"""
TODO doc me
"""
import sys
from petl.io import Uncacheable
from petl.util import RowContainer, data, iterpeek
from petlx.util import UnsatisfiedDependency
from petlx.array import guessdtype
dep_message = """
The package pytables is required. Instructions for installation can be found
at http://pytables.github.com/usersguide/installation.html or try apt-get install
python-tables.
"""
[docs]def fromhdf5(source, where=None, name=None, condition=None,
condvars=None, start=None, stop=None, step=None):
"""
Provides access to an HDF5 table. E.g.::
>>> from petl import look
>>> from petlx.hdf5 import fromhdf5
>>> table1 = fromhdf5('test1.h5', '/testgroup', 'testtable')
>>> look(table1)
+-------+----------+
| 'foo' | 'bar' |
+=======+==========+
| 1 | 'asdfgh' |
+-------+----------+
| 2 | 'qwerty' |
+-------+----------+
| 3 | 'zxcvbn' |
+-------+----------+
Some alternative signatures::
>>> # just specify path to table node
... table1 = fromhdf5('test1.h5', '/testgroup/testtable')
>>>
>>> # use an existing tables.File object
... import tables
>>> h5file = tables.openFile('test1.h5')
>>> table1 = fromhdf5(h5file, '/testgroup/testtable')
>>>
>>> # use an existing tables.Table object
... h5tbl = h5file.getNode('/testgroup/testtable')
>>> table1 = fromhdf5(h5tbl)
>>>
>>> # use a condition to filter data
... table2 = fromhdf5(h5tbl, condition="(foo < 3)")
>>> look(table2)
+-------+----------+
| 'foo' | 'bar' |
+=======+==========+
| 1 | 'asdfgh' |
+-------+----------+
| 2 | 'qwerty' |
+-------+----------+
.. versionadded:: 0.3
"""
return HDF5View(source, where=where, name=name,
condition=condition, condvars=condvars,
start=start, stop=stop, step=step)
class HDF5View(RowContainer):
def __init__(self, source, where=None, name=None, condition=None,
condvars=None, start=None, stop=None, step=None):
self.source = source
self.where = where
self.name = name
self.condition = condition
self.condvars = condvars
self.start = start
self.stop = stop
self.step = step
def __iter__(self):
return iterhdf5(self.source, self.where, self.name, self.condition,
self.condvars, self.start, self.stop, self.step)
def cachetag(self):
# TODO
raise Uncacheable()
def _get_hdf5_table(source, where, name, mode='r'):
try:
import tables
except ImportError as e:
raise UnsatisfiedDependency(e, dep_message)
# allow for polymorphic args
if isinstance(source, tables.Table):
h5file = None
h5tbl = source
else:
if isinstance(source, basestring):
# assume it's the name of an HDF5 file
h5file = tables.openFile(source, mode=mode)
elif isinstance(source, tables.File):
h5file = source
else:
raise Exception('invalid source argument, expected file name or tables.File or tables.Table object, found: %r' % source)
h5tbl = h5file.getNode(where, name=name)
assert isinstance(h5tbl, tables.Table), 'node is not a table: %r' % h5tbl
return h5file, h5tbl
def iterhdf5(source, where, name, condition, condvars, start, stop, step):
h5file, h5tbl = _get_hdf5_table(source, where, name)
try:
fields = tuple(h5tbl.colnames)
yield fields # header row
# determine how to access the table
if condition is not None:
it = h5tbl.where(condition, condvars=condvars,
start=start, stop=stop, step=step)
else:
it = h5tbl.iterrows(start=start, stop=stop, step=step)
for row in it:
yield row[:] # access row as a tuple
finally:
if isinstance(source, basestring):
# close the file if we opened it here
h5file.close()
[docs]def fromhdf5sorted(source, where=None, name=None, sortby=None, checkCSI=False,
start=None, stop=None, step=None):
"""
Provides access to an HDF5 table, sorted by an indexed column, e.g.::
>>> # set up a new hdf5 table to demonstrate with
... import tables
>>> h5file = tables.openFile("test1.h5", mode="w", title="Test file")
>>> h5file.createGroup('/', 'testgroup', 'Test Group')
/testgroup (Group) 'Test Group'
children := []
>>> class FooBar(tables.IsDescription):
... foo = tables.Int32Col(pos=0)
... bar = tables.StringCol(6, pos=2)
...
>>> h5table = h5file.createTable('/testgroup', 'testtable', FooBar, 'Test Table')
>>>
>>> # load some data into the table
... table1 = (('foo', 'bar'),
... (3, 'asdfgh'),
... (2, 'qwerty'),
... (1, 'zxcvbn'))
>>>
>>> for row in table1[1:]:
... for i, f in enumerate(table1[0]):
... h5table.row[f] = row[i]
... h5table.row.append()
...
>>> h5table.cols.foo.createCSIndex() # CS index is required
0
>>> h5file.flush()
>>> h5file.close()
>>>
>>> # access the data, sorted by the indexed column
... from petl import look
>>> from petlx.hdf5 import fromhdf5sorted
>>> table2 = fromhdf5sorted('test1.h5', '/testgroup', 'testtable', sortby='foo')
>>> look(table2)
+-------+----------+
| 'foo' | 'bar' |
+=======+==========+
| 1 | 'zxcvbn' |
+-------+----------+
| 2 | 'qwerty' |
+-------+----------+
| 3 | 'asdfgh' |
+-------+----------+
.. versionadded:: 0.3
"""
assert sortby is not None, 'no column specified to sort by'
return HDF5SortedView(source, where=where, name=name,
sortby=sortby, checkCSI=checkCSI,
start=start, stop=stop, step=step)
class HDF5SortedView(RowContainer):
def __init__(self, source, where=None, name=None, sortby=None,
checkCSI=False, start=None, stop=None, step=None):
self.source = source
self.where = where
self.name = name
self.sortby = sortby
self.checkCSI = checkCSI
self.start = start
self.stop = stop
self.step = step
def __iter__(self):
return iterhdf5sorted(self.source, self.where, self.name, self.sortby,
self.checkCSI, self.start, self.stop, self.step)
def cachetag(self):
# TODO
raise Uncacheable()
def iterhdf5sorted(source, where, name, sortby, checkCSI, start, stop, step):
h5file, h5tbl = _get_hdf5_table(source, where, name)
try:
fields = tuple(h5tbl.colnames)
yield fields # header row
it = h5tbl.itersorted(sortby, checkCSI=checkCSI, start=start, stop=stop, step=step)
for row in it:
yield row[:] # access row as a tuple
finally:
if isinstance(source, basestring):
# close the file if we opened it here
h5file.close()
[docs]def tohdf5(table, source, where=None, name=None, create=False,
description=None, title='', filters=None, expectedrows=10000,
chunkshape=None, byteorder=None, createparents=False,
sample=1000):
"""
Write to an HDF5 table. If `create` is `False`, assumes the table
already exists, and attempts to truncate it before loading. If `create`
is `True`, any existing table is dropped, and a new table is created;
if `description` is None, the datatype will be guessed. E.g.::
>>> from petl import look
>>> look(table1)
+-------+----------+
| 'foo' | 'bar' |
+=======+==========+
| 1 | 'asdfgh' |
+-------+----------+
| 2 | 'qwerty' |
+-------+----------+
| 3 | 'zxcvbn' |
+-------+----------+
>>> from petlx.hdf5 import tohdf5, fromhdf5
>>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True)
>>> look(fromhdf5('test1.h5', '/testgroup', 'testtable'))
+-------+----------+
| 'foo' | 'bar' |
+=======+==========+
| 1 | 'asdfgh' |
+-------+----------+
| 2 | 'qwerty' |
+-------+----------+
| 3 | 'zxcvbn' |
+-------+----------+
See also :func:`appendhdf5`.
.. versionadded:: 0.3
"""
it = iter(table)
if create:
try:
import tables
except ImportError as e:
raise UnsatisfiedDependency(e, dep_message)
if isinstance(source, basestring):
# assume it's the name of an HDF5 file
h5file = tables.openFile(source, mode='a') # don't replace the whole file!
elif isinstance(source, tables.File):
h5file = source
else:
raise Exception('invalid source argument, expected file name or tables.File, found: %r' % source)
# determine datatype
if description is None:
peek, it = iterpeek(it, sample)
# use a numpy dtype
description = guessdtype(peek)
# check if the table node already exists
try:
h5table = h5file.getNode(where, name)
except tables.NoSuchNodeError:
pass
else:
# drop the node
h5file.removeNode(where, name)
# create the table
h5table = h5file.createTable(where, name, description, title=title,
filters=filters, expectedrows=expectedrows,
chunkshape=chunkshape, byteorder=byteorder,
createparents=createparents)
else:
h5file, h5table = _get_hdf5_table(source, where, name, mode='a')
try:
# truncate the existing table
h5table.truncate(0)
# load the data
_insert(it, h5table)
finally:
if isinstance(source, basestring):
# close the file if we opened it here
h5file.close()
[docs]def appendhdf5(table, source, where=None, name=None):
"""
Like :func:`tohdf5` but don't truncate the table before loading.
.. versionadded:: 0.3
"""
h5file, h5table = _get_hdf5_table(source, where, name, mode='a')
try:
# load the data
_insert(table, h5table)
finally:
if isinstance(source, basestring):
# close the file if we opened it here
h5file.close()
def _insert(table, h5table):
it = data(table) # don't need header
for row in it:
for i, f in enumerate(h5table.colnames):
# depends on order of fields being the same in input table
# and hd5 table, but field names don't need to match
h5table.row[f] = row[i]
h5table.row.append()
h5table.flush()
from .integration import integrate
integrate(sys.modules[__name__])