Source code for petlx.hdf5

"""
TODO doc me

"""

import sys

from petl.util import RowContainer, data, iterpeek


from petlx.util import UnsatisfiedDependency
from petlx.array import guessdtype


dep_message = """
The package pytables is required. Instructions for installation can be found 
at http://pytables.github.com/usersguide/installation.html or try apt-get install 
python-tables.
"""


[docs]def fromhdf5(source, where=None, name=None, condition=None, 
             condvars=None, start=None, stop=None, step=None):
    """
    Provides access to an HDF5 table. E.g.::
    
        >>> from petl import look
        >>> from petlx.hdf5 import fromhdf5
        >>> table1 = fromhdf5('test1.h5', '/testgroup', 'testtable')
        >>> look(table1)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+
        
    Some alternative signatures::

        >>> # just specify path to table node
        ... table1 = fromhdf5('test1.h5', '/testgroup/testtable')
        >>> 
        >>> # use an existing tables.File object
        ... import tables
        >>> h5file = tables.openFile('test1.h5')
        >>> table1 = fromhdf5(h5file, '/testgroup/testtable')
        >>> 
        >>> # use an existing tables.Table object
        ... h5tbl = h5file.getNode('/testgroup/testtable')
        >>> table1 = fromhdf5(h5tbl)
        >>> 
        >>> # use a condition to filter data
        ... table2 = fromhdf5(h5tbl, condition="(foo < 3)")
        >>> look(table2)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+

    .. versionadded:: 0.3
    
    """
    
    return HDF5View(source, where=where, name=name, 
                    condition=condition, condvars=condvars,
                    start=start, stop=stop, step=step)


class HDF5View(RowContainer):
    
    def __init__(self, source, where=None, name=None, condition=None,
                 condvars=None, start=None, stop=None, step=None):
        self.source = source
        self.where = where
        self.name = name
        self.condition = condition
        self.condvars = condvars
        self.start = start
        self.stop = stop
        self.step = step
        
    def __iter__(self):
        return iterhdf5(self.source, self.where, self.name, self.condition, 
                        self.condvars, self.start, self.stop, self.step)
    

def _get_hdf5_table(source, where, name, mode='r'):

    try:
        import tables
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)

    # allow for polymorphic args
    if isinstance(source, tables.Table):
        h5file = None
        h5tbl = source
    else:
        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source, mode=mode)
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception('invalid source argument, expected file name or tables.File or tables.Table object, found: %r' % source)
        h5tbl = h5file.getNode(where, name=name)
        assert isinstance(h5tbl, tables.Table), 'node is not a table: %r' % h5tbl
    return h5file, h5tbl

    
def iterhdf5(source, where, name, condition, condvars, start, stop, step):

    h5file, h5tbl = _get_hdf5_table(source, where, name)
    
    try:
        fields = tuple(h5tbl.colnames)
        yield fields # header row
        
        # determine how to access the table
        if condition is not None:
            it = h5tbl.where(condition, condvars=condvars, 
                             start=start, stop=stop, step=step)
        else:
            it = h5tbl.iterrows(start=start, stop=stop, step=step)
        
        for row in it:
            yield row[:] # access row as a tuple
            
    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()


[docs]def fromhdf5sorted(source, where=None, name=None, sortby=None, checkCSI=False, 
                   start=None, stop=None, step=None):
    """
    Provides access to an HDF5 table, sorted by an indexed column, e.g.::
    
        >>> # set up a new hdf5 table to demonstrate with
        ... import tables
        >>> h5file = tables.openFile("test1.h5", mode="w", title="Test file")
        >>> h5file.createGroup('/', 'testgroup', 'Test Group')
        /testgroup (Group) 'Test Group'
          children := []
        >>> class FooBar(tables.IsDescription):
        ...     foo = tables.Int32Col(pos=0)
        ...     bar = tables.StringCol(6, pos=2)
        ... 
        >>> h5table = h5file.createTable('/testgroup', 'testtable', FooBar, 'Test Table')
        >>> 
        >>> # load some data into the table
        ... table1 = (('foo', 'bar'),
        ...           (3, 'asdfgh'),
        ...           (2, 'qwerty'),
        ...           (1, 'zxcvbn'))
        >>> 
        >>> for row in table1[1:]:
        ...     for i, f in enumerate(table1[0]):
        ...         h5table.row[f] = row[i]
        ...     h5table.row.append()
        ... 
        >>> h5table.cols.foo.createCSIndex() # CS index is required
        0
        >>> h5file.flush()
        >>> h5file.close()
        >>> 
        >>> # access the data, sorted by the indexed column
        ... from petl import look
        >>> from petlx.hdf5 import fromhdf5sorted
        >>> table2 = fromhdf5sorted('test1.h5', '/testgroup', 'testtable', sortby='foo')
        >>> look(table2)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'zxcvbn' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'asdfgh' |
        +-------+----------+

    .. versionadded:: 0.3
    
    """
    
    assert sortby is not None, 'no column specified to sort by'
    return HDF5SortedView(source, where=where, name=name, 
                          sortby=sortby, checkCSI=checkCSI,
                          start=start, stop=stop, step=step)


class HDF5SortedView(RowContainer):
    
    def __init__(self, source, where=None, name=None, sortby=None,
                 checkCSI=False, start=None, stop=None, step=None):
        self.source = source
        self.where = where
        self.name = name
        self.sortby = sortby
        self.checkCSI = checkCSI
        self.start = start
        self.stop = stop
        self.step = step
        
    def __iter__(self):
        return iterhdf5sorted(self.source, self.where, self.name, self.sortby,
                              self.checkCSI, self.start, self.stop, self.step)
    

def iterhdf5sorted(source, where, name, sortby, checkCSI, start, stop, step):

    h5file, h5tbl = _get_hdf5_table(source, where, name)

    try:
        fields = tuple(h5tbl.colnames)
        yield fields # header row
        
        it = h5tbl.itersorted(sortby, checkCSI=checkCSI, start=start, stop=stop, step=step)
        for row in it:
            yield row[:] # access row as a tuple
            
    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()


[docs]def tohdf5(table, source, where=None, name=None, create=False,
           description=None, title='', filters=None, expectedrows=10000, 
           chunkshape=None, byteorder=None, createparents=False,
           sample=1000):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, any existing table is dropped, and a new table is created;
    if `description` is None, the datatype will be guessed. E.g.::
    
        >>> from petl import look
        >>> look(table1)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+
        
        >>> from petlx.hdf5 import tohdf5, fromhdf5
        >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True)
        >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable'))
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 1     | 'asdfgh' |
        +-------+----------+
        | 2     | 'qwerty' |
        +-------+----------+
        | 3     | 'zxcvbn' |
        +-------+----------+

    See also :func:`appendhdf5`.
    
    .. versionadded:: 0.3
    
    """

    it = iter(table)
    
    if create:

        try:
            import tables
        except ImportError as e:
            raise UnsatisfiedDependency(e, dep_message)
        
        if isinstance(source, basestring):
            # assume it's the name of an HDF5 file
            h5file = tables.openFile(source, mode='a') # don't replace the whole file!
        elif isinstance(source, tables.File):
            h5file = source
        else:
            raise Exception('invalid source argument, expected file name or tables.File, found: %r' % source)
        
        # determine datatype
        if description is None:
            peek, it = iterpeek(it, sample)
            # use a numpy dtype
            description = guessdtype(peek)
        
        # check if the table node already exists
        try:
            h5table = h5file.getNode(where, name)
        except tables.NoSuchNodeError:
            pass
        else:
            # drop the node
            h5file.removeNode(where, name)
            
        # create the table
        h5table = h5file.createTable(where, name, description, title=title,
                                     filters=filters, expectedrows=expectedrows,
                                     chunkshape=chunkshape, byteorder=byteorder,
                                     createparents=createparents)
    
    else:
        h5file, h5table = _get_hdf5_table(source, where, name, mode='a')

    try:
        # truncate the existing table
        h5table.truncate(0)
        
        # load the data
        _insert(it, h5table)

    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close()


[docs]def appendhdf5(table, source, where=None, name=None):
    """
    Like :func:`tohdf5` but don't truncate the table before loading.
    
    .. versionadded:: 0.3
    
    """
    
    h5file, h5table = _get_hdf5_table(source, where, name, mode='a')

    try:
        
        # load the data
        _insert(table, h5table)

    finally:
        if isinstance(source, basestring):
            # close the file if we opened it here
            h5file.close() 


def _insert(table, h5table):
    it = data(table) # don't need header
    for row in it:
        for i, f in enumerate(h5table.colnames):
            # depends on order of fields being the same in input table
            # and hd5 table, but field names don't need to match
            h5table.row[f] = row[i]
        h5table.row.append()
    h5table.flush() 
    
    
from petlx.integration import integrate
integrate(sys.modules[__name__])