Source code for petlx.array

"""
A module providing convenience functionality for moving to/from numpy structured
arrays.

"""


import sys
from petl.util import columns, iterpeek, RowContainer
from petlx.util import UnsatisfiedDependency


dep_message = """
The package numpy is required. Instructions for installation can be found 
at http://docs.scipy.org/doc/numpy/user/install.html or try apt-get install 
python-numpy.
"""


def guessdtype(table):
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        # get numpy to infer dtypes for each field individually
        fields, table = iterpeek(table, 1)
        cols = columns(table)
        dtype = []
        for f in fields:
            a = np.array(cols[f]) # load into 1D array to get numpy to infer a dtype for the column
            dtype.append((f, a.dtype))
        return np.dtype(dtype)


[docs]def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Convenience function to load data from the given `table` into a numpy 
    structured array. E.g.::

        >>> from petl import look
        >>> from petlx.array import toarray
        >>> look(table)
        +-----------+-------+-------+
        | 'foo'     | 'bar' | 'baz' |
        +===========+=======+=======+
        | 'apples'  | 1     | 2.5   |
        +-----------+-------+-------+
        | 'oranges' | 3     | 4.4   |
        +-----------+-------+-------+
        | 'pears'   | 7     | 0.1   |
        +-----------+-------+-------+
        
        >>> a = toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], 
              dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')])
        >>> a['foo']
        array(['apples', 'oranges', 'pears'], 
              dtype='|S7')
        >>> a['bar']
        array([1, 3, 7])
        >>> a['baz']
        array([ 2.5,  4.4,  0.1])
        >>> a['foo'][0]
        'apples'
        >>> a['bar'][1]
        3
        >>> a['baz'][2]
        0.10000000000000001
        
    If no datatype is specified, `sample` rows will be examined to infer an
    appropriate datatype for each field.
        
    The datatype can be specified as a string, e.g.:

        >>> a = toarray(table, dtype='a4, i2, f4')
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432),
               ('pear', 7, 0.10000000149011612)], 
              dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')])

    The datatype can also be partially specified, in which case datatypes will
    be inferred for other fields, e.g.:
    
        >>> a = toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], 
              dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')])
    
    """
    
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:

        it = iter(table)
        peek, it = iterpeek(it, sample)
        fields = it.next()
        
        if dtype is None:
            dtype = guessdtype(peek)
           
        elif isinstance(dtype, basestring):
            # insert field names from source table
            typestrings = [s.strip() for s in dtype.split(',')]
            dtype = [(f, t) for f, t in zip(fields, typestrings)]
            
        elif isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype):
            # allow for partial specification of dtype
            cols = columns(peek)
            newdtype = {'names': [], 'formats': []}
            for f in fields:
                newdtype['names'].append(f)
                if f in dtype and isinstance(dtype[f], tuple):
                    # assume fully specified
                    newdtype['formats'].append(dtype[f][0])
                elif f not in dtype:
                    # not specified at all
                    a = np.array(cols[f])
                    newdtype['formats'].append(a.dtype)
                else:
                    # assume directly specified, just need to add offset
                    newdtype['formats'].append(dtype[f])
            dtype = newdtype
            
        else:
            pass # leave dtype as-is
                         
        it = (tuple(row) for row in it) # numpy is fussy about having tuples, need to make sure
        sa = np.fromiter(it, dtype=dtype, count=count)

        return sa


[docs]def torecarray(*args, **kwargs):
    """
    Convenient shorthand for ``toarray(...).view(np.recarray)``.
    
    .. versionadded:: 0.5.1
    
    """
    try:
        import numpy as np
    except ImportError as e:
        raise UnsatisfiedDependency(e, dep_message)
    else:
        return toarray(*args, **kwargs).view(np.recarray)


[docs]def fromarray(a):
    """
    Extract a table from a numpy structured array.
    
    .. versionadded:: 0.4
    
    """
    
    return ArrayContainer(a)


class ArrayContainer(RowContainer):
    
    def __init__(self, a):
        self.a = a
        
    def __iter__(self):
        yield tuple(self.a.dtype.names)
        for row in self.a:
            yield tuple(row)
            

from petlx.integration import integrate
integrate(sys.modules[__name__])