Source code for petlx.bio.gff3

# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


from petl.compat import PY2
if PY2:
    from urllib import unquote_plus
else:
    from urllib.parse import unquote_plus


import petl as etl
# activate tabix extension
import petlx.bio.tabix


def gff3_parse_attributes(attributes_string):
    """
    Parse a string of GFF3 attributes ('key=value' pairs delimited by ';') 
    and return a dictionary.
  
    """
    
    attributes = dict()
    fields = attributes_string.split(';')
    for f in fields:
        if '=' in f:
            key, value = f.split('=')
            attributes[unquote_plus(key).strip()] = unquote_plus(value.strip())
        elif len(f) > 0:
            # not strictly kosher
            attributes[unquote_plus(f).strip()] = True            
    return attributes


GFF3_HEADER = ('seqid', 'source', 'type', 'start', 'end', 'score', 'strand',
               'phase', 'attributes')


[docs]def fromgff3(filename, region=None):
    """
    Extract feature rows from a GFF3 file, e.g.::

        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = etl.fromgff3('fixture/sample.gff')
        >>> table1.look(truncate=30)
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=======+=========+=======+========+=======+================================+
        | 'apidb|MAL1' | 'ApiDB' | 'supercontig' |     1 |  643292 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL2' | 'ApiDB' | 'supercontig' |     1 |  947102 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL3' | 'ApiDB' | 'supercontig' |     1 | 1060087 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL4' | 'ApiDB' | 'supercontig' |     1 | 1204112 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |     1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        ...

    A region query string of the form '[seqid]' or '[seqid]:[start]-[end]'
    may be given for the `region` argument. If given, requires the GFF3
    file to be position sorted, bgzipped and tabix indexed. Requires pysam to be
    installed. E.g.::

        >>> # extract from a specific genome region via tabix
        ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
        ...                       region='apidb|MAL5:1289593-1289595')
        >>> table2.look(truncate=30)
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start   | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=========+=========+=======+========+=======+================================+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |       1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'exon'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'size': '2092', 'Parent': 'ap |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'gene'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|MAL5_18S', 'web_ |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'rRNA'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|rna_MAL5_18S-1', |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+

    """

    if region is None:

        # parse file as tab-delimited
        table = etl.fromtsv(filename)

    else:

        # extract via tabix
        table = etl.fromtabix(filename, region=region)

    return (
        table
        .pushheader(GFF3_HEADER)
        .skipcomments('#')
        # ignore any row not 9 values long (e.g., trailing fasta)
        .rowlenselect(9)
        # parse attributes into a dict
        .convert('attributes', gff3_parse_attributes)
        # parse coordinates
        .convert(('start', 'end'), int)
    )


etl.fromgff3 = fromgff3