Skip to content
Commits on Source (6)
Metadata-Version: 1.1
Name: gffutils
Version: 0.9
Version: 0.10.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
......
python-gffutils (0.10.1-1) unstable; urgency=medium
* New upstream version
* Freshen patches, removing two applied upstream
* Remove obsolete fields Name from debian/upstream/metadata.
-- Michael R. Crusoe <michael.crusoe@gmail.com> Wed, 01 Jan 2020 12:49:11 +0100
python-gffutils (0.9-2) unstable; urgency=medium
* debhelper 11
......
From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Ensure that tests work post-installation
--- python-gffutils.orig/gffutils/test/test.py
+++ python-gffutils/gffutils/test/test.py
@@ -621,6 +621,7 @@
# Serving test/data folder
served_folder = gffutils.example_filename('')
+ savedir = os.getcwd()
os.chdir(served_folder)
print("Starting SimpleHTTPServer in thread")
@@ -654,6 +655,7 @@
print('Server shutdown.')
httpd.shutdown()
server_thread.join()
+ os.chdir(savedir)
def test_empty_files():
......@@ -6,46 +6,19 @@ Subject: Support newer bedtools version
---
--- python-gffutils.orig/gffutils/iterators.py
+++ python-gffutils/gffutils/iterators.py
@@ -115,6 +115,7 @@
@@ -115,7 +115,6 @@
Subclass for iterating over features provided as a filename
"""
def open_function(self, data):
+ data = os.path.expanduser(data)
- data = os.path.expanduser(data)
if data.endswith('.gz'):
import gzip
return gzip.open(data)
--- python-gffutils.orig/gffutils/test/test.py
+++ python-gffutils/gffutils/test/test.py
@@ -880,7 +880,7 @@
assert len(filelist) == 1, filelist
assert filelist[0].endswith('.gffutils')
- #...and another one for gff. This time, make sure the suffix
+ #...and another one for gff. This time, make sure the suffix
db = gffutils.create_db(
gffutils.example_filename('FBgn0031208.gff'), ':memory:', _keep_tempfiles=True)
filelist = os.listdir(tempdir)
@@ -1006,7 +1006,7 @@
def test_deprecation_handler():
- return
+ return
# TODO: when infer_gene_extent actually gets deprecated, test here.
assert_raises(ValueError, gffutils.create_db,
@@ -1141,7 +1141,7 @@
assert f.attributes['null'][0] == '\x00'
assert f.attributes['comma'][0] == ','
- # Commas indicate
+ # Commas indicate
assert f.attributes['Parent'] == ['A,', 'B%', 'C']
assert str(f) == s
@@ -1174,6 +1174,18 @@
assert db['e']['Note'] == [',']
assert db['f']['Note'] == [',']
@@ -1315,6 +1315,18 @@
g = gffutils.feature.feature_from_line(str(f))
assert g == f
+
+def test_issue_105():
......@@ -62,107 +35,3 @@ Subject: Support newer bedtools version
if __name__ == "__main__":
# this test case fails
#test_attributes_modify()
--- python-gffutils.orig/gffutils/pybedtools_integration.py
+++ python-gffutils/gffutils/pybedtools_integration.py
@@ -2,11 +2,13 @@
Module for integration with pybedtools
"""
+import os
import pybedtools
from pybedtools import featurefuncs
from gffutils import helpers
import six
+
def to_bedtool(iterator):
"""
Convert any iterator into a pybedtools.BedTool object.
@@ -22,7 +24,7 @@
def tsses(db, merge_overlapping=False, attrs=None, attrs_sep=":",
- merge_kwargs=dict(o='distinct', s=True, c=4), as_bed6=False):
+ merge_kwargs=None, as_bed6=False, bedtools_227_or_later=True):
"""
Create 1-bp transcription start sites for all transcripts in the database
and return as a sorted pybedtools.BedTool object pointing to a temporary
@@ -74,13 +76,21 @@
attributes is supplied, e.g. ["gene_id", "transcript_id"], then these
will be joined by `attr_join_sep` and then placed in the name field.
-
attrs_sep: str
If `as_bed6=True` or `merge_overlapping=True`, then use this character
to separate attributes in the name field of the output BED. If also
using `merge_overlapping=True`, you'll probably want this to be
different than `merge_sep` in order to parse things out later.
+ bedtools_227_or_later : bool
+ In version 2.27, BEDTools changed the output for merge. By default,
+ this function expects BEDTools version 2.27 or later, but set this to
+ False to assume the older behavior.
+
+ For testing purposes, the environment variable
+ GFFUTILS_USES_BEDTOOLS_227_OR_LATER is set to either "true" or "false"
+ and is used to override this argument.
+
Examples
--------
@@ -146,7 +156,22 @@
"""
- _merge_kwargs = dict(o='distinct', s=True, c=4)
+ _override = os.environ.get('GFFUTILS_USES_BEDTOOLS_227_OR_LATER', None)
+ if _override is not None:
+ if _override == 'true':
+ bedtools_227_or_later = True
+ elif _override == 'false':
+ bedtools_227_or_later = False
+ else:
+ raise ValueError(
+ "Unknown value for GFFUTILS_USES_BEDTOOLS_227_OR_LATER "
+ "environment variable: {0}".format(_override))
+
+ if bedtools_227_or_later:
+ _merge_kwargs = dict(o='distinct', s=True, c='4,5,6')
+ else:
+ _merge_kwargs = dict(o='distinct', s=True, c='4')
+
if merge_kwargs is not None:
_merge_kwargs.update(merge_kwargs)
@@ -195,18 +220,18 @@
x = x.each(to_bed).saveas()
if merge_overlapping:
-
- def fix_merge(f):
- f = featurefuncs.extend_fields(f, 6)
- return pybedtools.Interval(
- f.chrom,
- f.start,
- f.stop,
- f[4],
- '.',
- f[3])
-
- x = x.merge(**_merge_kwargs).each(fix_merge).saveas()
-
+ if bedtools_227_or_later:
+ x = x.merge(**_merge_kwargs)
+ else:
+ def fix_merge(f):
+ f = featurefuncs.extend_fields(f, 6)
+ return pybedtools.Interval(
+ f.chrom,
+ f.start,
+ f.stop,
+ f[4],
+ '.',
+ f[3])
+ x = x.merge(**_merge_kwargs).saveas().each(fix_merge).saveas()
return x
From 7ecd042d46d88ea6f4e1baabaa4421bc0954f284 Mon Sep 17 00:00:00 2001
From: Abishek <jesuisabhishek@gmail.com>
Date: Fri, 22 Feb 2019 01:46:07 +0530
Subject: [PATCH] "raise StopIteration" to "return"
---
gffutils/iterators.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- python-gffutils.orig/gffutils/iterators.py
+++ python-gffutils/gffutils/iterators.py
@@ -130,7 +130,7 @@
self.current_item_number = i
if line == '##FASTA' or line.startswith('>'):
- raise StopIteration
+ return
if line.startswith('##'):
self._directive_handler(line)
py37
newer_bedtools
flexible_tests
python3
......@@ -7,4 +7,3 @@ Registry:
Entry: GFFutils
- Name: conda:bioconda
Entry: gffutils
Name: gffutils
Metadata-Version: 1.1
Name: gffutils
Version: 0.9
Version: 0.10.1
Summary: Work with GFF and GTF files in a flexible database framework
Home-page: https://github.com/daler/gffutils
Author: Ryan Dale
......
......@@ -16,9 +16,9 @@ gffutils/feature.py
gffutils/gffwriter.py
gffutils/helpers.py
gffutils/inspect.py
gffutils/inspection.py
gffutils/interface.py
gffutils/iterators.py
gffutils/merge_criteria.py
gffutils/parser.py
gffutils/pybedtools_integration.py
gffutils/version.py
......@@ -36,8 +36,11 @@ gffutils/test/feature_test.py
gffutils/test/helpers_test.py
gffutils/test/parser_test.py
gffutils/test/performance_test.py
gffutils/test/synth_test_base.py
gffutils/test/test.py
gffutils/test/test_biopython_integration.py
gffutils/test/test_merge.py
gffutils/test/test_merge_all.py
gffutils/test/data/F3-unique-3.v2.gff
gffutils/test/data/FBgn0031208.gff
gffutils/test/data/FBgn0031208.gtf
......@@ -67,6 +70,7 @@ gffutils/test/data/mouse_extra_comma.gff3
gffutils/test/data/ncbi_gff3.txt
gffutils/test/data/nonascii
gffutils/test/data/random-chr.gff
gffutils/test/data/synthetic.gff3
gffutils/test/data/unsanitized.gff
gffutils/test/data/wormbase_gff2.txt
gffutils/test/data/wormbase_gff2_alt.txt
\ No newline at end of file
pyfaidx
six
argh
argcomplete
pyfaidx>=0.5.5.2
six>=1.12.0
argh>=0.26.2
argcomplete>=1.9.4
simplejson
import six
import collections
try:
collectionsAbc = collections.abc
except AttributeError:
collectionsAbc = collections
from gffutils import constants
# collections.MutableMapping is apparently the best way to provide dict-like
# interface (http://stackoverflow.com/a/3387975)
class Attributes(collections.MutableMapping):
class Attributes(collectionsAbc.MutableMapping):
def __init__(self, *args, **kwargs):
"""
An Attributes object acts much like a dictionary. However, values are
......
......@@ -53,6 +53,7 @@ OFFSETS = [
# for BED (0-based, half-open) or GFF (1-based, closed intervals)
COORD_OFFSETS = {'bed': 0, 'gff': 1}
MAX_CHROM_SIZE = 2 ** 29
def bins(start, stop, fmt='gff', one=True):
"""
......@@ -72,6 +73,15 @@ def bins(start, stop, fmt='gff', one=True):
fmt : 'gff' | 'bed'
This specifies 1-based start coords (gff) or 0-based start coords (bed)
"""
# For very large coordinates, return 1 which is "somewhere on the
# chromosome".
if start >= MAX_CHROM_SIZE or stop >= MAX_CHROM_SIZE:
if one:
return 1
else:
return set([1])
# Jump to highest resolution bin that will fit these coords (depending on
# whether we have a BED or GFF-style coordinate).
#
......@@ -114,6 +124,7 @@ def bins(start, stop, fmt='gff', one=True):
# larger bin size)
start >>= NEXT_SHIFT
stop >>= NEXT_SHIFT
return bins
......@@ -214,6 +225,12 @@ def test():
assert bins(1, -1000, one=True) == 1
assert bins(1, -1000, one=False) == set([1])
# Juuuust fits inside the max chrom size
assert bins(536870910, 536870911, one=True) == 8776
# Too big; falls back to 1.
assert bins(536870911, 536870912, one=True) == 1
if __name__ == "__main__":
print_bin_sizes()
test()
......@@ -5,7 +5,6 @@ objects.
import six
try:
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
except ImportError:
import warnings
warnings.warn("BioPython must be installed to use this module")
......@@ -79,4 +78,4 @@ def from_seqfeature(s, **kwargs):
attributes.pop('seqid', '.')
attributes.pop('frame', '.')
return Feature(seqid, source, featuretype, start, stop, score, strand,
frame, attributes, **kwargs)
frame, attributes, id=id, **kwargs)
......@@ -48,7 +48,7 @@ def deprecation_handler(kwargs):
class _DBCreator(object):
def __init__(self, data, dbfn, force=False, verbose=False, id_spec=None,
merge_strategy='merge', checklines=10, transform=None,
merge_strategy='error', checklines=10, transform=None,
force_dialect_check=False, from_string=False, dialect=None,
default_encoding='utf-8',
disable_infer_genes=False,
......@@ -97,7 +97,6 @@ class _DBCreator(object):
self.disable_infer_genes = disable_infer_genes
self.disable_infer_transcripts = disable_infer_transcripts
self._autoincrements = collections.defaultdict(int)
if force:
if os.path.exists(dbfn):
os.unlink(dbfn)
......@@ -124,7 +123,10 @@ class _DBCreator(object):
force_dialect_check=force_dialect_check, from_string=from_string,
dialect=dialect
)
if '_autoincrements' in kwargs:
self._autoincrements = kwargs['_autoincrements']
else:
self._autoincrements = collections.defaultdict(int)
def set_verbose(self, verbose=None):
if verbose == 'debug':
......@@ -207,7 +209,8 @@ class _DBCreator(object):
Raise error
"warning"
Log a warning
Log a warning, which indicates that all future instances of the
same ID will be ignored
"merge":
Combine old and new attributes -- but only if everything else
......@@ -248,8 +251,8 @@ class _DBCreator(object):
% ([i.id for i in self._candidate_merges(f)]))
# If force_merge_fields was provided, don't pay attention to these
# fields if they're different. We are assuming attributes will be
# different, hence the [:-1]
# fields if they're different. We are assuming the attributes field
# will be different, hence the [:-1]
_gffkeys_to_check = list(
set(constants._gffkeys[:-1])
.difference(self.force_merge_fields))
......@@ -486,7 +489,7 @@ class _DBCreator(object):
c.execute('CREATE INDEX seqidstartendstrand ON features (seqid, start, end, strand)')
# speeds computation 1000x in some cases
logger.info("Running ANALYSE features")
logger.info("Running ANALYZE features")
c.execute('ANALYZE features')
self.conn.commit()
......@@ -505,6 +508,7 @@ class _DBCreator(object):
self._update_relations()
self._finalize()
# TODO: not sure this is used anywhere
def update(self, iterator):
self._populate_from_lines(iterator)
self._update_relations()
......@@ -645,7 +649,7 @@ class _GFFDBCreator(_DBCreator):
else:
suffix = '.gffutils'
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
fout = open(tmp, 'w')
with open(tmp, 'w') as fout:
# Here we look for "grandchildren" -- for each ID, get the child
# (parenthetical subquery below); then for each of those get *its*
......@@ -662,10 +666,10 @@ class _GFFDBCreator(_DBCreator):
''', tuple(parent))
for grandchild in c2:
fout.write('\t'.join((parent[0], grandchild[0])) + '\n')
fout.close()
def relations_generator():
for line in open(fout.name):
with open(fout.name) as fin:
for line in fin:
parent, child = line.strip().split('\t')
yield dict(parent=parent, child=child, level=2)
......@@ -720,7 +724,7 @@ class _GTFDBCreator(_DBCreator):
warnings.warn(
"It appears you have a transcript feature in your GTF "
"file. You may want to use the "
"`disable_infer_transcripts` "
"`disable_infer_transcripts=True` "
"option to speed up database creation")
elif (
f.featuretype == 'gene' and
......@@ -729,7 +733,7 @@ class _GTFDBCreator(_DBCreator):
warnings.warn(
"It appears you have a gene feature in your GTF "
"file. You may want to use the "
"`disable_infer_genes` "
"`disable_infer_genes=True` "
"option to speed up database creation")
lines_seen = i + 1
......@@ -838,9 +842,9 @@ class _GTFDBCreator(_DBCreator):
suffix = self._keep_tempfiles
else:
suffix = '.gffutils'
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
fout = open(tmp, 'w')
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
with open(tmp, 'w') as fout:
self._tmpfile = tmp
# This takes some explanation...
......@@ -959,15 +963,14 @@ class _GTFDBCreator(_DBCreator):
last_gene_id = gene_id
n_features += 1
fout.close()
def derived_feature_generator():
"""
Generator of items from the file that was just created...
"""
keys = ['parent', 'seqid', 'start', 'end', 'strand',
'featuretype', 'bin', 'attributes']
for line in open(fout.name):
with open(fout.name) as fin:
for line in fin:
d = dict(list(zip(keys, line.strip().split('\t'))))
d.pop('parent')
d['score'] = '.'
......@@ -1009,7 +1012,8 @@ class _GTFDBCreator(_DBCreator):
if not self._keep_tempfiles:
os.unlink(fout.name)
# TODO: recreate indexes?
# TODO: recreate indexes? Typically the _finalize() method will be
# called after this one, and indexes are created in _finalize().
def create_db(data, dbfn, id_spec=None, force=False, verbose=False,
......
from pyfaidx import Fasta
import six
import simplejson
import simplejson as json
from gffutils import constants
from gffutils import helpers
from gffutils import parser
......@@ -145,7 +145,7 @@ class Feature(object):
attributes = helpers._unjsonify(attributes, isattributes=True)
# it's a string but not JSON: assume original attributes string.
except simplejson.JSONDecodeError:
except json.JSONDecodeError:
# But Feature.attributes is still a dict
attributes, _dialect = parser._split_keyvals(attributes)
......@@ -159,7 +159,7 @@ class Feature(object):
if isinstance(extra, six.string_types):
try:
extra = helpers._unjsonify(extra)
except simplejson.JSONDecodeError:
except json.JSONDecodeError:
extra = extra.split('\t')
self.seqid = seqid
......
import copy
import sys
import os
import simplejson
import simplejson as json
import time
import tempfile
import six
......@@ -256,16 +256,16 @@ def _bin_from_dict(d):
def _jsonify(x):
"""Use most compact form of JSON"""
if isinstance(x, dict_class):
return simplejson.dumps(x._d, separators=(',', ':'))
return simplejson.dumps(x, separators=(',', ':'))
return json.dumps(x._d, separators=(',', ':'))
return json.dumps(x, separators=(',', ':'))
def _unjsonify(x, isattributes=False):
"""Convert JSON string to an ordered defaultdict."""
if isattributes:
obj = simplejson.loads(x)
obj = json.loads(x)
return dict_class(obj)
return simplejson.loads(x)
return json.loads(x)
def _feature_to_fields(f, jsonify=True):
......@@ -318,7 +318,7 @@ def merge_attributes(attr1, attr2):
"""
new_d = copy.deepcopy(attr1)
new_d.update(attr2)
new_d.update(copy.deepcopy(attr2))
#all of attr2 key : values just overwrote attr1, fix it
for k, v in new_d.items():
......@@ -330,7 +330,7 @@ def merge_attributes(attr1, attr2):
if not isinstance(v, list):
v = [v]
new_d[k].extend(v)
return dict((k, list(set(v))) for k, v in new_d.items())
return dict((k, sorted(set(v))) for k, v in new_d.items())
def dialect_compare(dialect1, dialect2):
......
from gffutils import iterators
from gffutils import interface
from collections import Counter
import sys
def inspect(data, look_for=['featuretype', 'chrom', 'attribute_keys',
'feature_count'], limit=None, verbose=True):
"""
Inspect a GFF or GTF data source.
This function is useful for figuring out the different featuretypes found
in a file (for potential removal before creating a FeatureDB).
Returns a dictionary with a key for each item in `look_for` and
a corresponding value that is a dictionary of how many of each unique item
were found.
There will always be a `feature_count` key, indicating how many features
were looked at (if `limit` is provided, then `feature_count` will be the
same as `limit`).
For example, if `look_for` is ['chrom', 'featuretype'], then the result
will be a dictionary like::
{
'chrom': {
'chr1': 500,
'chr2': 435,
'chr3': 200,
...
...
}.
'featuretype': {
'gene': 150,
'exon': 324,
...
},
'feature_count': 5000
}
Parameters
----------
data : str, FeatureDB instance, or iterator of Features
If `data` is a string, assume it's a GFF or GTF filename. If it's
a FeatureDB instance, then its `all_features()` method will be
automatically called. Otherwise, assume it's an iterable of Feature
objects.
look_for : list
List of things to keep track of. Options are:
- any attribute of a Feature object, such as chrom, source, start,
stop, strand.
- "attribute_keys", which will look at all the individual
attribute keys of each feature
limit : int
Number of features to look at. Default is no limit.
verbose : bool
Report how many features have been processed.
Returns
-------
dict
"""
results = {}
obj_attrs = []
for i in look_for:
if i not in ['attribute_keys', 'feature_count']:
obj_attrs.append(i)
results[i] = Counter()
attr_keys = 'attribute_keys' in look_for
d = iterators.DataIterator(data)
feature_count = 0
for f in d:
if verbose:
sys.stderr.write('\r%s features inspected' % feature_count)
sys.stderr.flush()
for obj_attr in obj_attrs:
results[obj_attr].update([getattr(f, obj_attr)])
if attr_keys:
results['attribute_keys'].update(f.attributes.keys())
feature_count += 1
if limit and feature_count == limit:
break
new_results = {}
for k, v in results.items():
new_results[k] = dict(v)
new_results['feature_count'] = feature_count
return new_results
import collections
import os
import six
import sqlite3
......@@ -6,10 +7,62 @@ import warnings
from gffutils import bins
from gffutils import helpers
from gffutils import constants
from gffutils import merge_criteria as mc
from gffutils.feature import Feature
from gffutils.exceptions import FeatureNotFoundError
def assign_child(parent, child):
"""
Helper for add_relation()
Sets 'Parent' attribute to parent['ID']
Parameters
----------
parent : Feature
Parent Feature
:param parent: parent Feature
child : Feature
Child Feature
Returns
-------
Child Feature
"""
child.attributes['Parent'] = parent['ID']
return child
# Reusable constant for FeatureDB.merge()
no_children = tuple()
def _finalize_merge(feature, feature_children):
"""
Helper for FeatureDB.merge() to update source and assign children property
Parameters
----------
feature : Feature
feature to finalise
feature_children
list of children to assign
Returns
-------
feature, modified
"""
if len(feature_children) > 1:
feature.source = ','.join(set(child.source for child in feature_children))
feature.children = feature_children
else:
feature.children = no_children
return feature
class FeatureDB(object):
# docstring to be filled in for methods that call out to
# helpers.make_query()
......@@ -147,7 +200,7 @@ class FeatureDB(object):
'''
SELECT base, n FROM autoincrements
''')
self._autoincrements = dict(c)
self._autoincrements = collections.defaultdict(int, c)
self.set_pragmas(pragmas)
......@@ -604,17 +657,15 @@ class FeatureDB(object):
# Only use bins if we have defined boundaries and completely_within is
# True. Otherwise you can't know how far away a feature stretches
# (which means bins are not computable ahead of time)
_bin_clause = ''
if (start is not None) and (end is not None) and completely_within:
if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE:
_bins = list(bins.bins(start, end, one=False))
# See issue #45
if len(_bins) < 900:
_bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
_bin_clause = 'AND ( %s )' % _bin_clause
args += _bins
else:
_bin_clause = ''
else:
_bin_clause = ''
query = ' '.join([
constants._SELECT,
......@@ -662,11 +713,9 @@ class FeatureDB(object):
Providing N features will return N - 1 new features.
This method purposefully does *not* do any merging or sorting of
coordinates, so you may want to use :meth:`FeatureDB.merge` first.
The new features' attributes will be a merge of the neighboring
features' attributes. This is useful if you have provided a list of
exons; the introns will then retain the transcript and/or gene parents.
coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
when selecting features use the `order_by` kwarg, e.g.,
`db.features_of_type('gene', order_by=('seqid', 'start'))`.
Parameters
----------
......@@ -678,6 +727,14 @@ class FeatureDB(object):
then the featuretypes will be constructed from the neighboring
features, e.g., `inter_exon_exon`.
merge_attributes : bool
If True, new features' attributes will be a merge of the neighboring
features' attributes. This is useful if you have provided a list of
exons; the introns will then retain the transcript and/or gene
parents as a single item. Otherwise, if False, the attribute will
be a comma-separated list of values, potentially listing the same
gene ID twice.
attribute_func : callable or None
If None, then nothing special is done to the attributes. If
callable, then the callable accepts two attribute dictionaries and
......@@ -705,9 +762,21 @@ class FeatureDB(object):
if new_featuretype is None:
new_featuretype = 'inter_%s_%s' % (
last_feature.featuretype, f.featuretype)
assert last_feature.strand == f.strand
assert last_feature.chrom == f.chrom
strand = last_feature.strand
if last_feature.strand != f.strand:
new_strand = '.'
else:
new_strand = f.strand
if last_feature.chrom != f.chrom:
# We've moved to a new chromosome. For example, if we're
# getting intergenic regions from all genes, they will be on
# different chromosomes. We still assume sorted features, but
# don't complain if they're on different chromosomes -- just
# move on.
last_feature = f
continue
strand = new_strand
chrom = last_feature.chrom
# Shrink
......@@ -748,6 +817,7 @@ class FeatureDB(object):
dialect = None
yield self._feature_returner(**fields)
interfeature_start = f.stop
last_feature = f
def delete(self, features, make_backup=True, **kwargs):
"""
......@@ -797,7 +867,19 @@ class FeatureDB(object):
def update(self, data, make_backup=True, **kwargs):
"""
Update database with features in `data`.
Update the on-disk database with features in `data`.
WARNING: If you used any non-default kwargs for gffutils.create_db when
creating the database in the first place (especially
`disable_infer_transcripts` or `disable_infer_genes`) then you should
use those same arguments here.
The returned object is the same FeatureDB, but since it is pointing to
the same database and that has been just updated, the new features can
now be accessed.
Parameters
----------
data : str, iterable, FeatureDB instance
If FeatureDB, all data will be used. If string, assume it's
......@@ -810,15 +892,15 @@ class FeatureDB(object):
makes a copy of the existing database and saves it with a .bak
extension.
Notes
-----
Other kwargs are used in the same way as in gffutils.create_db; see the
help for that function for details.
kwargs :
Additional kwargs are passed to gffutils.create_db; see the help
for that function for details.
Returns
-------
FeatureDB with updated features.
Returns self but with the underlying database updated.
"""
from gffutils import create
from gffutils import iterators
if make_backup:
......@@ -833,6 +915,7 @@ class FeatureDB(object):
# Handle all sorts of input
data = iterators.DataIterator(data, **_iterator_kwargs)
kwargs['_autoincrements'] = self._autoincrements
if self.dialect['fmt'] == 'gtf':
if 'id_spec' not in kwargs:
......@@ -849,10 +932,18 @@ class FeatureDB(object):
else:
raise ValueError
peek, data._iter = iterators.peek(data._iter, 1)
if len(peek) == 0: return db # If the file is empty then do nothing
db._populate_from_lines(data)
db._update_relations()
# Note that the autoincrements gets updated here
db._finalize()
return db
# Read it back in directly from the stored autoincrements table
self._autoincrements.update(db._autoincrements)
return self
def add_relation(self, parent, child, level, parent_func=None,
child_func=None):
......@@ -1000,8 +1091,11 @@ class FeatureDB(object):
):
yield intron
def merge(self, features, ignore_strand=False):
def _old_merge(self, features, ignore_strand=False):
"""
DEPRECATED, only retained here for backwards compatibility. Please use
merge().
Merge overlapping features together.
Parameters
......@@ -1095,6 +1189,197 @@ class FeatureDB(object):
attributes='')
yield self._feature_returner(**merged_feature)
def merge(self, features,
merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
multiline=False):
"""
Merge features matching criteria together
Returned Features have a special property called 'children' that is
a list of the component features. This only exists for the lifetime of
the Feature instance.
Parameters
----------
features : iterable
Iterable of Feature instances to merge
merge_criteria : list
List of merge criteria callbacks. All must evaluate to True in
order for a feature to be merged. See notes below on callback
signature.
multiline : bool
True to emit multiple features with the same ID attribute, False
otherwise.
Returns
-------
Generator yielding merged Features
Notes
-----
See the `gffutils.merge_criteria` module (imported here as `mc`) for
existing callback functions. For writing custom callbacks, functions
must have the following signature::
callback(
acc: gffutils.Feature,
cur: gffutils.Feature,
components: [gffutils.Feature]
) -> bool
Where:
- `acc`: current accumulated feature
- `cur`: candidate feature to merge
- `components`: list of features that compose acc
The function should return True to merge `cur` into `acc`, False to set
`cur` to `acc` (that is, start a new merged feature).
If merge criteria allows different feature types then the merged
features' feature types should have their featuretype property
reassigned to a more specific ontology value.
"""
if not isinstance(merge_criteria, list):
try:
merge_criteria = list(merge_criteria)
except TypeError:
merge_criteria = [merge_criteria]
# To start, we create a merged feature of just the first feature.
features = iter(features)
last_id = None
current_merged = None
feature_children = []
for feature in features:
if current_merged is None:
if all(criteria(feature, feature, feature_children) for criteria in merge_criteria):
current_merged = feature
feature_children = [feature]
else:
yield _finalize_merge(feature, no_children)
last_id = None
continue
if len(feature_children) == 0: # current_merged is last feature and unchecked
if all(criteria(current_merged, current_merged, feature_children) for criteria in merge_criteria):
feature_children.append(current_merged)
else:
yield _finalize_merge(current_merged, no_children)
current_merged = feature
last_id = None
continue
if all(criteria(current_merged, feature, feature_children) for criteria in merge_criteria):
# Criteria satisfied, merge
# TODO Test multiline records and iron out the following code
# if multiline and (feature.start > current_merged.end + 1 or feature.end + 1 < current_merged.start):
# # Feature is possibly multiline (discontiguous), keep ID but start new record
# yield _finalize_merge(current_merged, feature_children)
# current_merged = feature
# feature_children = [feature]
if len(feature_children) == 1:
# Current merged is only child and merge is going to occur, make copy
current_merged = vars(current_merged).copy()
del current_merged['attributes']
del current_merged['extra']
del current_merged['dialect']
del current_merged['keep_order']
del current_merged['sort_attribute_values']
current_merged = self._feature_returner(**current_merged)
if not last_id:
# Generate unique ID for new Feature
self._autoincrements[current_merged.featuretype] += 1
last_id = current_merged.featuretype + '_' + str(
self._autoincrements[current_merged.featuretype])
current_merged['ID'] = last_id
current_merged.id = last_id
feature_children.append(feature)
# Set mismatched properties to ambiguous values
if feature.seqid not in current_merged.seqid.split(','): current_merged.seqid += ',' + feature.seqid
if feature.strand != current_merged.strand: current_merged.strand = '.'
if feature.frame != current_merged.frame: current_merged.frame = '.'
if feature.featuretype != current_merged.featuretype: current_merged.featuretype = "sequence_feature"
if feature.start < current_merged.start:
# Extends prior, so set a new start position
current_merged.start = feature.start
if feature.end > current_merged.end:
# Extends further, so set a new stop position
current_merged.end = feature.end
else:
yield _finalize_merge(current_merged, feature_children)
current_merged = feature
feature_children = []
last_id = None
if current_merged:
yield _finalize_merge(current_merged, feature_children)
def merge_all(self,
merge_order=('seqid', 'featuretype', 'strand', 'start'),
merge_criteria=(mc.seqid, mc.overlap_end_inclusive, mc.strand, mc.feature_type),
featuretypes_groups=(None,),
exclude_components=False):
"""
Merge all features in database according to criteria.
Merged features will be assigned as children of the merged record.
The resulting records are added to the database.
Parameters
----------
merge_order : list
Ordered list of columns with which to group features before evaluating criteria
merge_criteria : list
List of merge criteria callbacks. See merge().
featuretypes_groups : list
iterable of sets of featuretypes to merge together
exclude_components : bool
True: child features will be discarded. False to keep them.
Returns
-------
list of merge features
"""
if not len(featuretypes_groups):
# Can't be empty
featuretypes_groups = (None,)
result_features = []
# Merge features per featuregroup
for featuregroup in featuretypes_groups:
for merged in self.merge(self.all_features(featuretype=featuregroup, order_by=merge_order),
merge_criteria=merge_criteria):
# If feature is result of merge
if merged.children:
self._insert(merged, self.conn.cursor())
if exclude_components:
# Remove child features from DB
self.delete(merged.children)
else:
# Add child relations to DB
for child in merged.children:
self.add_relation(merged, child, 1, child_func=assign_child)
result_features.append(merged)
else:
pass # Do nothing, feature is already in DB
return result_features
def children_bp(self, feature, child_featuretype='exon', merge=False,
ignore_strand=False):
"""
......
......@@ -115,6 +115,7 @@ class _FileIterator(_BaseIterator):
Subclass for iterating over features provided as a filename
"""
def open_function(self, data):
data = os.path.expanduser(data)
if data.endswith('.gz'):
import gzip
return gzip.open(data)
......@@ -130,7 +131,7 @@ class _FileIterator(_BaseIterator):
self.current_item_number = i
if line == '##FASTA' or line.startswith('>'):
raise StopIteration
return
if line.startswith('##'):
self._directive_handler(line)
......
"""
Merge criteria used by FeatureDB merge() and merge_all()
"""
from gffutils.feature import Feature
def seqid(acc, cur, components):
return cur.seqid == acc.seqid
def strand(acc, cur, components):
return acc.strand == cur.strand
def feature_type(acc, cur, components):
return acc.featuretype == cur.featuretype
def exact_coordinates_only(acc, cur, components):
return cur.start == acc.start and cur.stop == acc.end
def overlap_end_inclusive(acc, cur, components):
return acc.start <= cur.start <= acc.end + 1
def overlap_start_inclusive(acc, cur, components):
return acc.start <= cur.end + 1 <= acc.end + 1
def overlap_any_inclusive(acc, cur, components):
return acc.start <= cur.start <= acc.end + 1 or acc.start <= cur.end + 1 <= acc.end + 1
def overlap_end_threshold(threshold):
def partial(acc, cur, components):
return acc.start <= cur.start <= acc.end + threshold
return partial
def overlap_start_threshold(threshold):
def partial(acc, cur, components):
return acc.start - threshold <= cur.end + 1 <= acc.end + 1
return partial
def overlap_any_threshold(threshold):
def partial(acc, cur, components):
return acc.start - threshold <= cur.end + 1 <= acc.end + 1 or acc.start <= cur.start <= acc.end + threshold
return partial