...
 
Commits (3)
......@@ -20,6 +20,7 @@ before_install:
install:
- conda create --yes -n testenv python=$TRAVIS_PYTHON_VERSION Cython numpy nose
- source activate testenv
- pip install python-lzo
- python setup.py build_ext --inplace
- python setup.py install
script:
......
[![Build Status](https://travis-ci.org/bxlab/bx-python.svg?branch=master)](https://travis-ci.org/bxlab/bx-python)
[![Read the Docs](https://img.shields.io/readthedocs/bx-python.svg)](https://bx-python.readthedocs.io/)
# bx-python
The bx-python project is a python library and associated set of scripts to allow for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
......@@ -10,6 +12,10 @@ The bx-python project is a python library and associated set of scripts to allow
* "Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
* "Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation
## Requirements
Build currently requires liblzo, e.g. sudo apt-get install liblzo2-dev on debian/ubuntu).
## Installing
The package can be installed with pip:
......
python-bx (0.8.2-1) unstable; urgency=low
* New upstream version.
* Policy compliance with 4.2.1.
-- Steffen Moeller <moeller@debian.org> Wed, 17 Oct 2018 12:16:56 +0200
python-bx (0.8.1-2) unstable; urgency=low
* Initial release (Closes: #851242)
......
......@@ -19,7 +19,7 @@ Build-Depends:
python3-nose,
python3-numpy,
cython3
Standards-Version: 4.1.4
Standards-Version: 4.2.1
Homepage: https://github.com/bxlab/bx-python
Vcs-Git: https://salsa.debian.org/med-team/python-bx.git
Vcs-Browser: https://salsa.debian.org/med-team/python-bx
......
......@@ -99,7 +99,9 @@ class GenomicIntervalReader( TableReader ):
... "chr2\\tbar\\t20\\t300\\txxx",
... "#I am a comment",
... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
>>> header = next(r)
>>> elements = list( r )
>>> elements.insert(0, header)
>>> assert type( elements[0] ) is Header
>>> str( elements[0] )
'#chrom\\tname\\tstart\\tend\\textra'
......@@ -174,6 +176,15 @@ class GenomicIntervalReader( TableReader ):
return bitsets
class NiceReaderWrapper( GenomicIntervalReader ):
"""
>>> r = NiceReaderWrapper( [ "#chrom\\tname\\tstart\\tend\\textra",
... "chr1\\tfoo\\t1\\t100\\txxx",
... "chr2\\tbar\\t20\\t300\\txxx",
... "#I am a comment",
... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
>>> assert type(next(r)) == Header
"""
def __init__( self, reader, **kwargs ):
GenomicIntervalReader.__init__( self, reader, **kwargs )
self.outstream = kwargs.get("outstream", None)
......@@ -187,7 +198,7 @@ class NiceReaderWrapper( GenomicIntervalReader ):
def __next__( self ):
while 1:
try:
nextitem = GenomicIntervalReader.next( self )
nextitem = super(NiceReaderWrapper, self ).__next__()
return nextitem
except ParseError as e:
if self.outstream:
......
......@@ -21,6 +21,9 @@ from bx.cookbook import argparse
from bx.intervals.intersection import IntervalTree, Interval
elem_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100)])
narrowPeak_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100),
('score', np.int64), ('strand', np.str_, 1), ('signalValue', np.float),
('pValue', np.float), ('qValue', np.float), ('peak', np.int64)])
LOG_LEVELS = {"info" : logging.INFO, "debug" : logging.DEBUG, "silent" : logging.ERROR}
logging.basicConfig()
......@@ -116,18 +119,43 @@ def union_elements(elements):
def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
BED4_FRM = "%s\t%d\t%d\t%s\n"
BED12_FRM = "%s\t%d\t%d\t%s\t1000\t+\t%d\t%d\t0,0,0\t%d\t%s\t%s\n"
NPEAK_FRM = "%s\t%d\t%d\t%s\t%d\t%s\t%f\t%f\t%f\t%d\n"
assert len( set(from_elem_list['chrom']) ) <= 1
mapped_elem_count = 0
mapped_summit_count = 0
for from_elem in from_elem_list:
matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['start'], from_elem['end'])]
# do the actual mapping
to_elem_slices = [_ for _ in (transform(from_elem, all_epo[i], opt.gap) for i in matching_block_ids) if _]
""" # Original version: silently discard split alignments
if len(to_elem_slices) > 1 or len(to_elem_slices) == 0:
log.debug("%s no match or in different chain/chromosomes" % (str(from_elem)))
continue
to_elem_slices = to_elem_slices[0]
"""
""" Modified version below allows liftOver-like behavior of
keeping the longest alignment when alignments are split across
multiple chains. Added by Adam Diehl (adadiehl@umich.edu)
"""
max_elem_idx = 0
if len(to_elem_slices) == 0:
log.debug("%s: no match in target: discarding." % (str(from_elem)))
continue
elif len(to_elem_slices) > 1 and opt.keep_split:
log.debug("%s spans multiple chains/chromosomes. Using longest alignment." % (str(from_elem)))
max_elem_len = 0
for i in xrange(len(to_elem_slices)):
elem_len = to_elem_slices[i][-1][2] - to_elem_slices[i][0][2]
if elem_len > max_elem_len:
max_elem_len = elem_len
max_elem_idx = i
elif len(to_elem_slices) > 1:
log.debug("%s spans multiple chains/chromosomes: discarding." % (str(from_elem)))
continue
to_elem_slices = to_elem_slices[max_elem_idx]
""" End AGD modifications """
# apply threshold
if (from_elem[2] - from_elem[1]) * opt.threshold > reduce(lambda b,a: a[2]-a[1] + b, to_elem_slices, 0):
......@@ -139,19 +167,45 @@ def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
if to_elem_list:
mapped_elem_count += 1
log.debug("\tjoined to %d elements" % (len(to_elem_list)))
start = to_elem_list[0][1]
end = to_elem_list[-1][2]
if opt.format == "BED4":
for tel in to_elem_list:
out_fd.write(BED4_FRM % tel)
else:
start = to_elem_list[0][1]
end = to_elem_list[-1][2]
elif opt.format == "BED12":
out_fd.write(BED12_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
start, end, len(to_elem_list),
",".join( "%d" % (e[2]-e[1]) for e in to_elem_list ),
",".join( "%d" % (e[1]-start) for e in to_elem_list ) )
)
log.info("%s %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
else:
# narrowPeak convention is to report the peak location relative to start
peak = int((start + end)/2) - start
if opt.in_format == "narrowPeak":
# Map the peak location
#sys.stderr.write("{}\n".format(from_elem))
matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['peak'], from_elem['peak'])]
p_elem_slices = [_ for _ in (transform( np.array((chrom, from_elem['peak'], from_elem['peak'], '.'), dtype=elem_t), all_epo[i], opt.gap) for i in matching_block_ids) if _]
if len(p_elem_slices) >= 1:
mapped_summit_count += 1
sys.stderr.write("{}\n".format(p_elem_slices))
# Make sure the peak is between the start and end positions
if p_elem_slices[0][0][1] >= start and p_elem_slices[0][0][1] <= end:
peak = p_elem_slices[0][0][1] - start
else:
mapped_summit_count -= 1
log.debug("Warning: elem {} summit mapped location falls outside the mapped element start and end. Using the mapped elem midpoint instead.".format(from_elem))
else:
log.debug("Warning: elem {} summit maps to a gap region in the target alignment. Using the mapped elem midpoint instead.".format(from_elem))
out_fd.write(NPEAK_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
from_elem['score'], from_elem['strand'], from_elem['signalValue'],
from_elem['pValue'], from_elem['qValue'], peak))
log.info("%s: %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
if opt.format == "narrowPeak" and opt.in_format == "narrowPeak":
log.info("%s: %d peak summits from %d mapped elements mapped" % (chrom, mapped_summit_count, mapped_elem_count))
def transform_file(ELEMS, ofname, EPO, TREE, opt):
"transform/map the elements of this file and dump the output on 'ofname'"
......@@ -193,16 +247,29 @@ def loadChains(path):
assert all( t[0].tStrand == '+' for t in EPO ), "all target strands should be +"
return EPO
def loadFeatures(path):
"load BED4 features (all other columns are ignored)"
def loadFeatures(path, opt):
"""
Load features. For BED, only BED4 columns are loaded.
For narrowPeak, all columns are loaded.
"""
log.info("loading from %s ..." % path)
data = []
with open(path) as fd:
for line in fd:
cols = line.split()
data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3]) )
return np.array(data, dtype=elem_t)
if opt.in_format == "BED":
with open(path) as fd:
for line in fd:
cols = line.split()
data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3]) )
data = np.array(data, dtype=elem_t)
else:
with open(path) as fd:
for line in fd:
cols = line.split()
data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3], int(cols[4]),
cols[5], float(cols[6]), float(cols[7]), float(cols[8]),
int(cols[-1])+int(cols[1])) )
data = np.array(data, dtype=narrowPeak_t)
return data
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__, epilog="Olgert Denas (Taylor Lab)",
......@@ -212,8 +279,8 @@ if __name__ == "__main__":
help="Input to process. If more than a file is specified, all files will be mapped and placed on --output, which should be a directory.")
parser.add_argument("alignment", help="Alignment file (.chain or .pkl)")
parser.add_argument("-f", '--format', choices=("BED4", "BED12"), default="BED4",
help="Output format.")
parser.add_argument("-f", '--format', choices=("BED4", "BED12", "narrowPeak"), default="BED4",
help="Output format. BED4 output reports all aligned blocks as separate BED records. BED12 reports a single BED record for each mapped element, with individual blocks given in the BED12 fields. NarrowPeak reports a single narrowPeak record for each mapped element, in which the chromosome, start, end, and peak positions are mapped to the target species and all other columns are passed through unchanged.")
parser.add_argument("-o", '--output', metavar="FILE", default='stdout',
type=lambda s: ((s in ('stdout', '-') and "/dev/stdout") or s),
help="Output file. Mandatory if more than on file in input.")
......@@ -225,7 +292,10 @@ if __name__ == "__main__":
help="Ignore elements with an insertion/deletion of this or bigger size.")
parser.add_argument('-v', '--verbose', type=str, choices=list(LOG_LEVELS.keys()), default='info',
help='Verbosity level')
parser.add_argument("-k", '--keep_split', default=False, action='store_true',
help="If elements span multiple chains, report the segment with the longest overlap instead of silently dropping them. (This is the default behavior for liftOver.)")
parser.add_argument("-i", "--in_format", choices=["BED", "narrowPeak"], default="BED",
help="Input file format.")
opt = parser.parse_args()
log.setLevel(LOG_LEVELS[opt.verbose])
......@@ -257,4 +327,4 @@ if __name__ == "__main__":
log.warning("overwriting %s ..." % outpath)
transform_file(loadFeatures(inpath), outpath, EPO, TREE, opt)
else:
transform_file(loadFeatures( opt.input[0] ), opt.output, EPO, TREE, opt)
transform_file(loadFeatures( opt.input[0], opt ), opt.output, EPO, TREE, opt)
......@@ -17,22 +17,21 @@ from glob import glob
def main():
build_requires = [ 'python-lzo', 'numpy' ]
metadata = \
dict( name = "bx-python",
version = "0.8.1",
install_requires=build_requires + ['six'],
version = "0.8.2",
setup_requires=['numpy', 'cython'],
install_requires=['numpy', 'six'],
py_modules = [ 'psyco_full' ],
package_dir = { '': 'lib' },
package_data = { '': ['*.ps'] },
scripts = glob( "scripts/*.py" ),
test_suite = 'nose.collector',
tests_require = ['nose'],
tests_require = ['nose','python-lzo'],
author = "James Taylor, Bob Harris, David King, Brent Pedersen, Kanwei Li, and others",
author_email = "james@jamestaylor.org",
description = "Tools for manipulating biological data, particularly multiple sequence alignments",
url = "http://bitbucket.org/james_taylor/bx-python/wiki/Home",
url = "https://github.com/bxlab/bx-python",
license = "MIT",
classifiers = [
"Development Status :: 5 - Production/Stable",
......@@ -57,6 +56,8 @@ def main():
numpy = None
try:
import numpy
# Suppress numpy tests
numpy.test = None
except:
pass
......@@ -81,10 +82,8 @@ def main():
from distutils.core import Command
# Use build_ext from Cython
command_classes = {}
# Use build_ext from Cython if found
command_classes = {}
try:
import Cython.Distutils
command_classes['build_ext'] = Cython.Distutils.build_ext
......@@ -220,47 +219,5 @@ def get_extension_modules( numpy_include=None ):
include_dirs=[ "src/bunzip" ] ) )
return extensions
# ---- Monkey patches -------------------------------------------------------
def monkey_patch_doctest():
#
# Doctest and coverage don't get along, so we need to create
# a monkeypatch that will replace the part of doctest that
# interferes with coverage reports.
#
# The monkeypatch is based on this zope patch:
# http://svn.zope.org/Zope3/trunk/src/zope/testing/doctest.py?rev=28679&r1=28703&r2=28705
#
try:
import doctest
_orp = doctest._OutputRedirectingPdb
class NoseOutputRedirectingPdb(_orp):
def __init__(self, out):
self.__debugger_used = False
_orp.__init__(self, out)
def set_trace(self):
self.__debugger_used = True
_orp.set_trace(self)
def set_continue(self):
# Calling set_continue unconditionally would break unit test coverage
# reporting, as Bdb.set_continue calls sys.settrace(None).
if self.__debugger_used:
_orp.set_continue(self)
doctest._OutputRedirectingPdb = NoseOutputRedirectingPdb
except:
pass
def monkey_patch_numpy():
# Numpy pushes its tests into every importers namespace, yeccch.
try:
import numpy
numpy.test = None
except:
pass
if __name__ == "__main__":
monkey_patch_doctest()
monkey_patch_numpy()
main()