Steffen Möller · Steffen Möller · Steffen Möller · 60cef9a5 · 60cef9a5 · 60cef9a5
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,6 +20,7 @@ before_install:
 install: 
    - conda create --yes -n testenv python=$TRAVIS_PYTHON_VERSION Cython numpy nose
    - source activate testenv
+    - pip install python-lzo
    - python setup.py build_ext --inplace
    - python setup.py install
 script: 

--- a/README.md
+++ b/README.md
 [![Build Status](https://travis-ci.org/bxlab/bx-python.svg?branch=master)](https://travis-ci.org/bxlab/bx-python)

+[![Read the Docs](https://img.shields.io/readthedocs/bx-python.svg)](https://bx-python.readthedocs.io/)
+
 # bx-python

 The bx-python project is a python library and associated set of scripts to allow for rapid implementation of genome scale analyses. The library contains a variety of useful modules, but the particular strengths are:
@@ -10,6 +12,10 @@ The bx-python project is a python library and associated set of scripts to allow
    * "Binned bitsets" which act just like chromosome sized bit arrays, but lazily allocate regions and allow large blocks of all set or all unset bits to be stored compactly
    * "Intersecter" for performing fast intersection tests that preserve both query and target intervals and associated annotation

+## Requirements
+
+Build currently requires liblzo, e.g. sudo apt-get install liblzo2-dev on debian/ubuntu).
+
 ## Installing

 The package can be installed with pip:

--- a/debian/changelog
+++ b/debian/changelog
+python-bx (0.8.2-1) unstable; urgency=low
+
+  * New upstream version.
+  * Policy compliance with 4.2.1.
+
+ -- Steffen Moeller <moeller@debian.org>  Wed, 17 Oct 2018 12:16:56 +0200
+
 python-bx (0.8.1-2) unstable; urgency=low

  * Initial release (Closes: #851242)

--- a/debian/control
+++ b/debian/control
@@ -19,7 +19,7 @@ Build-Depends:
 	python3-nose,
 	python3-numpy,
 	cython3
-Standards-Version: 4.1.4
+Standards-Version: 4.2.1
 Homepage:  https://github.com/bxlab/bx-python
 Vcs-Git: https://salsa.debian.org/med-team/python-bx.git
 Vcs-Browser: https://salsa.debian.org/med-team/python-bx

--- a/lib/bx/intervals/io.py
+++ b/lib/bx/intervals/io.py
@@ -99,7 +99,9 @@ class GenomicIntervalReader( TableReader ):
    ...               "chr2\\tbar\\t20\\t300\\txxx",
    ...               "#I am a comment",
    ...               "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
+    >>> header = next(r)
    >>> elements = list( r )
+    >>> elements.insert(0, header)
    >>> assert type( elements[0] ) is Header
    >>> str( elements[0] )
    '#chrom\\tname\\tstart\\tend\\textra'
@@ -174,6 +176,15 @@ class GenomicIntervalReader( TableReader ):
        return bitsets

 class NiceReaderWrapper( GenomicIntervalReader ):
+    """
+    >>> r = NiceReaderWrapper( [ "#chrom\\tname\\tstart\\tend\\textra",
+    ...                          "chr1\\tfoo\\t1\\t100\\txxx",
+    ...                          "chr2\\tbar\\t20\\t300\\txxx",
+    ...                          "#I am a comment",
+    ...                          "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
+    >>> assert type(next(r)) == Header
+    """
+
    def __init__( self, reader, **kwargs ):
        GenomicIntervalReader.__init__( self, reader, **kwargs )
        self.outstream = kwargs.get("outstream", None)
@@ -187,7 +198,7 @@ class NiceReaderWrapper( GenomicIntervalReader ):
    def __next__( self ):
        while 1:
            try:
-                nextitem = GenomicIntervalReader.next( self )
+                nextitem = super(NiceReaderWrapper, self ).__next__()
                return nextitem
            except ParseError as e:
                if self.outstream:

--- a/scripts/bnMapper.py
+++ b/scripts/bnMapper.py
@@ -21,6 +21,9 @@ from bx.cookbook import argparse
 from bx.intervals.intersection import IntervalTree, Interval

 elem_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100)])
+narrowPeak_t = np.dtype([('chrom', np.str_, 30), ('start', np.int64), ('end', np.int64), ('id', np.str_, 100),
+                         ('score', np.int64), ('strand', np.str_, 1), ('signalValue', np.float),
+                         ('pValue', np.float), ('qValue', np.float), ('peak', np.int64)])
 LOG_LEVELS = {"info" : logging.INFO, "debug" : logging.DEBUG, "silent" : logging.ERROR}

 logging.basicConfig()
@@ -116,18 +119,43 @@ def union_elements(elements):
 def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
    BED4_FRM = "%s\t%d\t%d\t%s\n"
    BED12_FRM = "%s\t%d\t%d\t%s\t1000\t+\t%d\t%d\t0,0,0\t%d\t%s\t%s\n"
+    NPEAK_FRM = "%s\t%d\t%d\t%s\t%d\t%s\t%f\t%f\t%f\t%d\n"
    assert len( set(from_elem_list['chrom']) ) <= 1

    mapped_elem_count = 0
+    mapped_summit_count = 0
    for from_elem in from_elem_list:
        matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['start'], from_elem['end'])]

        # do the actual mapping
        to_elem_slices = [_ for _ in (transform(from_elem, all_epo[i], opt.gap) for i in matching_block_ids) if _]
+        """ # Original version: silently discard split alignments
        if len(to_elem_slices) > 1 or len(to_elem_slices) == 0:
            log.debug("%s no match or in different chain/chromosomes" % (str(from_elem)))
            continue
        to_elem_slices = to_elem_slices[0]
+        """
+        """ Modified version below allows liftOver-like behavior of
+        keeping the longest alignment when alignments are split across
+        multiple chains. Added by Adam Diehl (adadiehl@umich.edu)
+        """
+        max_elem_idx = 0
+        if len(to_elem_slices) == 0:
+            log.debug("%s: no match in target: discarding." % (str(from_elem)))
+            continue
+        elif len(to_elem_slices) > 1 and opt.keep_split:
+            log.debug("%s spans multiple chains/chromosomes. Using longest alignment." % (str(from_elem)))
+            max_elem_len = 0
+            for i in xrange(len(to_elem_slices)):
+                elem_len = to_elem_slices[i][-1][2] - to_elem_slices[i][0][2]
+                if elem_len > max_elem_len:
+                    max_elem_len = elem_len
+                    max_elem_idx = i
+        elif len(to_elem_slices) > 1:
+            log.debug("%s spans multiple chains/chromosomes: discarding." % (str(from_elem)))
+            continue
+        to_elem_slices = to_elem_slices[max_elem_idx]
+        """ End AGD modifications """

        # apply threshold
        if (from_elem[2] - from_elem[1]) * opt.threshold > reduce(lambda b,a: a[2]-a[1] + b, to_elem_slices, 0):
@@ -139,18 +167,44 @@ def transform_by_chrom(all_epo, from_elem_list, tree, chrom, opt, out_fd):
        if to_elem_list:
            mapped_elem_count += 1
            log.debug("\tjoined to %d elements" % (len(to_elem_list)))
+            start = to_elem_list[0][1]
+            end = to_elem_list[-1][2]
            if opt.format == "BED4":
                for tel in to_elem_list:
                    out_fd.write(BED4_FRM % tel)
-            else:
-                start = to_elem_list[0][1]
-                end = to_elem_list[-1][2]
+            elif opt.format == "BED12":
                out_fd.write(BED12_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
                        start, end, len(to_elem_list),
                        ",".join( "%d" % (e[2]-e[1]) for e in to_elem_list ),
                        ",".join( "%d" % (e[1]-start) for e in to_elem_list ) )
                        )
-    log.info("%s %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
+            else:
+                # narrowPeak convention is to report the peak location relative to start
+                peak = int((start + end)/2) - start
+                if opt.in_format == "narrowPeak":
+                    # Map the peak location
+                    #sys.stderr.write("{}\n".format(from_elem))
+                    matching_block_ids = [attrgetter("value")(_) for _ in tree.find(chrom, from_elem['peak'], from_elem['peak'])]
+                    p_elem_slices = [_ for _ in (transform( np.array((chrom, from_elem['peak'], from_elem['peak'], '.'), dtype=elem_t), all_epo[i], opt.gap) for i in matching_block_ids) if _]
+                    if len(p_elem_slices) >= 1:
+                        mapped_summit_count += 1
+                        sys.stderr.write("{}\n".format(p_elem_slices))
+                        # Make sure the peak is between the start and end positions
+                        if p_elem_slices[0][0][1] >= start and p_elem_slices[0][0][1] <= end:
+                            peak = p_elem_slices[0][0][1] - start
+                        else:
+                            mapped_summit_count -= 1
+                            log.debug("Warning: elem {} summit mapped location falls outside the mapped element start and end. Using the mapped elem midpoint instead.".format(from_elem))
+                        
+                    else:
+                        log.debug("Warning: elem {} summit maps to a gap region in the target alignment. Using the mapped elem midpoint instead.".format(from_elem))
+                out_fd.write(NPEAK_FRM % (to_elem_list[0][0], start, end, from_elem['id'],
+                                          from_elem['score'], from_elem['strand'], from_elem['signalValue'],
+                                          from_elem['pValue'], from_elem['qValue'], peak))
+    log.info("%s: %d of %d elements mapped" % (chrom, mapped_elem_count, from_elem_list.shape[0]))
+    if opt.format == "narrowPeak" and opt.in_format == "narrowPeak":
+        log.info("%s: %d peak summits from %d mapped elements mapped" % (chrom, mapped_summit_count, mapped_elem_count))
+
    
 def transform_file(ELEMS, ofname, EPO, TREE, opt):
    "transform/map the elements of this file and dump the output on 'ofname'"
@@ -193,16 +247,29 @@ def loadChains(path):
    assert all( t[0].tStrand == '+' for t in EPO ), "all target strands should be +"
    return EPO

-def loadFeatures(path):
-    "load BED4 features (all other columns are ignored)"
+def loadFeatures(path, opt):
+    """
+    Load features. For BED, only BED4 columns are loaded.
+    For narrowPeak, all columns are loaded.
+    """

    log.info("loading from %s ..." % path)
    data = []
+    if opt.in_format == "BED":        
        with open(path) as fd:
            for line in fd:
                cols = line.split()
                data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3]) )
-    return np.array(data, dtype=elem_t)
+        data = np.array(data, dtype=elem_t)
+    else:
+        with open(path) as fd:
+            for line in fd:
+                cols = line.split()
+                data.append( (cols[0], int(cols[1]), int(cols[2]), cols[3], int(cols[4]),
+                              cols[5], float(cols[6]), float(cols[7]), float(cols[8]),
+                              int(cols[-1])+int(cols[1])) )
+        data = np.array(data, dtype=narrowPeak_t)
+    return data

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__, epilog="Olgert Denas (Taylor Lab)",
@@ -212,8 +279,8 @@ if __name__ == "__main__":
            help="Input to process. If more than a file is specified, all files will be mapped and placed on --output, which should be a directory.")
    parser.add_argument("alignment", help="Alignment file (.chain or .pkl)")

-    parser.add_argument("-f", '--format', choices=("BED4", "BED12"), default="BED4",
-            help="Output format.")
+    parser.add_argument("-f", '--format', choices=("BED4", "BED12", "narrowPeak"), default="BED4",
+                        help="Output format. BED4 output reports all aligned blocks as separate BED records. BED12 reports a single BED record for each mapped element, with individual blocks given in the BED12 fields. NarrowPeak reports a single narrowPeak record for each mapped element, in which the chromosome, start, end, and peak positions are mapped to the target species and all other columns are passed through unchanged.")
    parser.add_argument("-o", '--output', metavar="FILE", default='stdout',
            type=lambda s: ((s in ('stdout', '-') and "/dev/stdout") or s),
            help="Output file. Mandatory if more than on file in input.")
@@ -225,7 +292,10 @@ if __name__ == "__main__":
            help="Ignore elements with an insertion/deletion of this or bigger size.")
    parser.add_argument('-v', '--verbose', type=str, choices=list(LOG_LEVELS.keys()), default='info',
            help='Verbosity level')
-
+    parser.add_argument("-k", '--keep_split', default=False, action='store_true',
+            help="If elements span multiple chains, report the segment with the longest overlap instead of silently dropping them. (This is the default behavior for liftOver.)")
+    parser.add_argument("-i", "--in_format", choices=["BED", "narrowPeak"], default="BED",
+            help="Input file format.")

    opt = parser.parse_args()
    log.setLevel(LOG_LEVELS[opt.verbose])
@@ -257,4 +327,4 @@ if __name__ == "__main__":
                log.warning("overwriting %s ..." % outpath)
            transform_file(loadFeatures(inpath), outpath, EPO, TREE, opt)
    else:
-        transform_file(loadFeatures( opt.input[0] ), opt.output, EPO, TREE, opt)
+        transform_file(loadFeatures( opt.input[0], opt ), opt.output, EPO, TREE, opt)
--- a/setup.py
+++ b/setup.py
@@ -17,22 +17,21 @@ from glob import glob

 def main():

-    build_requires = [ 'python-lzo', 'numpy' ]
-
    metadata = \
      dict( name = "bx-python",
-            version = "0.8.1",
-            install_requires=build_requires + ['six'],
+            version = "0.8.2",
+            setup_requires=['numpy', 'cython'],
+            install_requires=['numpy', 'six'],
            py_modules = [ 'psyco_full' ],
            package_dir = { '': 'lib' },
            package_data = { '': ['*.ps'] },
            scripts = glob( "scripts/*.py" ),
            test_suite = 'nose.collector',
-            tests_require = ['nose'],
+            tests_require = ['nose','python-lzo'],
            author = "James Taylor, Bob Harris, David King, Brent Pedersen, Kanwei Li, and others",
            author_email = "james@jamestaylor.org",
            description = "Tools for manipulating biological data, particularly multiple sequence alignments",
-            url = "http://bitbucket.org/james_taylor/bx-python/wiki/Home",
+            url = "https://github.com/bxlab/bx-python",
            license = "MIT",
            classifiers = [
                "Development Status :: 5 - Production/Stable",
@@ -57,6 +56,8 @@ def main():
    numpy = None
    try:
        import numpy
+        # Suppress numpy tests
+        numpy.test = None
    except:
        pass
    
@@ -81,10 +82,8 @@ def main():

 from distutils.core import Command

-# Use build_ext from Cython
-command_classes = {}
-
 # Use build_ext from Cython if found
+command_classes = {}
 try:
    import Cython.Distutils
    command_classes['build_ext'] = Cython.Distutils.build_ext
@@ -220,47 +219,5 @@ def get_extension_modules( numpy_include=None ):
                                      include_dirs=[ "src/bunzip" ] ) )   
    return extensions     
     
-# ---- Monkey patches -------------------------------------------------------
-
-def monkey_patch_doctest():
-    #
-    # Doctest and coverage don't get along, so we need to create
-    # a monkeypatch that will replace the part of doctest that
-    # interferes with coverage reports.
-    #
-    # The monkeypatch is based on this zope patch:
-    # http://svn.zope.org/Zope3/trunk/src/zope/testing/doctest.py?rev=28679&r1=28703&r2=28705
-    #
-    try:
-        import doctest
-        _orp = doctest._OutputRedirectingPdb
-        class NoseOutputRedirectingPdb(_orp):
-            def __init__(self, out):
-                self.__debugger_used = False
-                _orp.__init__(self, out)
-
-            def set_trace(self):
-                self.__debugger_used = True
-                _orp.set_trace(self)
-
-            def set_continue(self):
-                # Calling set_continue unconditionally would break unit test coverage
-                # reporting, as Bdb.set_continue calls sys.settrace(None).
-                if self.__debugger_used:
-                    _orp.set_continue(self)
-        doctest._OutputRedirectingPdb = NoseOutputRedirectingPdb
-    except:
-        pass
-
-def monkey_patch_numpy():
-    # Numpy pushes its tests into every importers namespace, yeccch.
-    try:
-        import numpy
-        numpy.test = None
-    except:
-        pass
-        
 if __name__ == "__main__":
-    monkey_patch_doctest()
-    monkey_patch_numpy()
    main()