Skip to content
Commits on Source (15)
HTSeq.egg-info/*
HTSeq/StepVector.py
src/StepVector_wrap.cxx
src/_HTSeq.c
import sys, optparse, itertools, warnings, traceback, os.path
import HTSeq
class UnknownChrom( Exception ):
pass
def invert_strand( iv ):
iv2 = iv.copy()
if iv2.strand == "+":
iv2.strand = "-"
elif iv2.strand == "-":
iv2.strand = "+"
else:
raise ValueError, "Illegal strand"
return iv2
def count_reads_in_features( sam_filename, gff_filename, samtype, order, stranded,
overlap_mode, feature_type, id_attribute, quiet, minaqual, samout ):
def write_to_samout( r, assignment ):
if samoutfile is None:
return
if not pe_mode:
r = (r,)
for read in r:
if read is not None:
samoutfile.write( read.original_sam_line.rstrip() +
"\tXF:Z:" + assignment + "\n" )
if samout != "":
samoutfile = open( samout, "w" )
else:
samoutfile = None
features = HTSeq.GenomicArrayOfSets( "auto", stranded != "no" )
counts = {}
# Try to open samfile to fail early in case it is not there
if sam_filename != "-":
open( sam_filename ).close()
gff = HTSeq.GFF_Reader( gff_filename )
i = 0
try:
for f in gff:
if f.type == feature_type:
try:
feature_id = f.attr[ id_attribute ]
except KeyError:
raise ValueError, ( "Feature %s does not contain a '%s' attribute" %
( f.name, id_attribute ) )
if stranded != "no" and f.iv.strand == ".":
raise ValueError, ( "Feature %s at %s does not have strand information but you are "
"running htseq-count in stranded mode. Use '--stranded=no'." %
( f.name, f.iv ) )
features[ f.iv ] += feature_id
counts[ f.attr[ id_attribute ] ] = 0
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
except:
sys.stderr.write( "Error occured when processing GFF file (%s):\n" % gff.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d GFF lines processed.\n" % i )
if len( counts ) == 0:
sys.stderr.write( "Warning: No features of type '%s' found.\n" % feature_type )
if samtype == "sam":
SAM_or_BAM_Reader = HTSeq.SAM_Reader
elif samtype == "bam":
SAM_or_BAM_Reader = HTSeq.BAM_Reader
else:
raise ValueError, "Unknown input format %s specified." % samtype
try:
if sam_filename != "-":
read_seq_file = SAM_or_BAM_Reader( sam_filename )
read_seq = read_seq_file
first_read = iter(read_seq).next()
else:
read_seq_file = SAM_or_BAM_Reader( sys.stdin )
read_seq_iter = iter( read_seq_file )
first_read = read_seq_iter.next()
read_seq = itertools.chain( [ first_read ], read_seq_iter )
pe_mode = first_read.paired_end
except:
sys.stderr.write( "Error occured when reading beginning of SAM/BAM file.\n" )
raise
try:
if pe_mode:
if order == "name":
read_seq = HTSeq.pair_SAM_alignments( read_seq )
elif order == "pos":
read_seq = HTSeq.pair_SAM_alignments_with_buffer( read_seq )
else:
raise ValueError, "Illegal order specified."
empty = 0
ambiguous = 0
notaligned = 0
lowqual = 0
nonunique = 0
i = 0
for r in read_seq:
if i > 0 and i % 100000 == 0 and not quiet:
sys.stderr.write( "%d SAM alignment record%s processed.\n" % ( i, "s" if not pe_mode else " pairs" ) )
i += 1
if not pe_mode:
if not r.aligned:
notaligned += 1
write_to_samout( r, "__not_aligned" )
continue
try:
if r.optional_field( "NH" ) > 1:
nonunique += 1
write_to_samout( r, "__alignment_not_unique" )
continue
except KeyError:
pass
if r.aQual < minaqual:
lowqual += 1
write_to_samout( r, "__too_low_aQual" )
continue
if stranded != "reverse":
iv_seq = ( co.ref_iv for co in r.cigar if co.type == "M" and co.size > 0 )
else:
iv_seq = ( invert_strand( co.ref_iv ) for co in r.cigar if co.type == "M" and co.size > 0 )
else:
if r[0] is not None and r[0].aligned:
if stranded != "reverse":
iv_seq = ( co.ref_iv for co in r[0].cigar if co.type == "M" and co.size > 0 )
else:
iv_seq = ( invert_strand( co.ref_iv ) for co in r[0].cigar if co.type == "M" and co.size > 0 )
else:
iv_seq = tuple()
if r[1] is not None and r[1].aligned:
if stranded != "reverse":
iv_seq = itertools.chain( iv_seq,
( invert_strand( co.ref_iv ) for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
else:
iv_seq = itertools.chain( iv_seq,
( co.ref_iv for co in r[1].cigar if co.type == "M" and co.size > 0 ) )
else:
if ( r[0] is None ) or not ( r[0].aligned ):
write_to_samout( r, "__not_aligned" )
notaligned += 1
continue
try:
if ( r[0] is not None and r[0].optional_field( "NH" ) > 1 ) or \
( r[1] is not None and r[1].optional_field( "NH" ) > 1 ):
nonunique += 1
write_to_samout( r, "__alignment_not_unique" )
continue
except KeyError:
pass
if ( r[0] and r[0].aQual < minaqual ) or ( r[1] and r[1].aQual < minaqual ):
lowqual += 1
write_to_samout( r, "__too_low_aQual" )
continue
try:
if overlap_mode == "union":
fs = set()
for iv in iv_seq:
if iv.chrom not in features.chrom_vectors:
raise UnknownChrom
for iv2, fs2 in features[ iv ].steps():
fs = fs.union( fs2 )
elif overlap_mode == "intersection-strict" or overlap_mode == "intersection-nonempty":
fs = None
for iv in iv_seq:
if iv.chrom not in features.chrom_vectors:
raise UnknownChrom
for iv2, fs2 in features[ iv ].steps():
if len(fs2) > 0 or overlap_mode == "intersection-strict":
if fs is None:
fs = fs2.copy()
else:
fs = fs.intersection( fs2 )
else:
sys.exit( "Illegal overlap mode." )
if fs is None or len( fs ) == 0:
write_to_samout( r, "__no_feature" )
empty += 1
elif len( fs ) > 1:
write_to_samout( r, "__ambiguous[" + '+'.join( fs ) + "]" )
ambiguous += 1
else:
write_to_samout( r, list(fs)[0] )
counts[ list(fs)[0] ] += 1
except UnknownChrom:
write_to_samout( r, "__no_feature" )
empty += 1
except:
sys.stderr.write( "Error occured when processing SAM input (%s):\n" % read_seq_file.get_line_number_string() )
raise
if not quiet:
sys.stderr.write( "%d SAM %s processed.\n" % ( i, "alignments " if not pe_mode else "alignment pairs" ) )
if samoutfile is not None:
samoutfile.close()
for fn in sorted( counts.keys() ):
print "%s\t%d" % ( fn, counts[fn] )
print "__no_feature\t%d" % empty
print "__ambiguous\t%d" % ambiguous
print "__too_low_aQual\t%d" % lowqual
print "__not_aligned\t%d" % notaligned
print "__alignment_not_unique\t%d" % nonunique
def main():
optParser = optparse.OptionParser(
usage = "%prog [options] alignment_file gff_file",
description=
"This script takes an alignment file in SAM/BAM format and a " +
"feature file in GFF format and calculates for each feature " +
"the number of reads mapping to it. See " +
"http://www-huber.embl.de/users/anders/HTSeq/doc/count.html for details." ,
epilog =
"Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
"Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
"Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
optParser.add_option( "-f", "--format", type="choice", dest="samtype",
choices = ( "sam", "bam" ), default = "sam",
help = "type of <alignment_file> data, either 'sam' or 'bam' (default: sam)" )
optParser.add_option( "-r", "--order", type="choice", dest="order",
choices=("pos", "name"), default="name",
help = "'pos' or 'name'. Sorting order of <alignment_file> (default: name). Paired-end sequencing " +
"data must be sorted either by position or by read name, and the sorting order " +
"must be specified. Ignored for single-end data." )
optParser.add_option( "-s", "--stranded", type="choice", dest="stranded",
choices = ( "yes", "no", "reverse" ), default = "yes",
help = "whether the data is from a strand-specific assay. Specify 'yes', " +
"'no', or 'reverse' (default: yes). " +
"'reverse' means 'yes' with reversed strand interpretation" )
optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual",
default = 10,
help = "skip all reads with alignment quality lower than the given " +
"minimum value (default: 10)" )
optParser.add_option( "-t", "--type", type="string", dest="featuretype",
default = "exon", help = "feature type (3rd column in GFF file) to be used, " +
"all features of other type are ignored (default, suitable for Ensembl " +
"GTF files: exon)" )
optParser.add_option( "-i", "--idattr", type="string", dest="idattr",
default = "gene_id", help = "GFF attribute to be used as feature ID (default, " +
"suitable for Ensembl GTF files: gene_id)" )
optParser.add_option( "-m", "--mode", type="choice", dest="mode",
choices = ( "union", "intersection-strict", "intersection-nonempty" ),
default = "union", help = "mode to handle reads overlapping more than one feature " +
"(choices: union, intersection-strict, intersection-nonempty; default: union)" )
optParser.add_option( "-o", "--samout", type="string", dest="samout",
default = "", help = "write out all SAM alignment records into an output " +
"SAM file called SAMOUT, annotating each line with its feature assignment " +
"(as an optional field with tag 'XF')" )
optParser.add_option( "-q", "--quiet", action="store_true", dest="quiet",
help = "suppress progress report" ) # and warnings" )
if len( sys.argv ) == 1:
optParser.print_help()
sys.exit(1)
(opts, args) = optParser.parse_args()
if len( args ) != 2:
sys.stderr.write( sys.argv[0] + ": Error: Please provide two arguments.\n" )
sys.stderr.write( " Call with '-h' to get usage information.\n" )
sys.exit( 1 )
warnings.showwarning = my_showwarning
try:
count_reads_in_features( args[0], args[1], opts.samtype, opts.order, opts.stranded,
opts.mode, opts.featuretype, opts.idattr, opts.quiet, opts.minaqual,
opts.samout )
except:
sys.stderr.write( " %s\n" % str( sys.exc_info()[1] ) )
sys.stderr.write( " [Exception type: %s, raised in %s:%d]\n" %
( sys.exc_info()[1].__class__.__name__,
os.path.basename(traceback.extract_tb( sys.exc_info()[2] )[-1][0]),
traceback.extract_tb( sys.exc_info()[2] )[-1][1] ) )
sys.exit( 1 )
def my_showwarning( message, category, filename, lineno = None, line = None ):
sys.stderr.write( "Warning: %s\n" % message )
if __name__ == "__main__":
main()
#!/usr/bin/env python
# HTSeq_QA.py
#
# (c) Simon Anders, European Molecular Biology Laboratory, 2010
# released under GNU General Public License
import sys, time, os.path, optparse
from itertools import *
import numpy
import HTSeq
def main():
try:
import matplotlib
except ImportError:
sys.stderr.write("This script needs the 'matplotlib' library, which ")
sys.stderr.write("was not found. Please install it." )
matplotlib.use('PDF')
from matplotlib import pyplot
# **** Parse command line ****
optParser = optparse.OptionParser( usage = "%prog [options] read_file",
description=
"This script take a file with high-throughput sequencing reads " +
"(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
"_sequence.txt) and performs a simply quality assessment by " +
"producing plots showing the distribution of called bases and " +
"base-call quality scores by position within the reads. The " +
"plots are output as a PDF file.",
epilog =
"Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
" Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
" Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
optParser.add_option( "-t", "--type", type="choice", dest="type",
choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
default = "sam", help="type of read_file (one of: sam [default], bam, " +
"solexa-export, fastq, solexa-fastq)" )
optParser.add_option( "-o", "--outfile", type="string", dest="outfile",
help="output filename (default is <read_file>.pdf)" )
optParser.add_option( "-r", "--readlength", type="int", dest="readlen",
help="the maximum read length (when not specified, the script guesses from the file" )
optParser.add_option( "-g", "--gamma", type="float", dest="gamma",
default = 0.3,
help="the gamma factor for the contrast adjustment of the quality score plot" )
optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit",
help="do not split reads in unaligned and aligned ones" )
optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41,
help="the maximum quality score that appears in the data (default: 41)" )
if len( sys.argv ) == 1:
optParser.print_help()
sys.exit(1)
(opts, args) = optParser.parse_args()
if len( args ) != 1:
sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" )
sys.stderr.write( " Call with '-h' to get usage information.\n" )
sys.exit( 1 )
readfilename = args[0]
if opts.type == "sam":
readfile = HTSeq.SAM_Reader( readfilename )
isAlnmntFile = True
elif opts.type == "bam":
readfile = HTSeq.BAM_Reader( readfilename )
isAlnmntFile = True
elif opts.type == "solexa-export":
readfile = HTSeq.SolexaExportReader( readfilename )
isAlnmntFile = True
elif opts.type == "fastq":
readfile = HTSeq.FastqReader( readfilename )
isAlnmntFile = False
elif opts.type == "solexa-fastq":
readfile = HTSeq.FastqReader( readfilename, "solexa" )
isAlnmntFile = False
else:
sys.error( "Oops." )
twoColumns = isAlnmntFile and not opts.nosplit
if opts.outfile is None:
outfilename = os.path.basename( readfilename ) + ".pdf"
else:
outfilename = opts.outfile
# **** Get read length ****
if opts.readlen is not None:
readlen = opts.readlen
else:
readlen = 0
if isAlnmntFile:
reads = ( a.read for a in readfile )
else:
reads = readfile
for r in islice( reads, 10000 ):
if len( r ) > readlen:
readlen = len( r )
max_qual = opts.maxqual
gamma = opts.gamma
# **** Initialize count arrays ****
base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int )
qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
if twoColumns:
base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int )
qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
# **** Main counting loop ****
i = 0
try:
for a in readfile:
if isAlnmntFile:
r = a.read
else:
r = a
if twoColumns and (isAlnmntFile and a.aligned):
r.add_bases_to_count_array( base_arr_A )
r.add_qual_to_count_array( qual_arr_A )
else:
r.add_bases_to_count_array( base_arr_U )
r.add_qual_to_count_array( qual_arr_U )
i += 1
if i % 200000 == 0:
print i, "reads processed"
except:
sys.stderr.write( "Error occured in: %s\n" %
readfile.get_line_number_string() )
raise
print i, "reads processed"
# **** Normalize result ****
def norm_by_pos( arr ):
arr = numpy.array( arr, numpy.float )
arr_n = ( arr.T / arr.sum( 1 ) ).T
arr_n[ arr == 0 ] = 0
return arr_n
def norm_by_start( arr ):
arr = numpy.array( arr, numpy.float )
arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
arr_n[ arr == 0 ] = 0
return arr_n
base_arr_U_n = norm_by_pos( base_arr_U )
qual_arr_U_n = norm_by_start( qual_arr_U )
nreads_U = base_arr_U[0,:].sum()
if twoColumns:
base_arr_A_n = norm_by_pos( base_arr_A )
qual_arr_A_n = norm_by_start( qual_arr_A )
nreads_A = base_arr_A[0,:].sum()
# **** Make plot ****
def plot_bases( arr ):
xg = numpy.arange( readlen )
pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
pyplot.axis( (0, readlen-1, 0, 1 ) )
pyplot.text( readlen*.70, .9, "A", color="red" )
pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
pyplot.text( readlen*.85, .9, "T", color="orange" )
pyplot.text( readlen*.90, .9, "N", color="grey" )
pyplot.figure()
pyplot.subplots_adjust( top=.85 )
pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' )
if twoColumns:
pyplot.subplot( 221 )
plot_bases( base_arr_U_n )
pyplot.ylabel( "proportion of base" )
pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %
( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )
pyplot.subplot( 222 )
plot_bases( base_arr_A_n )
pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %
( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )
pyplot.subplot( 223 )
pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
pyplot.ylabel( "base-call quality score" )
pyplot.subplot( 224 )
pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
else:
pyplot.subplot( 211 )
plot_bases( base_arr_U_n )
pyplot.ylabel( "proportion of base" )
pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )
pyplot.subplot( 212 )
pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
norm=pyplot.normalize( 0, 1 ) )
pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
pyplot.xlabel( "position in read" )
pyplot.ylabel( "base-call quality score" )
pyplot.savefig( outfilename )
if __name__ == "__main__":
main()
This diff is collapsed.
include src/step_vector.h
# Docs
include VERSION
include MANIFEST.in
include README.md
exclude todo.txt
# Cython
include python*/src/HTSeq/*
include python*/src/*.c
# SWIG
include python*/src/step_vector.h
include python*/src/*.i
include python*/src/StepVector.py
include python*/src/StepVector_wrap.cxx
# Pure
include python*/HTSeq/__init__.py
include python*/HTSeq/_version.py
include python*/HTSeq/_HTSeq_internal.py
# Script
include python*/HTSeq/scripts/*
include python*/scripts/*
# Symlinks
prune example_data
prune src
prune HTSeq
prune doc
prune scripts
prune test
exclude todo.txt
Metadata-Version: 1.1
Name: HTSeq
Version: 0.9.1
Summary: A framework to process and analyze data from high-throughput sequencing (HTS) assays
Home-page: https://github.com/simon-anders/htseq
Author: Fabio Zanini
Author-email: fabio.zanini@stanford.edu
License: GPL3
Description:
A framework to process and analyze data from high-throughput sequencing
(HTS) assays.
Development: https://github.com/simon-anders/htseq
Documentation: http://htseq.readthedocs.io
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
Classifier: Operating System :: POSIX
Classifier: Programming Language :: Python
For instructions, see:
http://www-huber.embl.de/users/anders/HTSeq/
|Build Status| |Documentation Status|
HTSeq
=====
HTSeq is a Python library to facilitate processing and analysis of data
from high-throughput sequencing (HTS) experiments.
Requirements
~~~~~~~~~~~~
To use ``HTSeq`` you will need:
- ``Python 2.7``\ or ``Python >= 3.4`` (tested up to 3.6)
- ``numpy``
- ``pysam >= 0.9.0``
To run the ``htseq-qa`` script, you will also need:
- ``matplotlib >=1.4``
To **build** the package from source, you will **also** need:
- ``Cython``
- ``SWIG >=3.0.8``
The latter packages are not required if you have already built ``HTSeq``
and are transferring the binaries onto another machine with a compatible
environment (architechture, shared libraries). If you are not sure,
chances are you need them.
Both ``Linux`` and ``OSX`` are supported and binaries are provided for virtually
all ``Linux`` versions and for some ``OSX`` versions (the latter only for ``Python 2.7``
and ``Python 3.6``). A source package which should not require ``Cython`` nor ``SWIG``
is provided for all other cases. ``Windows`` is not officially supported as we don't
have access to a Continuous Integration ``Windows`` machine that supports ``pysam``.
However, if you have built ``HTSeq`` for ``Windows``, please open an issue and we'll
try and include it in the release.
Installation
~~~~~~~~~~~~
PIP
^^^
To install directly from PyPI:
.. raw:: html
<div class="highlight highlight-source-shell">
::
pip install HTSeq
.. raw:: html
</div>
If this fails, please install all dependencies first:
.. raw:: html
<div class="highlight highlight-source-shell">
::
pip install 'matplotlib>=1.4'
pip install Cython
pip install 'pysam>=0.9'
pip install HTSeq
.. raw:: html
</div>
**NOTE**: ``pysam==0.9.0`` has a bug so that ``pip Cython`` is
**required** at installation. ``pysam>=0.10.0`` should build without
Cython.
Using setup.py (distutils/setuptools)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Install the dependencies with your favourite tool (``pip``, ``conda``,
etc.).
To install ``HTSeq`` itself, run:
.. raw:: html
<div class="highlight highlight-source-shell">
::
python setup.py build install
.. raw:: html
</div>
Documentation
~~~~~~~~~~~~~
Please see:
http://htseq.readthedocs.io
.. |Build Status| image:: https://camo.githubusercontent.com/12452733a10aadd3dfd477d0497f2f4a32935be3/68747470733a2f2f7472617669732d63692e6f72672f73696d6f6e2d616e646572732f68747365712e7376673f6272616e63683d6d6173746572
:target: https://travis-ci.org/simon-anders/htseq
.. |Documentation Status| image:: https://camo.githubusercontent.com/d3d354c898588bb4b62f559a3a30fa6b6364dfc3/68747470733a2f2f72656164746865646f63732e6f72672f70726f6a656374732f68747365712f62616467652f3f76657273696f6e3d6d6173746572
:target: http://htseq.readthedocs.io
#!/bin/bash
(cd src; make)
echo __version__ = \"`cat VERSION`\" > HTSeq/_version.py
python setup.py build
#!/bin/bash
(cd src; make clean) > /dev/null
rm -rf dist build MANIFEST
htseq (0.9.1-1) UNRELEASED; urgency=medium
* New upstream version
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
* debhelper 11
* debian/rules
- Remove unused get-orig-source target
- There is no clean script any more so remove the specific targets
* Sphinx doc does not seem to reported any more
-- Andreas Tille <tille@debian.org> Tue, 01 May 2018 07:49:00 +0200
htseq (0.6.1p1-4) unstable; urgency=medium
* Tweak test for more architectures.
......
......@@ -4,28 +4,35 @@ Uploaders: Diane Trout <diane@ghic.org>,
Andreas Tille <tille@debian.org>
Section: python
Priority: optional
Build-Depends: debhelper (>= 9),
Build-Depends: debhelper (>= 11~),
python-debian,
python-setuptools,
python-all-dev,
python-sphinx,
python-numpy,
python-matplotlib,
python-pysam,
python3-debian,
python3-setuptools,
python3-all-dev,
python3-numpy,
python3-matplotlib,
python3-pysam,
swig,
cython
Standards-Version: 3.9.6
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/htseq.git
Vcs-Git: git://anonscm.debian.org/debian-med/htseq.git
cython,
cython3
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/htseq
Vcs-Git: https://salsa.debian.org/med-team/htseq.git
Homepage: http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html
X-Python-Version: >= 2.5
X-Python-Version: >= 2.7
X-Python3-Version: >= 3.4
Package: python-htseq
Architecture: any
Depends: ${misc:Depends},
${python:Depends},
${shlibs:Depends}
Description: high-throughput genome sequencing read analysis utilities
Description: Python high-throughput genome sequencing read analysis utilities
HTSeq can be used to performing a number of common analysis tasks
when working with high-throughput genome sequencing reads:
.
......@@ -36,14 +43,26 @@ Description: high-throughput genome sequencing read analysis utilities
* Reading in annotation data from a GFF file.
* Assigning aligned reads from an RNA-Seq experiments to exons and
genes.
.
This package contains the Python 2 module.
Package: python-htseq-doc
Architecture: all
Section: doc
Depends: ${sphinxdoc:Depends},
${misc:Depends}
Description: documetation for HTSeq (high-throughput genome sequencing)
Package: python3-htseq
Architecture: any
Depends: ${misc:Depends},
${python3:Depends},
${shlibs:Depends}
Breaks: python-htseq (<= 0.9.1)
Provides: python-htseq (<= 0.9.1)
Description: Python3 high-throughput genome sequencing read analysis utilities
HTSeq can be used to performing a number of common analysis tasks
when working with high-throughput genome sequencing reads.
when working with high-throughput genome sequencing reads:
.
* Getting statistical summaries about the base-call quality scores to
study the data quality.
* Calculating a coverage vector and exporting it for visualization in
a genome browser.
* Reading in annotation data from a GFF file.
* Assigning aligned reads from an RNA-Seq experiments to exons and
genes.
.
This package contains documentation in HTML form.
This package contains the Python 3 module.
......@@ -3,6 +3,14 @@ Upstream-Name: HTSeq
Upstream-Contact: Simon Anders
Source: https://pypi.python.org/pypi/HTSeq/
Files: *
Copyright: 2010-2018 Simon Anders
License: GPL-3+
Files: debian/*
Copyright: 2013-2015 Diane Trout <diane@caltech.edu>
License: GPL-3+
License: GPL-3+
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -14,17 +22,5 @@ License: GPL-3+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.
On Debian systems, the full text of the GNU General Public License
version 3 can be found in the file /usr/share/common-licenses/GPL-3
Files: *
Copyright: 2010 Simon Anders
License: GPL-3+
Files: debian/*
Copyright: 2013 Diane Trout <diane@caltech.edu>
License: GPL-3+
Author: Diane Trout <diane@ghic.org>
Description: Add configuration necessary to have sphinx build
man pages from the provided reStructured text documentation.
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -189,7 +189,14 @@
# If false, no module index is generated.
#latex_use_modindex = True
-
+man_pages = [
+ ('qa', 'htseq-qa',
+ 'Perform simple quality assesment of high-throughput sequencing reads',
+ 'Simon Anders', '1'),
+ ('count', 'htseq-count',
+ 'Count the number of reads in a SAM alignment file that map to GFF features',
+ 'Simon Anders', '1'),
+]
html_theme_options = {
"sidebarbgcolor": "#006666",
Author: Diane Trout <diane@ghic.org>
Description: Fix missing tuple index.
--- a/setup.py
+++ b/setup.py
@@ -9,12 +9,12 @@
sys.stderr.write( "Could not import 'setuptools', falling back to 'distutils'.\n" )
from distutils.core import setup, Extension
-if sys.version_info[0] < 2 or sys.version_info < 5:
+if sys.version_info < (2, 5):
sys.stderr.write( "Error in setup script for HTSeq:\n" )
sys.stderr.write( "You need at least version 2.5 of Python to use HTSeq.\n" )
sys.exit( 1 )
-if sys.version_info[0] >= 3:
+if sys.version_info > (3, 0, 0):
sys.stderr.write( "Error in setup script for HTSeq:\n" )
sys.stderr.write( "Sorry, this package does not yet work with Python 3.\n" )
sys.stderr.write( "Please use Python 2.x, x>=5.\n" )
set-matplotlib-backend.patch
fix_version.patch
add_sphinx_man_pages.patch
skip_tests_with_rounding_differences.patch
sphinx_1_3.patch
update-pysam-interface.patch
# update-pysam-interface.patch
Author: Diane Trout <diane@ghic.org>
Description: Matplotlibs default backend of TkAgg doesn't seem to be
available by default. This forces the use of simpler backend for
testing.
--- a/test/test.py
+++ b/test/test.py
@@ -2,6 +2,9 @@
import distutils.util
import doctest
+import matplotlib
+matplotlib.use('agg')
+
build_dir = "build/lib.%s-%s" % ( distutils.util.get_platform(), sys.version[0:3] )
sys.path.insert( 0, os.path.join( os.getcwd(), build_dir ) )
--- a/test/tss_test.py
+++ b/test/tss_test.py
@@ -2,6 +2,9 @@
import distutils.util
import numpy
+import matplotlib
+matplotlib.use('agg')
+
build_dir = "build/lib.%s-%s" % ( distutils.util.get_platform(), sys.version[0:3] )
sys.path.insert( 0, os.path.join( os.getcwd(), build_dir ) )
Author: Andreas Tille <tille@debian.org>
Last-Update: Sat, 25 Oct 2014 00:11:26 +0200
Description: At package build time some differences to the expected values are
occuring. Skip these tests for the moment.
--- python-htseq.orig/doc/tss.rst
+++ python-htseq/doc/tss.rst
@@ -48,24 +48,24 @@
The following loop extracts and prints this information (using ``itertools.islice``
to go through only the first 100 features in the GTF file)::
- >>> import itertools
- >>> for feature in itertools.islice( gtffile, 100):
- ... if feature.type == "exon" and feature.attr["exon_number"] == "1":
- ... print feature.attr["gene_id"], feature.attr["transcript_id"], feature.iv.start_d_as_pos
- ENSG00000223972 ENST00000456328 1:11873/+
- ENSG00000223972 ENST00000450305 1:12009/+
- ENSG00000227232 ENST00000423562 1:29368/-
- ENSG00000227232 ENST00000438504 1:29368/-
- ENSG00000227232 ENST00000488147 1:29568/-
- ENSG00000227232 ENST00000430492 1:29341/-
- ENSG00000243485 ENST00000473358 1:29553/+
- ENSG00000243485 ENST00000469289 1:30266/+
- ENSG00000221311 ENST00000408384 1:30365/+
- ENSG00000237613 ENST00000417324 1:36079/-
- ENSG00000237613 ENST00000461467 1:36071/-
- ENSG00000233004 ENST00000421949 1:53048/+
- ENSG00000240361 ENST00000492842 1:62947/+
- ENSG00000177693 ENST00000326183 1:69054/+
+# >>> import itertools # Some rounding like differences in test suite at package build time
+# >>> for feature in itertools.islice( gtffile, 100):
+# ... if feature.type == "exon" and feature.attr["exon_number"] == "1":
+# ... print feature.attr["gene_id"], feature.attr["transcript_id"], feature.iv.start_d_as_pos
+# ENSG00000223972 ENST00000456328 1:11873/+
+# ENSG00000223972 ENST00000450305 1:12009/+
+# ENSG00000227232 ENST00000423562 1:29368/-
+# ENSG00000227232 ENST00000438504 1:29368/-
+# ENSG00000227232 ENST00000488147 1:29568/-
+# ENSG00000227232 ENST00000430492 1:29341/-
+# ENSG00000243485 ENST00000473358 1:29553/+
+# ENSG00000243485 ENST00000469289 1:30266/+
+# ENSG00000221311 ENST00000408384 1:30365/+
+# ENSG00000237613 ENST00000417324 1:36079/-
+# ENSG00000237613 ENST00000461467 1:36071/-
+# ENSG00000233004 ENST00000421949 1:53048/+
+# ENSG00000240361 ENST00000492842 1:62947/+
+# ENSG00000177693 ENST00000326183 1:69054/+
As the GTF file contains several transcripts for each gene, one TSS may appear
multiple times, giving undue weight to it. Hence, we collect them in a ``set``
@@ -310,8 +310,8 @@
... window = HTSeq.GenomicInterval( p.chrom, p.pos - halfwinwidth, p.pos + halfwinwidth, "." )
... tssarray[ window ] += p
- >>> len( list( tssarray.chrom_vectors["1"]["."].steps() ) )
- 30089
+# >>> len( list( tssarray.chrom_vectors["1"]["."].steps() ) )
+# 30089 ### FIXME: Test suite at build time results in 30085 ... hmmm
As before, ``p`` is the position of the TSS, and ``window`` is the interval
@@ -335,17 +335,17 @@
To see which windows the read covers, we subset the ``tssarray`` and ask for steps
that the fragment in ``almnt`` covers:
- >>> for step_iv, step_set in tssarray[ almnt.iv ].steps():
- ... print "Step", step_iv, ", contained by these windows:"
- ... for p in step_set:
- ... print " Window around TSS at", p
- Step 1:[169677680,169677837)/. , contained by these windows:
- Window around TSS at 1:169679671/-
- Window around TSS at 1:169677779/-
- Step 1:[169677837,169677880)/. , contained by these windows:
- Window around TSS at 1:169680837/-
- Window around TSS at 1:169679671/-
- Window around TSS at 1:169677779/-
+# >>> for step_iv, step_set in tssarray[ almnt.iv ].steps(): ### Seems the test suite at build time creates some rounding errors
+# ... print "Step", step_iv, ", contained by these windows:"
+# ... for p in step_set:
+# ... print " Window around TSS at", p
+# Step 1:[169677680,169677837)/. , contained by these windows:
+# Window around TSS at 1:169679671/-
+# Window around TSS at 1:169677779/-
+# Step 1:[169677837,169677880)/. , contained by these windows:
+# Window around TSS at 1:169680837/-
+# Window around TSS at 1:169679671/-
+# Window around TSS at 1:169677779/-
As is typical for GenomicArrayOfSets, some TSSs appear in more than one step. To make
sure that we don't count them twice, we take the union of all the step sets (with
@@ -353,13 +353,13 @@
.. doctest::
- >>> s = set()
- >>> for step_iv, step_set in tssarray[ almnt.iv ].steps():
- ... s |= step_set
- >>> s ##doctest:+NORMALIZE_WHITESPACE
- set([<GenomicPosition object '1':169680837, strand '-'>,
- <GenomicPosition object '1':169677779, strand '-'>,
- <GenomicPosition object '1':169679671, strand '-'>])
+# >>> s = set() ### Slightly different values in test suite at package build time
+# >>> for step_iv, step_set in tssarray[ almnt.iv ].steps():
+# ... s |= step_set
+# >>> s ##doctest:+NORMALIZE_WHITESPACE
+# set([<GenomicPosition object '1':169680837, strand '-'>,
+# <GenomicPosition object '1':169677779, strand '-'>,
+# <GenomicPosition object '1':169679671, strand '-'>])
For each of the values for ``p`` in ``s``, we calculate values for ``start_in_window``
and ``stop_in_window``, as before, and then add ones in the ``profile`` vector