Skip to content
Commits on Source (4)
Metadata-Version: 1.1
Metadata-Version: 2.1
Name: HTSeq
Version: 0.9.1
Version: 0.10.0
Summary: A framework to process and analyze data from high-throughput sequencing (HTS) assays
Home-page: https://github.com/simon-anders/htseq
Author: Fabio Zanini
Author-email: fabio.zanini@stanford.edu
Author: Simon Anders
Author-email: sanders@fs.tum.de
Maintainer: Fabio Zanini
Maintainer-email: fabio.zanini@stanford.edu
License: GPL3
Description:
A framework to process and analyze data from high-throughput sequencing
......@@ -21,3 +23,4 @@ Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: GNU General Public License (GPL)
Classifier: Operating System :: POSIX
Classifier: Programming Language :: Python
Provides-Extra: htseq-qa
htseq (0.10.0-1) unstable; urgency=medium
* New upstream version
-- Andreas Tille <tille@debian.org> Mon, 28 May 2018 17:26:34 +0200
htseq (0.9.1-1) unstable; urgency=medium
* New upstream version
......
......@@ -3,7 +3,10 @@
See htseq.readthedocs.io/en/master/index.html for documentation.
"""
import itertools, warnings, os, shlex
import itertools
import warnings
import os
import shlex
try:
from _HTSeq import *
......@@ -44,7 +47,7 @@ class FileOrSequence( object ):
def __iter__(self):
self.line_no = 1
if isinstance( self.fos, str ):
if isinstance(self.fos, basestring):
if self.fos.lower().endswith((".gz", ".gzip")):
lines = gzip.open(self.fos, 'rt')
else:
......@@ -83,7 +86,6 @@ class FileOrSequence( object ):
#########################
class GenomicFeature(object):
"""A genomic feature, i.e., an interval on a genome with metadata.
At minimum, the following information should be provided by slots:
......@@ -144,6 +146,7 @@ class GenomicFeature( object ):
_re_attr_main = re.compile("\s*([^\s\=]+)[\s=]+(.*)")
_re_attr_empty = re.compile("^\s*$")
def parse_GFF_attribute_string(attrStr, extra_return_first_value=False):
"""Parses a GFF attribute string and returns it as a dictionary.
......@@ -158,10 +161,10 @@ def parse_GFF_attribute_string( attrStr, extra_return_first_value=False ):
if _re_attr_empty.match(attr):
continue
if attr.count('"') not in (0, 2):
raise ValueError, "The attribute string seems to contain mismatched quotes."
raise ValueError("The attribute string seems to contain mismatched quotes.")
mo = _re_attr_main.match(attr)
if not mo:
raise ValueError, "Failure parsing GFF attribute line"
raise ValueError("Failure parsing GFF attribute line")
val = mo.group(2)
if val.startswith('"') and val.endswith('"'):
val = val[1:-1]
......@@ -174,10 +177,11 @@ def parse_GFF_attribute_string( attrStr, extra_return_first_value=False ):
else:
return d
_re_gff_meta_comment = re.compile("##\s*(\S+)\s+(\S*)")
class GFF_Reader( FileOrSequence ):
class GFF_Reader(FileOrSequence):
"""Parse a GFF file
Pass the constructor either a file name or an iterator of lines of a
......@@ -192,7 +196,6 @@ class GFF_Reader( FileOrSequence ):
self.end_included = end_included
self.metadata = {}
def __iter__(self):
for line in FileOrSequence.__iter__(self):
if line == "\n":
......@@ -256,7 +259,6 @@ def make_feature_dict( feature_sequence ):
return res
#########################
## GenomicArray
#########################
......@@ -272,6 +274,7 @@ def read_chrom_lens( filename, delimiter="\t" ):
_re_fasta_header_line = re.compile(r'>\s*(\S+)\s*(.*)')
class FastaReader(FileOrSequence):
"""A Fasta_Reader is associated with a FASTA file or an open connection
to a file-like object with content in FASTA format.
......@@ -365,6 +368,7 @@ class FastaReader( FileOrSequence ):
else:
return ans[0].get_reverse_complement()
class FastqReader(FileOrSequence):
"""A Fastq object is associated with a FASTQ self.file. When an iterator
is requested from the object, the FASTQ file is read.
......@@ -376,7 +380,7 @@ class FastqReader( FileOrSequence ):
FileOrSequence.__init__(self, file_)
self.qual_scale = qual_scale
if qual_scale not in ("phred", "solexa", "solexa-old"):
raise ValueError, "Illegal quality scale."
raise ValueError("Illegal quality scale.")
self.raw_iterator = raw_iterator
def __iter__(self):
......@@ -388,7 +392,8 @@ class FastqReader( FileOrSequence ):
qual = fin.next()
if qual == "":
if id1 != "":
warnings.warn( "Number of lines in FASTQ file is not "
warnings.warn(
"Number of lines in FASTQ file is not "
"a multiple of 4. Discarding the last, "
"incomplete record")
break
......@@ -412,6 +417,7 @@ class FastqReader( FileOrSequence ):
self.qual_scale)
yield s
class BowtieReader(FileOrSequence):
"""A BowtieFile object is associated with a Bowtie output file that
contains short read alignments. It can generate an iterator of Alignment
......@@ -428,6 +434,7 @@ class BowtieReader( FileOrSequence ):
RuntimeWarning)
yield algnt
def bundle_multiple_alignments(sequence_of_alignments):
"""Some alignment programs, e.g., Bowtie, can output multiple alignments,
i.e., the same read is reported consecutively with different alignments.
......@@ -537,6 +544,7 @@ class SolexaExportReader( FileOrSequence ):
start + len(fields['read_seq']), strand)
yield record
class SAM_Reader(FileOrSequence):
"""A SAM_Reader object is associated with a SAM file that
contains short read alignments. It can generate an iterator of Alignment
......@@ -554,8 +562,8 @@ class SAM_Reader( FileOrSequence ):
raise
yield algnt
class GenomicArrayOfSets( GenomicArray ):
class GenomicArrayOfSets(GenomicArray):
"""A GenomicArrayOfSets is a specialization of GenomicArray that allows to store
sets of objects. On construction, the step vectors are initialized with empty sets.
By using the 'add_value' method, objects can be added to intervals. If an object
......@@ -871,7 +879,7 @@ class VCF_Reader( FileOrSequence ):
def meta_info(self, header_filename=None):
ret = []
if header_filename == None:
if header_filename is None:
the_iter = FileOrSequence.__iter__(self)
else:
the_iter = open(header_filename, "r")
......@@ -941,13 +949,15 @@ class WiggleReader( FileOrSequence ):
pos = int(tmp[0])
yield (GenomicInterval(chrom, pos, pos + span, '.'), float(tmp[1]))
class BAM_Reader(object):
def __init__( self, filename ):
def __init__(self, filename, check_sq=True):
global pysam
self.filename = filename
self.sf = None # This one is only used by __getitem__
self.record_no = -1
self.check_sq = check_sq
try:
import pysam
except ImportError:
......@@ -955,7 +965,7 @@ class BAM_Reader( object ):
raise
def __iter__(self):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
for pa in sf:
#yield SAM_Alignment.from_pysam_AlignedRead(pa, sf)
......@@ -963,7 +973,7 @@ class BAM_Reader( object ):
self.record_no += 1
def fetch(self, reference=None, start=None, end=None, region=None):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
try:
for pa in sf.fetch(reference, start, end, region):
......@@ -986,17 +996,17 @@ class BAM_Reader( object ):
def __getitem__(self, iv):
if not isinstance(iv, GenomicInterval):
raise TypeError, "Use a HTSeq.GenomicInterval to access regions within .bam-file!"
raise TypeError("Use a HTSeq.GenomicInterval to access regions within .bam-file!")
if self.sf is None:
self.sf = pysam.Samfile( self.filename, "rb" )
self.sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
# NOTE: pysam 0.9 has renames _hasIndex into has_index
if (hasattr(self.sf, '_hasIndex') and (not self.sf._hasIndex())) or (not self.sf.has_index()):
raise ValueError, "The .bam-file has no index, random-access is disabled!"
raise ValueError("The .bam-file has no index, random-access is disabled!")
for pa in self.sf.fetch(iv.chrom, iv.start+1, iv.end):
yield SAM_Alignment.from_pysam_AlignedRead(pa, self.sf)
def get_header_dict(self):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
return sf.header
......@@ -1014,7 +1024,14 @@ class BAM_Writer( object ):
self.referencelengths = referencelengths
self.text = text
self.header = header
self.sf = pysam.Samfile( self.filename, mode="wb", template = self.template, referencenames = self.referencenames, referencelengths = self.referencelengths, text = self.text, header = self.header )
self.sf = pysam.Samfile(
self.filename,
mode="wb",
template=self.template,
referencenames=self.referencenames,
referencelengths=self.referencelengths,
text=self.text,
header=self.header)
@classmethod
def from_BAM_Reader(cls, fn, br):
......@@ -1039,13 +1056,12 @@ class BED_Reader( FileOrSequence ):
continue
fields = line.split()
if len(fields) < 3:
raise ValueError, "BED file line contains less than 3 fields"
raise ValueError("BED file line contains less than 3 fields")
if len(fields) > 9:
raise ValueError, "BED file line contains more than 9 fields"
raise ValueError("BED file line contains more than 9 fields")
iv = GenomicInterval(fields[0], int(fields[1]), int(fields[2]), fields[5] if len(fields) > 5 else ".")
f = GenomicFeature(fields[3] if len(fields) > 3 else "unnamed", "BED line", iv)
f.score = float(fields[4]) if len(fields) > 4 else None
f.thick = GenomicInterval(iv.chrom, int(fields[6]), int(fields[7]), iv.strand) if len(fields) > 7 else None
f.itemRgb = [int(a) for a in fields[8].split(",")] if len(fields) > 8 else None
yield(f)
__version__ = "0.9.1"
\ No newline at end of file
__version__ = "0.10.0"
\ No newline at end of file
......@@ -87,6 +87,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write("%d GFF lines processed.\n" % i)
sys.stderr.flush()
except:
sys.stderr.write(
"Error occured when processing GFF file (%s):\n" %
......@@ -95,6 +96,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
if not quiet:
sys.stderr.write("%d GFF lines processed.\n" % i)
sys.stderr.flush()
if len(counts) == 0:
sys.stderr.write(
......@@ -156,6 +158,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
sys.stderr.write(
"%d SAM alignment record%s processed.\n" %
(i, "s" if not pe_mode else " pairs"))
sys.stderr.flush()
i += 1
if not pe_mode:
......@@ -298,6 +301,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
sys.stderr.write(
"%d SAM %s processed.\n" %
(i, "alignments " if not pe_mode else "alignment pairs"))
sys.stderr.flush()
if samoutfile is not None:
samoutfile.close()
......
......@@ -576,7 +576,9 @@ cdef class GenomicArray(object):
self.storage, self.memmap_dir)
else:
self.chrom_vectors[chrom] = {
strand_nostrand: ChromVector.create(iv, self.typecode, self.storage)}
strand_nostrand: ChromVector.create(iv, self.typecode,
self.storage,
self.memmap_dir)}
def __reduce__(self):
return (_GenomicArray_unpickle, (self.stranded, self.typecode, self.chrom_vectors))
......
......@@ -993,11 +993,12 @@ class WiggleReader(FileOrSequence):
class BAM_Reader(object):
def __init__(self, filename):
def __init__(self, filename, check_sq=True):
global pysam
self.filename = filename
self.sf = None # This one is only used by __getitem__
self.record_no = -1
self.check_sq = check_sq
try:
import pysam
except ImportError:
......@@ -1006,7 +1007,7 @@ class BAM_Reader(object):
raise
def __iter__(self):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
for pa in sf:
# yield SAM_Alignment.from_pysam_AlignedRead( pa, sf )
......@@ -1014,7 +1015,7 @@ class BAM_Reader(object):
self.record_no += 1
def fetch(self, reference=None, start=None, end=None, region=None):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
self.record_no = 0
try:
for pa in sf.fetch(reference, start, end, region):
......@@ -1041,7 +1042,7 @@ class BAM_Reader(object):
raise TypeError(
"Use a HTSeq.GenomicInterval to access regions within .bam-file!")
if self.sf is None:
self.sf = pysam.Samfile(self.filename, "rb")
self.sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
# NOTE: pysam 0.9 has renames _hasIndex into has_index
if (hasattr(self.sf, '_hasIndex') and (not self.sf._hasIndex())) or (not self.sf.has_index()):
raise ValueError(
......@@ -1050,7 +1051,7 @@ class BAM_Reader(object):
yield SAM_Alignment.from_pysam_AlignedRead(pa, self.sf)
def get_header_dict(self):
sf = pysam.Samfile(self.filename, "rb")
sf = pysam.Samfile(self.filename, "rb", check_sq=self.check_sq)
return sf.header
......
__version__ = "0.9.1"
\ No newline at end of file
__version__ = "0.10.0"
\ No newline at end of file
......@@ -87,6 +87,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
i += 1
if i % 100000 == 0 and not quiet:
sys.stderr.write("%d GFF lines processed.\n" % i)
sys.stderr.flush()
except:
sys.stderr.write(
"Error occured when processing GFF file (%s):\n" %
......@@ -95,6 +96,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
if not quiet:
sys.stderr.write("%d GFF lines processed.\n" % i)
sys.stderr.flush()
if len(counts) == 0:
sys.stderr.write(
......@@ -156,6 +158,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
sys.stderr.write(
"%d SAM alignment record%s processed.\n" %
(i, "s" if not pe_mode else " pairs"))
sys.stderr.flush()
i += 1
if not pe_mode:
......@@ -298,6 +301,7 @@ def count_reads_in_features(sam_filenames, gff_filename,
sys.stderr.write(
"%d SAM %s processed.\n" %
(i, "alignments " if not pe_mode else "alignment pairs"))
sys.stderr.flush()
if samoutfile is not None:
samoutfile.close()
......
......@@ -579,7 +579,9 @@ cdef class GenomicArray(object):
self.storage, self.memmap_dir)
else:
self.chrom_vectors[chrom] = {
strand_nostrand: ChromVector.create(iv, self.typecode, self.storage)}
strand_nostrand: ChromVector.create(iv, self.typecode,
self.storage,
self.memmap_dir)}
def __reduce__(self):
return (_GenomicArray_unpickle, (self.stranded, self.typecode, self.chrom_vectors))
......@@ -1513,11 +1515,18 @@ cdef class SAM_Alignment(AlignmentWithSequenceReversal):
else:
cigar = "*"
return '\t'.join((self.read.name, str(self.flag), query_start.chrom,
str(query_start.start +
1), str(self.aQual), cigar, mate_start.chrom,
str(mate_start.pos + 1), str(self.inferred_insert_size),
self.read_as_aligned.seq, self.read_as_aligned.qualstr,
return '\t'.join(
(self.read.name,
str(self.flag),
query_start.chrom,
str(query_start.start + 1),
str(self.aQual),
cigar,
mate_start.chrom,
str(mate_start.pos + 1),
str(self.inferred_insert_size),
self.read_as_aligned.seq.decode(),
self.read_as_aligned.qualstr.decode(),
'\t'.join(self.raw_optional_fields())))
def optional_field(SAM_Alignment self, str tag):
......
[egg_info]
tag_build =
tag_date = 0
tag_svn_revision = 0