Skip to content
Commits on Source (6)
......@@ -7,9 +7,7 @@ python:
- '3.6'
- '3.5'
- '3.4'
- '3.3'
- '2.7'
- '2.6'
- 'pypy'
- 'pypy3'
install:
......
|Travis| |PyPI| |Landscape| |Coverage| |Depsy| |Appveyor|
|Travis| |PyPI| |Coverage| |Depsy|
Description
-----------
......@@ -54,7 +54,8 @@ Acts like a dictionary.
.. code:: python
>>> genes.keys() ('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
>>> genes.keys()
('AB821309.1', 'KF435150.1', 'KF435149.1', 'NR_104216.1', 'NR_104215.1', 'NR_104212.1', 'NM_001282545.1', 'NM_001282543.1', 'NM_000465.3', 'NM_001282549.1', 'NM_001282548.1', 'XM_005249645.1', 'XM_005249644.1', 'XM_005249643.1', 'XM_005249642.1', 'XM_005265508.1', 'XM_005265507.1', 'XR_241081.1', 'XR_241080.1', 'XR_241079.1')
>>> genes['NM_001282543.1'][200:230]
>NM_001282543.1:201-230
......@@ -544,7 +545,7 @@ Support for compressed FASTA
----------------------------
``pyfaidx`` can create and read ``.fai`` indices for FASTA files that have
been compressed using the `bgzip <http://www.htslib.org/doc/tabix.html>`_
been compressed using the `bgzip <https://www.htslib.org/doc/bgzip.html>`_
tool from `samtools <http://www.htslib.org/>`_. ``bgzip`` writes compressed
data in a ``BGZF`` format. ``BGZF`` is ``gzip`` compatible, consisting of
multiple concatenated ``gzip`` blocks, each with an additional ``gzip``
......
python-pyfaidx (0.5.7-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Standards-Version: 4.4.1
* Set upstream metadata fields: Repository, Repository-Browse.
-- Steffen Moeller <moeller@debian.org> Wed, 01 Jan 2020 19:58:16 +0100
python-pyfaidx (0.5.5.2-2) unstable; urgency=medium
* Drop Python2 support
......
......@@ -13,7 +13,7 @@ Build-Depends: debhelper-compat (= 12),
python3-numpy,
python3-six,
python3-mock
Standards-Version: 4.4.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/python-pyfaidx
Vcs-Git: https://salsa.debian.org/med-team/python-pyfaidx.git
Homepage: https://github.com/mdshw5/pyfaidx
......
......@@ -13,3 +13,5 @@ Reference:
Registry:
- Name: bio.tools
Entry: pyfaidx
Repository: https://github.com/mdshw5/pyfaidx
Repository-Browse: https://github.com/mdshw5/pyfaidx
......@@ -25,7 +25,7 @@ if sys.version_info > (3, ):
dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
__version__ = '0.5.5.2'
__version__ = '0.5.7'
class KeyFunctionError(ValueError):
......@@ -123,7 +123,7 @@ class Sequence(object):
>chr1
AC
"""
if self.start is None or self.end is None:
if self.start is None or self.end is None or len(self.seq) == 0:
correction_factor = 0
elif len(
self.seq
......@@ -461,7 +461,7 @@ class Faidx(object):
rname, rlen, offset, lenc, lenb = line.split('\t')
rlen, offset, lenc, lenb = map(int,
(rlen, offset, lenc, lenb))
newlines = int(ceil(rlen / lenc) * (lenb - lenc))
newlines = int(ceil(rlen / lenc) * (lenb - lenc)) if lenc else 0
bend = offset + newlines + rlen
rec = IndexRecord(rlen, offset, lenc, lenb, bend,
prev_bend)
......@@ -508,8 +508,8 @@ class Faidx(object):
rname = None # reference sequence name
offset = 0 # binary offset of end of current line
rlen = 0 # reference character length
blen = None # binary line length (includes newline)
clen = None # character line length
blen = 0 # binary line length (includes newline)
clen = 0 # character line length
bad_lines = [] # lines > || < than blen
thisoffset = offset
valid_entry = False
......@@ -535,9 +535,9 @@ class Faidx(object):
"Inconsistent line found in >{0} at "
"line {1:n}.".format(
rname, bad_lines[0][0] + 1))
blen = None
blen = 0
rlen = 0
clen = None
clen = 0
bad_lines = []
try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
rname = line.rstrip('\n\r')[1:].split()[
......@@ -648,8 +648,8 @@ class Faidx(object):
# Calculate offset (https://github.com/samtools/htslib/blob/20238f354894775ed22156cdd077bc0d544fa933/faidx.c#L398)
newlines_before = int(
(start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 else 0
newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc))
(start0 - 1) / i.lenc * (i.lenb - i.lenc)) if start0 > 0 and i.lenc else 0
newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc)) if i.lenc else 0
newlines_inside = newlines_to_end - newlines_before
seq_blen = newlines_inside + seq_len
bstart = i.offset + newlines_before + start0
......@@ -669,12 +669,15 @@ class Faidx(object):
else:
self.file.seek(bstart)
# If the requested sequence exceeds len(FastaRecord), return as much as possible
if bstart + seq_blen > i.bend and not self.strict_bounds:
seq_blen = i.bend - bstart
# Otherwise it should be safe to read the sequence
if seq_blen > 0:
seq = self.file.read(seq_blen).decode()
elif seq_blen <= 0 and not self.strict_bounds:
# If the requested sequence is negative, we will pad the empty string with default_seq.
# This was changed to support #155 with strict_bounds=True.
elif seq_blen <= 0:
seq = ''
if not internals:
......@@ -994,13 +997,9 @@ class Fasta(object):
sequence_always_upper=sequence_always_upper,
rebuild=rebuild,
build_index=build_index)
self.keys = self.faidx.index.keys
if not self.mutable:
self.records = dict(
[(rname, FastaRecord(rname, self)) for rname in self.keys()])
elif self.mutable:
self.records = dict([(rname, MutableFastaRecord(rname, self))
for rname in self.keys()])
_record_constructor = MutableFastaRecord if self.mutable else FastaRecord
self.records = OrderedDict([(rname, _record_constructor(rname, self)) for rname in self.faidx.index.keys()])
def __contains__(self, rname):
"""Return True if genome contains record."""
......@@ -1057,6 +1056,15 @@ class Fasta(object):
# len(Sequence.seq) != end - start
return Sequence(name=name, seq=seq, start=None, end=None)
def keys(self):
return self.records.keys()
def values(self):
return self.records.values()
def items(self):
return self.records.items()
def close(self):
self.__exit__()
......@@ -1259,6 +1267,21 @@ def check_bad_lines(rname, bad_lines, i):
"Please report this issue at https://github.com/mdshw5/pyfaidx/issues " + \
str(bad_lines))
def get_valid_filename(s):
"""
From https://github.com/django/django/blob/efc3e32d6d7fb9bb41be73b80c8607b653c1fbd6/django/utils/text.py#L222-L232
Return the given string converted to a string that can be used for a clean
filename. Remove leading and trailing spaces; convert other spaces to
underscores; and remove anything that is not an alphanumeric, dash,
underscore, or dot.
>>> get_valid_filename("HPV16_144-1.fa")
'HPV16_144-1.fa'
>>> get_valid_filename("chromosome 6.fa")
'chromosome_6.fa'
"""
s = str(s).strip().replace(' ', '_')
return re.sub(r'(?u)[^-\w.]', '', s)
if __name__ == "__main__":
import doctest
......
......@@ -3,11 +3,9 @@ import argparse
import sys
import os.path
import re
from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split
from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split, get_valid_filename
from collections import defaultdict
keepcharacters = (' ', '.', '_')
def write_sequence(args):
_, ext = os.path.splitext(args.fasta)
if ext:
......@@ -36,7 +34,7 @@ def write_sequence(args):
continue
if args.split_files: # open output file based on sequence name
filename = '.'.join(str(e) for e in (name, start, end, ext) if e)
filename = ''.join(c for c in filename if c.isalnum() or c in keepcharacters)
filename = get_valid_filename(filename)
outfile = open(filename, 'w')
elif args.out:
outfile = args.out
......
......@@ -36,12 +36,11 @@ setup(
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Operating System :: Unix",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
......
......@@ -46,6 +46,7 @@ class TestFastaRecord(TestCase):
long_names = []
for record in fasta:
long_names.append(record.long_name)
print(tuple(zip(deflines, long_names)))
assert deflines == long_names
def test_issue_62(self):
......
......@@ -6,6 +6,53 @@ from unittest import TestCase
path = os.path.dirname(__file__)
os.chdir(path)
class TestFeatureZeroLength:
"""Tests for handling zero-length entries, added in #155"""
def setUp(self):
with open('data/zero_length.fasta', 'w') as fasta:
fasta.write(""">A
ATCG
>B
>C
>D
GTA
GC""")
def tearDown(self):
os.remove('data/zero_length.fasta')
os.remove('data/zero_length.fasta.fai')
def test_index_zero_length(self):
fasta = Fasta('data/zero_length.fasta')
def test_fetch_zero_length(self):
fasta = Fasta('data/zero_length.fasta')
b = fasta["B"]
assert str(b) == ''
class TestZeroLengthSequenceSubRange(TestCase):
def setUp(self):
pass
def tearDown(self):
try:
os.remove('data/genes.fasta.fai')
except EnvironmentError:
pass # some tests may delete this file
def test_as_raw_zero_length_subsequence(self):
fasta = Fasta('data/genes.fasta', as_raw=True, strict_bounds=True)
expect = ''
result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
assert result == expect
def test_zero_length_subsequence(self):
fasta = Fasta('data/genes.fasta', strict_bounds=True)
expect = ''
result = fasta['gi|557361099|gb|KF435150.1|'][100:100]
assert result.seq == expect
class TestFeatureBoundsCheck:
def setUp(self):
pass
......