Skip to content
Commits on Source (5)
......@@ -11,8 +11,9 @@ python:
- 'pypy'
- 'pypy3'
install:
- pip wheel -f wheelhouse coverage biopython cython pysam pyvcf numpy || true
- pip install -f wheelhouse biopython cython pysam pyfasta coverage pyvcf numpy || true
- pip wheel -f wheelhouse cython pysam numpy || true
- pip install -f wheelhouse cython pysam pyfasta coverage pyvcf numpy || true
- pip install -f wheelhouse -e git+https://github.com/biopython/biopython.git#egg=biopython || true
- python setup.py install
- if [ ! -f samtools-1.2 ]; then curl -sL https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 | tar -xjv; fi
- cd samtools-1.2
......
......@@ -367,7 +367,7 @@ cli script: faidx
-x, --split-files write each region to a separate file (names are derived from regions)
-l, --lazy fill in --default-seq for missing ranges. default: False
-s DEFAULT_SEQ, --default-seq DEFAULT_SEQ
default base for missing positions and masking. default: N
default base for missing positions and masking. default: None
-d DELIMITER, --delimiter DELIMITER
delimiter for splitting names to multiple values (duplicate names will be discarded). default: None
-e HEADER_FUNCTION, --header-function HEADER_FUNCTION
......
python-pyfaidx (0.5.5.1-1) UNRELEASED; urgency=medium
* New upstream version
* Standards-Version: 4.2.1
* Secure URI in copyright format
-- Andreas Tille <tille@debian.org> Thu, 25 Oct 2018 20:30:14 +0200
python-pyfaidx (0.5.4-1) unstable; urgency=medium
* New upstream version
......
......@@ -20,7 +20,7 @@ Build-Depends: debhelper (>= 11~),
python3-numpy,
python3-six,
python3-mock
Standards-Version: 4.1.4
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/python-pyfaidx
Vcs-Git: https://salsa.debian.org/med-team/python-pyfaidx.git
Homepage: https://github.com/mdshw5/pyfaidx
......
Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: pyfaidx
Upstream-Contact: Matthew Shirley <mdshw5@gmail.com>
Source: https://github.com/mdshw5/pyfaidx
......
......@@ -25,7 +25,7 @@ if sys.version_info > (3, ):
dna_bases = re.compile(r'([ACTGNactgnYRWSKMDVHBXyrwskmdvhbx]+)')
__version__ = '0.5.4'
__version__ = '0.5.5.1'
class KeyFunctionError(ValueError):
......@@ -335,9 +335,13 @@ class Faidx(object):
# Only try to import Bio if we actually need the bgzf reader.
try:
from Bio import bgzf
from Bio import __version__ as bgzf_version
from distutils.version import LooseVersion
if LooseVersion(bgzf_version) < LooseVersion('1.73'):
raise ImportError
except ImportError:
raise ImportError(
"BioPython must be installed to read gzipped files.")
"BioPython >= 1.73 must be installed to read block gzip files.")
else:
self._fasta_opener = bgzf.open
self._bgzf = True
......@@ -499,7 +503,7 @@ class Faidx(object):
def build_index(self):
try:
with self._fasta_opener(self.filename, 'r') as fastafile:
with self._fasta_opener(self.filename, 'rb') as fastafile:
with open(self.indexname, 'w') as indexfile:
rname = None # reference sequence name
offset = 0 # binary offset of end of current line
......@@ -508,10 +512,11 @@ class Faidx(object):
clen = None # character line length
bad_lines = [] # lines > || < than blen
thisoffset = offset
valid_entry = False
lastline = None
for i, line in enumerate(fastafile):
line_blen = len(line)
line = line.decode()
line_clen = len(line.rstrip('\n\r'))
lastline = i
# write an index line
......@@ -557,6 +562,12 @@ class Faidx(object):
offset += line_blen
rlen += line_clen
# check that we find at least 1 valid FASTA record
if not valid_entry:
raise FastaIndexingError(
"The FASTA file %s does not contain a valid sequence. "
"Check that sequence definition lines start with '>'." % self.filename)
# write the final index line, if there is one.
if lastline is not None:
valid_entry = check_bad_lines(
......@@ -667,7 +678,7 @@ class Faidx(object):
seq = ''
if not internals:
return seq.replace('\n', '')
return seq.replace('\n', '').replace('\r', '')
else:
return (seq, locals())
......@@ -709,14 +720,20 @@ class Faidx(object):
)
elif len(seq) == len(file_seq) - internals['newlines_inside']:
line_len = internals['i'].lenc
if '\r\n' in file_seq:
newline_char = '\r\n'
elif '\r' in file_seq:
newline_char = '\r'
else:
newline_char = '\n'
self.file.seek(internals['bstart'])
if internals['newlines_inside'] == 0:
self.file.write(seq.encode())
elif internals['newlines_inside'] > 0:
n = 0
m = file_seq.index('\n')
m = file_seq.index(newline_char)
while m < len(seq):
self.file.write(''.join([seq[n:m], '\n']).encode())
self.file.write(''.join([seq[n:m], newline_char]).encode())
n = m
m += line_len
self.file.write(seq[n:].encode())
......
......@@ -12,17 +12,17 @@ def write_sequence(args):
_, ext = os.path.splitext(args.fasta)
if ext:
ext = ext[1:] # remove the dot from extension
filt_function = re.compile(args.regex).search
if args.invert_match:
filt_function = lambda x: not re.compile(args.regex).search(x)
fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, filt_function=filt_function, read_long_names=args.long_names, rebuild=not args.no_rebuild)
regions_to_fetch, split_function = split_regions(args)
if not regions_to_fetch:
regions_to_fetch = fasta.keys()
if args.invert_match:
sequences_to_exclude = set([split_function(region)[0] for region in regions_to_fetch])
fasta = Fasta(args.fasta, default_seq=args.default_seq, key_function=eval(args.header_function), strict_bounds=not args.lazy, split_char=args.delimiter, rebuild=not args.no_rebuild)
regions_to_fetch = (key for key in fasta.keys() if key not in sequences_to_exclude)
split_function = ucsc_split
header = False
for region in regions_to_fetch:
......@@ -163,11 +163,11 @@ def main(ext_args=None):
header.add_argument('-t', '--no-coords', action="store_true", default=False, help="omit coordinates (e.g. chr:start-end) from output headers. default: %(default)s")
output.add_argument('-x', '--split-files', action="store_true", default=False, help="write each region to a separate file (names are derived from regions)")
output.add_argument('-l', '--lazy', action="store_true", default=False, help="fill in --default-seq for missing ranges. default: %(default)s")
output.add_argument('-s', '--default-seq', type=check_seq_length, default='N', help='default base for missing positions and masking. default: %(default)s')
output.add_argument('-s', '--default-seq', type=check_seq_length, default=None, help='default base for missing positions and masking. default: %(default)s')
header.add_argument('-d', '--delimiter', type=str, default=None, help='delimiter for splitting names to multiple values (duplicate names will be discarded). default: %(default)s')
header.add_argument('-e', '--header-function', type=str, default='lambda x: x.split()[0]', help='python function to modify header lines e.g: "lambda x: x.split("|")[0]". default: %(default)s')
header.add_argument('-u', '--duplicates-action', type=str, default="stop", choices=("stop", "first", "last", "longest", "shortest"), help='entry to take when duplicate sequence names are encountered. default: %(default)s')
matcher = header.add_mutually_exclusive_group()
matcher = parser.add_argument_group('matching arguments')
matcher.add_argument('-g', '--regex', type=str, default='.*', help='selected sequences are those matching regular expression. default: %(default)s')
matcher.add_argument('-v', '--invert-match', action="store_true", default=False, help="selected sequences are those not matching 'regions' argument. default: %(default)s")
masking = output.add_mutually_exclusive_group()
......@@ -199,7 +199,10 @@ def main(ext_args=None):
def check_seq_length(value):
if len(value) != 1:
if value is None:
pass # default value
elif len(value) != 1:
# user is passing a single character
raise argparse.ArgumentTypeError("--default-seq value must be a single character!")
return value
......
This diff is collapsed.
......@@ -50,3 +50,11 @@ class TestCLI(TestCase):
print(noto_strand.read())
self.assertTrue(filecmp.cmp(auto_strand.name, noto_strand.name))
def test_regexp(self):
main(['data/genes.fasta', '-g', 'XR'])
def test_not_regexp(self):
main(['data/genes.fasta', '-g', 'XR','-v'])
def test_not_regexp_multi(self):
main(['data/genes.fasta', '-g', 'XR', '-g', 'XM', '-v'])
......@@ -55,6 +55,33 @@ class TestIndexing(TestCase):
result_index = open(index_file).read()
assert result_index == expect_index
def test_build_issue_141(self):
expect_index = ("gi|563317589|dbj|AB821309.1| 3510 115 70 72\n"
"gi|557361099|gb|KF435150.1| 481 3842 70 72\n"
"gi|557361097|gb|KF435149.1| 642 4429 70 72\n"
"gi|543583796|ref|NR_104216.1| 4573 5213 70 72\n"
"gi|543583795|ref|NR_104215.1| 5317 10040 70 72\n"
"gi|543583794|ref|NR_104212.1| 5374 15631 70 72\n"
"gi|543583788|ref|NM_001282545.1| 4170 21274 70 72\n"
"gi|543583786|ref|NM_001282543.1| 5466 25679 70 72\n"
"gi|543583785|ref|NM_000465.3| 5523 31415 70 72\n"
"gi|543583740|ref|NM_001282549.1| 3984 37211 70 72\n"
"gi|543583738|ref|NM_001282548.1| 4113 41424 70 72\n"
"gi|530384540|ref|XM_005249645.1| 2752 45784 70 72\n"
"gi|530384538|ref|XM_005249644.1| 3004 48745 70 72\n"
"gi|530384536|ref|XM_005249643.1| 3109 51964 70 72\n"
"gi|530384534|ref|XM_005249642.1| 3097 55292 70 72\n"
"gi|530373237|ref|XM_005265508.1| 2794 58640 70 72\n"
"gi|530373235|ref|XM_005265507.1| 2848 61675 70 72\n"
"gi|530364726|ref|XR_241081.1| 1009 64742 70 72\n"
"gi|530364725|ref|XR_241080.1| 4884 65918 70 72\n"
"gi|530364724|ref|XR_241079.1| 2819 71079 70 72\n")
index_file = Faidx('data/issue_141.fasta').indexname
result_index = open(index_file).read()
os.remove('data/issue_141.fasta.fai')
print(result_index)
assert result_index == expect_index
def test_build_issue_111(self):
expect_index = ("gi|563317589|dbj|AB821309 3510 114 70 71\n"
"gi|557361099|gb|KF435150 481 3789 70 71\n"
......@@ -295,3 +322,18 @@ class TestIndexing(TestCase):
""" Ensure that index file is not built when build_index=False. See mdshw5/pyfaidx#134.
"""
faidx = Faidx('data/genes.fasta', build_index=False)
@raises(FastaIndexingError)
def test_issue_144_no_defline(self):
""" Ensure that an exception is raised when a file contains no deflines. See mdshw5/pyfaidx#144.
"""
tmp_dir = mkdtemp()
try:
fasta_path = os.path.join(tmp_dir, 'issue_144.fasta')
# Write simple fasta file
with open(fasta_path, 'w') as fasta_out:
fasta_out.write("CTCCGGGCCCAT\nATAAAGCCTAAA\n")
faidx = Faidx(fasta_path)
finally:
shutil.rmtree(tmp_dir)