Skip to content
Commits on Source (7)
......@@ -11,6 +11,7 @@ python:
- "3.5"
- "3.6"
- "3.7"
- "3.8"
- "nightly"
install:
......@@ -45,5 +46,10 @@ jobs:
ls -l dist/
python3 -m twine upload dist/*
- name: flake8
python: "3.6"
install: python3 -m pip install flake8
script: flake8 src/ tests/
allow_failures:
- python: "nightly"
python-dnaio (0.4.1-1) unstable; urgency=medium
* Team upload.
* Build-Depends: python3-setuptools-scm
Closes: #944898
* Set upstream metadata fields: Repository.
* Remove obsolete fields Name from debian/upstream/metadata.
-- Andreas Tille <tille@debian.org> Sun, 17 Nov 2019 16:18:47 +0100
python-dnaio (0.4-1) unstable; urgency=medium
* Team upload.
......
......@@ -8,6 +8,7 @@ Build-Depends: debhelper-compat (= 12),
python3,
python3-dev,
python3-setuptools,
python3-setuptools-scm,
python3-pytest,
python3-xopen,
cython3
......
Name: dnaio
Repository: https://github.com/marcelm/dnaio
......@@ -77,7 +77,7 @@ setup(
install_requires=['xopen>=0.8.2'],
python_requires='>=3.4',
classifiers=[
"Development Status :: 3 - Alpha",
"Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
......
......@@ -16,6 +16,9 @@ __all__ = [
'InterleavedSequenceReader',
'InterleavedSequenceWriter',
'PairedSequenceReader',
'read_chunks',
'read_paired_chunks',
'__version__',
]
import os
......@@ -45,7 +48,9 @@ except ImportError:
return path
def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qualities=None):
def open(
file1, *, file2=None, fileformat=None, interleaved=False, mode="r", qualities=None, opener=xopen
):
"""
Open sequence files in FASTA or FASTQ format for reading or writing. This is
a factory that returns an instance of one of the ...Reader or ...Writer
......@@ -71,32 +76,37 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
appropriately.
* When False (no qualities available), an exception is raised when the
auto-detected output format is FASTQ.
opener -- A function that is used to open file1 and file2 if they are not
already open file-like objects. By default, xopen is used, which can
also open compressed file formats.
"""
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
if interleaved and file2 is not None:
raise ValueError("When interleaved is set, file2 must be None")
if file2 is not None:
if mode in "wa" and file1 == file2:
raise ValueError("The paired-end output files are identical")
if mode == "r":
return PairedSequenceReader(file1, file2, fileformat)
return PairedSequenceReader(file1, file2, fileformat, opener=opener)
elif mode == "w":
return PairedSequenceWriter(file1, file2, fileformat, qualities)
return PairedSequenceWriter(file1, file2, fileformat, qualities, opener=opener)
else:
return PairedSequenceAppender(file1, file2, fileformat, qualities)
return PairedSequenceAppender(file1, file2, fileformat, qualities, opener=opener)
if interleaved:
if mode == "r":
return InterleavedSequenceReader(file1, fileformat)
return InterleavedSequenceReader(file1, fileformat, opener=opener)
elif mode == "w":
return InterleavedSequenceWriter(file1, fileformat, qualities)
return InterleavedSequenceWriter(file1, fileformat, qualities, opener=opener)
else:
return InterleavedSequenceAppender(file1, fileformat, qualities)
return InterleavedSequenceAppender(file1, fileformat, qualities, opener=opener)
# The multi-file options have been dealt with, delegate rest to the
# single-file function.
return _open_single(
file1, fileformat=fileformat, mode=mode, qualities=qualities)
file1, opener=opener, fileformat=fileformat, mode=mode, qualities=qualities)
def _detect_format_from_name(name):
......@@ -118,16 +128,16 @@ def _detect_format_from_name(name):
return None
def _open_single(file, *, fileformat=None, mode='r', qualities=None):
def _open_single(file, opener, *, fileformat=None, mode="r", qualities=None):
"""
Open a single sequence file. See description of open() above.
"""
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
if isinstance(file, (str, pathlib.Path)):
if isinstance(file, (str, pathlib.Path)): # TODO Use os.PathLike in Python 3.6+
path = fspath(file)
file = xopen(path, mode + 'b')
file = opener(path, mode + "b")
close_file = True
else:
if mode == 'r' and not hasattr(file, 'readinto'):
......@@ -164,30 +174,18 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
fileformat = 'fastq' if qualities else 'fasta'
if mode == 'r' and fileformat is None:
# No format detected so far. Try to read from the file.
if file.seekable():
first_char = file.read(1)
file.seek(-1, 1)
else:
first_char = file.peek(1)[0:1]
formats = {
b'@': 'fastq',
b'>': 'fasta',
b'#': 'fasta', # Some FASTA variants allow comments
b'': 'fastq', # Pretend FASTQ for empty input
}
try:
fileformat = formats[first_char]
except KeyError:
fileformat = _detect_format_from_content(file)
if fileformat is None:
raise UnknownFileFormat(
'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was '
'not available or not recognized and the first character in the file ({!r}) is '
'unexpected.'.format(file, first_char))
'not available or not recognized and the first character in the file is '
'unexpected.'.format(file))
if fileformat is None:
assert mode == 'w'
extra = " because the output file name is not available" if path is None else ""
raise UnknownFileFormat("Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
raise UnknownFileFormat(
"Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
if fileformat == 'fastq' and mode in "wa" and qualities is False:
raise ValueError(
......@@ -196,6 +194,24 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
return handlers[fileformat](file)
def _detect_format_from_content(file):
"""
Return 'fasta', 'fastq' or None
"""
if file.seekable():
first_char = file.read(1)
file.seek(-1, 1)
else:
first_char = file.peek(1)[0:1]
formats = {
b'@': 'fastq',
b'>': 'fasta',
b'#': 'fasta', # Some FASTA variants allow comments
b'': 'fastq', # Pretend FASTQ for empty input
}
return formats.get(first_char, None)
def _sequence_names_match(r1, r2):
"""
Check whether the sequence records r1 and r2 have identical names, ignoring a
......@@ -220,10 +236,10 @@ class PairedSequenceReader:
"""
paired = True
def __init__(self, file1, file2, fileformat=None):
def __init__(self, file1, file2, fileformat=None, opener=xopen):
with ExitStack() as stack:
self.reader1 = stack.enter_context(_open_single(file1, fileformat=fileformat))
self.reader2 = stack.enter_context(_open_single(file2, fileformat=fileformat))
self.reader1 = stack.enter_context(_open_single(file1, opener=opener, fileformat=fileformat))
self.reader2 = stack.enter_context(_open_single(file2, opener=opener, fileformat=fileformat))
self._close = stack.pop_all().close
self.delivers_qualities = self.reader1.delivers_qualities
......@@ -240,7 +256,8 @@ class PairedSequenceReader:
# End of file 1. Make sure that file 2 is also at end.
try:
next(it2)
raise FileFormatError("Reads are improperly paired. There are more reads in "
raise FileFormatError(
"Reads are improperly paired. There are more reads in "
"file 2 than in file 1.", line=None) from None
except StopIteration:
pass
......@@ -248,10 +265,12 @@ class PairedSequenceReader:
try:
r2 = next(it2)
except StopIteration:
raise FileFormatError("Reads are improperly paired. There are more reads in "
raise FileFormatError(
"Reads are improperly paired. There are more reads in "
"file 1 than in file 2.", line=None) from None
if not _sequence_names_match(r1, r2):
raise FileFormatError("Reads are improperly paired. Read name '{}' "
raise FileFormatError(
"Reads are improperly paired. Read name '{}' "
"in file 1 does not match '{}' in file 2.".format(r1.name, r2.name), line=None) from None
yield (r1, r2)
......@@ -271,8 +290,8 @@ class InterleavedSequenceReader:
"""
paired = True
def __init__(self, file, fileformat=None):
self.reader = _open_single(file, fileformat=fileformat)
def __init__(self, file, fileformat=None, opener=xopen):
self.reader = _open_single(file, opener=opener, fileformat=fileformat)
self.delivers_qualities = self.reader.delivers_qualities
def __iter__(self):
......@@ -281,10 +300,12 @@ class InterleavedSequenceReader:
try:
r2 = next(it)
except StopIteration:
raise FileFormatError("Interleaved input file incomplete: Last record "
raise FileFormatError(
"Interleaved input file incomplete: Last record "
"{!r} has no partner.".format(r1.name), line=None) from None
if not _sequence_names_match(r1, r2):
raise FileFormatError("Reads are improperly paired. Name {!r} "
raise FileFormatError(
"Reads are improperly paired. Name {!r} "
"(first) does not match {!r} (second).".format(r1.name, r2.name), line=None)
yield (r1, r2)
......@@ -301,12 +322,14 @@ class InterleavedSequenceReader:
class PairedSequenceWriter:
_mode = "w"
def __init__(self, file1, file2, fileformat='fastq', qualities=None):
def __init__(self, file1, file2, fileformat='fastq', qualities=None, opener=xopen):
with ExitStack() as stack:
self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode=self._mode,
qualities=qualities))
self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode=self._mode,
qualities=qualities))
self._writer1 = stack.enter_context(
_open_single(
file1, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
self._writer2 = stack.enter_context(
_open_single(
file2, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities))
self._close = stack.pop_all().close
def write(self, read1, read2):
......@@ -334,10 +357,10 @@ class InterleavedSequenceWriter:
"""
_mode = "w"
def __init__(self, file, fileformat='fastq', qualities=None):
def __init__(self, file, fileformat='fastq', qualities=None, opener=xopen):
self._writer = _open_single(
file, fileformat=fileformat, mode=self._mode, qualities=qualities)
file, opener=opener, fileformat=fileformat, mode=self._mode, qualities=qualities)
def write(self, read1, read2):
self._writer.write(read1)
......
......@@ -103,7 +103,8 @@ def read_paired_chunks(f, f2, buffer_size=4*1024**2):
start1 = f.readinto(memoryview(buf1)[0:1])
start2 = f2.readinto(memoryview(buf2)[0:1])
if (start1 == 1 and buf1[0:1] != b'@') or (start2 == 1 and buf2[0:1] != b'@'):
raise FileFormatError('Paired-end data must be in FASTQ format when using multiple cores', line=None)
raise FileFormatError(
"Paired-end data must be in FASTQ format when using multiple cores", line=None)
while True:
if start1 == len(buf1) or start2 == len(buf2):
......
......@@ -18,13 +18,13 @@ class BinaryFileReader:
paired = False
mode = 'rb'
def __init__(self, file, _close_file=None):
def __init__(self, file, opener=xopen, _close_file=None):
"""
The file is a path or a file-like object. In both cases, the file may
be compressed (.gz, .bz2, .xz).
"""
if isinstance(file, str):
file = xopen(file, self.mode)
file = opener(file, self.mode)
self._close_on_exit = True
elif _close_file:
self._close_on_exit = True
......@@ -49,14 +49,14 @@ class FastaReader(BinaryFileReader):
Reader for FASTA files.
"""
def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, _close_file=None):
def __init__(self, file, keep_linebreaks=False, sequence_class=Sequence, opener=xopen, _close_file=None):
"""
file is a path or a file-like object. In both cases, the file may
be compressed (.gz, .bz2, .xz).
keep_linebreaks -- whether to keep newline characters in the sequence
"""
super().__init__(file, _close_file=_close_file)
super().__init__(file, opener=opener, _close_file=_close_file)
self.sequence_class = sequence_class
self.delivers_qualities = False
self._delimiter = '\n' if keep_linebreaks else ''
......@@ -83,8 +83,9 @@ class FastaReader(BinaryFileReader):
elif name is not None:
seq.append(line)
else:
raise FastaFormatError("Expected '>' at beginning of "
"record, but got {!r}.".format(_shorten(line)), line=i)
raise FastaFormatError(
"Expected '>' at beginning of record, but got {!r}."
.format(_shorten(line)), line=i)
if name is not None:
yield self.sequence_class(name, self._delimiter.join(seq), None)
......@@ -97,12 +98,12 @@ class FastqReader(BinaryFileReader):
Reader for FASTQ files. Does not support multi-line FASTQ files.
"""
def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, _close_file=None):
def __init__(self, file, sequence_class=Sequence, buffer_size=1048576, opener=xopen, _close_file=None):
"""
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
"""
super().__init__(file, _close_file=_close_file)
super().__init__(file, opener=opener, _close_file=_close_file)
self.sequence_class = sequence_class
self.delivers_qualities = True
self.buffer_size = buffer_size
......
......@@ -2,10 +2,10 @@ from xopen import xopen
class FileWriter:
def __init__(self, file, _close_file=None):
def __init__(self, file, opener=xopen, _close_file=None):
self._file = file
if isinstance(file, str):
self._file = xopen(file, 'wb')
self._file = opener(file, "wb")
self._close_on_exit = True
else:
self._close_on_exit = bool(_close_file)
......@@ -28,12 +28,12 @@ class FastaWriter(FileWriter):
Write FASTA-formatted sequences to a file.
"""
def __init__(self, file, line_length=None, _close_file=None):
def __init__(self, file, line_length=None, opener=xopen, _close_file=None):
"""
If line_length is not None, the lines will
be wrapped after line_length characters.
"""
super().__init__(file, _close_file=_close_file)
super().__init__(file, opener=opener, _close_file=_close_file)
self.line_length = line_length if line_length != 0 else None
def write(self, name_or_record, sequence=None):
......@@ -78,8 +78,8 @@ class FastqWriter(FileWriter):
"""
file_mode = 'wb'
def __init__(self, file, two_headers=False, _close_file=None):
super().__init__(file, _close_file=_close_file)
def __init__(self, file, two_headers=False, opener=xopen, _close_file=None):
super().__init__(file, opener=opener, _close_file=_close_file)
self._two_headers = two_headers
def write(self, record):
......
......@@ -82,18 +82,18 @@ class TestFastaReader:
filename = "tests/data/simple.fasta"
with open(filename, 'rb') as f:
assert not f.closed
reads = list(dnaio.open(f))
_ = list(dnaio.open(f))
assert not f.closed
assert f.closed
with FastaReader(filename) as sr:
tmp_sr = sr
assert not sr._file.closed
reads = list(sr)
_ = list(sr)
assert not sr._file.closed
assert tmp_sr._file is None
# Open it a second time
with FastaReader(filename) as sr:
with FastaReader(filename):
pass
......@@ -112,7 +112,7 @@ class TestFastqReader:
def test_fastqreader_buffersize_too_small(self):
with raises(ValueError):
with FastqReader("tests/data/simple.fastq", buffer_size=0) as f:
reads = list(f) # pragma: no cover
_ = list(f) # pragma: no cover
def test_fastqreader_dos(self):
# DOS line breaks
......@@ -212,7 +212,7 @@ class TestFastqReader:
with FastqReader(filename) as sr:
tmp_sr = sr
assert not sr._file.closed
reads = list(sr)
_ = list(sr)
assert not sr._file.closed
assert tmp_sr._file is None
......@@ -445,7 +445,12 @@ class TestInterleavedWriter:
with InterleavedSequenceWriter(bio) as writer:
for read1, read2 in reads:
writer.write(read1, read2)
assert bio.getvalue() == b'@A/1 comment\nTTA\n+\n##H\n@A/2 comment\nGCT\n+\nHH#\n@B/1\nCC\n+\nHH\n@B/2\nTG\n+\n#H\n'
assert bio.getvalue() == (
b'@A/1 comment\nTTA\n+\n##H\n'
b'@A/2 comment\nGCT\n+\nHH#\n'
b'@B/1\nCC\n+\nHH\n'
b'@B/2\nTG\n+\n#H\n'
)
class TestPairedSequenceReader:
......
......@@ -62,6 +62,49 @@ def test_read_pathlib_path(fileformat, extension):
assert records == SIMPLE_RECORDS[fileformat]
def test_read_opener(fileformat, extension):
def my_opener(path, mode):
import io
if fileformat == "fasta":
data = b">read\nACG\n"
else:
data = b"@read\nACG\n+\nHHH\n"
return io.BytesIO(data)
with dnaio.open("totally-ignored-filename." + fileformat + extension, opener=my_opener) as f:
records = list(f)
assert len(records) == 1
assert records[0].name == "read"
assert records[0].sequence == "ACG"
@pytest.mark.parametrize("interleaved", [False, True])
def test_paired_opener(fileformat, extension, interleaved):
def my_opener(_path, _mode):
import io
if fileformat == "fasta":
data = b">read\nACG\n"
else:
data = b"@read\nACG\n+\nHHH\n"
return io.BytesIO(data + data)
path1 = "ignored-filename." + fileformat + extension
path2 = "also-ignored-filename." + fileformat + extension
if interleaved:
with dnaio.open(path1, file2=path2, opener=my_opener) as f:
records = list(f)
expected = 2
else:
with dnaio.open(path1, interleaved=True, opener=my_opener) as f:
records = list(f)
expected = 1
assert len(records) == expected
assert records[0][0].name == "read"
assert records[0][0].sequence == "ACG"
assert records[0][1].name == "read"
assert records[0][1].sequence == "ACG"
def test_detect_fastq_from_content():
"""FASTQ file that is not named .fastq"""
with dnaio.open('tests/data/missingextension') as f:
......@@ -115,8 +158,8 @@ def test_write_pathlib(tmpdir, fileformat, extension):
def test_write_paired_same_path(tmpdir):
path1 = str(tmpdir / "same.fastq")
path2 = str(tmpdir / "same.fastq")
with pytest.raises(ValueError) as e:
with dnaio.open(file1=path1, file2=path2, mode="w") as f:
with pytest.raises(ValueError):
with dnaio.open(file1=path1, file2=path2, mode="w"):
pass
......
[tox]
envlist = py34,py35,py36,py37
envlist = flake8,py34,py35,py36,py37,py38
[testenv]
deps =
......@@ -10,6 +10,11 @@ commands =
coverage combine
coverage report
[testenv:flake8]
basepython = python3.6
deps = flake8
commands = flake8 src/ tests/
[coverage:run]
parallel = True
include =
......@@ -20,3 +25,7 @@ include =
source =
src/
*/site-packages/
[flake8]
max-line-length = 110
max-complexity = 15