Skip to content
Commits on Source (4)
[*.py]
[*.{py,pyx}]
charset=utf-8
end_of_line=lf
insert_final_newline=true
......
*.fastq -crlf
*.fasta -crlf
src/dnaio/_version.py export-subst
......@@ -9,3 +9,4 @@ __pycache__
/src/*/*.so
/src/*.egg-info/
/.tox/
/src/dnaio/_version.py
sudo: false
language: python
dist: xenial
cache:
directories:
- $HOME/.cache/pip
......@@ -8,13 +10,19 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
- "nightly"
install:
- pip install --upgrade coverage codecov
- pip install .[dev]
script:
- pytest
- coverage run -m pytest
after_success:
- coverage combine
- codecov
env:
global:
......@@ -24,13 +32,7 @@ env:
jobs:
include:
- stage: test
python: "3.7"
sudo: true # This may possibly be removed in the future
dist: xenial
- stage: deploy
sudo: required
services:
- docker
python: "3.6"
......
[![Travis](https://travis-ci.org/marcelm/dnaio.svg?branch=master)](https://travis-ci.org/marcelm/dnaio)
[![PyPI](https://img.shields.io/pypi/v/dnaio.svg?branch=master)](https://pypi.python.org/pypi/dnaio)
[![Codecov](https://codecov.io/gh/marcelm/dnaio/branch/master/graph/badge.svg)](https://codecov.io/gh/marcelm/dnaio)
# dnaio parses FASTQ and FASTA
`dnaio` is a Python 3 library for fast parsing of FASTQ and also FASTA files. The code was previously part of the
[cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out.
[Cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out.
## Example usage
......
python-dnaio (0.4-1) unstable; urgency=medium
* Team upload.
* New upstream version
-- Steffen Moeller <moeller@debian.org> Wed, 30 Oct 2019 11:30:18 +0100
python-dnaio (0.3-2) unstable; urgency=medium
* Team upload.
......
......@@ -11,7 +11,7 @@ Build-Depends: debhelper-compat (= 12),
python3-pytest,
python3-xopen,
cython3
Standards-Version: 4.4.0
Standards-Version: 4.4.1
Vcs-Browser: https://salsa.debian.org/med-team/python-dnaio
Vcs-Git: https://salsa.debian.org/med-team/python-dnaio.git
Homepage: https://github.com/marcelm/dnaio
......
......@@ -11,6 +11,12 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@ --with python3 --buildsystem=pybuild
override_dh_auto_clean:
dh_auto_clean
rm -rf .pytest_cache
rm -f src/dnaio/_core.c
rm -f src/dnaio/_version.py
### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
#override_dh_auto_test:
#ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))
......
[build-system]
requires = ["setuptools", "wheel", "setuptools_scm", "Cython"]
[versioneer]
VCS = git
style = pep440
versionfile_source = src/dnaio/_version.py
versionfile_build = dnaio/_version.py
tag_prefix = v
parentdir_prefix = dnaio-
import sys
import os.path
from setuptools import setup, Extension
from setuptools import setup, Extension, find_packages
from distutils.command.sdist import sdist as _sdist
from distutils.command.build_ext import build_ext as _build_ext
import versioneer
if sys.version_info[:2] < (3, 4):
sys.stdout.write('Python 3.4 or later is required\n')
......@@ -30,12 +29,8 @@ extensions = [
Extension('dnaio._core', sources=['src/dnaio/_core.pyx']),
]
cmdclass = versioneer.get_cmdclass()
versioneer_build_ext = cmdclass.get('build_ext', _build_ext)
versioneer_sdist = cmdclass.get('sdist', _sdist)
class build_ext(versioneer_build_ext):
class BuildExt(_build_ext):
def run(self):
# If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip
# file retrieved from PyPI that already includes the pre-cythonized
......@@ -47,26 +42,24 @@ class build_ext(versioneer_build_ext):
# only sensible thing is to require Cython to be installed.
from Cython.Build import cythonize
self.extensions = cythonize(self.extensions)
versioneer_build_ext.run(self)
super().run()
class sdist(versioneer_sdist):
class SDist(_sdist):
def run(self):
# Make sure the compiled Cython files in the distribution are up-to-date
from Cython.Build import cythonize
cythonize(extensions)
versioneer_sdist.run(self)
super().run()
cmdclass['build_ext'] = build_ext
cmdclass['sdist'] = sdist
with open('README.md', encoding='utf-8') as f:
long_description = f.read()
setup(
name='dnaio',
version=versioneer.get_version(),
setup_requires=['setuptools_scm'], # Support pip versions that don't know about pyproject.toml
use_scm_version={'write_to': 'src/dnaio/_version.py'},
author='Marcel Martin',
author_email='marcel.martin@scilifelab.se',
url='https://github.com/marcelm/dnaio/',
......@@ -74,14 +67,15 @@ setup(
long_description=long_description,
long_description_content_type='text/markdown',
license='MIT',
packages=['dnaio'],
package_dir={'': 'src'},
packages=find_packages('src'),
extras_require={
'dev': ['Cython', 'pytest'],
},
ext_modules=extensions,
cmdclass=cmdclass,
install_requires=['xopen'],
cmdclass={'build_ext': BuildExt, 'sdist': SDist},
install_requires=['xopen>=0.8.2'],
python_requires='>=3.4',
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
......
......@@ -21,6 +21,7 @@ __all__ = [
import os
from contextlib import ExitStack
import functools
import pathlib
from xopen import xopen
......@@ -29,10 +30,19 @@ from .readers import FastaReader, FastqReader
from .writers import FastaWriter, FastqWriter
from .exceptions import UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError
from .chunks import read_chunks, read_paired_chunks
from ._version import version as __version__
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
try:
from os import fspath # Exists in Python 3.6+
except ImportError:
def fspath(path):
if hasattr(path, "__fspath__"):
return path.__fspath__()
# Python 3.4 and 3.5 do not support the file system path protocol
if isinstance(path, pathlib.Path):
return str(path)
return path
def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qualities=None):
......@@ -42,10 +52,10 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
classes also defined in this module.
file1, file2 -- Paths to regular or compressed files or file-like
objects. Use file1 if data is single-end. If also file2 is provided,
sequences are paired.
objects (as str or as pathlib.Path). Use only file1 if data is single-end.
If sequences are paired, use also file2.
mode -- Either 'r' for reading or 'w' for writing.
mode -- Either 'r' for reading, 'w' for writing or 'a' for appending.
interleaved -- If True, then file1 contains interleaved paired-end data.
file2 must be None in this case.
......@@ -62,20 +72,26 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
* When False (no qualities available), an exception is raised when the
auto-detected output format is FASTQ.
"""
if mode not in ('r', 'w'):
raise ValueError("Mode must be 'r' or 'w'")
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
if interleaved and file2 is not None:
raise ValueError("When interleaved is set, file2 must be None")
if file2 is not None:
if mode == 'r':
if mode in "wa" and file1 == file2:
raise ValueError("The paired-end output files are identical")
if mode == "r":
return PairedSequenceReader(file1, file2, fileformat)
else:
elif mode == "w":
return PairedSequenceWriter(file1, file2, fileformat, qualities)
else:
return PairedSequenceAppender(file1, file2, fileformat, qualities)
if interleaved:
if mode == 'r':
if mode == "r":
return InterleavedSequenceReader(file1, fileformat)
else:
elif mode == "w":
return InterleavedSequenceWriter(file1, fileformat, qualities)
else:
return InterleavedSequenceAppender(file1, fileformat, qualities)
# The multi-file options have been dealt with, delegate rest to the
# single-file function.
......@@ -106,15 +122,21 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
"""
Open a single sequence file. See description of open() above.
"""
if mode not in ('r', 'w'):
raise ValueError("Mode must be 'r' or 'w'")
if isinstance(file, str):
file = xopen(file, mode + 'b')
if mode not in ("r", "w", "a"):
raise ValueError("Mode must be 'r', 'w' or 'a'")
if isinstance(file, (str, pathlib.Path)):
path = fspath(file)
file = xopen(path, mode + 'b')
close_file = True
else:
if mode == 'r' and not hasattr(file, 'readinto'):
raise ValueError(
'When passing in an open file-like object, it must have been opened in binary mode')
if hasattr(file, "name") and isinstance(file.name, str):
path = file.name
else:
path = None
close_file = False
if mode == 'r':
fastq_handler = FastqReader
......@@ -122,59 +144,56 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
else:
fastq_handler = FastqWriter
fasta_handler = FastaWriter
fastq_handler = functools.partial(fastq_handler, _close_file=close_file)
fasta_handler = functools.partial(fasta_handler, _close_file=close_file)
if fileformat: # Explict file format given
fileformat = fileformat.lower()
if fileformat == 'fasta':
return fasta_handler(file)
elif fileformat == 'fastq':
return fastq_handler(file)
else:
handlers = {
'fastq': functools.partial(fastq_handler, _close_file=close_file),
'fasta': functools.partial(fasta_handler, _close_file=close_file),
}
if fileformat:
try:
handler = handlers[fileformat.lower()]
except KeyError:
raise UnknownFileFormat(
"File format {!r} is unknown (expected 'fasta' or 'fastq').".format(fileformat))
return handler(file)
# First, try to detect the file format from the file name only
format = None
if hasattr(file, "name") and isinstance(file.name, str):
format = _detect_format_from_name(file.name)
if format is None and mode == 'w' and qualities is not None:
if path is not None:
fileformat = _detect_format_from_name(path)
if fileformat is None and mode == 'w' and qualities is not None:
# Format not recognized, but we know whether to use a format with or without qualities
format = 'fastq' if qualities else 'fasta'
fileformat = 'fastq' if qualities else 'fasta'
if mode == 'r' and format is None:
if mode == 'r' and fileformat is None:
# No format detected so far. Try to read from the file.
if file.seekable():
first_char = file.read(1)
file.seek(-1, 1)
else:
first_char = file.peek(1)[0:1]
if first_char == b'#':
# A comment char - only valid for some FASTA variants (csfasta)
format = 'fasta'
elif first_char == b'>':
format = 'fasta'
elif first_char == b'@':
format = 'fastq'
elif first_char == b'':
# Empty input. Pretend this is FASTQ
format = 'fastq'
else:
formats = {
b'@': 'fastq',
b'>': 'fasta',
b'#': 'fasta', # Some FASTA variants allow comments
b'': 'fastq', # Pretend FASTQ for empty input
}
try:
fileformat = formats[first_char]
except KeyError:
raise UnknownFileFormat(
'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was '
'not available or not recognized and the first character in the file ({!r}) is '
'unexpected.'.format(file, first_char))
if format is None:
if fileformat is None:
assert mode == 'w'
raise UnknownFileFormat('Cannot determine whether to write in FASTA or FASTQ format')
extra = " because the output file name is not available" if path is None else ""
raise UnknownFileFormat("Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)
if format == 'fastq' and mode == 'w' and qualities is False:
if fileformat == 'fastq' and mode in "wa" and qualities is False:
raise ValueError(
'Output format cannot be FASTQ since no quality values are available.')
return fastq_handler(file) if format == 'fastq' else fasta_handler(file)
return handlers[fileformat](file)
def _sequence_names_match(r1, r2):
......@@ -280,11 +299,13 @@ class InterleavedSequenceReader:
class PairedSequenceWriter:
_mode = "w"
def __init__(self, file1, file2, fileformat='fastq', qualities=None):
with ExitStack() as stack:
self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode='w',
self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode=self._mode,
qualities=qualities))
self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode='w',
self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode=self._mode,
qualities=qualities))
self._close = stack.pop_all().close
......@@ -303,15 +324,20 @@ class PairedSequenceWriter:
self.close()
class PairedSequenceAppender(PairedSequenceWriter):
_mode = "a"
class InterleavedSequenceWriter:
"""
Write paired-end reads to an interleaved FASTA or FASTQ file
"""
_mode = "w"
def __init__(self, file, fileformat='fastq', qualities=None):
self._writer = _open_single(
file, fileformat=fileformat, mode='w', qualities=qualities)
file, fileformat=fileformat, mode=self._mode, qualities=qualities)
def write(self, read1, read2):
self._writer.write(read1)
......@@ -326,3 +352,7 @@ class InterleavedSequenceWriter:
def __exit__(self, *args):
self.close()
class InterleavedSequenceAppender(InterleavedSequenceWriter):
_mode = "a"
......@@ -8,59 +8,59 @@ from ._util import shorten
cdef class Sequence:
"""
A record in a FASTA or FASTQ file. For FASTA, the qualities attribute
is None. For FASTQ, qualities is a string and it contains the qualities
encoded as ascii(qual+33).
"""
cdef:
public str name
public str sequence
public str qualities
def __cinit__(self, str name, str sequence, str qualities=None):
"""Set qualities to None if there are no quality values"""
self.name = name
self.sequence = sequence
self.qualities = qualities
if qualities is not None and len(qualities) != len(sequence):
rname = shorten(name)
raise ValueError("In read named {!r}: length of quality sequence "
"({}) and length of read ({}) do not match".format(
rname, len(qualities), len(sequence)))
def __getitem__(self, key):
"""slicing"""
return self.__class__(
self.name,
self.sequence[key],
self.qualities[key] if self.qualities is not None else None)
def __repr__(self):
qstr = ''
if self.qualities is not None:
qstr = ', qualities={!r}'.format(shorten(self.qualities))
return '<Sequence(name={!r}, sequence={!r}{})>'.format(
shorten(self.name), shorten(self.sequence), qstr)
def __len__(self):
return len(self.sequence)
def __richcmp__(self, other, int op):
if 2 <= op <= 3:
eq = self.name == other.name and \
self.sequence == other.sequence and \
self.qualities == other.qualities
if op == 2:
return eq
else:
return not eq
else:
raise NotImplementedError()
def __reduce__(self):
return (Sequence, (self.name, self.sequence, self.qualities))
"""
A record in a FASTA or FASTQ file. For FASTA, the qualities attribute
is None. For FASTQ, qualities is a string and it contains the qualities
encoded as ascii(qual+33).
"""
cdef:
public str name
public str sequence
public str qualities
def __cinit__(self, str name, str sequence, str qualities=None):
"""Set qualities to None if there are no quality values"""
self.name = name
self.sequence = sequence
self.qualities = qualities
if qualities is not None and len(qualities) != len(sequence):
rname = shorten(name)
raise ValueError("In read named {!r}: length of quality sequence "
"({}) and length of read ({}) do not match".format(
rname, len(qualities), len(sequence)))
def __getitem__(self, key):
"""slicing"""
return self.__class__(
self.name,
self.sequence[key],
self.qualities[key] if self.qualities is not None else None)
def __repr__(self):
qstr = ''
if self.qualities is not None:
qstr = ', qualities={!r}'.format(shorten(self.qualities))
return '<Sequence(name={!r}, sequence={!r}{})>'.format(
shorten(self.name), shorten(self.sequence), qstr)
def __len__(self):
return len(self.sequence)
def __richcmp__(self, other, int op):
if 2 <= op <= 3:
eq = self.name == other.name and \
self.sequence == other.sequence and \
self.qualities == other.qualities
if op == 2:
return eq
else:
return not eq
else:
raise NotImplementedError()
def __reduce__(self):
return (Sequence, (self.name, self.sequence, self.qualities))
# It would be nice to be able to have the first parameter be an
......@@ -69,216 +69,216 @@ cdef class Sequence:
# See <https://stackoverflow.com/questions/28203670/>
ctypedef fused bytes_or_bytearray:
bytes
bytearray
bytes
bytearray
def paired_fastq_heads(bytes_or_bytearray buf1, bytes_or_bytearray buf2, Py_ssize_t end1, Py_ssize_t end2):
"""
Skip forward in the two buffers by multiples of four lines.
Return a tuple (length1, length2) such that buf1[:length1] and
buf2[:length2] contain the same number of lines (where the
line number is divisible by four).
"""
cdef:
Py_ssize_t pos1 = 0, pos2 = 0
Py_ssize_t linebreaks = 0
unsigned char* data1 = buf1
unsigned char* data2 = buf2
Py_ssize_t record_start1 = 0
Py_ssize_t record_start2 = 0
while True:
while pos1 < end1 and data1[pos1] != b'\n':
pos1 += 1
if pos1 == end1:
break
pos1 += 1
while pos2 < end2 and data2[pos2] != b'\n':
pos2 += 1
if pos2 == end2:
break
pos2 += 1
linebreaks += 1
if linebreaks == 4:
linebreaks = 0
record_start1 = pos1
record_start2 = pos2
# Hit the end of the data block
return record_start1, record_start2
"""
Skip forward in the two buffers by multiples of four lines.
Return a tuple (length1, length2) such that buf1[:length1] and
buf2[:length2] contain the same number of lines (where the
line number is divisible by four).
"""
cdef:
Py_ssize_t pos1 = 0, pos2 = 0
Py_ssize_t linebreaks = 0
unsigned char* data1 = buf1
unsigned char* data2 = buf2
Py_ssize_t record_start1 = 0
Py_ssize_t record_start2 = 0
while True:
while pos1 < end1 and data1[pos1] != b'\n':
pos1 += 1
if pos1 == end1:
break
pos1 += 1
while pos2 < end2 and data2[pos2] != b'\n':
pos2 += 1
if pos2 == end2:
break
pos2 += 1
linebreaks += 1
if linebreaks == 4:
linebreaks = 0
record_start1 = pos1
record_start2 = pos2
# Hit the end of the data block
return record_start1, record_start2
def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
"""
Parse a FASTQ file and yield Sequence objects
The *first value* that the generator yields is a boolean indicating whether
the first record in the FASTQ has a repeated header (in the third row
after the ``+``).
file -- a file-like object, opened in binary mode (it must have a readinto
method)
buffer_size -- size of the initial buffer. This is automatically grown
if a FASTQ record is encountered that does not fit.
"""
cdef:
bytearray buf = bytearray(buffer_size)
char[:] buf_view = buf
char* c_buf = buf
int endskip
str name
char* name_encoded
Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
Py_ssize_t second_header_start, sequence_length, qualities_start
Py_ssize_t second_header_length, name_length
bint custom_class = sequence_class is not Sequence
Py_ssize_t n_records = 0
bint extra_newline = False
if buffer_size < 1:
raise ValueError("Starting buffer size too small")
# buf is a byte buffer that is re-used in each iteration. Its layout is:
#
# |-- complete records --|
# +---+------------------+---------+-------+
# | | | | |
# +---+------------------+---------+-------+
# ^ ^ ^ ^ ^
# 0 bufstart end bufend len(buf)
#
# buf[0:bufstart] is the 'leftover' data that could not be processed
# in the previous iteration because it contained an incomplete
# FASTQ record.
readinto = file.readinto
bufstart = 0
# The input file is processed in chunks that each fit into buf
while True:
assert bufstart < len(buf_view)
bufend = readinto(buf_view[bufstart:]) + bufstart
if bufstart == bufend:
# End of file
if bufstart > 0 and buf_view[bufstart-1] != b'\n':
# There is still data in the buffer and its last character is
# not a newline: This is a file that is missing the final
# newline. Append a newline and continue.
buf_view[bufstart] = b'\n'
bufstart += 1
bufend += 1
extra_newline = True
else:
break
# Parse all complete FASTQ records in this chunk
pos = 0
record_start = 0
while True:
# Parse the name (line 0)
if c_buf[pos] != b'@':
raise FastqFormatError("Line expected to "
"start with '@', but found {!r}".format(chr(c_buf[pos])),
line=n_records * 4)
pos += 1
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
name_length = pos - endskip - record_start - 1
name_encoded = c_buf + record_start + 1
# .decode('latin-1') is 50% faster than .decode('ascii')
name = c_buf[record_start+1:pos-endskip].decode('latin-1')
pos += 1
# Parse the sequence (line 1)
sequence_start = pos
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
sequence_length = pos - endskip - sequence_start
pos += 1
# Parse second header (line 2)
second_header_start = pos
if pos == bufend:
break
if c_buf[pos] != b'+':
raise FastqFormatError("Line expected to "
"start with '+', but found {!r}".format(chr(c_buf[pos])),
line=n_records * 4 + 2)
pos += 1 # skip over the '+'
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
second_header_length = pos - endskip - second_header_start - 1
if second_header_length == 0:
second_header = False
else:
if (name_length != second_header_length or
strncmp(c_buf+second_header_start+1,
name_encoded, second_header_length) != 0):
raise FastqFormatError(
"Sequence descriptions don't match ('{}' != '{}').\n"
"The second sequence description must be either "
"empty or equal to the first description.".format(
name_encoded[:name_length].decode('latin-1'),
c_buf[second_header_start+1:pos-endskip]
.decode('latin-1')), line=n_records * 4 + 2)
second_header = True
pos += 1
# Parse qualities (line 3)
qualities_start = pos
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
if pos - endskip - qualities_start != sequence_length:
raise FastqFormatError("Length of sequence and "
"qualities differ", line=n_records * 4 + 3)
pos += 1
if n_records == 0:
yield second_header # first yielded value is special
if custom_class:
yield sequence_class(name, sequence, qualities)
else:
yield Sequence.__new__(Sequence, name, sequence, qualities)
n_records += 1
record_start = pos
if pos == bufend:
break
if pos == bufend:
if record_start == 0 and bufend == len(buf):
# buffer too small, double it
buffer_size *= 2
prev_buf = buf
buf = bytearray(buffer_size)
buf[0:bufend] = prev_buf
del prev_buf
bufstart = bufend
buf_view = buf
c_buf = buf
else:
bufstart = bufend - record_start
buf[0:bufstart] = buf[record_start:bufend]
if pos > record_start:
if extra_newline:
pos -= 1
lines = buf[record_start:pos].count(b'\n')
raise FastqFormatError(
'Premature end of file encountered. The incomplete final record was: '
'{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
line=n_records * 4 + lines)
"""
Parse a FASTQ file and yield Sequence objects
The *first value* that the generator yields is a boolean indicating whether
the first record in the FASTQ has a repeated header (in the third row
after the ``+``).
file -- a file-like object, opened in binary mode (it must have a readinto
method)
buffer_size -- size of the initial buffer. This is automatically grown
if a FASTQ record is encountered that does not fit.
"""
cdef:
bytearray buf = bytearray(buffer_size)
char[:] buf_view = buf
char* c_buf = buf
int endskip
str name
char* name_encoded
Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
Py_ssize_t second_header_start, sequence_length, qualities_start
Py_ssize_t second_header_length, name_length
bint custom_class = sequence_class is not Sequence
Py_ssize_t n_records = 0
bint extra_newline = False
if buffer_size < 1:
raise ValueError("Starting buffer size too small")
# buf is a byte buffer that is re-used in each iteration. Its layout is:
#
# |-- complete records --|
# +---+------------------+---------+-------+
# | | | | |
# +---+------------------+---------+-------+
# ^ ^ ^ ^ ^
# 0 bufstart end bufend len(buf)
#
# buf[0:bufstart] is the 'leftover' data that could not be processed
# in the previous iteration because it contained an incomplete
# FASTQ record.
readinto = file.readinto
bufstart = 0
# The input file is processed in chunks that each fit into buf
while True:
assert bufstart < len(buf_view)
bufend = readinto(buf_view[bufstart:]) + bufstart
if bufstart == bufend:
# End of file
if bufstart > 0 and buf_view[bufstart-1] != b'\n':
# There is still data in the buffer and its last character is
# not a newline: This is a file that is missing the final
# newline. Append a newline and continue.
buf_view[bufstart] = b'\n'
bufstart += 1
bufend += 1
extra_newline = True
else:
break
# Parse all complete FASTQ records in this chunk
pos = 0
record_start = 0
while True:
# Parse the name (line 0)
if c_buf[pos] != b'@':
raise FastqFormatError("Line expected to "
"start with '@', but found {!r}".format(chr(c_buf[pos])),
line=n_records * 4)
pos += 1
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
name_length = pos - endskip - record_start - 1
name_encoded = c_buf + record_start + 1
# .decode('latin-1') is 50% faster than .decode('ascii')
name = c_buf[record_start+1:pos-endskip].decode('latin-1')
pos += 1
# Parse the sequence (line 1)
sequence_start = pos
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
sequence_length = pos - endskip - sequence_start
pos += 1
# Parse second header (line 2)
second_header_start = pos
if pos == bufend:
break
if c_buf[pos] != b'+':
raise FastqFormatError("Line expected to "
"start with '+', but found {!r}".format(chr(c_buf[pos])),
line=n_records * 4 + 2)
pos += 1 # skip over the '+'
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
second_header_length = pos - endskip - second_header_start - 1
if second_header_length == 0:
second_header = False
else:
if (name_length != second_header_length or
strncmp(c_buf+second_header_start+1,
name_encoded, second_header_length) != 0):
raise FastqFormatError(
"Sequence descriptions don't match ('{}' != '{}').\n"
"The second sequence description must be either "
"empty or equal to the first description.".format(
name_encoded[:name_length].decode('latin-1'),
c_buf[second_header_start+1:pos-endskip]
.decode('latin-1')), line=n_records * 4 + 2)
second_header = True
pos += 1
# Parse qualities (line 3)
qualities_start = pos
while pos < bufend and c_buf[pos] != b'\n':
pos += 1
if pos == bufend:
break
endskip = 1 if c_buf[pos-1] == b'\r' else 0
qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
if pos - endskip - qualities_start != sequence_length:
raise FastqFormatError("Length of sequence and "
"qualities differ", line=n_records * 4 + 3)
pos += 1
if n_records == 0:
yield second_header # first yielded value is special
if custom_class:
yield sequence_class(name, sequence, qualities)
else:
yield Sequence.__new__(Sequence, name, sequence, qualities)
n_records += 1
record_start = pos
if pos == bufend:
break
if pos == bufend:
if record_start == 0 and bufend == len(buf):
# buffer too small, double it
buffer_size *= 2
prev_buf = buf
buf = bytearray(buffer_size)
buf[0:bufend] = prev_buf
del prev_buf
bufstart = bufend
buf_view = buf
c_buf = buf
else:
bufstart = bufend - record_start
buf[0:bufstart] = buf[record_start:bufend]
if pos > record_start:
if extra_newline:
pos -= 1
lines = buf[record_start:pos].count(b'\n')
raise FastqFormatError(
'Premature end of file encountered. The incomplete final record was: '
'{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
line=n_records * 4 + lines)
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
# directories (produced by setup.py build) will contain a much shorter file
# that just contains the computed version number.
# This file is released into the public domain. Generated by
# versioneer-0.18 (https://github.com/warner/python-versioneer)
"""Git implementation of _version.py."""
import errno
import os
import re
import subprocess
import sys
def get_keywords():
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (tag: v0.3)"
git_full = "a16a94708e8b7b974a8598e56da4ffbf892f8898"
git_date = "2018-10-03 14:11:31 +0200"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
def get_config():
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
cfg = VersioneerConfig()
cfg.VCS = "git"
cfg.style = "pep440"
cfg.tag_prefix = "v"
cfg.parentdir_prefix = "dnaio-"
cfg.versionfile_source = "src/dnaio/_version.py"
cfg.verbose = False
return cfg
class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
LONG_VERSION_PY = {}
HANDLERS = {}
def register_vcs_handler(vcs, method): # decorator
"""Decorator to mark a method as the handler for a particular VCS."""
def decorate(f):
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
return decorate
def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
env=None):
"""Call the given command(s)."""
assert isinstance(commands, list)
p = None
for c in commands:
try:
dispcmd = str([c] + args)
# remember shell=False, so use git.cmd on windows, not just git
p = subprocess.Popen([c] + args, cwd=cwd, env=env,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr
else None))
break
except EnvironmentError:
e = sys.exc_info()[1]
if e.errno == errno.ENOENT:
continue
if verbose:
print("unable to run %s" % dispcmd)
print(e)
return None, None
else:
if verbose:
print("unable to find command, tried %s" % (commands,))
return None, None
stdout = p.communicate()[0].strip()
if sys.version_info[0] >= 3:
stdout = stdout.decode()
if p.returncode != 0:
if verbose:
print("unable to run %s (error)" % dispcmd)
print("stdout was %s" % stdout)
return None, p.returncode
return stdout, p.returncode
def versions_from_parentdir(parentdir_prefix, root, verbose):
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
"""
rootdirs = []
for i in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
return {"version": dirname[len(parentdir_prefix):],
"full-revisionid": None,
"dirty": False, "error": None, "date": None}
else:
rootdirs.append(root)
root = os.path.dirname(root) # up a level
if verbose:
print("Tried directories %s but none started with prefix %s" %
(str(rootdirs), parentdir_prefix))
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
@register_vcs_handler("git", "get_keywords")
def git_get_keywords(versionfile_abs):
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
keywords = {}
try:
f = open(versionfile_abs, "r")
for line in f.readlines():
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["refnames"] = mo.group(1)
if line.strip().startswith("git_full ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["full"] = mo.group(1)
if line.strip().startswith("git_date ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["date"] = mo.group(1)
f.close()
except EnvironmentError:
pass
return keywords
@register_vcs_handler("git", "keywords")
def git_versions_from_keywords(keywords, tag_prefix, verbose):
"""Get version information from git keywords."""
if not keywords:
raise NotThisMethod("no keywords at all, weird")
date = keywords.get("date")
if date is not None:
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = set([r.strip() for r in refnames.strip("()").split(",")])
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = set([r for r in refs if re.search(r'\d', r)])
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix):]
if verbose:
print("picking %s" % r)
return {"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": None,
"date": date}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": "no suitable tags", "date": None}
@register_vcs_handler("git", "pieces_from_vcs")
def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
hide_stderr=True)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
raise NotThisMethod("'git rev-parse --git-dir' returned error")
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
"--always", "--long",
"--match", "%s*" % tag_prefix],
cwd=root)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
pieces = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
# look for -dirty suffix
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[:git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
pieces["error"] = ("unable to parse git-describe output: '%s'"
% describe_out)
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
% (full_tag, tag_prefix))
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix):]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
# commit: short hex revision ID
pieces["short"] = mo.group(3)
else:
# HEX: no tags
pieces["closest-tag"] = None
count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
cwd=root)
pieces["distance"] = int(count_out) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
cwd=root)[0].strip()
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
def plus_or_dot(pieces):
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
def render_pep440(pieces):
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%d.g%s" % (pieces["distance"],
pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_pre(pieces):
"""TAG[.post.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post.devDISTANCE
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += ".post.dev%d" % pieces["distance"]
else:
# exception #1
rendered = "0.post.dev%d" % pieces["distance"]
return rendered
def render_pep440_post(pieces):
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
(a dirty tree will appear "older" than the corresponding clean one),
but you shouldn't be releasing software with -dirty anyways.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
return rendered
def render_pep440_old(pieces):
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
Eexceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
return rendered
def render_git_describe(pieces):
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render_git_describe_long(pieces):
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
The distance/hash is unconditional.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render(pieces, style):
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"],
"date": None}
if not style or style == "default":
style = "pep440" # the default
if style == "pep440":
rendered = render_pep440(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
rendered = render_git_describe(pieces)
elif style == "git-describe-long":
rendered = render_git_describe_long(pieces)
else:
raise ValueError("unknown style '%s'" % style)
return {"version": rendered, "full-revisionid": pieces["long"],
"dirty": pieces["dirty"], "error": None,
"date": pieces.get("date")}
def get_versions():
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
# case we can only use expanded keywords.
cfg = get_config()
verbose = cfg.verbose
try:
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
verbose)
except NotThisMethod:
pass
try:
root = os.path.realpath(__file__)
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
for i in cfg.versionfile_source.split('/'):
root = os.path.dirname(root)
except NameError:
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree",
"date": None}
try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
return render(pieces, cfg.style)
except NotThisMethod:
pass
try:
if cfg.parentdir_prefix:
return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
except NotThisMethod:
pass
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to compute version", "date": None}
"""Chunked reading of FASTA and FASTQ files"""
from ._core import paired_fastq_heads as _paired_fastq_heads
from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat
......@@ -53,6 +54,9 @@ def read_chunks(f, buffer_size=4*1024**2):
head = _fastq_head
elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'):
head = _fasta_head
elif start == 0:
# Empty file
return
else:
raise UnknownFileFormat('Input file format unknown')
......
import io
from xopen import xopen
......