Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · cf42bed6 · cf42bed6
--- a/.editorconfig
+++ b/.editorconfig
-[*.py]
+[*.{py,pyx}]
 charset=utf-8
 end_of_line=lf
 insert_final_newline=true

--- a/.gitattributes
+++ b/.gitattributes
 *.fastq -crlf
 *.fasta -crlf
-src/dnaio/_version.py export-subst
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ __pycache__
 /src/*/*.so
 /src/*.egg-info/
 /.tox/
+/src/dnaio/_version.py
--- a/.travis.yml
+++ b/.travis.yml
-sudo: false
 language: python
+
+dist: xenial
+
 cache:
  directories:
    - $HOME/.cache/pip
@@ -8,13 +10,19 @@ python:
  - "3.4"
  - "3.5"
  - "3.6"
+  - "3.7"
  - "nightly"

 install:
+  - pip install --upgrade coverage codecov
  - pip install .[dev]

 script:
-  - pytest
+  - coverage run -m pytest
+
+after_success:
+  - coverage combine
+  - codecov

 env:
  global:
@@ -24,13 +32,7 @@ env:

 jobs:
  include:
-    - stage: test
-      python: "3.7"
-      sudo: true  # This may possibly be removed in the future
-      dist: xenial
-
    - stage: deploy
-      sudo: required
      services:
        - docker
      python: "3.6"

--- a/README.md
+++ b/README.md
 [![Travis](https://travis-ci.org/marcelm/dnaio.svg?branch=master)](https://travis-ci.org/marcelm/dnaio)
 [![PyPI](https://img.shields.io/pypi/v/dnaio.svg?branch=master)](https://pypi.python.org/pypi/dnaio)
+[![Codecov](https://codecov.io/gh/marcelm/dnaio/branch/master/graph/badge.svg)](https://codecov.io/gh/marcelm/dnaio)

 # dnaio parses FASTQ and FASTA

 `dnaio` is a Python 3 library for fast parsing of FASTQ and also FASTA files. The code was previously part of the
-[cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out.
+[Cutadapt](https://cutadapt.readthedocs.io/) tool and has been improved since it has been split out.


 ## Example usage

--- a/debian/changelog
+++ b/debian/changelog
+python-dnaio (0.4-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+
+ -- Steffen Moeller <moeller@debian.org>  Wed, 30 Oct 2019 11:30:18 +0100
+
 python-dnaio (0.3-2) unstable; urgency=medium

  * Team upload.

--- a/debian/control
+++ b/debian/control
@@ -11,7 +11,7 @@ Build-Depends: debhelper-compat (= 12),
               python3-pytest,
               python3-xopen,
               cython3
-Standards-Version: 4.4.0
+Standards-Version: 4.4.1
 Vcs-Browser: https://salsa.debian.org/med-team/python-dnaio
 Vcs-Git: https://salsa.debian.org/med-team/python-dnaio.git
 Homepage: https://github.com/marcelm/dnaio

--- a/debian/rules
+++ b/debian/rules
@@ -11,6 +11,12 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all
 %:
 	dh $@ --with python3 --buildsystem=pybuild

+override_dh_auto_clean:
+	dh_auto_clean
+	rm -rf .pytest_cache
+	rm -f src/dnaio/_core.c
+	rm -f src/dnaio/_version.py
+
 ### When overriding auto_test make sure DEB_BUILD_OPTIONS will be respected
 #override_dh_auto_test:
 #ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS)))

--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools", "wheel", "setuptools_scm", "Cython"]
--- a/setup.cfg
+++ b/setup.cfg
-[versioneer]
-VCS = git
-style = pep440
-versionfile_source = src/dnaio/_version.py
-versionfile_build = dnaio/_version.py
-tag_prefix = v
-parentdir_prefix = dnaio-
--- a/setup.py
+++ b/setup.py
 import sys
 import os.path
-from setuptools import setup, Extension
+from setuptools import setup, Extension, find_packages
 from distutils.command.sdist import sdist as _sdist
 from distutils.command.build_ext import build_ext as _build_ext
-import versioneer

 if sys.version_info[:2] < (3, 4):
    sys.stdout.write('Python 3.4 or later is required\n')
@@ -30,12 +29,8 @@ extensions = [
    Extension('dnaio._core', sources=['src/dnaio/_core.pyx']),
 ]

-cmdclass = versioneer.get_cmdclass()
-versioneer_build_ext = cmdclass.get('build_ext', _build_ext)
-versioneer_sdist = cmdclass.get('sdist', _sdist)

-
-class build_ext(versioneer_build_ext):
+class BuildExt(_build_ext):
    def run(self):
        # If we encounter a PKG-INFO file, then this is likely a .tar.gz/.zip
        # file retrieved from PyPI that already includes the pre-cythonized
@@ -47,26 +42,24 @@ class build_ext(versioneer_build_ext):
            # only sensible thing is to require Cython to be installed.
            from Cython.Build import cythonize
            self.extensions = cythonize(self.extensions)
-        versioneer_build_ext.run(self)
+        super().run()


-class sdist(versioneer_sdist):
+class SDist(_sdist):
    def run(self):
        # Make sure the compiled Cython files in the distribution are up-to-date
        from Cython.Build import cythonize
        cythonize(extensions)
-        versioneer_sdist.run(self)
-
+        super().run()

-cmdclass['build_ext'] = build_ext
-cmdclass['sdist'] = sdist

 with open('README.md', encoding='utf-8') as f:
    long_description = f.read()

 setup(
    name='dnaio',
-    version=versioneer.get_version(),
+    setup_requires=['setuptools_scm'],  # Support pip versions that don't know about pyproject.toml
+    use_scm_version={'write_to': 'src/dnaio/_version.py'},
    author='Marcel Martin',
    author_email='marcel.martin@scilifelab.se',
    url='https://github.com/marcelm/dnaio/',
@@ -74,14 +67,15 @@ setup(
    long_description=long_description,
    long_description_content_type='text/markdown',
    license='MIT',
-    packages=['dnaio'],
    package_dir={'': 'src'},
+    packages=find_packages('src'),
    extras_require={
        'dev': ['Cython', 'pytest'],
    },
    ext_modules=extensions,
-    cmdclass=cmdclass,
-    install_requires=['xopen'],
+    cmdclass={'build_ext': BuildExt, 'sdist': SDist},
+    install_requires=['xopen>=0.8.2'],
+    python_requires='>=3.4',
    classifiers=[
            "Development Status :: 3 - Alpha",
            "Intended Audience :: Science/Research",

--- a/src/dnaio/__init__.py
+++ b/src/dnaio/__init__.py
@@ -21,6 +21,7 @@ __all__ = [
 import os
 from contextlib import ExitStack
 import functools
+import pathlib

 from xopen import xopen

@@ -29,10 +30,19 @@ from .readers import FastaReader, FastqReader
 from .writers import FastaWriter, FastqWriter
 from .exceptions import UnknownFileFormat, FileFormatError, FastaFormatError, FastqFormatError
 from .chunks import read_chunks, read_paired_chunks
+from ._version import version as __version__

-from ._version import get_versions
-__version__ = get_versions()['version']
-del get_versions
+
+try:
+    from os import fspath  # Exists in Python 3.6+
+except ImportError:
+    def fspath(path):
+        if hasattr(path, "__fspath__"):
+            return path.__fspath__()
+        # Python 3.4 and 3.5 do not support the file system path protocol
+        if isinstance(path, pathlib.Path):
+            return str(path)
+        return path


 def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qualities=None):
@@ -42,10 +52,10 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
    classes also defined in this module.

    file1, file2 -- Paths to regular or compressed files or file-like
-        objects. Use file1 if data is single-end. If also file2 is provided,
-        sequences are paired.
+        objects (as str or as pathlib.Path). Use only file1 if data is single-end.
+        If sequences are paired, use also file2.

-    mode -- Either 'r' for reading or 'w' for writing.
+    mode -- Either 'r' for reading, 'w' for writing or 'a' for appending.

    interleaved -- If True, then file1 contains interleaved paired-end data.
        file2 must be None in this case.
@@ -62,20 +72,26 @@ def open(file1, *, file2=None, fileformat=None, interleaved=False, mode='r', qua
        * When False (no qualities available), an exception is raised when the
          auto-detected output format is FASTQ.
    """
-    if mode not in ('r', 'w'):
-        raise ValueError("Mode must be 'r' or 'w'")
+    if mode not in ("r", "w", "a"):
+        raise ValueError("Mode must be 'r', 'w' or 'a'")
    if interleaved and file2 is not None:
        raise ValueError("When interleaved is set, file2 must be None")
    if file2 is not None:
-        if mode == 'r':
+        if mode in "wa" and file1 == file2:
+            raise ValueError("The paired-end output files are identical")
+        if mode == "r":
            return PairedSequenceReader(file1, file2, fileformat)
-        else:
+        elif mode == "w":
            return PairedSequenceWriter(file1, file2, fileformat, qualities)
+        else:
+            return PairedSequenceAppender(file1, file2, fileformat, qualities)
    if interleaved:
-        if mode == 'r':
+        if mode == "r":
            return InterleavedSequenceReader(file1, fileformat)
-        else:
+        elif mode == "w":
            return InterleavedSequenceWriter(file1, fileformat, qualities)
+        else:
+            return InterleavedSequenceAppender(file1, fileformat, qualities)

    # The multi-file options have been dealt with, delegate rest to the
    # single-file function.
@@ -106,15 +122,21 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
    """
    Open a single sequence file. See description of open() above.
    """
-    if mode not in ('r', 'w'):
-        raise ValueError("Mode must be 'r' or 'w'")
-    if isinstance(file, str):
-        file = xopen(file, mode + 'b')
+    if mode not in ("r", "w", "a"):
+        raise ValueError("Mode must be 'r', 'w' or 'a'")
+
+    if isinstance(file, (str, pathlib.Path)):
+        path = fspath(file)
+        file = xopen(path, mode + 'b')
        close_file = True
    else:
        if mode == 'r' and not hasattr(file, 'readinto'):
            raise ValueError(
                'When passing in an open file-like object, it must have been opened in binary mode')
+        if hasattr(file, "name") and isinstance(file.name, str):
+            path = file.name
+        else:
+            path = None
        close_file = False
    if mode == 'r':
        fastq_handler = FastqReader
@@ -122,59 +144,56 @@ def _open_single(file, *, fileformat=None, mode='r', qualities=None):
    else:
        fastq_handler = FastqWriter
        fasta_handler = FastaWriter
-    fastq_handler = functools.partial(fastq_handler, _close_file=close_file)
-    fasta_handler = functools.partial(fasta_handler, _close_file=close_file)
-
-    if fileformat:  # Explict file format given
-        fileformat = fileformat.lower()
-        if fileformat == 'fasta':
-            return fasta_handler(file)
-        elif fileformat == 'fastq':
-            return fastq_handler(file)
-        else:
+    handlers = {
+        'fastq': functools.partial(fastq_handler, _close_file=close_file),
+        'fasta': functools.partial(fasta_handler, _close_file=close_file),
+    }
+
+    if fileformat:
+        try:
+            handler = handlers[fileformat.lower()]
+        except KeyError:
            raise UnknownFileFormat(
                "File format {!r} is unknown (expected 'fasta' or 'fastq').".format(fileformat))
+        return handler(file)

-    # First, try to detect the file format from the file name only
-    format = None
-    if hasattr(file, "name") and isinstance(file.name, str):
-        format = _detect_format_from_name(file.name)
-    if format is None and mode == 'w' and qualities is not None:
+    if path is not None:
+        fileformat = _detect_format_from_name(path)
+    if fileformat is None and mode == 'w' and qualities is not None:
        # Format not recognized, but we know whether to use a format with or without qualities
-        format = 'fastq' if qualities else 'fasta'
+        fileformat = 'fastq' if qualities else 'fasta'

-    if mode == 'r' and format is None:
+    if mode == 'r' and fileformat is None:
        # No format detected so far. Try to read from the file.
        if file.seekable():
            first_char = file.read(1)
            file.seek(-1, 1)
        else:
            first_char = file.peek(1)[0:1]
-        if first_char == b'#':
-            # A comment char - only valid for some FASTA variants (csfasta)
-            format = 'fasta'
-        elif first_char == b'>':
-            format = 'fasta'
-        elif first_char == b'@':
-            format = 'fastq'
-        elif first_char == b'':
-            # Empty input. Pretend this is FASTQ
-            format = 'fastq'
-        else:
+        formats = {
+            b'@': 'fastq',
+            b'>': 'fasta',
+            b'#': 'fasta',  # Some FASTA variants allow comments
+            b'': 'fastq',  # Pretend FASTQ for empty input
+        }
+        try:
+            fileformat = formats[first_char]
+        except KeyError:
            raise UnknownFileFormat(
                'Could not determine whether file {!r} is FASTA or FASTQ. The file extension was '
                'not available or not recognized and the first character in the file ({!r}) is '
                'unexpected.'.format(file, first_char))

-    if format is None:
+    if fileformat is None:
        assert mode == 'w'
-        raise UnknownFileFormat('Cannot determine whether to write in FASTA or FASTQ format')
+        extra = " because the output file name is not available" if path is None else ""
+        raise UnknownFileFormat("Auto-detection of the output file format (FASTA/FASTQ) failed" + extra)

-    if format == 'fastq' and mode == 'w' and qualities is False:
+    if fileformat == 'fastq' and mode in "wa" and qualities is False:
        raise ValueError(
            'Output format cannot be FASTQ since no quality values are available.')

-    return fastq_handler(file) if format == 'fastq' else fasta_handler(file)
+    return handlers[fileformat](file)


 def _sequence_names_match(r1, r2):
@@ -280,11 +299,13 @@ class InterleavedSequenceReader:


 class PairedSequenceWriter:
+    _mode = "w"
+
    def __init__(self, file1, file2, fileformat='fastq', qualities=None):
        with ExitStack() as stack:
-            self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode='w',
+            self._writer1 = stack.enter_context(_open_single(file1, fileformat=fileformat, mode=self._mode,
                qualities=qualities))
-            self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode='w',
+            self._writer2 = stack.enter_context(_open_single(file2, fileformat=fileformat, mode=self._mode,
                qualities=qualities))
            self._close = stack.pop_all().close

@@ -303,15 +324,20 @@ class PairedSequenceWriter:
        self.close()


+class PairedSequenceAppender(PairedSequenceWriter):
+    _mode = "a"
+
+
 class InterleavedSequenceWriter:
    """
    Write paired-end reads to an interleaved FASTA or FASTQ file
    """
+    _mode = "w"

    def __init__(self, file, fileformat='fastq', qualities=None):

        self._writer = _open_single(
-            file, fileformat=fileformat, mode='w', qualities=qualities)
+            file, fileformat=fileformat, mode=self._mode, qualities=qualities)

    def write(self, read1, read2):
        self._writer.write(read1)
@@ -326,3 +352,7 @@ class InterleavedSequenceWriter:

    def __exit__(self, *args):
        self.close()
+
+
+class InterleavedSequenceAppender(InterleavedSequenceWriter):
+    _mode = "a"
--- a/src/dnaio/_core.pyx
+++ b/src/dnaio/_core.pyx
@@ -8,59 +8,59 @@ from ._util import shorten


 cdef class Sequence:
-	"""
-	A record in a FASTA or FASTQ file. For FASTA, the qualities attribute
-	is None. For FASTQ, qualities is a string and it contains the qualities
-	encoded as ascii(qual+33).
-	"""
-	cdef:
-		public str name
-		public str sequence
-		public str qualities
-
-	def __cinit__(self, str name, str sequence, str qualities=None):
-		"""Set qualities to None if there are no quality values"""
-		self.name = name
-		self.sequence = sequence
-		self.qualities = qualities
-
-		if qualities is not None and len(qualities) != len(sequence):
-			rname = shorten(name)
-			raise ValueError("In read named {!r}: length of quality sequence "
-				"({}) and length of read ({}) do not match".format(
-					rname, len(qualities), len(sequence)))
-
-	def __getitem__(self, key):
-		"""slicing"""
-		return self.__class__(
-			self.name,
-			self.sequence[key],
-			self.qualities[key] if self.qualities is not None else None)
-
-	def __repr__(self):
-		qstr = ''
-		if self.qualities is not None:
-			qstr = ', qualities={!r}'.format(shorten(self.qualities))
-		return '<Sequence(name={!r}, sequence={!r}{})>'.format(
-			shorten(self.name), shorten(self.sequence), qstr)
-
-	def __len__(self):
-		return len(self.sequence)
-
-	def __richcmp__(self, other, int op):
-		if 2 <= op <= 3:
-			eq = self.name == other.name and \
-				self.sequence == other.sequence and \
-				self.qualities == other.qualities
-			if op == 2:
-				return eq
-			else:
-				return not eq
-		else:
-			raise NotImplementedError()
-
-	def __reduce__(self):
-		return (Sequence, (self.name, self.sequence, self.qualities))
+    """
+    A record in a FASTA or FASTQ file. For FASTA, the qualities attribute
+    is None. For FASTQ, qualities is a string and it contains the qualities
+    encoded as ascii(qual+33).
+    """
+    cdef:
+        public str name
+        public str sequence
+        public str qualities
+
+    def __cinit__(self, str name, str sequence, str qualities=None):
+        """Set qualities to None if there are no quality values"""
+        self.name = name
+        self.sequence = sequence
+        self.qualities = qualities
+
+        if qualities is not None and len(qualities) != len(sequence):
+            rname = shorten(name)
+            raise ValueError("In read named {!r}: length of quality sequence "
+                "({}) and length of read ({}) do not match".format(
+                    rname, len(qualities), len(sequence)))
+
+    def __getitem__(self, key):
+        """slicing"""
+        return self.__class__(
+            self.name,
+            self.sequence[key],
+            self.qualities[key] if self.qualities is not None else None)
+
+    def __repr__(self):
+        qstr = ''
+        if self.qualities is not None:
+            qstr = ', qualities={!r}'.format(shorten(self.qualities))
+        return '<Sequence(name={!r}, sequence={!r}{})>'.format(
+            shorten(self.name), shorten(self.sequence), qstr)
+
+    def __len__(self):
+        return len(self.sequence)
+
+    def __richcmp__(self, other, int op):
+        if 2 <= op <= 3:
+            eq = self.name == other.name and \
+                self.sequence == other.sequence and \
+                self.qualities == other.qualities
+            if op == 2:
+                return eq
+            else:
+                return not eq
+        else:
+            raise NotImplementedError()
+
+    def __reduce__(self):
+        return (Sequence, (self.name, self.sequence, self.qualities))


 # It would be nice to be able to have the first parameter be an
@@ -69,216 +69,216 @@ cdef class Sequence:
 # See <https://stackoverflow.com/questions/28203670/>

 ctypedef fused bytes_or_bytearray:
-	bytes
-	bytearray
+    bytes
+    bytearray


 def paired_fastq_heads(bytes_or_bytearray buf1, bytes_or_bytearray buf2, Py_ssize_t end1, Py_ssize_t end2):
-	"""
-	Skip forward in the two buffers by multiples of four lines.
-
-	Return a tuple (length1, length2) such that buf1[:length1] and
-	buf2[:length2] contain the same number of lines (where the
-	line number is divisible by four).
-	"""
-	cdef:
-		Py_ssize_t pos1 = 0, pos2 = 0
-		Py_ssize_t linebreaks = 0
-		unsigned char* data1 = buf1
-		unsigned char* data2 = buf2
-		Py_ssize_t record_start1 = 0
-		Py_ssize_t record_start2 = 0
-
-	while True:
-		while pos1 < end1 and data1[pos1] != b'\n':
-			pos1 += 1
-		if pos1 == end1:
-			break
-		pos1 += 1
-		while pos2 < end2 and data2[pos2] != b'\n':
-			pos2 += 1
-		if pos2 == end2:
-			break
-		pos2 += 1
-		linebreaks += 1
-		if linebreaks == 4:
-			linebreaks = 0
-			record_start1 = pos1
-			record_start2 = pos2
-
-	# Hit the end of the data block
-	return record_start1, record_start2
+    """
+    Skip forward in the two buffers by multiples of four lines.
+
+    Return a tuple (length1, length2) such that buf1[:length1] and
+    buf2[:length2] contain the same number of lines (where the
+    line number is divisible by four).
+    """
+    cdef:
+        Py_ssize_t pos1 = 0, pos2 = 0
+        Py_ssize_t linebreaks = 0
+        unsigned char* data1 = buf1
+        unsigned char* data2 = buf2
+        Py_ssize_t record_start1 = 0
+        Py_ssize_t record_start2 = 0
+
+    while True:
+        while pos1 < end1 and data1[pos1] != b'\n':
+            pos1 += 1
+        if pos1 == end1:
+            break
+        pos1 += 1
+        while pos2 < end2 and data2[pos2] != b'\n':
+            pos2 += 1
+        if pos2 == end2:
+            break
+        pos2 += 1
+        linebreaks += 1
+        if linebreaks == 4:
+            linebreaks = 0
+            record_start1 = pos1
+            record_start2 = pos2
+
+    # Hit the end of the data block
+    return record_start1, record_start2


 def fastq_iter(file, sequence_class, Py_ssize_t buffer_size):
-	"""
-	Parse a FASTQ file and yield Sequence objects
-
-	The *first value* that the generator yields is a boolean indicating whether
-	the first record in the FASTQ has a repeated header (in the third row
-	after the ``+``).
-
-	file -- a file-like object, opened in binary mode (it must have a readinto
-	method)
-
-	buffer_size -- size of the initial buffer. This is automatically grown
-	    if a FASTQ record is encountered that does not fit.
-	"""
-	cdef:
-		bytearray buf = bytearray(buffer_size)
-		char[:] buf_view = buf
-		char* c_buf = buf
-		int endskip
-		str name
-		char* name_encoded
-		Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
-		Py_ssize_t second_header_start, sequence_length, qualities_start
-		Py_ssize_t second_header_length, name_length
-		bint custom_class = sequence_class is not Sequence
-		Py_ssize_t n_records = 0
-		bint extra_newline = False
-
-	if buffer_size < 1:
-		raise ValueError("Starting buffer size too small")
-
-	# buf is a byte buffer that is re-used in each iteration. Its layout is:
-	#
-	# |-- complete records --|
-	# +---+------------------+---------+-------+
-	# |   |                  |         |       |
-	# +---+------------------+---------+-------+
-	# ^   ^                  ^         ^       ^
-	# 0   bufstart           end       bufend  len(buf)
-	#
-	# buf[0:bufstart] is the 'leftover' data that could not be processed
-	# in the previous iteration because it contained an incomplete
-	# FASTQ record.
-
-	readinto = file.readinto
-	bufstart = 0
-
-	# The input file is processed in chunks that each fit into buf
-	while True:
-		assert bufstart < len(buf_view)
-		bufend = readinto(buf_view[bufstart:]) + bufstart
-		if bufstart == bufend:
-			# End of file
-			if bufstart > 0 and buf_view[bufstart-1] != b'\n':
-				# There is still data in the buffer and its last character is
-				# not a newline: This is a file that is missing the final
-				# newline. Append a newline and continue.
-				buf_view[bufstart] = b'\n'
-				bufstart += 1
-				bufend += 1
-				extra_newline = True
-			else:
-				break
-
-		# Parse all complete FASTQ records in this chunk
-		pos = 0
-		record_start = 0
-		while True:
-			# Parse the name (line 0)
-			if c_buf[pos] != b'@':
-				raise FastqFormatError("Line expected to "
-					"start with '@', but found {!r}".format(chr(c_buf[pos])),
-					line=n_records * 4)
-			pos += 1
-			while pos < bufend and c_buf[pos] != b'\n':
-				pos += 1
-			if pos == bufend:
-				break
-			endskip = 1 if c_buf[pos-1] == b'\r' else 0
-			name_length = pos - endskip - record_start - 1
-			name_encoded = c_buf + record_start + 1
-			# .decode('latin-1') is 50% faster than .decode('ascii')
-			name = c_buf[record_start+1:pos-endskip].decode('latin-1')
-
-			pos += 1
-
-			# Parse the sequence (line 1)
-			sequence_start = pos
-			while pos < bufend and c_buf[pos] != b'\n':
-				pos += 1
-			if pos == bufend:
-				break
-			endskip = 1 if c_buf[pos-1] == b'\r' else 0
-			sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
-			sequence_length = pos - endskip - sequence_start
-			pos += 1
-
-			# Parse second header (line 2)
-			second_header_start = pos
-			if pos == bufend:
-				break
-			if c_buf[pos] != b'+':
-				raise FastqFormatError("Line expected to "
-					"start with '+', but found {!r}".format(chr(c_buf[pos])),
-					line=n_records * 4 + 2)
-			pos += 1  # skip over the '+'
-			while pos < bufend and c_buf[pos] != b'\n':
-				pos += 1
-			if pos == bufend:
-				break
-			endskip = 1 if c_buf[pos-1] == b'\r' else 0
-			second_header_length = pos - endskip - second_header_start - 1
-			if second_header_length == 0:
-				second_header = False
-			else:
-				if (name_length != second_header_length or
-						strncmp(c_buf+second_header_start+1,
-							name_encoded, second_header_length) != 0):
-					raise FastqFormatError(
-						"Sequence descriptions don't match ('{}' != '{}').\n"
-						"The second sequence description must be either "
-						"empty or equal to the first description.".format(
-							name_encoded[:name_length].decode('latin-1'),
-							c_buf[second_header_start+1:pos-endskip]
-							.decode('latin-1')), line=n_records * 4 + 2)
-				second_header = True
-			pos += 1
-
-			# Parse qualities (line 3)
-			qualities_start = pos
-			while pos < bufend and c_buf[pos] != b'\n':
-				pos += 1
-			if pos == bufend:
-				break
-			endskip = 1 if c_buf[pos-1] == b'\r' else 0
-			qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
-			if pos - endskip - qualities_start != sequence_length:
-				raise FastqFormatError("Length of sequence and "
-					"qualities differ", line=n_records * 4 + 3)
-			pos += 1
-			if n_records == 0:
-				yield second_header  # first yielded value is special
-			if custom_class:
-				yield sequence_class(name, sequence, qualities)
-			else:
-				yield Sequence.__new__(Sequence, name, sequence, qualities)
-			n_records += 1
-			record_start = pos
-			if pos == bufend:
-				break
-		if pos == bufend:
-			if record_start == 0 and bufend == len(buf):
-				# buffer too small, double it
-				buffer_size *= 2
-				prev_buf = buf
-				buf = bytearray(buffer_size)
-				buf[0:bufend] = prev_buf
-				del prev_buf
-				bufstart = bufend
-				buf_view = buf
-				c_buf = buf
-			else:
-				bufstart = bufend - record_start
-				buf[0:bufstart] = buf[record_start:bufend]
-	if pos > record_start:
-		if extra_newline:
-			pos -= 1
-		lines = buf[record_start:pos].count(b'\n')
-		raise FastqFormatError(
-			'Premature end of file encountered. The incomplete final record was: '
-			'{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
-			line=n_records * 4 + lines)
+    """
+    Parse a FASTQ file and yield Sequence objects
+
+    The *first value* that the generator yields is a boolean indicating whether
+    the first record in the FASTQ has a repeated header (in the third row
+    after the ``+``).
+
+    file -- a file-like object, opened in binary mode (it must have a readinto
+    method)
+
+    buffer_size -- size of the initial buffer. This is automatically grown
+        if a FASTQ record is encountered that does not fit.
+    """
+    cdef:
+        bytearray buf = bytearray(buffer_size)
+        char[:] buf_view = buf
+        char* c_buf = buf
+        int endskip
+        str name
+        char* name_encoded
+        Py_ssize_t bufstart, bufend, pos, record_start, sequence_start
+        Py_ssize_t second_header_start, sequence_length, qualities_start
+        Py_ssize_t second_header_length, name_length
+        bint custom_class = sequence_class is not Sequence
+        Py_ssize_t n_records = 0
+        bint extra_newline = False
+
+    if buffer_size < 1:
+        raise ValueError("Starting buffer size too small")
+
+    # buf is a byte buffer that is re-used in each iteration. Its layout is:
+    #
+    # |-- complete records --|
+    # +---+------------------+---------+-------+
+    # |   |                  |         |       |
+    # +---+------------------+---------+-------+
+    # ^   ^                  ^         ^       ^
+    # 0   bufstart           end       bufend  len(buf)
+    #
+    # buf[0:bufstart] is the 'leftover' data that could not be processed
+    # in the previous iteration because it contained an incomplete
+    # FASTQ record.
+
+    readinto = file.readinto
+    bufstart = 0
+
+    # The input file is processed in chunks that each fit into buf
+    while True:
+        assert bufstart < len(buf_view)
+        bufend = readinto(buf_view[bufstart:]) + bufstart
+        if bufstart == bufend:
+            # End of file
+            if bufstart > 0 and buf_view[bufstart-1] != b'\n':
+                # There is still data in the buffer and its last character is
+                # not a newline: This is a file that is missing the final
+                # newline. Append a newline and continue.
+                buf_view[bufstart] = b'\n'
+                bufstart += 1
+                bufend += 1
+                extra_newline = True
+            else:
+                break
+
+        # Parse all complete FASTQ records in this chunk
+        pos = 0
+        record_start = 0
+        while True:
+            # Parse the name (line 0)
+            if c_buf[pos] != b'@':
+                raise FastqFormatError("Line expected to "
+                    "start with '@', but found {!r}".format(chr(c_buf[pos])),
+                    line=n_records * 4)
+            pos += 1
+            while pos < bufend and c_buf[pos] != b'\n':
+                pos += 1
+            if pos == bufend:
+                break
+            endskip = 1 if c_buf[pos-1] == b'\r' else 0
+            name_length = pos - endskip - record_start - 1
+            name_encoded = c_buf + record_start + 1
+            # .decode('latin-1') is 50% faster than .decode('ascii')
+            name = c_buf[record_start+1:pos-endskip].decode('latin-1')
+
+            pos += 1
+
+            # Parse the sequence (line 1)
+            sequence_start = pos
+            while pos < bufend and c_buf[pos] != b'\n':
+                pos += 1
+            if pos == bufend:
+                break
+            endskip = 1 if c_buf[pos-1] == b'\r' else 0
+            sequence = c_buf[sequence_start:pos-endskip].decode('latin-1')
+            sequence_length = pos - endskip - sequence_start
+            pos += 1
+
+            # Parse second header (line 2)
+            second_header_start = pos
+            if pos == bufend:
+                break
+            if c_buf[pos] != b'+':
+                raise FastqFormatError("Line expected to "
+                    "start with '+', but found {!r}".format(chr(c_buf[pos])),
+                    line=n_records * 4 + 2)
+            pos += 1  # skip over the '+'
+            while pos < bufend and c_buf[pos] != b'\n':
+                pos += 1
+            if pos == bufend:
+                break
+            endskip = 1 if c_buf[pos-1] == b'\r' else 0
+            second_header_length = pos - endskip - second_header_start - 1
+            if second_header_length == 0:
+                second_header = False
+            else:
+                if (name_length != second_header_length or
+                        strncmp(c_buf+second_header_start+1,
+                            name_encoded, second_header_length) != 0):
+                    raise FastqFormatError(
+                        "Sequence descriptions don't match ('{}' != '{}').\n"
+                        "The second sequence description must be either "
+                        "empty or equal to the first description.".format(
+                            name_encoded[:name_length].decode('latin-1'),
+                            c_buf[second_header_start+1:pos-endskip]
+                            .decode('latin-1')), line=n_records * 4 + 2)
+                second_header = True
+            pos += 1
+
+            # Parse qualities (line 3)
+            qualities_start = pos
+            while pos < bufend and c_buf[pos] != b'\n':
+                pos += 1
+            if pos == bufend:
+                break
+            endskip = 1 if c_buf[pos-1] == b'\r' else 0
+            qualities = c_buf[qualities_start:pos-endskip].decode('latin-1')
+            if pos - endskip - qualities_start != sequence_length:
+                raise FastqFormatError("Length of sequence and "
+                    "qualities differ", line=n_records * 4 + 3)
+            pos += 1
+            if n_records == 0:
+                yield second_header  # first yielded value is special
+            if custom_class:
+                yield sequence_class(name, sequence, qualities)
+            else:
+                yield Sequence.__new__(Sequence, name, sequence, qualities)
+            n_records += 1
+            record_start = pos
+            if pos == bufend:
+                break
+        if pos == bufend:
+            if record_start == 0 and bufend == len(buf):
+                # buffer too small, double it
+                buffer_size *= 2
+                prev_buf = buf
+                buf = bytearray(buffer_size)
+                buf[0:bufend] = prev_buf
+                del prev_buf
+                bufstart = bufend
+                buf_view = buf
+                c_buf = buf
+            else:
+                bufstart = bufend - record_start
+                buf[0:bufstart] = buf[record_start:bufend]
+    if pos > record_start:
+        if extra_newline:
+            pos -= 1
+        lines = buf[record_start:pos].count(b'\n')
+        raise FastqFormatError(
+            'Premature end of file encountered. The incomplete final record was: '
+            '{!r}'.format(shorten(buf[record_start:pos].decode('latin-1'), 500)),
+            line=n_records * 4 + lines)
--- a/src/dnaio/_version.py
+++ b/src/dnaio/_version.py
-
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain. Generated by
-# versioneer-0.18 (https://github.com/warner/python-versioneer)
-
-"""Git implementation of _version.py."""
-
-import errno
-import os
-import re
-import subprocess
-import sys
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = " (tag: v0.3)"
-    git_full = "a16a94708e8b7b974a8598e56da4ffbf892f8898"
-    git_date = "2018-10-03 14:11:31 +0200"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "pep440"
-    cfg.tag_prefix = "v"
-    cfg.parentdir_prefix = "dnaio-"
-    cfg.versionfile_source = "src/dnaio/_version.py"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
-            if verbose:
-                print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%d" % pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
-            root = os.path.dirname(root)
-    except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
-
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
--- a/src/dnaio/chunks.py
+++ b/src/dnaio/chunks.py
 """Chunked reading of FASTA and FASTQ files"""
+
 from ._core import paired_fastq_heads as _paired_fastq_heads
 from .exceptions import FileFormatError, FastaFormatError, UnknownFileFormat

@@ -53,6 +54,9 @@ def read_chunks(f, buffer_size=4*1024**2):
        head = _fastq_head
    elif start == 1 and (buf[0:1] == b'#' or buf[0:1] == b'>'):
        head = _fasta_head
+    elif start == 0:
+        # Empty file
+        return
    else:
        raise UnknownFileFormat('Input file format unknown')


--- a/src/dnaio/writers.py
+++ b/src/dnaio/writers.py
-import io
 from xopen import xopen



--- a/tests/data/simple.fasta.bz2
+++ b/tests/data/simple.fasta.bz2
--- a/tests/data/simple.fasta.gz
+++ b/tests/data/simple.fasta.gz
--- a/tests/data/simple.fasta.xz
+++ b/tests/data/simple.fasta.xz
--- a/tests/data/simple.fastq.bz2
+++ b/tests/data/simple.fastq.bz2