Skip to content
Commits on Source (6)
[run]
omit =
tests/*
setup.py
*site-packages/*
[report]
exclude_lines =
pragma: no cover
pragma: no-cover
def __repr__
raise NotImplementedError
if __name__ == .__main__.:
......@@ -14,7 +14,6 @@ atropos/**/*.so
doc/_build
*.pyo
.idea/
tests/testtmp
virtualenv
*.egg-info
*.c
......@@ -50,4 +49,7 @@ paper/workflow/results
*.ipynb
*.tar
.pytest_cache/
.coverage
coverage.xml
venv/
.eggs/
\ No newline at end of file
......@@ -5,32 +5,17 @@ cache:
directories:
- "$HOME/.cache/pip"
python:
- "3.4.5"
- "3.5"
- "3.6"
- "3.7"
- "3.8"
- 3.6
- 3.7
- 3.8
install:
- pip install --upgrade pip wheel
- pip install Cython
- pip install pytest-cov
- pip install coveralls
- make install
script:
- make test
after_success:
- coveralls
- pylint atropos
# push Docker images for full releases
- >
test -n "$TRAVIS_TAG" &&
docker login -e $DOCKER_EMAIL -u $DOCKER_USER -p $DOCKER_PASS &&
export REPO=jdidion/atropos &&
docker build -f Dockerfile -t $REPO:$TRAVIS_TAG . &&
docker tag $REPO:$TRAVIS_TAG $REPO:latest &&
docker tag $REPO:$TRAVIS_TAG $REPO:$COMMIT &&
docker tag $REPO:$TRAVIS_TAG $REPO:travis-$TRAVIS_BUILD_NUMBER &&
docker push $REPO
env:
global:
- secure: BQFHrr0r2qmehz3uVcKsEV3G4ollqYoSqyxF02W9OsApiAihry14Gi3yFKU8knZsqCzXPHlYShHahqiVTIU9QqGi8xlYSVJ8P0cV7xqapgO1HJhKd/quiRRpwGOHXf3yCbEJvGq/9PQ0bRkT3+4Khu8GHKLWrCfepT6LPpNYCzAQTEhsRCAO1MhOEM3x0Req82UMp5fndJfAnla+XcbhCsurZCaNoxmfm5i4Jv3tAUozWipL1vy2E65P/whyvz9U19CxaO+hvuFWTwHivMRtTVoF4fInvKWcf7XxgJ0tOFeDQBQWxTLS852kPq1W3uDbg084CwmlM89tt/zhCMNFSs6nOfO1TysN+FGFRvP7dW8bq7ht+Vzj+sFR62DaPJn48KU3X1TBqYm+/k9xMnPe+njNPioMp4aDKarSV8mVm1NGCQhJ6Feg7ebpKtcSeBCzUEUHUd3YM7xoV06OLNi4D+qClYYLPP91Z/c0/yGcT/rLo8cX+PdeDQiQ13lSIfWi00mdUdW/mdYnpwgJ9JPe2eS6/8+G3bXDCazGuFjlNDlkw/jDiG/tC6Wbgr5rUN2xLC6ZJB9LCP3JhtCZLR/IHIEOVAQHk/awqgKIK+8V9JFhKc753unFpMFnuL3KreGiVEjX/6UdChnMt0OdadaCU/a8CCtNxnSSlyJ+rEy++Fc= # DOCKER_EMAIL
- secure: dDu/K3lqfEmHYGiJO7rdp+RUHqhv76c26u3/I/La3Li9VHuF7rEye3/b7D7PLCZsh49V7VuZ6XVsgVN0mfSkFQV/U+51cmRPDx9poj4IPmUP0Nda04BKWuHjJ7izDZby7aWHPZx6psdLaaBnpw+7zEWiPvDo3Q4Qo8o0ZeD4k3BWQWXh+vSz6TlWWKY0gj6ex/sx+EX/3c/6QRBNYl8IApmOgmUyCzkaXq4sBrjoU4Pm69fSlHge/LU44cvM3++HMYkcaI4M6WUOPuwdjeRZ8wRW/uOJZ5xXV0LQQscB/D82z9OwRirmYVS+O/UXD3r440HIWDm8KPpCFhW1+W3tRytZKt0xm/MeAHEJuvLb3XHa4Tto69SOmgxvLs3CDnJraq4Q8x5T07KYatea/1ChAmiS6mqf6bhcDMNyGaanMTmtfVtnBdTEbvl24rAiu5pxfyW4tHGvptwaSlB8PCH/cLjSJZa5XdiPimld2pbL1MKKplG8JgxHJ8nS/RD9GlT95/nGekOOs49w/zzZZ8wMdoBHOqL/Ln3AwidSC/wxCbOGERm7LyT7lCdQBFA75smQUO43Fmrdjf8mlVZtO6QCAAgyjzOyPaCw9Hhf3bDEEreMIi4g9kBsl+ZWd4W3i1rD64dPNDo24THzxWKqtWI5ufTialA5N+iqms4vjXAs7W4= # DOCKER_USER
- secure: r/IVzZCA/wFZlS7nPEbiv7UGpH1e88egGmKLPfOqx1zWy8B5UH+hEjP8nZQo8mTN5vQpWz3FhCLvS3Dg6FMAci+VpUNdjLfbwgPXw5ukioQHaVaX/4+aut4XqWTsWsgcQ+8zL7I7/GR5hL55VnZrX2R7vh6iEPV3942P2dNiaJ4PDx6XKQeSrfEwQGjRxTK4jfaZZDMr8QpWZuVY4KqynfDnY4UTG+udqJaJi2AYbh39JKN1aGpo7QXj1+kXK7T+XOR5YW+a2hC8d2f6INHothjLG5r/SdgZS8Ma3lGuVq+llEiqJnAW6C4ldSvkk9H54R1T5tO0HhBiiOi4bhglxzD0F976nsxbzV6SAKoOAANR+TR5par1BUlnFO2uIrIVvdhvj806M/dz1ZLkMnHSpF+Fs5z+uwZwJIWjw8I2+R/DiZL/fbUx6Smvqw8ZHYlXjXZG1PhSxyyu00EQfFCjPHGtW1N9CcEoexNuXj+qjY/e/rs9kCYaBczLV8X41807BvxSkwyXUH5DUjTMraW/6b7tParp754cw2jGAvSG5ceqZHKqoZXa+OAY7Bck7oP5nu/xNfG1upv2vjgzCaJ5gErOBP4e/4wrBnmcZkAgLVFtz9yNuL0zBaE3ubnoN+8DyHTsXQVO9f/p6x6ODAH4N2MS6PWYWvLssKKh7DrOPVQ= # DOCKER_PASS
- COMMIT=${TRAVIS_COMMIT::8}
# Changes
v1.1.24 (2019.11.27)
* Fix #87 - Python 3.8 compatibility, time.clock removed
v1.1.23 (2019.11.22)
* Set minimum Python version to 3.4.5
* Fixed #86 - no trimming performed for single-end BAM file
v1.1.22 (2019.05.20)
## 2.0.0-alpha.3 (dev)
## 2.0.0-alpha.2 (2020.01.02)
* *Breaking changes:*
* Renamed `--read1_umi` and `--read2-umi` options to `--read1-umi` and `--read2-umi`
* `detect --fasta` option now accepts only a single value, either 'perinput', 'union', or 'both'
* Replace pysam with bamnostic
* Re-enable testing on Python 3.8
* Added '--umi-input' option to obtain UMIs from a separate input file.
* This addresses #66: Support UMIs from a third input file.
* When writing SAM output for reads with UMIs, write the UMIs using the `BC` tag.
* Refactored test code to use pytest fixtures and generate coverage metrics
* Upgrade to xphyle 4.2.2
## 2.0.0-alpha.1 (2019.12.25)
* *Breaking changes:*
* Dropped support for python 3.3-3.5 in order to take advantage of many new features in 3.6 (such as type annotations), and to migrate to xphyle for file management. The 1.1.x branch will maintain 3.3-3.5 support and will receive any new bug fixes (but no new features).
* The `--compression` argument has been renamed to `--compression-mode`, to avoid confusion with the new `--compression-format` option (see below).
* The `--format option` has been renamed to `--input-format`, to avoid confusion with the new `--output-format` option
* The `--stats option` has been renamed to `--metrics`.
* The `--nextseq-trim` option has been renamed to `--twocolor-trim`.
* The `--sra-accession` option has been renamed to `--accession` and now accepts a protocol prefix.
* Dropped the --discard alias for `--discard-trimmed`.
* Dropped the --trimmed-only alias for `--discard-untrimmed`.
* Merged PR #63: Implementation of UMI support. Thanks @wckdouglas!
* Eliminated the bin/ folder; switch to using entry points for the atropos executable.
* Fix #32: SAM output.
* Fix #36: Progress bars don't increment correctly when batch size > 1 used.
* Moved all file management code to use [xphyle](https://github.com/jdidion/atropos/tree/xphyle)
* Added --compression-format option to override filename-based detection of compression format, and to enable compressed output to stdout.
* Added --output-format option to manually specify output format instead of determining the format from the output file name.
* Added --query option to specify a URL for a supported streaming protocol (e.g. htsget).
* Enabled output to stdout by default with single-end and interleaved reads.
* Migrated to setuptools_scm for version management.
## v1.1.24 (2019.11.17)
* Fix #87 - Python 3.8 incompatibility - change time.clock() to time.process_time()
## v1.1.23 (2019.11.22)
Set minimum Python version to 3.4.5
Fixed #86 - no trimming performed for single-end BAM file
## v1.1.22 (2019.05.20)
* Documentation fixes (#79)
* Fix for index error when running `detect` command (#80)
v1.1.21 (2018.11.24)
## v1.1.21 (2018.11.24)
* Added long options for paired adapter parameters.
v1.1.20 (2018.11.21)
## v1.1.20 (2018.11.21)
* Fix #74: Make pysam open SAM/BAM files with check_sq=False by default
* Fix #74: Make pysam open SAM/BAM files with `check_sq=False` by default
* Fixed setup.py to correctly include README for PyPI.
v1.1.19 (2018.05.16)
## v1.1.19 (2018.05.16)
* Fix #68: Error when using insert aligner with adapters of different lengths
v1.1.18 (2018.03.16)
## v1.1.18 (2018.03.16)
* Added two new tools to the benchmarks:
* fastp
......@@ -35,78 +72,76 @@ v1.1.18 (2018.03.16)
* Updated versions of several tools used in the paper. Rebuilt containers and pushed them to Dockerhub.
* Fix #64: InsertAligner not respecting match_adapter_wildcards and match_read_wildcards options.
v1.1.17 (2018.01.13)
## v1.1.17 (2018.01.13)
* Fix #51: Reads of different lengths not error corrected.
v1.1.16 (2018.01.07)
## v1.1.16 (2018.01.07)
* Fix for #57: LegacyReport stops on adapter with no trimmed reads, and LegacyReport errors when histogram data is None. Thanks to @cokelaer and @pyMyt1!
* Fix for #58: NextSeqTrimmer not trimming from both ends. Thanks to @pkMyt1!
v1.1.15 (2017.09.28)
## v1.1.15 (2017.09.28)
* Fix for #41: Error when using progress bar.
* Fix for #42: Discordance between Cutadapt and Atropos in number of expected events.
* Added '--alphabet' option. Set to 'dna' to validate input sequences against the allowed DNA characters (A/C/G/T/N). This fixes #43 and partially fixes #44.
* Fixed #44: Uncaught errors not being logged.
v1.1.14 (2017.09.19)
## v1.1.14 (2017.09.19)
* Fix for #39: miRNA option error (thanks to @mottodora)
* Fix for #37: fixes overflow error when computing RandomMatchProbability on long reads (>150 bp)
v1.1.13 (2017.09.14)
## v1.1.13 (2017.09.14)
* Fix for #38: Atropos fails with MultiCore error when using OrderPreservingWriterResultsHandler (thanks to @cshenanigans)
v1.1.12 (2017.08.15)
## v1.1.12 (2017.08.15)
* Expose --min-frequency and --min-contaminant-match-frac options to 'detect -d heuristic' command.
* Expose --min-kmer-match-frac option to 'detect -d known' command.
* Fixed #35: using incorrect metric to determine match fraction in 'detect -d known' command.
v1.1.11 (2017.08.15)
## v1.1.11 (2017.08.15)
* Fixed #34: JSON report output not working with SRA streaming.
v1.1.10 (2017.08.09)
## v1.1.10 (2017.08.09)
* Improve debugging messages
v1.1.9 (2017.08.01)
## v1.1.9 (2017.08.01)
* Fix #30: failure when using --preserve-order option
v1.1.8 (2017.07.10)
## v1.1.8 (2017.07.10)
* Add --config option for specifying options in a config file.
* Fix for #29: allow paired-end quality and N trimming without adapter trimming.
* Removed twine register command from make release
v1.1.7 (2017.06.01)
## v1.1.7 (2017.06.01)
* Stream reads directly from an SRA accession for any atropos command using the
-sra option.
* Add detect option to specify the bases that signify when the sequencer has
read past the end of a fragment.
* Stream reads directly from an SRA accession for any atropos command using the -sra option.
* Add detect option to specify the bases that signify when the sequencer has read past the end of a fragment.
v1.1.6 (2017.05.30)
## v1.1.6 (2017.05.30)
* Add FASTA output for detect command, and enable json, yaml, and pickle output for all commands.
v1.1.5 (2017.05.18)
## v1.1.5 (2017.05.18)
* Major update to the documentation.
* Fixed error messages in multi-threaded mode.
* Fixed bug when generating reports for runs involving error correction.
v1.1.4 (2017.05.02)
## v1.1.4 (2017.05.02)
* Exposed option to set PRNG seed when subsampling reads.
* Fixed issue #14: 'detect' and 'error' commands were broken. This involved rewriting those commands to use the same pipeline and reporting frameworks as the 'trim' and 'qc' commands.
v1.1.3 (2017.05.01)
## v1.1.3 (2017.05.01)
* Updated Dockerfile to use smaller, Alpine-based image.
* Added Docker image for v1.1.2 to Docker Hub.
......@@ -115,7 +150,7 @@ v1.1.3 (2017.05.01)
* Fixed #12: tqdm progress bar not working.
* Fixed #13: unnecessary differences in summary output between Cutadapt and Atropos.
v1.1.2 (2017.04.12)
## v1.1.2 (2017.04.12)
* New 'qc' command computes read-level statistics.
* The 'trim' command can also compute read-level statistic pre- and/or post-trimming using the new '--stats' option.
......@@ -132,46 +167,45 @@ v1.1.2 (2017.04.12)
* Implemented Atropos module for MultiQC, which reads reports in JSON format. This is currently available [here](http://github.com/jdidion/atropos-multiqc) and will hopefully soon be merged into MultiQC.
* Ported some recent enhancments over from Cutadapt.
v1.0.23 (2016.12.07)
## v1.0.23 (2016.12.07)
* Identified a subtle bug having to do with insufficient memory in multi-threaded mode. The main thread appears to hang waiting for the next read from the input file. This appears to occur only under a strictly-regulated memory cap such as on cluster environment. This bug is not fixed, but I added the following:
* Set the default batch size based on the queue sizes
* Warn the user if their combination of batch and queue sizes might lead to excessive memory usage.
* Bug fixes
v1.0.22 (2016.12.02)
## v1.0.22 (2016.12.02)
* Abstracted the ErrorEstimator class to enable alternate implementations.
* Added a new ShadowRegressionErrorEstimator that uses the ShadowRegression R package (Wang et al.) to more accurately estimate sequencing error rate. This requires that R and the [ShadowRegression package](http://bcb.dfci.harvard.edu/~vwang/shadowRegression.html) and its dependencies be installed -- MASS and ReadCount, which in turn depend on a bunch of Bioconductor packages. At some point, this dependency will go away when I reimplement the method in pure python.
* The error command now reports the longest matching read fragment, which is usually a closer match for the actual adapter sequence than the longest matching k-mer.
v1.0.21 (2016.11.23)
## v1.0.21 (2016.11.23)
* Bugfixes
v1.0.20 (2016.11.22)
## v1.0.20 (2016.11.22)
* Changed the order of trimming operations - OverwriteReadModifier is now after read and quality trimming.
* Refactored the main Atropos interface to improve testability.
* Added more unit tests.
v1.0.19 (2016.11.21)
## v1.0.19 (2016.11.21)
* Fixed a major bug in OverwriteReadModifier, and in the unit tests for paired-end trimmers.
v1.0.18 (2016.11.20)
## v1.0.18 (2016.11.20)
* Added OverwriteReadModifier, a paired-end modifier that overwrites one read end with the other if the mean quality over the first N bases (where N is user-specified) of one is below a threshold value and the mean quality of the other is above a second threshold. This dramatically improves the number of high-quality read mappings in data sets where there are systematic problems with one read-end.
v1.0.17 (2016.11.18)
## v1.0.17 (2016.11.18)
* Perform error correction when insert match fails but adapter matches are complementary
* Improvements to handling of cached adapter lists
* Merged reads are no longer written to --too-short-output by default
* Many bugfixes and improvements in deployment (including a Makefile)
* Many
v1.0.16 (2016.09.20)
## v1.0.16 (2016.09.20)
* Migrate to Versioneer for version management.
* Enable stderr as a valid output using the '\_' shortcut.
......@@ -181,13 +215,13 @@ v1.0.16 (2016.09.20)
* We are beginning to move towards the use of commands for all operations, and read-trimming now falls under the 'trim' command. Currently, calling atropos without any command will default to the 'trim' command.
* When InsertAdapterCutter.symmetric is True and mismatch_action is not None, insert match fails, at least one adapter match succeeds, and the adapter matches (if there are two) are complementary, then the reads are treated as overlapping and error correction is performed. This leads to substantial improvements when one read is of good quality while the other is other is of poor quality.
v1.0.15 (2016.09.14)
## v1.0.15 (2016.09.14)
* Fixed missing import bug in 'detect' command.
* Added estimate of fraction of contaminated reads to output of 'detect' command.
* Optionally cache list of known contaminants rather than re-download it every time.
v1.0.14 (2016.09.13)
## v1.0.14 (2016.09.13)
* Implemented \_align.MultiAligner, which returns all matches that satisfy the overlap and error thresholds. align.InsertAligner now uses MultiAligner for insert matching, and tests all matches in decreasing size order until it finds one with adapter matches (if any).
* Major improvements to the accuracy of the 'detect' command.
......@@ -198,53 +232,54 @@ v1.0.14 (2016.09.13)
* Fixed a segmentation fault that occurs when trying to trim zero-length reads with the insert aligner.
* Sevaral other bugfixes.
v1.0.13 (2016.08.31)
## v1.0.13 (2016.08.31)
* Add options to specify max error rates for insert and adapter matching within insert aligner.
* Add new command to estimate empirical error rate in data set from base qualities.
v1.0.12 (2016.08.30)
## v1.0.12 (2016.08.30)
* Add ability to correct errors during insert-match adapter trimming.
* Implement additional adapter-detection algorithms.
* Fix bug where default output file is force-created in parallel-write mode
v1.0.11 (2016.08.24)
## v1.0.11 (2016.08.24)
* Clarify and fix issues with bisulfite trimming. Notably, rrbs and non-directional are now allowed independently or in combination.
v1.0.10 (2016.08.23)
## v1.0.10 (2016.08.23)
* Introduced new 'detect' command for automatically detecting adapter sequences.
* Options are now required to specify input files.
* Major updates to documentation.
v1.0.9 (2016.08.22)
## v1.0.9 (2016.08.22)
* Bugfix release
v1.0.8 (2016.08.19)
## v1.0.8 (2016.08.19)
* Reverted previously introduced (and no longer necessary) dependency on bitarray).
* Switched the insert aligner back to the default implementation, as the one that ignores indels is not any faster.
v1.0.7 (2016.08.18)
## v1.0.7 (2016.08.18)
* Re-engineered modifiers.py (and all dependent code) to enable use of modifiers that simultaneously edit both reads in a pair.
* Add --op-order option to enable use to specify order of first four trimming operations.
* Implemented insert-based alignment for paired-end adapter trimming. This is currently experimental. Benchmarking against SeqPurge and Skewer using simulated reads showed that the method Cutadapt uses to align adapters, while optimal for single-end reads, is much less sensitive and specific than the insert match algorithms used by SeqPurge and Skewer. Our algorithm is similar to the one used by SeqPurge but leverages the dynamic programming model of Cutadapt.
v1.0.6 (2016.08.08)
## v1.0.6 (2016.08.08)
* Based on tests, worker compression is faster than writer compression when more than 8 threads are available, so set this to be the default.
v1.0.5 (2016.08.06)
## v1.0.5 (2016.08.06)
* Interanal code reorganization - compression code moved to separate module
* Eliminated the --worker-compression option in favor of --compression (whose value is either 'worker' or 'writer')
* More documentation improvements
v1.0.3 (2016.08.05)
## v1.0.3 (2016.08.05)
* Significant performance improvements:
* Start an extra worker once the main process is finished loading reads
......@@ -254,11 +289,11 @@ v1.0.3 (2016.08.05)
* Disable quality trimming if all cutoffs are set to 0
* Eliminated the --parallel-environment option
v1.0.1 (2016.08.04)
## v1.0.1 (2016.08.04)
* Fix documentation bugs associated with migration from optparse to argparse
v1.0 (2016.07.29)
## v1.0 (2016.07.29)
* Initial release (forked from cutadapt 1.10)
* Re-wrote much of filters.py and modifiers.py to separate modifying/filtering from file writing.
......
John P Didion, Marcel Martin, and Francis S Collins. "Atropos: specific, sensitive, and speedy trimming of sequencing reads." In preparation.
Didion JP, Martin M, Collins FS. (2017) Atropos: specific, sensitive, and speedy trimming of sequencing reads. PeerJ 5:e3720 https://doi.org/10.7717/peerj.3720
@ARTICLE{Didion2016Cutadapt,
author = {Didion JP, Martin M, and Collins FS},
title = {Atropos: specific, sensitive, and speedy trimming of sequencing reads.},
journal = {submitted},
year = 2017
journal = {PeerJ},
volume = {5},
pages = {e3720},
doi = {10.7717/peerj.3720}
}
......@@ -2,7 +2,7 @@
# Dockerfile
#
# Software: Atropos
# Software Version: 1.1.0
# Software Version: 2.0.0
# Description: Atropos image
# Website: https://github.com/jdidion/atropos
# Provides: atropos
......@@ -28,7 +28,7 @@ RUN export NPROC=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || 1) \
zlib-dev bzip2-dev xz-dev
# Additional Atropos dependencies
RUN pip install tqdm pysam jinja2
RUN pip install tqdm bamnostic jinja2
# Attach project directory and install
ADD . /atropos/
......
# documentation
include README.rst
include README.md
include CHANGES.md
include CITATION
include LICENSE.txt
......@@ -7,15 +7,12 @@ include doc/*.rst
include doc/conf.py
include doc/Makefile
include atropos/**/*.pyx
include atropos/align/_align.c
include atropos/aligner/_aligner.c
include atropos/commands/trim/_qualtrim.c
include atropos/io/_seqio.c
include atropos/io/readers/_readers.c
include atropos/io/sequence/_sequence.c
include atropos/adapters/*.fa
include atropos/report/templates/*
include bin/_preamble.py
include tests/test*.py
include tests/utils.py
graft tests/data
graft tests/cut
include versioneer.py
include atropos/_version.py
graft tests/data/input
graft tests/data/expected
\ No newline at end of file
tests = tests
module = atropos
#pytestops = --full-trace
#pytestops = -v -s
repo = jdidion/$(module)
package = atropos
#pytestopts = -s -vv --show-capture=all
repo = jdidion/$(package)
desc = Release $(version)
BUILD =
TEST =
all: clean install install_extra_requirements install_test_requirements test test_release_setup
all: clean install test
clean:
rm -Rf __pycache__
rm -Rf .pytest_cache
rm -Rf .eggs
rm -Rf **/__pycache__/*
rm -Rf **/*.c
rm -Rf **/*.so
rm -Rf **/*.pyc
rm -Rf dist
rm -Rf build
rm -Rf atropos.egg-info
rm -f .adapters
rm -f .coverage
rm -f coverage.xml
rm -f MANIFEST
build:
python setup.py build_ext -i
python setup.py sdist bdist_wheel
install: clean build
python setup.py install $(installargs)
pip install --upgrade dist/*.whl $(installargs)
install_test_requirements:
pip install -r requirements-test.txt
install_extra_requirements:
pip install -r requirements-extra.txt
test:
py.test $(pytestops) $(tests)
test: install install_extra_requirements install_test_requirements
coverage run -m pytest $(pytestopts) $(tests)
coverage report -m
coverage xml
test_release_setup:
twine check dist/*
docs:
make -C doc html
readme:
pandoc --from=markdown --to=rst --output=README.rst README.md
pandoc --from=markdown --to=rst --output=CHANGES.rst CHANGES.md
lint:
pylint $(module)
pylint $(package)
clean:
rm -Rf __pycache__
rm -Rf **/__pycache__/*
rm -Rf **/*.c
rm -Rf **/*.so
rm -Rf **/*.pyc
rm -Rf dist
rm -Rf build
rm -Rf .adapters
rm -Rf atropos.egg-info
reformat:
black $(package)
black $(tests)
docker:
# build
......@@ -47,22 +59,36 @@ docker:
# add alternate tags
docker tag $(repo):$(version) $(repo):latest
# push to Docker Hub
docker login -u jdidion && \
docker push $(repo)
docker login && docker push $(repo)
tag:
git tag $(version)
release: clean tag install test
twine upload dist/*
push_tag:
git push origin --tags
del_tag:
git tag -d $(version)
pypi_release:
twine upload dist/*
release: clean tag
${MAKE} install test pypi_release push_tag || (${MAKE} del_tag && exit 1)
# github release
curl -v -i -X POST \
-H "Content-Type:application/json" \
-H "Authorization: token $(token)" \
https://api.github.com/repos/$(repo)/releases \
-d '{"tag_name":"$(version)","target_commitish": "master","name": "$(version)","body": "$(desc)","draft": false,"prerelease": false}'
-d '{\
"tag_name":"$(version)",\
"target_commitish": "master",\
"name": "$(version)",\
"body": "$(desc)",\
"draft": false,\
"prerelease": false \
}'
# build a package with the files needed to run the workflows
workflow:
......
[![Travis CI](https://travis-ci.org/jdidion/atropos.svg?branch=1.1)](https://travis-ci.org/jdidion/atropos)
[![Travis CI](https://travis-ci.org/jdidion/atropos.svg?branch=develop)](https://travis-ci.org/jdidion/atropos)
[![PyPi](https://img.shields.io/pypi/v/atropos.svg)](https://pypi.python.org/pypi/atropos)
[![DOI](https://zenodo.org/badge/61393086.svg)](https://zenodo.org/badge/latestdoi/61393086)
[![Coverage Status](https://img.shields.io/coveralls/jdidion/atropos/master.svg)](https://coveralls.io/github/jdidion/atropos?branch=develop)
# Atropos
......@@ -11,11 +12,11 @@ Atropos is tool for specific, sensitive, and speedy trimming of NGS reads. It is
3. Options for trimming specific types of data (miRNA, bisulfite-seq).
4. A new command ('detect') that will detect adapter sequences and other potential contaminants.
5. A new command ('error') that will estimate the sequencing error rate, which helps to select the appropriate adapter- and quality- trimming parameter values.
6. A new command ('qc') that generates read statistics similar to FastQC. The trim command can also compute read statistics both before and after trimming (using the '--stats' option).
6. A new command ('qc') that` ``` generates read statistics similar to FastQC. The trim command can also compute read metrics both before and after trimming (using the '--metrics' option).
7. Improved summary reports, including support for serialization formats (JSON, YAML, pickle), support for user-defined templates (via the optional Jinja2 dependency), and integration with [MultiQC](http://multiqc.info).
8. The ability to merge overlapping reads (this is experimental and the functionality is limited).
9. The ability to write the summary report and log messages to separate files.
10. The ability to read SAM/BAM files and read/write interleaved FASTQ files.
10. The ability to read/write SAM, BAM, and interleaved FASTQ files.
11. Direct trimming of reads from an SRA accession.
12. A progress bar, and other minor usability enhancements.
......@@ -26,19 +27,21 @@ Atropos is available from [pypi](https://pypi.python.org/pypi/atropos) and can b
First install dependencies:
* Required
* Python 3.4.5+ (python 2.x is NOT supported)
* Cython 0.25.2+ (`pip install Cython`)
* Maybe python libraries
* Python 3.6+ (python 2.x is NOT supported)
* Cython 0.25.2+/0.29+/0.29.14+, depending on whether you're using python 3.6/3.7/3.8 (`pip install Cython`)
* [loguru]()
* [pokrok]() 0.2.0+
* [xphyle]() 4.2.1+
* Optional
* pytest (for running unit tests)
* progressbar2 or tqdm (progressbar support)
* pysam (SAM/BAM input)
* bamnostic or pysam (SAM/BAM support)
* khmer 2.0+ (for detecting low-frequency adapter contamination)
* jinja2 (for user-defined report formats)
* ngstream (for SRA streaming), which requires [ngs](https://github.com/ncbi/ngs)
Pip can be used to install atropos and optional dependencies, e.g.:
pip install atropos[tqdm,pysam,ngstream]
`pip install atropos[tqdm,bamnostic,ngstream]`
## Conda
......@@ -76,6 +79,10 @@ atropos --aligner insert -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCC
See the [Documentation](https://atropos.readthedocs.org/) for more complete usage information.
## Using Atropos as a library
While we consider the command-line interface to be stable, the internal code organization of Atropos is likely to change. At this time, we recommend to not directly interface with Atropos as a library (or to be prepared for your code to break).
## Publications
Atropos is [published](https://peerj.com/articles/3720/) in PeerJ.
......@@ -90,7 +97,6 @@ The citation for the original Cutadapt paper is:
> Marcel Martin. "Cutadapt removes adapter sequences from high-throughput sequencing reads." EMBnet.Journal, 17(1):10-12, May 2011. http://dx.doi.org/10.14806/ej.17.1.200
## Links
* [Documentation](https://atropos.readthedocs.org/)
......@@ -98,106 +104,3 @@ The citation for the original Cutadapt paper is:
* [Report an issue](https://github.com/jdidion/atropos/issues)
* [Code of conduct](https://github.com/jdidion/atropos/CODE_OF_CONDUCT.md)
* [Contributing](https://github.com/jdidion/atropos/CONTRIUBTING.md)
## Roadmap
### 1.2
* Migrate to [xphyle](https://github.com/jdidion/xphyle) for file management.
* Migrate to [pokrok](https://github.com/jdidion/pokrok) for progress bar management.
* Accept multiple input files.
* Support SAM output (including #33).
* Direct streaming and trimming of reads from SRA and htsget using [ngstream](https://github.com/jdidion/ngstream).
* Read "cropping" (#50)
* Support for ThruPlex-style adapters (in which barcode is part of query sequence; #55)
* Accessibility:
* Create recipe for homebrew.
* Automatically update conda and homebrew recipes for each release.
* Create Galaxy tool description using [argparse2tool](https://github.com/erasche/argparse2tool#cwl-specific-functionality).
* Improve documentation (#24)
* Port over improvements in latest versions of Cutadapt https://cutadapt.readthedocs.io/en/stable/
* Switch to using entry point instead of Atropos executable.
### 1.3
* Add auto-trimming mode for paired-end reads.
* Support for UMIs.
* Provide PacBio- and nanopore-specific options (https://github.com/marcelm/cutadapt/issues/120).
* Provide option for RNA-seq data that will trim polyA sequence.
* Add formal config file support (#53)
* Automate crash reporting using [sentry](https://github.com/getsentry/raven-python).
* Look at [NGMerge] for improving read merging: https://github.com/harvardinformatics/NGmerge
* Look at replacing pysam with [pybam](https://github.com/JohnLonginotto/pybam)
### 1.4
* Currently, InsertAligner requires a single 3' adapter for each end. Adapter trimming will later be generalized so that A) the InsertAligner can handle multiple matched pairs of adapters and/or B) multiple different aligners can be used for different adapters.
* Integrate with [AdapterBase](https://github.com/NCBI-Hackathons/OnlineAdapterDatabase) for improved matching of detected contaminants to known adapters, automated trimming of datasets with known adapters, and (opt-in) submission of adapter information for novel datasets.
* Migrate to seqio (https://github.com/jdidion/seqio) for reading/writing sequence files.
* Also look at using HTSeq instead: https://github.com/simon-anders/htseq
* General-purpose read filtering based on read ID: https://github.com/marcelm/cutadapt/issues/107.
* Currently, SAM/BAM input files must be name sorted; add an option to 1) pre-sort reads inline using samtools or sambamba, or 2) cache each read in memory until its mate is found.
### 1.5
* Provide more user control over anchoring of adapters: https://github.com/marcelm/cutadapt/issues/53.
* Enable user to define custom read structure: https://github.com/nh13/read-structure-examples
* Support for paired-end demultiplexing
* Demultiplexing based on barcodes: https://github.com/marcelm/cutadapt/issues/118.
* Consider supporting different error rates for read1 vs read2.
* Add a ClipOverlapping modifier that will remove read overlaps (as opposed to merging).
* Look more closely at providing solutions to the Illumina two-color chemistry issue:
* Provide and option to exempt G calls from the assessment of quality
* Trim 3′ Gs from reads
* Also look at addressing any issues with one-color chemistry (iSeq).
* Consider whether to support trimming/QC of raw IonTorrent data.
### 1.6
* Switch to using Click for CLI.
* Implement a public plugin API.
* Add more logging and convert log messages from old-style to new-style format strings.
* Add option to estimate bisulfite conversion rate from filled-in cytosine methylation status in reads that were MspI-digested.
* CPU and memory profiling. Try out:
* https://github.com/nvdv/vprof
* https://github.com/what-studio/profiling
* https://github.com/fabianp/memory_profiler
* https://github.com/rkern/line_profiler#line-profiler
* Look at some new trimming/qc programs; decide whether to add to benchmarks and/or incorporate any of their features
* https://github.com/OpenGene/fastp/issues
* http://tagcleaner.sourceforge.net/
* https://github.com/mdshw5/fastqp/blob/master/README.md
### 2.0
* Simplification of command line options, perhaps by further splitting functionality up into different sub-commands, but also by more intelligent choices for default option values based on context.
* Consider adding additional report formats
* Add fuzzy matching: https://github.com/Daniel-Liu-c0deb0t/Java-Fuzzy-Search
* https://github.com/marcelm/cutadapt/issues/112
* Performance enhancements using
* http://numba.pydata.org/
* https://github.com/undefx/vecpy
* https://github.com/serge-sans-paille/pythran
* https://github.com/IntelLabs/hpat
* https://github.com/cupy/cupy
* >90% test coverage
* Fuzz testing using AFL
* http://lcamtuf.coredump.cx/afl/
* https://github.com/jwilk/python-afl
### Beyond 2.0
* Implement additional alternate alignment algorithms.
* Implement the error detection algorithm in ADEPT: https://github.com/LANL-Bioinformatics/ADEPT
* Explore new quality trimming algorithms
* UrQt: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4450468/
* InfoTrim: github.com/JacobPorter/InfoTrim
* Scythe is an interesting new trimmer. Depending on how the benchmarks look in the forthcoming paper, we will add it to the list of tools we compare against Atropos, and perhaps implement their Bayesian approach for adapter match.
* Experiment with replacing the multicore implementation with an asyncio-based implementation (using ProcessPoolExecutor and uvloop).
* Automatic adaptive tuning of queue sizes to maximize the balance between memory usage and latency.
* FastProNGS has some nice visualizations that could be included, rather than relying on MultiQC: https://github.com/Megagenomics/FastProNGS
While we consider the command-line interface to be stable, the internal code organization of Atropos is likely to change. At this time, we recommend to not directly interface with Atropos as a library (or to be prepared for your code to break). The internal code organization will be stabilized as of version 2.0, which is planned for sometime in 2017.
If you would like to suggest additional enhancements, you can submit issues and/or pull requests at our GitHub page.
# coding: utf-8
"""Top-level Atropos package.
"""
import os
import sys
import pkg_resources
from atropos._version import get_versions
__version__ = get_versions()['version']
del get_versions
class AtroposError(Exception):
"""Base class for Atropos-specific errors.
"""
pass
UNKNOWN_VERSION = "Unknown"
def check_importability(): # pragma: no cover
"""Check that cython modules haev been compile.
"""
try:
import atropos.align._align # pylint: disable=unused-variable
except ImportError as err:
if 'undefined symbol' in str(err):
print("""
ERROR: A required extension module could not be imported because it is
incompatible with your system. A quick fix is to recompile the extension
modules with the following command:
{0} setup.py build_ext -i
See the documentation for alternative ways of installing the program.
The original error message follows.
""".format(sys.executable))
raise
try:
__version__ = pkg_resources.get_distribution(__name__).version
except pkg_resources.DistributionNotFound:
__version__ = UNKNOWN_VERSION
import sys
from typing import Optional, Sequence
from atropos.console import run_atropos
def main(args: Optional[Sequence[str]] = None):
"""
Main method, designed to be called by the system.
Always calls `sys.exit()`. If you want to run Atropos programatically, call
`run()` instead.
Args:
args: Command-line arguments. If None, `sys.argv[1:]` is used.
"""
sys.exit(run_atropos(args))
if __name__ == "__main__":
main()
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
# directories (produced by setup.py build) will contain a much shorter file
# that just contains the computed version number.
# This file is released into the public domain. Generated by
# versioneer-0.16 (https://github.com/warner/python-versioneer)
"""Git implementation of _version.py."""
import errno
import os
import re
import subprocess
import sys
def get_keywords():
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (HEAD -> 1.1, tag: 1.1.24)"
git_full = "9281be92f0e52a14085841344a509f7808efcfe1"
keywords = {"refnames": git_refnames, "full": git_full}
return keywords
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
def get_config():
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
cfg = VersioneerConfig()
cfg.VCS = "git"
cfg.style = "pep440"
cfg.tag_prefix = ""
cfg.parentdir_prefix = "atropos-"
cfg.versionfile_source = "atropos/_version.py"
cfg.verbose = False
return cfg
class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
LONG_VERSION_PY = {}
HANDLERS = {}
def register_vcs_handler(vcs, method): # decorator
"""Decorator to mark a method as the handler for a particular VCS."""
def decorate(f):
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
return decorate
def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
"""Call the given command(s)."""
assert isinstance(commands, list)
p = None
for c in commands:
try:
dispcmd = str([c] + args)
# remember shell=False, so use git.cmd on windows, not just git
p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr
else None))
break
except EnvironmentError:
e = sys.exc_info()[1]
if e.errno == errno.ENOENT:
continue
if verbose:
print("unable to run %s" % dispcmd)
print(e)
return None
else:
if verbose:
print("unable to find command, tried %s" % (commands,))
return None
stdout = p.communicate()[0].strip()
if sys.version_info[0] >= 3:
stdout = stdout.decode()
if p.returncode != 0:
if verbose:
print("unable to run %s (error)" % dispcmd)
return None
return stdout
def versions_from_parentdir(parentdir_prefix, root, verbose):
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes
both the project name and a version string.
"""
dirname = os.path.basename(root)
if not dirname.startswith(parentdir_prefix):
if verbose:
print("guessing rootdir is '%s', but '%s' doesn't start with "
"prefix '%s'" % (root, dirname, parentdir_prefix))
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
return {"version": dirname[len(parentdir_prefix):],
"full-revisionid": None,
"dirty": False, "error": None}
@register_vcs_handler("git", "get_keywords")
def git_get_keywords(versionfile_abs):
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
keywords = {}
try:
f = open(versionfile_abs, "r")
for line in f.readlines():
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["refnames"] = mo.group(1)
if line.strip().startswith("git_full ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["full"] = mo.group(1)
f.close()
except EnvironmentError:
pass
return keywords
@register_vcs_handler("git", "keywords")
def git_versions_from_keywords(keywords, tag_prefix, verbose):
"""Get version information from git keywords."""
if not keywords:
raise NotThisMethod("no keywords at all, weird")
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = set([r.strip() for r in refnames.strip("()").split(",")])
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = set([r for r in refs if re.search(r'\d', r)])
if verbose:
print("discarding '%s', no digits" % ",".join(refs-tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix):]
if verbose:
print("picking %s" % r)
return {"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": None
}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": "no suitable tags"}
@register_vcs_handler("git", "pieces_from_vcs")
def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
if not os.path.exists(os.path.join(root, ".git")):
if verbose:
print("no .git in %s" % root)
raise NotThisMethod("no .git directory")
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
describe_out = run_command(GITS, ["describe", "--tags", "--dirty",
"--always", "--long",
"--match", "%s*" % tag_prefix],
cwd=root)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
pieces = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
# look for -dirty suffix
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[:git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
pieces["error"] = ("unable to parse git-describe output: '%s'"
% describe_out)
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
% (full_tag, tag_prefix))
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix):]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
# commit: short hex revision ID
pieces["short"] = mo.group(3)
else:
# HEX: no tags
pieces["closest-tag"] = None
count_out = run_command(GITS, ["rev-list", "HEAD", "--count"],
cwd=root)
pieces["distance"] = int(count_out) # total number of commits
return pieces
def plus_or_dot(pieces):
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
def render_pep440(pieces):
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%d.g%s" % (pieces["distance"],
pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_pre(pieces):
"""TAG[.post.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post.devDISTANCE
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += ".post.dev%d" % pieces["distance"]
else:
# exception #1
rendered = "0.post.dev%d" % pieces["distance"]
return rendered
def render_pep440_post(pieces):
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
(a dirty tree will appear "older" than the corresponding clean one),
but you shouldn't be releasing software with -dirty anyways.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
return rendered
def render_pep440_old(pieces):
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
Eexceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
return rendered
def render_git_describe(pieces):
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render_git_describe_long(pieces):
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
The distance/hash is unconditional.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render(pieces, style):
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"]}
if not style or style == "default":
style = "pep440" # the default
if style == "pep440":
rendered = render_pep440(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
rendered = render_git_describe(pieces)
elif style == "git-describe-long":
rendered = render_git_describe_long(pieces)
else:
raise ValueError("unknown style '%s'" % style)
return {"version": rendered, "full-revisionid": pieces["long"],
"dirty": pieces["dirty"], "error": None}
def get_versions():
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
# case we can only use expanded keywords.
cfg = get_config()
verbose = cfg.verbose
try:
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
verbose)
except NotThisMethod:
pass
try:
root = os.path.realpath(__file__)
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
for i in cfg.versionfile_source.split('/'):
root = os.path.dirname(root)
except NameError:
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree"}
try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
return render(pieces, cfg.style)
except NotThisMethod:
pass
try:
if cfg.parentdir_prefix:
return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
except NotThisMethod:
pass
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to compute version"}
This diff is collapsed.
# coding: utf-8
"""
Alignment module.
"""
from collections import namedtuple
from atropos.align._align import Aligner, MultiAligner, compare_prefixes, locate
from atropos.util import RandomMatchProbability, reverse_complement
# flags for global alignment
# The interpretation of the first flag is:
# An initial portion of seq1 may be skipped at no cost.
# This is equivalent to saying that in the alignment,
# gaps in the beginning of seq2 are free.
#
# The other flags have an equivalent meaning.
START_WITHIN_SEQ1 = 1
START_WITHIN_SEQ2 = 2
STOP_WITHIN_SEQ1 = 4
STOP_WITHIN_SEQ2 = 8
# Use this to get regular semiglobal alignment
# (all gaps in the beginning or end are free)
SEMIGLOBAL = (
START_WITHIN_SEQ1 | START_WITHIN_SEQ2 |
STOP_WITHIN_SEQ1 | STOP_WITHIN_SEQ2)
def compare_suffixes(
suffix_ref, suffix_query, wildcard_ref=False, wildcard_query=False):
"""Find out whether one string is the suffix of the other one, allowing
mismatches. Used to find an anchored 3' adapter when no indels are allowed.
Args:
suffix_ref, suffix_query: The suffices to compare.
wildcard_ref, wildcard_query: Whether wildcards are valid in either of
the suffices.
"""
suffix_ref = suffix_ref[::-1]
suffix_query = suffix_query[::-1]
_, length, _, _, matches, errors = compare_prefixes(
suffix_ref, suffix_query, wildcard_ref, wildcard_query)
return (
len(suffix_ref) - length, len(suffix_ref), len(suffix_query) - length,
len(suffix_query), matches, errors)
# Common match-result object returned by aligners
# TODO creating instances of this class is relatively slow and responsible for
# quite some runtime.
class Match(object):
"""An alignment match.
Args:
astart: Starting position of the match within the adapter.
astop: Ending position of the match within the adapter.
rstart: Starting position of the match within the read.
rstop: Ending position of the match within the read.
matches: Number of matching bases.
errors: Number of mismatching bases.
front: Whether the match is to the front of the read.
adapter: The :class:`Adapter`.
read: The :class:`Sequence`.
length: The length of the match.
"""
__slots__ = [
'astart', 'astop', 'rstart', 'rstop', 'matches', 'errors', 'front',
'adapter', 'read', 'length']
def __init__(
self, astart, astop, rstart, rstop, matches, errors,
front=None, adapter=None, read=None):
self.astart = astart
self.astop = astop
self.rstart = rstart
self.rstop = rstop
self.matches = matches
self.errors = errors
self.front = self._guess_is_front() if front is None else front
self.adapter = adapter
self.read = read
# Number of aligned characters in the adapter. If there are indels,
# this may be different from the number of characters in the read.
self.length = self.astop - self.astart
if self.length <= 0:
raise ValueError('Match length must be >= 0')
if self.length - self.errors <= 0:
raise ValueError('A Match requires at least one matching position.')
# TODO: this assertion may not always hold now that we use both error
# rates and probabilities
#if self.adapter:
# assert self.errors / self.length <= self.adapter.max_error_rate
def __str__(self):
return (
'Match(astart={0}, astop={1}, rstart={2}, rstop={3}, matches={4}, '
'errors={5})').format(
self.astart, self.astop, self.rstart, self.rstop, self.matches,
self.errors)
def copy(self):
"""Create a copy of this Match.
"""
return Match(
self.astart, self.astop, self.rstart, self.rstop, self.matches,
self.errors, self.front, self.adapter, self.read)
def _guess_is_front(self):
"""Return whether this is guessed to be a front adapter.
The match is assumed to be a front adapter when the first base of
the read is involved in the alignment to the adapter.
"""
return self.rstart == 0
def wildcards(self, wildcard_char='N'):
"""Return a string that contains, for each wildcard character,
the character that it matches. For example, if the adapter
ATNGNA matches ATCGTA, then the string 'CT' is returned.
If there are indels, this is not reliable as the full alignment
is not available.
"""
wildcards = [
self.read.sequence[self.rstart + i]
for i in range(self.length)
if (self.adapter.sequence[self.astart + i] == wildcard_char and
self.rstart + i < len(self.read.sequence))
]
return ''.join(wildcards)
def rest(self):
"""Returns the part of the read before this match if this is a
'front' (5') adapter, or the part after the match if this is not
a 'front' adapter (3'). This can be an empty string.
"""
if self.front:
return self.read.sequence[:self.rstart]
else:
return self.read.sequence[self.rstop:]
# TODO: write test
def get_info_record(self):
"""Returns a :class:`MatchInfo`, which contains information about the
match to write into the info file.
"""
seq = self.read.sequence
qualities = self.read.qualities
if qualities is None:
qualities = ''
rsize = rsize_total = self.rstop - self.rstart
if self.front and self.rstart > 0:
rsize_total = self.rstop
elif not self.front and self.rstop < len(seq):
rsize_total = len(seq) - self.rstart
return MatchInfo(
self.read.name,
self.errors,
self.rstart,
self.rstop,
seq[0:self.rstart],
seq[self.rstart:self.rstop],
seq[self.rstop:],
self.adapter.name,
qualities[0:self.rstart],
qualities[self.rstart:self.rstop],
qualities[self.rstop:],
self.front,
self.astop - self.astart,
rsize, rsize_total)
MatchInfo = namedtuple("MatchInfo", (
"read_name", "errors", "rstart", "rstop", "seq_before", "seq_adapter",
"seq_after", "adapter_name", "qual_before", "qual_adapter", "qual_after",
"is_front", "asize", "rsize_adapter", "rsize_total"))
class InsertAligner(object):
"""Implementation of an insert matching algorithm.
If the inserts align, the overhangs are searched for the adapter sequences.
Otherwise, each read is search for its adapter separately.
This only works with paired-end reads with 3' adapters.
Args:
adapter1, adapter2: read1, read2 adapters.
match_probability: Callable that calculates random match probability
given arguments (num_matches, match_len).
insert_max_rmp: Max random match probability for the insert match.
adapter_max_rmp: Max random match probability for the adapter match.
min_insert_overlap: Minimum number of bases the inserts must overlap
to be considered matching.
max_insert_mismatch_frac: Maximum fraction of mismatching bases between
the inserts to be considered matching.
min_adapter_overlap: Minimum number of bases the adapter must overlap
the read to be considered matching.
max_adapter_mismatch_frac: Maximum fraction of mismatching bases between
the adapter and the read to be considered matching.
adapter_check_cutoff: Threshold number of matching bases required before
adapter matches are checked against random match probability.
base_probs: Dict of (match_prob, mismatch_prob), which are the
probabilities passed to
:method:`atropos.util.RandomMatchProbability.__call__()`.
"""
def __init__(
self, adapter1, adapter2,
match_probability=RandomMatchProbability(),
insert_max_rmp=1E-6, adapter_max_rmp=0.001,
min_insert_overlap=1, max_insert_mismatch_frac=0.2,
min_adapter_overlap=1, max_adapter_mismatch_frac=0.2,
adapter_check_cutoff=9, base_probs=None,
adapter_wildcards=True, read_wildcards=False):
self.adapter1 = adapter1
self.adapter1_len = len(adapter1)
self.adapter2 = adapter2
self.adapter2_len = len(adapter2)
self.match_probability = match_probability
self.insert_max_rmp = insert_max_rmp
self.adapter_max_rmp = adapter_max_rmp
self.min_insert_overlap = min_insert_overlap
self.max_insert_mismatch_frac = float(max_insert_mismatch_frac)
self.min_adapter_overlap = min_adapter_overlap
self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac)
self.adapter_check_cutoff = adapter_check_cutoff
self.base_probs = base_probs or dict(
match_prob=0.25, mismatch_prob=0.75)
self.adapter_wildcards = adapter_wildcards
self.read_wildcards = read_wildcards
self.aligner = MultiAligner(
max_insert_mismatch_frac,
START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
min_insert_overlap)
def match_insert(self, seq1, seq2):
"""Use cutadapt aligner for insert and adapter matching.
Args:
seq1, seq2: Sequences to match.
Returns:
A :class:`Match` object, or None if there is no match.
"""
seq_len1 = len(seq1)
seq_len2 = len(seq2)
seq_len = min(seq_len1, seq_len2)
if seq_len1 > seq_len2:
seq1 = seq1[:seq_len2]
elif seq_len2 > seq_len1:
seq2 = seq2[:seq_len1]
seq2_rc = reverse_complement(seq2)
def _match(_insert_match, _offset, _insert_match_size, _):
if _offset < self.min_adapter_overlap:
# The reads are mostly overlapping, to the point where
# there's not enough overhang to do a confident adapter
# match. We return just the insert match to signal that
# error correction can be done even though no adapter
# trimming is required.
return (_insert_match, None, None)
# TODO: this is very sensitive to the exact correct choice of
# adapter. For example, if you specifiy GATCGGAA... and the correct
# adapter is AGATCGGAA..., the prefixes will not match exactly and
# the alignment will fail. We need to use a comparison that is a bit
# more forgiving.
def _adapter_match(insert_seq, adapter_seq, adapter_len):
amatch = compare_prefixes(
insert_seq[_insert_match_size:], adapter_seq,
wildcard_ref=self.adapter_wildcards,
wildcard_query=self.read_wildcards)
alen = min(_offset, adapter_len)
return amatch, alen, round(alen * self.max_adapter_mismatch_frac)
a1_match, a1_length, a1_max_mismatches = _adapter_match(
seq1, self.adapter1, self.adapter1_len)
a2_match, a2_length, a2_max_mismatches = _adapter_match(
seq2, self.adapter2, self.adapter2_len)
if (
a1_match[5] > a1_max_mismatches and
a2_match[5] > a2_max_mismatches):
return None
if min(a1_length, a2_length) > self.adapter_check_cutoff:
a1_prob = self.match_probability(a1_match[4], a1_length)
a2_prob = self.match_probability(a2_match[4], a2_length)
if (a1_prob * a2_prob) > self.adapter_max_rmp:
return None
mismatches = min(a1_match[5], a2_match[5])
def _create_match(alen, slen):
alen = min(alen, slen - _insert_match_size)
_mismatches = min(alen, mismatches)
_matches = alen - _mismatches
return Match(0, alen, _insert_match_size, slen, _matches, _mismatches)
return (
_insert_match,
_create_match(a1_length, seq_len1),
_create_match(a2_length, seq_len2)
)
# # This is the old way of doing things, where we use the built-in
# # Aligner to do a single match.
# aligner = Aligner(
# seq2_rc,
# self.max_insert_mismatch_frac,
# START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
# False, False)
# aligner.min_overlap = self.min_insert_overlap
# aligner.indel_cost = 100000
#
# insert_match = aligner.locate(seq1)
#
# if not insert_match:
# return None
#
# offset = min(insert_match[0], seq_len - insert_match[3])
# insert_match_size = seq_len - offset
# prob = self.match_probability(insert_match[4], insert_match_size)
#
# if prob > self.insert_max_rmp:
# return None
#
# return _match(insert_match, offset, insert_match_size, prob)
# Use an aligner that returns all matches that satisfy the
# overlap and error rate thresholds. We sort by matches and
# then mismatches, and then check each in turn until we find
# one with an adapter match (if any).
insert_matches = self.aligner.locate(seq2_rc, seq1)
if insert_matches:
# Filter by random-match probability
filtered_matches = []
for insert_match in insert_matches:
offset = min(insert_match[0], seq_len - insert_match[3])
insert_match_size = seq_len - offset
prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs)
if prob <= self.insert_max_rmp:
filtered_matches.append((insert_match, offset, insert_match_size, prob))
if filtered_matches:
if len(filtered_matches) == 1:
return _match(*filtered_matches[0])
else:
# Test matches in order of random-match probability.
# TODO: compare against sorting by length (which is how
# SeqPurge essentially does it).
# filtered_matches.sort(key=lambda x: x[2], reverse=True)
filtered_matches.sort(key=lambda x: x[3])
for match_args in filtered_matches:
match = _match(*match_args)
if match:
return match
return None
from enum import IntFlag
from typing import Tuple
from ._aligner import Aligner, MultiAligner, compare_prefixes, locate
MatchTuple = Tuple[int, int, int, int, int, int]
class GapRule(IntFlag):
START_WITHIN_SEQ1 = 1
"""Gaps at begining of seq2 have no penalty."""
START_WITHIN_SEQ2 = 2
"""Gaps at begining of seq1 have no penalty."""
STOP_WITHIN_SEQ1 = 4
"""Gaps at end of seq2 have no penalty."""
STOP_WITHIN_SEQ2 = 8
"""Gaps at end of seq1 have no penalty"""
SEMIGLOBAL = (
START_WITHIN_SEQ1
| START_WITHIN_SEQ2
| STOP_WITHIN_SEQ1
| STOP_WITHIN_SEQ2
)
"""Typical semiglobal alignment (all gaps in the beginning or end are free)"""
def compare_suffixes(
suffix_ref: str,
suffix_query: str,
wildcard_ref: bool = False,
wildcard_query: bool = False,
) -> MatchTuple:
"""
Find out whether one string is the suffix of the other one, allowing mismatches.
Used to find an anchored 3' adapter when no indels are allowed.
Args:
suffix_ref, suffix_query: The suffices to compare.
wildcard_ref, wildcard_query: Whether wildcards are valid in either of
the suffices.
"""
suffix_ref = suffix_ref[::-1]
suffix_query = suffix_query[::-1]
_, length, _, _, matches, errors = compare_prefixes(
suffix_ref, suffix_query, wildcard_ref, wildcard_query
)
return (
len(suffix_ref) - length,
len(suffix_ref),
len(suffix_query) - length,
len(suffix_query),
matches,
errors,
)
# cython: profile=False, emit_code_comments=False
# TODO: Flouri et al. report errors in Gotoh's original paper that persist
# in most implementations of NW alignment (http://biorxiv.org/content/biorxiv/early/2015/11/12/031500.full.pdf).
# They provide a correct implementation (qalign: http://www.exelixis-lab.org/web/software/alignment/).
# in most implementations of NW alignment
# (http://biorxiv.org/content/biorxiv/early/2015/11/12/031500.full.pdf).
# They provide a correct implementation
# (qalign: http://www.exelixis-lab.org/web/software/alignment/).
from cpython.mem cimport PyMem_Malloc, PyMem_Free, PyMem_Realloc
from cpython.array cimport array, clone
from cpython.mem cimport PyMem_Free, PyMem_Realloc
from cpython.array cimport array
cdef array ld_array = array('d', [])
from libc.math cimport ceil
DEF START_WITHIN_SEQ1 = 1
DEF START_WITHIN_SEQ2 = 2
......@@ -18,8 +19,11 @@ DEF SEMIGLOBAL = 15
# structure for a DP matrix entry
ctypedef struct _Entry:
int cost
int matches # no. of matches in this alignment
int origin # where the alignment originated: negative for positions within seq1, positive for pos. within seq2
# no. of matches in this alignment
int matches
# where the alignment originated: negative for positions within seq1, positive for
# pos. within seq2
int origin
ctypedef struct _Match:
int origin
......@@ -112,11 +116,11 @@ class DPMatrix:
"""
Return a representation of the matrix as a string.
"""
rows = [' ' + ' '.join(c.rjust(2) for c in self.query)]
for c, row in zip(' ' + self.reference, self._rows):
r = c + ' ' + ' '.join(' ' if v is None else '{0:2d}'.format(v) for v in row)
rows = [" " + " ".join(c.rjust(2) for c in self.query)]
for c, row in zip(" " + self.reference, self._rows):
r = c + " " + " ".join(" " if v is None else f"{v:2d}" for v in row)
rows.append(r)
return '\n'.join(rows)
return "\n".join(rows)
cdef class Aligner:
"""
......@@ -125,8 +129,8 @@ cdef class Aligner:
Locate one string within another by computing an optimal semiglobal
alignment between string1 and string2.
The alignment uses unit costs, which means that mismatches, insertions and deletions are
counted as one error.
The alignment uses unit costs, which means that mismatches, insertions and deletions
are counted as one error.
flags is a bitwise 'or' of the allowed flags.
To allow skipping of a prefix of string1 at no cost, set the
......@@ -143,7 +147,8 @@ cdef class Aligner:
The skipped parts are described with two intervals (start1, stop1),
(start2, stop2).
For example, an optimal semiglobal alignment of SISSI and MISSISSIPPI looks like this:
For example, an optimal semiglobal alignment of SISSI and MISSISSIPPI looks like
this:
---SISSI---
MISSISSIPPI
......@@ -167,9 +172,9 @@ cdef class Aligner:
leftmost position within the read.
The alignment itself is not returned, only the tuple
(start1, stop1, start2, stop2, matches, errors), where the first four fields have the
meaning as described, matches is the number of matches and errors is the number of
errors in the alignment.
(start1, stop1, start2, stop2, matches, errors), where the first four fields have
the meaning as described, matches is the number of matches and errors is the
number of errors in the alignment.
It is always the case that at least one of start1 and start2 is zero.
......@@ -194,8 +199,16 @@ cdef class Aligner:
cdef bytes _reference # TODO rename to translated_reference or so
cdef str str_reference
def __cinit__(self, str reference, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False,
bint wildcard_query=False, int min_overlap=1, int indel_cost=1):
def __cinit__(
self,
str reference,
double max_error_rate,
int flags=SEMIGLOBAL,
bint wildcard_ref=False,
bint wildcard_query=False,
int min_overlap=1,
int indel_cost=1
):
self.max_error_rate = max_error_rate
self.flags = flags
self.wildcard_ref = wildcard_ref
......@@ -232,7 +245,9 @@ cdef class Aligner:
return self._reference
def __set__(self, str reference):
mem = <_Entry*> PyMem_Realloc(self.column, (len(reference) + 1) * sizeof(_Entry))
mem = <_Entry*> PyMem_Realloc(
self.column, (len(reference) + 1) * sizeof(_Entry)
)
if not mem:
raise MemoryError()
self.column = mem
......@@ -461,7 +476,14 @@ cdef class Aligner:
length = i + min(column[i].origin, 0)
cost = column[i].cost
matches = column[i].matches
if length >= self._min_overlap and cost <= length * max_error_rate and (matches > best.matches or (matches == best.matches and cost < best.cost)):
if (
length >= self._min_overlap and
cost <= length * max_error_rate and (
matches > best.matches or (
matches == best.matches and cost < best.cost
)
)
):
# update best
best.matches = matches
best.cost = cost
......@@ -484,17 +506,27 @@ cdef class Aligner:
start2 = 0
assert best.ref_stop - start1 > 0 # Do not return empty alignments.
return (start1, best.ref_stop, start2, best.query_stop, best.matches, best.cost)
return start1, best.ref_stop, start2, best.query_stop, best.matches, best.cost
def __dealloc__(self):
PyMem_Free(self.column)
def locate(str reference, str query, double max_error_rate, int flags=SEMIGLOBAL, bint wildcard_ref=False, bint wildcard_query=False, int min_overlap=1):
def locate(
str reference,
str query,
double max_error_rate,
int flags=SEMIGLOBAL,
bint wildcard_ref=False,
bint wildcard_query=False,
int min_overlap=1
):
aligner = Aligner(reference, max_error_rate, flags, wildcard_ref, wildcard_query)
aligner.min_overlap = min_overlap
return aligner.locate(query)
def compare_prefixes(str ref, str query, bint wildcard_ref=False, bint wildcard_query=False):
def compare_prefixes(
str ref, str query, bint wildcard_ref=False, bint wildcard_query=False
):
"""
Find out whether one string is the prefix of the other one, allowing
IUPAC wildcards in ref and/or query if the appropriate flag is set.
......@@ -537,7 +569,7 @@ def compare_prefixes(str ref, str query, bint wildcard_ref=False, bint wildcard_
matches += 1
# length - matches = no. of errors
return (0, length, 0, length, matches, length - matches)
return 0, length, 0, length, matches, length - matches
DEF OVERHANG_MULTIPLIER = 100000
......@@ -562,6 +594,11 @@ cdef class MultiAligner:
self._num_cols = 0
self._num_matches = 0
def __repr__(self):
return f"MultiAligner<max_error_rate={self.max_error_rate}, " \
f"flags={self.flags}, min_overlap={self._min_overlap}, " \
f"_num_cols={self._num_cols}, _num_matches={self._num_matches}>"
def _resize_matrix(self, size):
if size > self._num_cols:
mem = <_Entry*> PyMem_Realloc(self.column, (size + 1) * sizeof(_Entry))
......@@ -741,7 +778,10 @@ cdef class MultiAligner:
continue
length = i + min(column[i].origin, 0)
if length >= self._min_overlap and cost <= length * max_error_rate:
if (
length >= self._min_overlap and
cost <= length * max_error_rate
):
# update best
match_array[num_matches].ref_stop = i
match_array[num_matches].query_stop = n
......@@ -768,7 +808,14 @@ cdef class MultiAligner:
start1 = -_match.origin
start2 = 0
assert _match.ref_stop - start1 > 0 # Do not return empty alignments.
return (start1, _match.ref_stop, start2, _match.query_stop, _match.matches, _match.cost)
return (
start1,
_match.ref_stop,
start2,
_match.query_stop,
_match.matches,
_match.cost
)
def __dealloc__(self):
PyMem_Free(self.column)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.