Commit 6c56f54b authored by Liubov Chuprikova's avatar Liubov Chuprikova

New upstream version 2019.7.0

parent 62b201c9
[run]
branch = True
omit =
*/tests*
*/__init__.py
q2_cutadapt/_version.py
versioneer.py
[report]
omit =
*/tests*
*/__init__.py
q2_cutadapt/_version.py
versioneer.py
q2cli/_version.py export-subst
q2_cutadapt/_version.py export-subst
......@@ -63,3 +63,8 @@ target/
# vi
.*.swp
# other
*~
.DS_store
......@@ -14,12 +14,12 @@ install:
- wget -q https://raw.githubusercontent.com/qiime2/environment-files/master/latest/staging/qiime2-latest-py36-linux-conda.yml
- conda env create -q -n test-env --file qiime2-latest-py36-linux-conda.yml
- source activate test-env
- conda install -q nose
- pip install -q flake8
- conda install -q pytest-cov
- pip install flake8 coveralls
- pip install -q https://github.com/qiime2/q2lint/archive/master.zip
- make install
script:
- make lint
- make test
- QIIMETEST= source tab-qiime
- QIIMETEST= qiime info
- make test-cov
after_success:
- coveralls
BSD 3-Clause License
Copyright (c) 2016-2019, QIIME 2 development team.
Copyright (c) 2017-2019, QIIME 2 development team.
All rights reserved.
Redistribution and use in source and binary forms, with or without
......
include versioneer.py
include q2cli/_version.py
include q2_cutadapt/_version.py
.PHONY: all lint test install dev clean distclean
.PHONY: all lint test test-cov install dev clean distclean
PYTHON ?= python
PREFIX ?= $(CONDA_PREFIX)
all: ;
......@@ -10,17 +9,16 @@ lint:
flake8
test: all
QIIMETEST= nosetests
py.test
test-cov: all
py.test --cov=q2_cutadapt
install: all
$(PYTHON) setup.py install && \
mkdir -p $(PREFIX)/etc/conda/activate.d && \
cp hooks/50_activate_q2cli_tab_completion.sh $(PREFIX)/etc/conda/activate.d/
$(PYTHON) setup.py install
dev: all
pip install -e . && \
mkdir -p $(PREFIX)/etc/conda/activate.d && \
cp hooks/50_activate_q2cli_tab_completion.sh $(PREFIX)/etc/conda/activate.d/
pip install -e .
clean: distclean
......
# q2cli
A [click-based](http://click.pocoo.org/) command line interface for [QIIME 2](https://github.com/qiime2/qiime2).
# q2-cutadapt
## Installation and getting help
[![Build Status](https://travis-ci.org/qiime2/q2-cutadapt.svg?branch=master)](https://travis-ci.org/qiime2/q2-cutadapt) [![Coverage Status](https://coveralls.io/repos/github/qiime2/q2-cutadapt/badge.svg?branch=master)](https://coveralls.io/github/qiime2/q2-cutadapt?branch=master)
Visit https://qiime2.org to learn more about q2cli and the QIIME 2 project.
## Enabling tab completion
### Bash
To enable tab completion in Bash, run the following command or add it to your `.bashrc`/`.bash_profile`:
```bash
source tab-qiime
```
### ZSH
To enable tab completion in ZSH, run the following commands or add them to your `.zshrc`:
```bash
autoload bashcompinit && bashcompinit && source tab-qiime
```
This is a QIIME 2 plugin. For details on QIIME 2 and tutorials demonstrating how to use this plugin, see the [QIIME 2 documentation](https://qiime2.org/).
#!/usr/bin/env bash
# Bash completion script that defers to a cached completion script representing
# the state of the current QIIME 2 deployment.
#
# This script is intended to be executed on the command-line or in
# .bashrc/.bash_profile:
#
# source tab-qiime
#
_qiime_completion()
{
# Attempt to find the cached completion script. If q2cli isn't installed, or
# is an incompatible version, don't attempt completion.
local completion_path="$(python -c "import q2cli.util; print(q2cli.util.get_completion_path())" 2> /dev/null)"
if [[ $? != 0 ]]; then
unset COMPREPLY
return 0
fi
# If the completion script exists, attempt completion by invoking the script
# in a subshell, supplying COMP_WORDS and COMP_CWORD. Capture the output as
# the completion reply. If the completion script failed, don't attempt
# completion.
if [[ -f "$completion_path" ]] ; then
COMPREPLY=( $(COMP_WORDS="${COMP_WORDS[*]}" COMP_CWORD="${COMP_CWORD}" "$completion_path" 2> /dev/null) )
if [[ $? != 0 ]]; then
unset COMPREPLY
return 0
fi
else
unset COMPREPLY
return 0
fi
return 0
}
# Enable default readline and bash completion behavior when `_qiime_completion`
# doesn't have a reply.
complete -F _qiime_completion -o default -o bashdefault qiime
# Execute a `qiime` command (any command will do) so that tab-completion will
# work out-of-the-box (e.g. with a fresh installation of q2cli). Running a
# command will create or refresh the cache if necessary, which contains the
# actual completion script.
#
# Ignore stdout to avoid displaying help text to users enabling tab-completion.
# stderr displays the note about cache refreshing, as that can take a few
# moments to complete.
qiime > /dev/null
......@@ -3,7 +3,7 @@
{% set release = '.'.join(version.split('.')[:2]) %}
package:
name: q2cli
name: q2-cutadapt
version: {{ version }}
source:
......@@ -11,26 +11,25 @@ source:
build:
script: make install
entry_points:
- qiime=q2cli.__main__:qiime
requirements:
host:
- python {{ python }}
- python {{ python }}
- setuptools
run:
- python {{ python }}
- pip
- click
- python {{ python }}
- cutadapt
- pigz
- pandas
- numpy
- qiime2 {{ release }}.*
- q2-types {{ release }}.*
test:
imports:
- q2cli
commands:
- QIIMETEST= qiime --help
- q2_cutadapt
- qiime2.plugins.cutadapt
about:
home: https://qiime2.org
......
if [ -n "${ZSH_VERSION-}" ]; then
autoload bashcompinit && bashcompinit && source tab-qiime
elif [ -n "${BASH_VERSION-}" ]; then
source tab-qiime
fi
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2019, QIIME 2 development team.
# Copyright (c) 2017-2019, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
......@@ -8,5 +8,6 @@
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
# ----------------------------------------------------------------------------
# Copyright (c) 2017-2019, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import gzip
import os
import subprocess
import tempfile
import qiime2
from q2_types.per_sample_sequences import (
CasavaOneEightSingleLanePerSampleDirFmt,
FastqGzFormat,
)
from q2_types.multiplexed_sequences import (
MultiplexedSingleEndBarcodeInSequenceDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt,
)
import pandas as pd
import numpy as np
def run_command(cmd, verbose=True):
print('Running external command line application. This may print '
'messages to stdout and/or stderr.')
print('The command being run is below. This command cannot '
'be manually re-run as it will depend on temporary files that '
'no longer exist.')
print('\nCommand:', end=' ')
print(' '.join(cmd), end='\n\n')
subprocess.run(cmd, check=True)
def _build_demux_command(seqs_dir_fmt, barcode_fhs, per_sample_dir_fmt,
untrimmed_dir_fmt, error_rate, minimum_length):
cmd = ['cutadapt',
'--front', 'file:%s' % barcode_fhs['fwd'].name,
'--error-rate', str(error_rate),
'--minimum-length', str(minimum_length),
# {name} is a cutadapt convention for interpolating the sample id
# into the filename.
'-o', os.path.join(str(per_sample_dir_fmt), '{name}.1.fastq.gz'),
'--untrimmed-output',
os.path.join(str(untrimmed_dir_fmt), 'forward.fastq.gz'),
]
if isinstance(seqs_dir_fmt, MultiplexedPairedEndBarcodeInSequenceDirFmt):
# PAIRED-END
if barcode_fhs['rev'] is not None:
# Dual indices
cmd += [
'--pair-adapters',
'-G', 'file:%s' % barcode_fhs['rev'].name,
]
cmd += [
'-p', os.path.join(str(per_sample_dir_fmt), '{name}.2.fastq.gz'),
'--untrimmed-paired-output',
os.path.join(str(untrimmed_dir_fmt), 'reverse.fastq.gz'),
str(seqs_dir_fmt.forward_sequences.view(FastqGzFormat)),
str(seqs_dir_fmt.reverse_sequences.view(FastqGzFormat)),
]
else:
# SINGLE-END
cmd += [str(seqs_dir_fmt.file.view(FastqGzFormat))]
return cmd
def _rename_files(seqs_dir_fmt, per_sample_dir_fmt, barcode_series):
read_directions = [1]
if isinstance(seqs_dir_fmt, MultiplexedPairedEndBarcodeInSequenceDirFmt):
# PAIRED-END
read_directions.append(2)
for (sample_id, barcode_id) in barcode_series.iteritems():
for read_direction in read_directions:
out_fp = per_sample_dir_fmt.sequences.path_maker(
sample_id=sample_id, barcode_id=barcode_id,
lane_number=1, read_number=read_direction)
src = os.path.join(str(per_sample_dir_fmt),
'%s.%d.fastq.gz' % (sample_id,
read_direction))
if os.path.isfile(src):
os.rename(src, str(out_fp))
def _write_barcode_fasta(barcode_series, barcode_fasta):
with open(barcode_fasta.name, 'w') as fh:
for (sample_id, barcode) in barcode_series.iteritems():
fh.write('>%s\n%s\n' % (sample_id, barcode))
def _write_empty_fastq_to_mux_barcode_in_seq_fmt(seqs_dir_fmt):
fastq = FastqGzFormat()
with gzip.open(str(fastq), 'w') as fh:
fh.write(b'')
# PAIRED-END
if isinstance(seqs_dir_fmt, MultiplexedPairedEndBarcodeInSequenceDirFmt):
seqs_dir_fmt.forward_sequences.write_data(fastq, FastqGzFormat)
seqs_dir_fmt.reverse_sequences.write_data(fastq, FastqGzFormat)
# SINGLE-END
else:
seqs_dir_fmt.file.write_data(fastq, FastqGzFormat)
def _demux(seqs, forward_barcodes, reverse_barcodes, error_tolerance,
mux_fmt, batch_size, minimum_length):
fwd_barcode_name = forward_barcodes.name
forward_barcodes = forward_barcodes.drop_missing_values()
barcodes = forward_barcodes.to_series().to_frame()
if reverse_barcodes is not None:
barcode_pairs = set()
samples_w_missing_barcodes = set()
samples_w_dup_barcode_pairs = set()
rev_barcode_name = reverse_barcodes.name
rev_barcodes = reverse_barcodes.to_series()
# 'sort = false' below prevents a warning about future behavior changes
# by selecting the future behavior explicitly
barcodes = pd.concat([barcodes, rev_barcodes], axis=1, sort=False)
for sample_id, f_barcode, r_barcode in barcodes.itertuples():
if pd.isnull(f_barcode) or pd.isnull(r_barcode):
samples_w_missing_barcodes.add(sample_id)
if (f_barcode, r_barcode) in barcode_pairs:
samples_w_dup_barcode_pairs.add(sample_id)
barcode_pairs.add((f_barcode, r_barcode))
if samples_w_missing_barcodes:
raise ValueError('The following samples do not have both '
'forward and reverse barcodes: %s'
% ', '.join(sorted(samples_w_missing_barcodes)))
if samples_w_dup_barcode_pairs:
raise ValueError('The following samples have duplicate barcode'
' pairs: %s' %
', '.join(sorted(samples_w_dup_barcode_pairs)))
per_sample_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
n_samples = len(barcodes)
if batch_size > n_samples:
raise ValueError('The batch_size (%d) cannot be greater than the '
'number of samples (%d).' % (
batch_size, n_samples))
batch_size = n_samples if batch_size == 0 else batch_size
batches = np.arange(n_samples) // batch_size
previous_untrimmed = seqs
for _, barcode_batch in barcodes.groupby(batches):
current_untrimmed = mux_fmt()
_write_empty_fastq_to_mux_barcode_in_seq_fmt(current_untrimmed)
open_fhs = {'fwd': tempfile.NamedTemporaryFile(), 'rev': None}
_write_barcode_fasta(barcode_batch[fwd_barcode_name], open_fhs['fwd'])
if reverse_barcodes is not None:
open_fhs['rev'] = tempfile.NamedTemporaryFile()
_write_barcode_fasta(barcode_batch[rev_barcode_name],
open_fhs['rev'])
cmd = _build_demux_command(previous_untrimmed, open_fhs,
per_sample_sequences,
current_untrimmed, error_tolerance,
minimum_length)
run_command(cmd)
open_fhs['fwd'].close()
if reverse_barcodes is not None:
open_fhs['rev'].close()
previous_untrimmed = current_untrimmed
# Only use the forward barcode in the renamed files
_rename_files(seqs, per_sample_sequences, barcodes[fwd_barcode_name])
muxed = len(list(per_sample_sequences.sequences.iter_views(FastqGzFormat)))
if muxed == 0:
raise ValueError('No samples were demultiplexed.')
return per_sample_sequences, previous_untrimmed
def demux_single(seqs: MultiplexedSingleEndBarcodeInSequenceDirFmt,
barcodes: qiime2.CategoricalMetadataColumn,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedSingleEndBarcodeInSequenceDirFmt):
mux_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt
return _demux(seqs, barcodes, None, error_rate, mux_fmt, batch_size,
minimum_length)
def demux_paired(seqs: MultiplexedPairedEndBarcodeInSequenceDirFmt,
forward_barcodes: qiime2.CategoricalMetadataColumn,
reverse_barcodes: qiime2.CategoricalMetadataColumn = None,
error_rate: float = 0.1,
batch_size: int = 0,
minimum_length: int = 1) -> \
(CasavaOneEightSingleLanePerSampleDirFmt,
MultiplexedPairedEndBarcodeInSequenceDirFmt):
mux_fmt = MultiplexedPairedEndBarcodeInSequenceDirFmt
return _demux(seqs, forward_barcodes, reverse_barcodes, error_rate,
mux_fmt, batch_size, minimum_length)
# ----------------------------------------------------------------------------
# Copyright (c) 2017-2019, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import subprocess
import pandas as pd
from q2_types.per_sample_sequences import (
CasavaOneEightSingleLanePerSampleDirFmt,
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt,
)
def run_commands(cmds, verbose=True):
print('Running external command line application. This may print '
'messages to stdout and/or stderr.')
print('The commands to be run are below. These commands cannot '
'be manually re-run as they will depend on temporary files that '
'no longer exist.')
for cmd in cmds:
print('\nCommand:', end=' ')
print(' '.join(cmd), end='\n\n')
subprocess.run(cmd, check=True)
_trim_defaults = {
'cores': 1,
'adapter_f': None,
'adapter_r': None,
'front_f': None,
'front_r': None,
'anywhere_f': None,
'anywhere_r': None,
'error_rate': 0.1,
'indels': True,
'times': 1,
'overlap': 3,
'match_read_wildcards': False,
'match_adapter_wildcards': True,
'minimum_length': 1,
'discard_untrimmed': False,
}
def _build_trim_command(f_read, r_read, trimmed_seqs,
cores=_trim_defaults['cores'],
adapter_f=_trim_defaults['adapter_f'],
front_f=_trim_defaults['front_f'],
anywhere_f=_trim_defaults['anywhere_f'],
adapter_r=_trim_defaults['adapter_r'],
front_r=_trim_defaults['front_r'],
anywhere_r=_trim_defaults['anywhere_r'],
error_rate=_trim_defaults['error_rate'],
indels=_trim_defaults['indels'],
times=_trim_defaults['times'],
overlap=_trim_defaults['overlap'],
match_read_wildcards=_trim_defaults[
'match_read_wildcards'],
match_adapter_wildcards=_trim_defaults[
'match_adapter_wildcards'],
minimum_length=_trim_defaults['minimum_length'],
discard_untrimmed=_trim_defaults['discard_untrimmed'],
):
cmd = [
'cutadapt',
'--cores', str(cores),
'--error-rate', str(error_rate),
'--times', str(times),
'--overlap', str(overlap),
'--minimum-length', str(minimum_length),
'-o', str(trimmed_seqs.path / os.path.basename(f_read)),
]
if r_read is not None:
cmd += ['-p', str(trimmed_seqs.path / os.path.basename(r_read))]
if adapter_f is not None:
for adapter in adapter_f:
cmd += ['--adapter', adapter]
if front_f is not None:
for adapter in front_f:
cmd += ['--front', adapter]
if anywhere_f is not None:
for adapter in anywhere_f:
cmd += ['--anywhere', adapter]
if adapter_r is not None:
for adapter in adapter_r:
cmd += ['-A', adapter] # cutadapt doesn't have a long-form flag
if front_r is not None:
for adapter in front_r:
cmd += ['-G', adapter] # cutadapt doesn't have a long-form flag
if anywhere_r is not None:
for adapter in anywhere_r:
cmd += ['-B', adapter] # cutadapt doesn't have a long-form flag
if not indels:
cmd += ['--no-indels']
if match_read_wildcards:
cmd += ['--match-read-wildcards']
if not match_adapter_wildcards:
cmd += ['--no-match-adapter-wildcards']
if discard_untrimmed:
cmd += ['--discard-untrimmed']
cmd += [f_read]
if r_read is not None:
cmd += [r_read]
return cmd
def trim_single(demultiplexed_sequences:
SingleLanePerSampleSingleEndFastqDirFmt,
cores: int = _trim_defaults['cores'],
adapter: str = _trim_defaults['adapter_f'],
front: str = _trim_defaults['front_f'],
anywhere: str = _trim_defaults['anywhere_f'],
error_rate: float = _trim_defaults['error_rate'],
indels: bool = _trim_defaults['indels'],
times: int = _trim_defaults['times'],
overlap: int = _trim_defaults['overlap'],
match_read_wildcards:
bool = _trim_defaults['match_read_wildcards'],
match_adapter_wildcards:
bool = _trim_defaults['match_adapter_wildcards'],
minimum_length: int = _trim_defaults['minimum_length'],
discard_untrimmed:
bool = _trim_defaults['discard_untrimmed']) -> \
CasavaOneEightSingleLanePerSampleDirFmt:
trimmed_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
cmds = []
df = demultiplexed_sequences.manifest.view(pd.DataFrame)
for _, fwd in df.itertuples():
cmd = _build_trim_command(fwd, None,
trimmed_sequences, cores, adapter, front,
anywhere, None, None, None, error_rate,
indels, times, overlap, match_read_wildcards,
match_adapter_wildcards, minimum_length,
discard_untrimmed)
cmds.append(cmd)
run_commands(cmds)
return trimmed_sequences
def trim_paired(demultiplexed_sequences:
SingleLanePerSamplePairedEndFastqDirFmt,
cores: int = _trim_defaults['cores'],
adapter_f: str = _trim_defaults['adapter_f'],
front_f: str = _trim_defaults['front_f'],
anywhere_f: str = _trim_defaults['anywhere_f'],
adapter_r: str = _trim_defaults['adapter_r'],
front_r: str = _trim_defaults['front_r'],
anywhere_r: str = _trim_defaults['anywhere_r'],
error_rate: float = _trim_defaults['error_rate'],
indels: bool = _trim_defaults['indels'],
times: int = _trim_defaults['times'],
overlap: int = _trim_defaults['overlap'],
match_read_wildcards:
bool = _trim_defaults['match_read_wildcards'],
match_adapter_wildcards:
bool = _trim_defaults['match_adapter_wildcards'],
minimum_length: int = _trim_defaults['minimum_length'],
discard_untrimmed:
bool = _trim_defaults['discard_untrimmed']) -> \
CasavaOneEightSingleLanePerSampleDirFmt:
trimmed_sequences = CasavaOneEightSingleLanePerSampleDirFmt()
cmds = []
df = demultiplexed_sequences.manifest.view(pd.DataFrame)
for _, fwd, rev in df.itertuples():
cmd = _build_trim_command(fwd, rev, trimmed_sequences, cores,
adapter_f, front_f,
anywhere_f, adapter_r, front_r, anywhere_r,
error_rate, indels, times, overlap,
match_read_wildcards,
match_adapter_wildcards, minimum_length,
discard_untrimmed)
cmds.append(cmd)
run_commands(cmds)
return trimmed_sequences
......@@ -24,8 +24,8 @@ def get_keywords():
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (tag: 2019.7.0)"
git_full = "06b978c96c8efce8be0c8213e744cb4b389f2bc6"
git_date = "2019-07-30 18:15:53 +0000"
git_full = "8dd928fc233deafbb705310875e9a32a8e323efe"
git_date = "2019-07-30 18:15:55 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
......@@ -42,8 +42,8 @@ def get_config():
cfg.VCS = "git"
cfg.style = "pep440"
cfg.tag_prefix = ""
cfg.parentdir_prefix = "q2cli-"
cfg.versionfile_source = "q2cli/_version.py"
cfg.parentdir_prefix = "q2-quality-control-"
cfg.versionfile_source = "q2_quality_control/_version.py"
cfg.verbose = False
return cfg
......
@article{martin2011cutadapt,
title={Cutadapt removes adapter sequences from high-throughput sequencing reads},
author={Martin, Marcel},
journal={EMBnet. journal},
volume={17},
number={1},
pages={pp--10},
year={2011},
doi={10.14806/ej.17.1.200}
}
This diff is collapsed.
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2019, QIIME 2 development team.
# Copyright (c) 2017-2019, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
......
sample-id,filename,direction
# direction is not meaningful in this file as these
# data may be derived from forward, reverse, or
# joined reads
sample_a,sample_a_S01_L001_R1_001.fastq.gz,forward
sample_a,sample_a_S01_L001_R2_001.fastq.gz,reverse
sample_b,sample_b_S02_L001_R1_001.fastq.gz,forward
sample_b,sample_b_S00_L001_R2_001.fastq.gz,reverse
sample-id,filename,direction
# direction is not meaningful in this file as these
# data may be derived from forward, reverse, or
# joined reads
sample_a,sample_a_S01_L001_R1_001.fastq.gz,forward
sample_a,sample_a_S01_L001_R2_001.fastq.gz,reverse
sample_b,sample_b_S02_L001_R1_001.fastq.gz,forward
sample_b,sample_b_S02_L001_R2_001.fastq.gz,reverse
sample_c,sample_c_S03_L001_R1_001.fastq.gz,forward
sample_c,sample_c_S03_L001_R2_001.fastq.gz,reverse
\ No newline at end of file
sample-id,filename,direction
# direction is not meaningful in this file as these