Skip to content
Commits on Source (3)
......@@ -19,7 +19,8 @@ requirements:
run:
- python {{ python }}
- scikit-learn 0.20.2
- scikit-learn 0.21.2
- joblib
- scikit-bio
- biom-format >=2.1.5,<2.2.0
- blast >=2.6.0
......@@ -27,6 +28,9 @@ requirements:
- vsearch <=2.7.0
- qiime2 {{ release }}.*
- q2-types {{ release }}.*
- q2-quality-control {{ release }}.*
- q2-taxa {{ release }}.*
- q2-feature-table {{ release }}.*
test:
imports:
......
q2-feature-classifier (2019.7.0-1) UNRELEASED; urgency=medium
* New upstream version
-- Liubov Chuprikova <chuprikovalv@gmail.com> Sat, 07 Sep 2019 13:41:46 +0200
q2-feature-classifier (2019.4.0-1) unstable; urgency=medium
* Initial release (Closes: #931887)
......
......@@ -11,6 +11,7 @@ import subprocess
import pandas as pd
from os.path import isfile
from collections import Counter, defaultdict
import qiime2
def _get_default_unassignable_label():
......@@ -40,7 +41,7 @@ def _consensus_assignments(
consensus = {'': ('', '')}
result = pd.DataFrame.from_dict(consensus, 'index')
result.index.name = 'Feature ID'
result.columns = ['Taxon', 'Confidence']
result.columns = ['Taxon', 'Consensus']
return result
......@@ -242,3 +243,9 @@ def _compute_consensus_annotation(annotations, min_consensus,
consensus_fraction_result = 1.0
return annotation, consensus_fraction_result
def _annotate_method(taxa, method):
taxa = taxa.view(pd.DataFrame)
taxa['Method'] = method
return qiime2.Artifact.import_data('FeatureData[Taxonomy]', taxa)
......@@ -9,7 +9,7 @@
from itertools import islice, repeat
from copy import deepcopy
from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
_specific_fitters = [
['naive_bayes',
......@@ -38,7 +38,7 @@ def _extract_reads(reads):
def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
pre_dispatch='2*n_jobs', confidence=-1.):
pre_dispatch='2*n_jobs', confidence='disable'):
return (m for c in Parallel(n_jobs=n_jobs, batch_size=1,
pre_dispatch=pre_dispatch)
(delayed(_predict_chunk)(pipeline, separator, confidence, chunk)
......@@ -46,7 +46,7 @@ def predict(reads, pipeline, separator=';', chunk_size=262144, n_jobs=1,
def _predict_chunk(pipeline, separator, confidence, chunk):
if confidence < 0.:
if confidence == 'disable':
return _predict_chunk_without_conf(pipeline, chunk)
else:
return _predict_chunk_with_conf(pipeline, separator, confidence, chunk)
......
......@@ -11,7 +11,7 @@ import tarfile
import os
import sklearn
from sklearn.externals import joblib
import joblib
from sklearn.pipeline import Pipeline
import qiime2.plugin
import qiime2.plugin.model as model
......
......@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (tag: 2019.4.0)"
git_full = "e2ba97ea232a02f4c489b51ff0481583d394a339"
git_date = "2019-05-03 04:14:49 +0000"
git_refnames = " (tag: 2019.7.0)"
git_full = "bc7ed3a741779ccb32114a229af65cc8c1510743"
git_date = "2019-07-30 18:15:57 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
......
......@@ -6,13 +6,19 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import tempfile
import pandas as pd
import qiime2
from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, DNAFASTAFormat)
from .plugin_setup import plugin, citations
from qiime2.plugin import Int, Str, Float, Choices, Range
from ._consensus_assignment import (_consensus_assignments,
_get_default_unassignable_label)
from qiime2.plugin import Int, Str, Float, Choices, Range, Bool
from ._consensus_assignment import (_consensus_assignments, _run_command,
_get_default_unassignable_label,
_annotate_method)
from ._taxonomic_classifier import TaxonomicClassifier
from .classifier import _classify_parameters, _parameter_descriptions
def classify_consensus_vsearch(query: DNAFASTAFormat,
......@@ -25,53 +31,164 @@ def classify_consensus_vsearch(query: DNAFASTAFormat,
min_consensus: float = 0.51,
unassignable_label: str =
_get_default_unassignable_label(),
search_exact: bool = False,
top_hits_only: bool = False,
threads: str = 1) -> pd.DataFrame:
seqs_fp = str(query)
ref_fp = str(reference_reads)
if maxaccepts == 'all':
maxaccepts = 0
cmd = ['vsearch', '--usearch_global', seqs_fp, '--id', str(perc_identity),
'--query_cov', str(query_cov), '--strand', strand, '--maxaccepts',
str(maxaccepts), '--maxrejects', '0', '--output_no_hits', '--db',
ref_fp, '--threads', str(threads), '--blast6out']
ref_fp, '--threads', str(threads)]
if search_exact:
cmd[1] = '--search_exact'
if top_hits_only:
cmd.append('--top_hits_only')
cmd.append('--blast6out')
consensus = _consensus_assignments(
cmd, reference_taxonomy, min_consensus=min_consensus,
unassignable_label=unassignable_label)
return consensus
plugin.methods.register_function(
function=classify_consensus_vsearch,
inputs={'query': FeatureData[Sequence],
'reference_reads': FeatureData[Sequence],
'reference_taxonomy': FeatureData[Taxonomy]},
parameters={'maxaccepts': Int % Range(0, None),
def classify_hybrid_vsearch_sklearn(ctx,
query,
reference_reads,
reference_taxonomy,
classifier,
maxaccepts=10,
perc_identity=0.5,
query_cov=0.8,
strand='both',
min_consensus=0.51,
reads_per_batch=0,
confidence=0.7,
read_orientation='auto',
threads=1,
prefilter=True,
sample_size=1000,
randseed=0):
exclude = ctx.get_action('quality_control', 'exclude_seqs')
ccv = ctx.get_action('feature_classifier', 'classify_consensus_vsearch')
cs = ctx.get_action('feature_classifier', 'classify_sklearn')
filter_seqs = ctx.get_action('taxa', 'filter_seqs')
merge = ctx.get_action('feature_table', 'merge_taxa')
# randomly subsample reference sequences for rough positive filter
if prefilter:
ref = str(reference_reads.view(DNAFASTAFormat))
with tempfile.NamedTemporaryFile() as output:
cmd = ['vsearch', '--fastx_subsample', ref, '--sample_size',
str(sample_size), '--randseed', str(randseed),
'--fastaout', output.name]
_run_command(cmd)
sparse_reference = qiime2.Artifact.import_data(
'FeatureData[Sequence]', output.name)
# perform rough positive filter on query sequences
query, misses, = exclude(
query_sequences=query, reference_sequences=sparse_reference,
method='vsearch', perc_identity=perc_identity,
perc_query_aligned=query_cov, threads=threads)
# find exact matches, perform LCA consensus classification
taxa1, = ccv(query=query, reference_reads=reference_reads,
reference_taxonomy=reference_taxonomy, maxaccepts=maxaccepts,
strand=strand, min_consensus=min_consensus,
search_exact=True, threads=threads)
# Annotate taxonomic assignments with classification method
taxa1 = _annotate_method(taxa1, 'VSEARCH')
# perform second pass classification on unassigned taxa
# filter out unassigned seqs
try:
query, = filter_seqs(sequences=query, taxonomy=taxa1,
include=_get_default_unassignable_label())
except ValueError:
# get ValueError if all sequences are filtered out.
# so if no sequences are unassigned, return exact match results
return taxa1
# classify with sklearn classifier
taxa2, = cs(reads=query, classifier=classifier,
reads_per_batch=reads_per_batch, n_jobs=threads,
confidence=confidence, read_orientation=read_orientation)
# Annotate taxonomic assignments with classification method
taxa2 = _annotate_method(taxa2, 'sklearn')
# merge into one big happy result
taxa, = merge(data=[taxa2, taxa1])
return taxa
output_descriptions = {
'classification': 'The resulting taxonomy classifications.'}
parameters = {'maxaccepts': Int % Range(1, None) | Str % Choices(['all']),
'perc_identity': Float % Range(0.0, 1.0, inclusive_end=True),
'query_cov': Float % Range(0.0, 1.0, inclusive_end=True),
'strand': Str % Choices(['both', 'plus']),
'min_consensus': Float % Range(0.5, 1.0, inclusive_end=True,
inclusive_start=False),
'unassignable_label': Str,
'threads': Int},
outputs=[('classification', FeatureData[Taxonomy])],
'threads': Int % Range(1, None)}
inputs = {'query': FeatureData[Sequence],
'reference_reads': FeatureData[Sequence],
'reference_taxonomy': FeatureData[Taxonomy]}
input_descriptions = {'query': 'Sequences to classify taxonomically.',
'reference_reads': 'reference sequences.',
'reference_taxonomy': 'reference taxonomy labels.'},
'reference_taxonomy': 'reference taxonomy labels.'}
parameter_descriptions = {
'strand': ('Align against reference sequences in forward ("plus") '
'or both directions ("both").'),
'maxaccepts': ('Maximum number of hits to keep for each query. Set to '
'0 to keep all hits > perc_identity similarity. Must '
'be in range [0, infinity].'),
'perc_identity': ('Reject match if percent identity to query is '
'lower. Must be in range [0.0, 1.0].'),
'strand': 'Align against reference sequences in forward ("plus") '
'or both directions ("both").',
'maxaccepts': 'Maximum number of hits to keep for each query. Set to '
'"all" to keep all hits > perc_identity similarity.',
'perc_identity': 'Reject match if percent identity to query is '
'lower.',
'query_cov': 'Reject match if query alignment coverage per high-'
'scoring pair is lower. Must be in range [0.0, 1.0].',
'min_consensus': ('Minimum fraction of assignments must match top '
'hit to be accepted as consensus assignment. Must '
'be in range (0.5, 1.0].')
'scoring pair is lower.',
'min_consensus': 'Minimum fraction of assignments must match top '
'hit to be accepted as consensus assignment.',
'threads': 'Number of threads to use for job parallelization.'}
outputs = [('classification', FeatureData[Taxonomy])]
ignore_prefilter = ' This parameter is ignored if `prefilter` is disabled.'
plugin.methods.register_function(
function=classify_consensus_vsearch,
inputs=inputs,
parameters={**parameters,
'unassignable_label': Str,
'search_exact': Bool,
'top_hits_only': Bool},
outputs=outputs,
input_descriptions=input_descriptions,
parameter_descriptions={
**parameter_descriptions,
'search_exact': 'Search for exact full-length matches to the query '
'sequences. Only 100% exact matches are reported and '
'this command is much faster than the default. If '
'True, the perc_identity and query_cov settings are '
'ignored. Note: query and reference reads must be '
'trimmed to the exact same DNA locus (e.g., primer '
'site) because only exact matches will be reported.',
'top_hits_only': 'Only the top hits between the query and reference '
'sequence sets are reported. For each query, the top '
'hit is the one presenting the highest percentage of '
'identity. Multiple equally scored top hits will be '
'used for consensus taxonomic assignment if '
'maxaccepts is greater than 1.',
},
output_descriptions={'classification': 'The resulting taxonomy '
'classifications.'},
name='VSEARCH consensus taxonomy classifier',
output_descriptions=output_descriptions,
name='VSEARCH-based consensus taxonomy classifier',
description=('Assign taxonomy to query sequences using VSEARCH. Performs '
'VSEARCH global alignment between query and reference_reads, '
'then assigns consensus taxonomy to each query sequence from '
......@@ -81,3 +198,68 @@ plugin.methods.register_function(
'choosing the top N hits, not the first N hits.'),
citations=[citations['rognes2016vsearch']]
)
plugin.pipelines.register_function(
function=classify_hybrid_vsearch_sklearn,
inputs={**inputs, 'classifier': TaxonomicClassifier},
parameters={**parameters,
'reads_per_batch': _classify_parameters['reads_per_batch'],
'confidence': _classify_parameters['confidence'],
'read_orientation': _classify_parameters['read_orientation'],
'prefilter': Bool,
'sample_size': Int % Range(1, None),
'randseed': Int % Range(0, None)},
outputs=outputs,
input_descriptions={**input_descriptions,
'classifier': 'Pre-trained sklearn taxonomic '
'classifier for classifying the reads.'},
parameter_descriptions={
**{k: parameter_descriptions[k] for k in [
'strand', 'maxaccepts', 'min_consensus', 'threads']},
'perc_identity': 'Percent sequence similarity to use for PREFILTER. ' +
parameter_descriptions['perc_identity'] + ' Set to a '
'lower value to perform a rough pre-filter.' +
ignore_prefilter,
'query_cov': 'Query coverage threshold to use for PREFILTER. ' +
parameter_descriptions['query_cov'] + ' Set to a '
'lower value to perform a rough pre-filter.' +
ignore_prefilter,
'confidence': _parameter_descriptions['confidence'],
'read_orientation': 'Direction of reads with respect to reference '
'sequences in pre-trained sklearn classifier. '
'same will cause reads to be classified unchanged'
'; reverse-complement will cause reads to be '
'reversed and complemented prior to '
'classification. "auto" will autodetect '
'orientation based on the confidence estimates '
'for the first 100 reads.',
'reads_per_batch': 'Number of reads to process in each batch for '
'sklearn classification. If "auto", this parameter '
'is autoscaled to min(number of query sequences / '
'threads, 20000).',
'prefilter': 'Toggle positive filter of query sequences on or off.',
'sample_size': 'Randomly extract the given number of sequences from '
'the reference database to use for prefiltering.' +
ignore_prefilter,
'randseed': 'Use integer as a seed for the pseudo-random generator '
'used during prefiltering. A given seed always produces '
'the same output, which is useful for replicability. Set '
'to 0 to use a pseudo-random seed.' + ignore_prefilter,
},
output_descriptions=output_descriptions,
name='ALPHA Hybrid classifier: VSEARCH exact match + sklearn classifier',
description=('NOTE: THIS PIPELINE IS AN ALPHA RELEASE. Please report bugs '
'to https://forum.qiime2.org!\n'
'Assign taxonomy to query sequences using hybrid classifier. '
'First performs rough positive filter to remove artifact and '
'low-coverage sequences (use "prefilter" parameter to toggle '
'this step on or off). Second, performs VSEARCH exact match '
'between query and reference_reads to find exact matches, '
'followed by least common ancestor consensus taxonomy '
'assignment from among maxaccepts top hits, min_consensus of '
'which share that taxonomic assignment. Query sequences '
'without an exact match are then classified with a pre-'
'trained sklearn taxonomy classifier to predict the most '
'likely taxonomic lineage.'),
)
......@@ -23,6 +23,7 @@ import sklearn
from numpy import median, array, ceil
import biom
import skbio
import joblib
from ._skl import fit_pipeline, predict, _specific_fitters
from ._taxonomic_classifier import TaxonomicClassifier
......@@ -182,7 +183,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
raise ValueError("Value other than zero must be specified as number "
"of jobs to run.")
else:
n_jobs = sklearn.externals.joblib.effective_n_jobs(n_jobs)
n_jobs = joblib.effective_n_jobs(n_jobs)
# we really only want to calculate this if running in parallel
if n_jobs != 1:
......@@ -200,7 +201,7 @@ def _autotune_reads_per_batch(reads, n_jobs):
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
reads_per_batch: int = 0, n_jobs: int = 1,
pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
read_orientation: str = None
read_orientation: str = 'auto'
) -> pd.DataFrame:
# autotune reads per batch
if reads_per_batch == 0:
......@@ -226,25 +227,14 @@ _classify_parameters = {
'reads_per_batch': Int % Range(0, None),
'n_jobs': Int,
'pre_dispatch': Str,
'confidence': Float,
'read_orientation': Str % Choices(['same', 'reverse-complement'])}
'confidence': Float % Range(
0, 1, inclusive_start=True, inclusive_end=True) | Str % Choices(
['disable']),
'read_orientation': Str % Choices(['same', 'reverse-complement', 'auto'])}
plugin.methods.register_function(
function=classify_sklearn,
inputs={'reads': FeatureData[Sequence],
'classifier': TaxonomicClassifier},
parameters=_classify_parameters,
outputs=[('classification', FeatureData[Taxonomy])],
name='Pre-fitted sklearn-based taxonomy classifier',
description='Classify reads by taxon using a fitted classifier.',
input_descriptions={
'reads': 'The feature data to be classified.',
'classifier': 'The taxonomic classifier for classifying the reads.'
},
parameter_descriptions={
_parameter_descriptions = {
'confidence': 'Confidence threshold for limiting '
'taxonomic depth. Provide -1 to disable '
'taxonomic depth. Set to "disable" to disable '
'confidence calculation, or 0 to calculate '
'confidence but not apply it to limit the '
'taxonomic depth of the assignments.',
......@@ -253,9 +243,9 @@ plugin.methods.register_function(
'reads to be classified unchanged; reverse-'
'complement will cause reads to be reversed '
'and complemented prior to classification. '
'Default is to autodetect based on the '
'"auto" will autodetect orientation based on the '
'confidence estimates for the first 100 reads.',
'reads_per_batch': 'Number of reads to process in each batch. If 0, '
'reads_per_batch': 'Number of reads to process in each batch. If "auto", '
'this parameter is autoscaled to '
'min( number of query sequences / n_jobs, 20000).',
'n_jobs': 'The maximum number of concurrently worker processes. If -1 '
......@@ -265,7 +255,21 @@ plugin.methods.register_function(
'n_jobs = -2, all CPUs but one are used.',
'pre_dispatch': '"all" or expression, as in "3*n_jobs". The number of '
'batches (of tasks) to be pre-dispatched.'
}
plugin.methods.register_function(
function=classify_sklearn,
inputs={'reads': FeatureData[Sequence],
'classifier': TaxonomicClassifier},
parameters=_classify_parameters,
outputs=[('classification', FeatureData[Taxonomy])],
name='Pre-fitted sklearn-based taxonomy classifier',
description='Classify reads by taxon using a fitted classifier.',
input_descriptions={
'reads': 'The feature data to be classified.',
'classifier': 'The taxonomic classifier for classifying the reads.'
},
parameter_descriptions={**_parameter_descriptions},
citations=[citations['pedregosa2011scikit']]
)
......
......@@ -59,8 +59,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
# should populate the class weight of a pipeline
weights = Artifact.import_data(
'FeatureTable[RelativeFrequency]',
self.get_data_path('class_weight.biom'),
view_type='BIOMV100Format')
self.get_data_path('class_weight.biom'))
table = weights.view(biom.Table)
svc_spec = [['feat_ext',
......@@ -89,8 +88,7 @@ class ClassifierTests(FeatureClassifierTestPluginBase):
# we should be able to input class_weight to fit_classifier
weights = Artifact.import_data(
'FeatureTable[RelativeFrequency]',
self.get_data_path('class_weight.biom'),
view_type='BIOMV100Format')
self.get_data_path('class_weight.biom'))
reads = Artifact.import_data(
'FeatureData[Sequence]',
self.get_data_path('se-dna-sequences.fasta'))
......
......@@ -9,6 +9,8 @@
import pandas as pd
from qiime2.sdk import Artifact
from qiime2.plugins import feature_classifier
from q2_feature_classifier._skl import _specific_fitters
from q2_feature_classifier._blast import classify_consensus_blast
from q2_feature_classifier._vsearch import classify_consensus_vsearch
from q2_feature_classifier._consensus_assignment import (
......@@ -55,6 +57,88 @@ class ConsensusAssignmentsTests(FeatureClassifierTestPluginBase):
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
def test_vsearch_search_exact(self):
result = classify_consensus_vsearch(self.reads, self.reads,
self.taxonomy, search_exact=True)
res = result.Taxon.to_dict()
tax = self.taxonomy.to_dict()
right = 0.
for taxon in res:
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
def test_vsearch_top_hits_only(self):
result = classify_consensus_vsearch(self.reads, self.reads,
self.taxonomy, top_hits_only=True)
res = result.Taxon.to_dict()
tax = self.taxonomy.to_dict()
right = 0.
for taxon in res:
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
class HybridClassiferTests(FeatureClassifierTestPluginBase):
package = 'q2_feature_classifier.tests'
def setUp(self):
super().setUp()
taxonomy = Artifact.import_data(
'FeatureData[Taxonomy]', self.get_data_path('taxonomy.tsv'))
self.taxonomy = taxonomy.view(pd.Series)
self.taxartifact = taxonomy
# TODO: use `Artifact.import_data` here once we have a transformer
# for DNASequencesDirectoryFormat -> DNAFASTAFormat
reads_fp = self.get_data_path('se-dna-sequences.fasta')
reads = DNAFASTAFormat(reads_fp, mode='r')
self.reads = Artifact.import_data('FeatureData[Sequence]', reads)
fitter = getattr(feature_classifier.methods,
'fit_classifier_' + _specific_fitters[0][0])
self.classifier = fitter(self.reads, self.taxartifact).classifier
self.query = Artifact.import_data('FeatureData[Sequence]', pd.Series(
{'A': 'GCCTAACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGA',
'B': 'TAACACATGCAAGTCAACGATGCTTATGTAGCAATATGTAAGTAGAGTGGCGCACG',
'C': 'ATACATGCAAGTCGTACGGTATTCCGGTTTCGGCCGGGAGAGAGTGGCGGATGGGT',
'D': 'GACGAACGCTGGCGACGTGCTTAACACATGCAAGTCGTGCGAGGACGGGCGGTGCT'
'TGCACTGCTCGAGCCGAGCGGCGGACGGGTGAGTAACACGTGAGCAACCTATCTCC'
'GTGCGGGGGACAACCCGGGGAAACCCGGGCTAATACCG'}))
def test_classify_hybrid_vsearch_sklearn_all_exact_match(self):
result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
query=self.reads, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier,
prefilter=False)
result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
query=self.reads, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier)
result = result.view(pd.DataFrame)
res = result.Taxon.to_dict()
tax = self.taxonomy.to_dict()
right = 0.
for taxon in res:
right += tax[taxon].startswith(res[taxon])
self.assertGreater(right/len(res), 0.5)
def test_classify_hybrid_vsearch_sklearn_mixed_query(self):
result, = feature_classifier.actions.classify_hybrid_vsearch_sklearn(
query=self.query, reference_reads=self.reads,
reference_taxonomy=self.taxartifact, classifier=self.classifier,
prefilter=True, read_orientation='same', randseed=1001)
result = result.view(pd.DataFrame)
obs = result.Taxon.to_dict()
exp = {'A': 'k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; '
'o__Legionellales; f__; g__; s__',
'B': 'k__Bacteria; p__Chlorobi; c__; o__; f__; g__; s__',
'C': 'k__Bacteria; p__Bacteroidetes; c__Cytophagia; '
'o__Cytophagales; f__Cyclobacteriaceae; g__; s__',
'D': 'k__Bacteria; p__Gemmatimonadetes; c__Gemm-5; o__; f__; '
'g__; s__'}
self.assertDictEqual(obs, exp)
class ImportBlastAssignmentTests(FeatureClassifierTestPluginBase):
......
......@@ -14,7 +14,7 @@ import os
import shutil
import sklearn
from sklearn.externals import joblib
import joblib
from sklearn.pipeline import Pipeline
from qiime2.sdk import Artifact
from qiime2.plugins.feature_classifier.methods import \
......