Skip to content
Commits on Source (4)
Mash is normally distributed as a dependency-free binary for Linux or OSX (see
https://github.com/marbl/Mash/releases). This source distribution is intended
for other operating systems or for development. Mash requires c++11 to build,
which is available in and GCC >= 4.8 and OSX >= 10.7.
for other operating systems or for development. Mash requires c++14 to build,
which is available in and GCC >= 5 and XCode >= 6.
See http://mash.readthedocs.org for more information.
mash (2.2+dfsg-1) unstable; urgency=medium
* New upstream release.
* Bump Standards-Version.
-- Sascha Steinbiss <satta@debian.org> Wed, 31 Jul 2019 17:08:42 +0200
mash (2.1.1+dfsg-1) unstable; urgency=medium
[ Michael R. Crusoe ]
......
......@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 12~),
libjs-mathjax,
asciidoctor,
libmurmurhash-dev
Standards-Version: 4.3.0
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/mash
Vcs-Git: https://salsa.debian.org/med-team/mash.git
Homepage: https://mash.readthedocs.io
......
......@@ -43,7 +43,7 @@ source_suffix = '.rst'
master_doc = 'index'
# General information about the project.
project = u'mash'
project = u'Mash'
copyright = u'2015, Brian Ondov, Todd Treangen, Adam Phillippy'
# The version info for the project you're documenting, acts as replacement for
......
......@@ -37,24 +37,6 @@ Figure 5:
* `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
* `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
Public data sources
~~~~~~~~~~~~~~~~~~~
The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
Public data products
~~~~~~~~~~~~~~~~~~~~
Quebec Polyomavirus is submitted to GenBank as BK010702.
Screen of SRA metagenomes vs. RefSeq
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......@@ -83,3 +65,24 @@ The files are tab separated, with each line beginning with a RefSeq assembly acc
GCF_000001215.4 SRR3401361 SRR3540373
GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081
GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081
We also provide simple scripts for searching these files: `search.tar <https://obj.umiacs.umd.edu/mash/screen/search.tar>`_
Public data sources
~~~~~~~~~~~~~~~~~~~
The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
Public data products
~~~~~~~~~~~~~~~~~~~~
Quebec Polyomavirus is submitted to GenBank as BK010702.
......@@ -17,6 +17,8 @@ Publication
===========
`Mash: fast genome and metagenome distance estimation using MinHash. Ondov BD, Treangen TJ, Melsted P, Mallonee AB, Bergman NH, Koren S, Phillippy AM. Genome Biol. 2016 Jun 20;17(1):132. doi: 10.1186/s13059-016-0997-x. <http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x>`_
`Mash Screen: High-throughput sequence containment estimation for genome discovery. Ondov BD, Starrett GJ, Sappington A, Kostic A, Koren S, Buck CB, Phillippy AM. BioRxiv. 2019 Mar. doi: 10.1101/557314 <https://doi.org/10.1101/557314>`_
.. toctree::
:maxdepth: 1
......
......@@ -85,7 +85,8 @@ int CommandBounds::run() const
if ( cont )
{
m2j = exp(-k * dists[j]);
//m2j = exp(-k * dists[j]);
m2j = pow(1.0 - dists[j], k); // binomial model
}
else
{
......@@ -114,7 +115,8 @@ int CommandBounds::run() const
if ( cont )
{
j2m = -1.0 / k * log(je);
//j2m = -1.0 / k * log(je);
j2m = 1.0 - pow(je, 1. / k);
}
else
{
......
......@@ -37,6 +37,7 @@ CommandDistance::CommandDistance()
//addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report.", "1.0", 0., 1.));
addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report.", "1.0", 0., 1.));
addOption("comment", Option(Option::Boolean, "C", "Output", "Show comment fields with reference/query names (denoted with ':').", "1.0", 0., 1.));
useSketchOptions();
}
......@@ -51,6 +52,7 @@ int CommandDistance::run() const
int threads = options.at("threads").getArgumentAsNumber();
bool list = options.at("list").active;
bool table = options.at("table").active;
bool comment = options.at("comment").active;
//bool log = options.at("log").active;
double pValueMax = options.at("pvalue").getArgumentAsNumber();
double distanceMax = options.at("distance").getArgumentAsNumber();
......@@ -225,13 +227,13 @@ int CommandDistance::run() const
while ( threadPool.outputAvailable() )
{
writeOutput(threadPool.popOutputWhenAvailable(), table);
writeOutput(threadPool.popOutputWhenAvailable(), table, comment);
}
}
while ( threadPool.running() )
{
writeOutput(threadPool.popOutputWhenAvailable(), table);
writeOutput(threadPool.popOutputWhenAvailable(), table, comment);
}
if ( warningCount > 0 && ! parameters.reads )
......@@ -242,7 +244,7 @@ int CommandDistance::run() const
return 0;
}
void CommandDistance::writeOutput(CompareOutput * output, bool table) const
void CommandDistance::writeOutput(CompareOutput * output, bool table, bool comment) const
{
uint64_t i = output->indexQuery;
uint64_t j = output->indexRef;
......@@ -267,7 +269,21 @@ void CommandDistance::writeOutput(CompareOutput * output, bool table) const
}
else if ( pair->pass )
{
cout << output->sketchRef.getReference(j).name << '\t' << output->sketchQuery.getReference(i).name << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
cout << output->sketchRef.getReference(j).name;
if ( comment )
{
cout << ':' << output->sketchRef.getReference(j).comment;
}
cout << '\t' << output->sketchQuery.getReference(i).name;
if ( comment )
{
cout << ':' << output->sketchQuery.getReference(i).comment;
}
cout << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
}
j++;
......
......@@ -85,7 +85,7 @@ public:
private:
void writeOutput(CompareOutput * output, bool table) const;
void writeOutput(CompareOutput * output, bool table, bool comment) const;
};
CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * input);
......
......@@ -39,9 +39,9 @@ CommandScreen::CommandScreen()
: Command()
{
name = "screen";
summary = "Determine whether query sequences are within a larger pool of sequences.";
description = "Determine how well query sequences are contained within a pool of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <pool> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <pool> to read from standard input. The <pool> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the pool.";
argumentString = "<queries>.msh <pool> [<pool>] ...";
summary = "Determine whether query sequences are within a larger mixture of sequences.";
description = "Determine how well query sequences are contained within a mixture of sequences. The queries must be formatted as a single Mash sketch file (.msh), created with the `mash sketch` command. The <mixture> files can be contigs or reads, in fasta or fastq, gzipped or not, and \"-\" can be given for <mixture> to read from standard input. The <mixture> sequences are assumed to be nucleotides, and will be 6-frame translated if the <queries> are amino acids. The output fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment], where median-multiplicity is computed for shared hashes, based on the number of observations of those hashes within the mixture.";
argumentString = "<queries>.msh <mixture> [<mixture>] ...";
useOption("help");
useOption("threads");
......@@ -322,7 +322,7 @@ int CommandScreen::run() const
*/
uint64_t setSize = minHashHeap.estimateSetSize();
cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in pool: " << setSize << endl;
cerr << " Estimated distinct" << (trans ? " (translated)" : "") << " k-mers in mixture: " << setSize << endl;
if ( setSize == 0 )
{
......@@ -606,7 +606,7 @@ double pValueWithin(uint64_t x, uint64_t setSize, double kmerSpace, uint64_t ske
return 1.;
}
double r = 1. / (1. + kmerSpace / setSize);
double r = double(setSize) / kmerSpace;
#ifdef USE_BOOST
return cdf(complement(binomial(sketchSize, r), x - 1));
......
......@@ -19,6 +19,15 @@
namespace mash {
struct HashTableEntry
{
HashTableEntry() : count(0) {}
uint32_t count;
std::unordered_set<uint64_t> indices;
};
//typedef std::unordered_map< uint64_t, HashTableEntry > HashTable;
typedef std::unordered_map< uint64_t, std::unordered_set<uint64_t> > HashTable;
static const std::unordered_map< std::string, char > codons =
......
......@@ -35,6 +35,9 @@ CommandTriangle::CommandTriangle()
useOption("help");
addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <query> specify paths to sequence files, one per line. The reference file is not affected.", ""));
addOption("comment", Option(Option::Boolean, "C", "Output", "Use comment fields for sequence names instead of IDs.", ""));
addOption("edge", Option(Option::Boolean, "E", "Output", "Output edge list instead of Phylip matrix, with fields [seq1, seq2, dist, p-val, shared-hashes].", ""));
addOption("pvalue", Option(Option::Number, "v", "Output", "Maximum p-value to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.));
addOption("distance", Option(Option::Number, "d", "Output", "Maximum distance to report in edge list. Implies -" + getOption("edge").identifier + ".", "1.0", 0., 1.));
//addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
useSketchOptions();
}
......@@ -50,8 +53,16 @@ int CommandTriangle::run() const
int threads = options.at("threads").getArgumentAsNumber();
bool list = options.at("list").active;
//bool log = options.at("log").active;
double pValueMax = 0;
bool comment = options.at("comment").active;
bool edge = options.at("edge").active;
double pValueMax = options.at("pvalue").getArgumentAsNumber();
double distanceMax = options.at("distance").getArgumentAsNumber();
double pValuePeakToSet = 0;
if ( options.at("pvalue").active || options.at("distance").active )
{
edge = true;
}
Sketch::Parameters parameters;
......@@ -109,27 +120,33 @@ int CommandTriangle::run() const
}
}
if ( !edge )
{
cout << '\t' << sketch.getReferenceCount() << endl;
cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl;
}
ThreadPool<TriangleInput, TriangleOutput> threadPool(compare, threads);
for ( uint64_t i = 1; i < sketch.getReferenceCount(); i++ )
{
threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters));
threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters, distanceMax, pValueMax));
while ( threadPool.outputAvailable() )
{
writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet);
}
}
while ( threadPool.running() )
{
writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
writeOutput(threadPool.popOutputWhenAvailable(), comment, edge, pValuePeakToSet);
}
cerr << "Max p-value: " << pValueMax << endl;
if ( !edge )
{
cerr << "Max p-value: " << pValuePeakToSet << endl;
}
if ( warningCount > 0 && ! parameters.reads )
{
......@@ -139,23 +156,43 @@ int CommandTriangle::run() const
return 0;
}
void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const
void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const
{
const Sketch & sketch = output->sketch;
const Sketch::Reference & ref = sketch.getReference(output->index);
if ( !edge )
{
cout << (comment ? ref.comment : ref.name);
}
for ( uint64_t i = 0; i < output->index; i++ )
{
const CommandDistance::CompareOutput::PairOutput * pair = &output->pairs[i];
if ( edge )
{
if ( pair->pass )
{
const Sketch::Reference & qry = sketch.getReference(i);
cout << (comment ? ref.comment : ref.name) << '\t'<< (comment ? qry.comment : qry.name) << '\t' << pair->distance << '\t' << pair->pValue << '\t' << pair->numer << '/' << pair->denom << endl;
}
}
else
{
cout << '\t' << pair->distance;
if ( pair->pValue > pValueMax )
}
if ( pair->pValue > pValuePeakToSet )
{
pValueMax = pair->pValue;
pValuePeakToSet = pair->pValue;
}
}
if ( !edge )
{
cout << endl;
}
delete output;
}
......@@ -170,7 +207,7 @@ CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input
for ( uint64_t i = 0; i < input->index; i++ )
{
compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), -1., -1.);
compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), input->maxDistance, input->maxPValue);
}
return output;
......
......@@ -19,16 +19,20 @@ public:
struct TriangleInput
{
TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew)
TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew, double maxDistanceNew, double maxPValueNew)
:
sketch(sketchNew),
index(indexNew),
parameters(parametersNew)
parameters(parametersNew),
maxDistance(maxDistanceNew),
maxPValue(maxPValueNew)
{}
const Sketch & sketch;
uint64_t index;
const Sketch::Parameters & parameters;
double maxDistance;
double maxPValue;
};
struct TriangleOutput
......@@ -61,7 +65,7 @@ private:
double pValueMax;
bool comment;
void writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const;
void writeOutput(TriangleOutput * output, bool comment, bool edge, double & pValuePeakToSet) const;
};
CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input);
......
......@@ -4,4 +4,4 @@
//
// See the LICENSE.txt file included with this software for license information.
static const char * version = "2.1.1";
static const char * version = "2.2";
0.861792 44/1000 1 5.00739e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome
0.853596 36/1000 1 1.70479e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome
0.861792 44/1000 1 5.00739e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome
0.861792 44/1000 1 5.00742e-229 genome1.fna gi|49175990|ref|NC_000913.2| Escherichia coli str. K-12 substr. MG1655, complete genome
0.853596 36/1000 1 1.7048e-184 genome2.fna gi|47118301|dbj|BA000007.2| Escherichia coli O157:H7 str. Sakai DNA, complete genome
0.861792 44/1000 1 5.00742e-229 genome3.fna gi|682117612|gb|CP009273.1| Escherichia coli BW25113, complete genome