Skip to content
Commits on Source (4)
CXXFLAGS += -O3 -std=c++11 -Isrc -I@capnp@/include -I@mathinc@
CXXFLAGS += -O3 -std=c++14 -Isrc -I@capnp@/include -I@mathinc@
CPPFLAGS += @amcppflags@
UNAME_S=$(shell uname -s)
......
......@@ -33,7 +33,7 @@ then
AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
fi
CPPFLAGS="-I$with_capnp/include -std=c++11"
CPPFLAGS="-I$with_capnp/include -std=c++14"
AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
......
mash (2.1+dfsg-3) UNRELEASED; urgency=medium
mash (2.1.1+dfsg-1) unstable; urgency=medium
* Team upload.
[ Michael R. Crusoe ]
* Mark the -doc package as Multi-Arch: foreign
-- Michael R. Crusoe <michael.crusoe@gmail.com> Sat, 16 Feb 2019 01:04:02 -0800
[ Sascha Steinbiss ]
* New upstream release.
* Remove patch applied upstream.
-- Sascha Steinbiss <satta@debian.org> Mon, 08 Jul 2019 19:33:37 +0200
mash (2.1+dfsg-2) unstable; urgency=medium
......
Description: raise C++ standard to allow building with capnp 0.7
Author: Sascha Steinbiss <satta@debian.org>
Bug: https://github.com/marbl/Mash/issues/98
Last-Update: 2018-12-26
--- a/configure.ac
+++ b/configure.ac
@@ -33,7 +33,7 @@
AC_MSG_ERROR([Cap'n Proto compiler (capnp) not found.])
fi
-CPPFLAGS="-I$with_capnp/include -std=c++11"
+CPPFLAGS="-I$with_capnp/include -std=c++14"
AC_CHECK_HEADER(capnp/common.h, [result=1], [result=0])
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,4 +1,4 @@
-CXXFLAGS += -O3 -std=c++11 -Isrc -I@capnp@/include -I@mathinc@
+CXXFLAGS += -O3 -std=c++14 -Isrc -I@capnp@/include -I@mathinc@
CPPFLAGS += @amcppflags@
UNAME_S=$(shell uname -s)
......@@ -3,5 +3,4 @@ use_debian_mathjax.patch
drop_memcpy_wrapper.patch
link_dynamically_against_capnp.patch
parallel.patch
capnp0_7.patch
use_debian_packaged_libmurmurhash.patch
......@@ -15,3 +15,71 @@ Mash: fast genome and metagenome distance estimation using MinHash
`SRR2671867.BaAmes.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671867.BaAmes.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (157MB)
`SRR2671868.Bc10987.poretools.fastq.gz <http://gembox.cbcb.umd.edu/mash/SRR2671868.Bc10987.poretools.fastq.gz>`_: Nanopore 1D + 2D sequences generated by poretools (250MB)
Mash Screen: High-throughput sequence containment estimation for genome discovery
---------------------------------------------------------------------------------
Custom scripts and intermediate data:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
`MashScreen_supp.tar.gz <https://obj.umiacs.umd.edu/mash/screen/MashScreen_supp.tar.gz>`_
Data files:
~~~~~~~~~~~
Mash Sketch databases for RefSeq release 88:
* `RefSeq88n.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88n.msh.gz>`_: Genomes (k=21, s=1000), 1.2Gb uncompressed
* `RefSeq88p.msh.gz <https://obj.umiacs.umd.edu/mash/screen/RefSeq88p.msh.gz>`_: Proteomes (k=9, s=1000), 1.1Gb uncompressed
`art.fastq.gz <https://obj.umiacs.umd.edu/mash/screen/art.fastq.gz>`_: Simulated reads for Shakya experiment
Figure 5:
* `fig5.html <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.html>`_: Interactive version
* `fig5.tsv <https://obj.umiacs.umd.edu/mash/screen/fig5/fig5.tsv>`_: Source data
Public data sources
~~~~~~~~~~~~~~~~~~~
The BLAST ``nr`` database was downloaded from ``ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.*``.
HMP data were downloaded from ``ftp://public-ftp.ihmpdcc.org/``, reads from the ``Ilumina/`` directory
and coding sequences from the ``HMGI/`` directory. Within these folders, sample SRS015937 resides in
``tongue_dorsum/`` and SRS020263 in ``right_retroauricular_crease/``.
SRA runs downloaded with the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_.
RefSeq genomes downloaded from the ``genomes/refseq/`` directory of ``ftp.ncbi.nlm.nih.gov``.
Public data products
~~~~~~~~~~~~~~~~~~~~
Quebec Polyomavirus is submitted to GenBank as BK010702.
Screen of SRA metagenomes vs. RefSeq
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* `sra_meta_nucl_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_95idy.tsv.gz>`_ (2.3Gb uncompressed)
* `sra_meta_nucl_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_nucl_80idy_3x.tsv.gz>`_ (6.7Gb uncompressed)
* `sra_meta_prot_95idy.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_95idy.tsv.gz>`_ (2.1Gb uncompressed)
* `sra_meta_prot_80idy_3x.tsv.gz <https://obj.umiacs.umd.edu/mash/screen/tables/sra_meta_prot_80idy_3x.tsv.gz>`_ (8.3Gb uncompressed)
These files have a line for each RefSeq genome listing all metagenomic SRA runs
(as of August 2018) with Mash Containment Scores above the specified threshold.
They are provided for two screen modes:
* ``nucl``: Genomic RefSeq sequences
* ``prot``: Proteomic RefSeq sequences (combined amino acid sequences per organism). **NOTE:** Protein tables above are not p-value filtered and thus large (> ~50Gb) runs may have spurious hits. They also do not contain plasmids. Updates coming soon!
...and at two thresholds:
* ``95idy``: 95% Mash Containment Score, any coverage. Useful for finding runs containing a specific genome.
* ``80idy_3x``: 80% Mash Containment Score, at least 3x median k-mer multiplicity.
Useful for finding related, but novel, sequences.
The files are tab separated, with each line beginning with a RefSeq assembly accession, followed by SRA accessions, for example:
::
GCF_000001215.4 SRR3401361 SRR3540373
GCF_000001405.36 SRR5127794 ERR1539652 SRR413753 ERR206081
GCF_000001405.38 SRR5127794 ERR1539652 ERR1711677 SRR413753 ERR206081
......@@ -291,7 +291,7 @@ int CommandScreen::run() const
for ( unordered_set<MinHashHeap *>::const_iterator i = minHashHeaps.begin(); i != minHashHeaps.end(); i++ )
{
HashList hashList(parameters.kmerSize);
HashList hashList(parameters.use64);
(*i)->toHashList(hashList);
......@@ -550,38 +550,6 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
break;
}
//kmersTotal++; TODO
if ( ! noncanonical )
{
bool debug = false;
useRevComp = true;
bool prefixEqual = true;
if ( debug ) {for ( uint64_t k = j; k < j + kmerSize; k++ ) { cout << *(seq + k); } cout << endl;}
for ( uint64_t k = 0; k < kmerSize; k++ )
{
char base = seq[j + k];
char baseMinus = seqRev[l - j - kmerSize + k];
if ( debug ) cout << baseMinus;
if ( prefixEqual && baseMinus > base )
{
useRevComp = false;
break;
}
if ( prefixEqual && baseMinus < base )
{
prefixEqual = false;
}
}
if ( debug ) cout << endl;
}
const char * kmer;
if ( trans )
......@@ -590,14 +558,15 @@ CommandScreen::HashOutput * hashSequence(CommandScreen::HashInput * input)
}
else
{
kmer = useRevComp ? seqRev + l - j - kmerSize : seq + j;
const char *kmer_fwd = seq + j;
const char *kmer_rev = seqRev + length - j - kmerSize;
kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
}
//cout << kmer << '\t' << kmerSize << endl;
hash_u hash = getHash(kmer, kmerSize, seed, use64);
//cout << kmer << '\t' << hash.hash64 << endl;
input->minHashHeap->tryInsert(hash);
uint64_t key = use64 ? hash.hash64 : hash.hash32;
if ( input->hashCounts.count(key) == 1 )
......
......@@ -28,7 +28,7 @@ CommandSketch::CommandSketch()
addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", ""));
addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", ""));
addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment).", ""));
useSketchOptions();
}
......
......@@ -15,7 +15,7 @@ class HashList
public:
HashList() {use64 = true;}
HashList(int kmerSize) {use64 = kmerSize > 16;}
HashList(bool use64new) {use64 = use64new;}
hash_u at(int index) const;
void clear();
......
......@@ -536,13 +536,10 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
for ( uint64_t i = 0; i < length - kmerSize + 1; i++ )
{
bool useRevComp = false;
bool debug = false;
// repeatedly skip kmers with bad characters
//
bool bad = false;
//
for ( uint64_t j = i; j < i + kmerSize && i + kmerSize <= length; j++ )
{
if ( ! parameters.alphabet[seq[j]] )
......@@ -552,56 +549,25 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
break;
}
}
//
if ( bad )
{
continue;
}
//
if ( i + kmerSize > length )
{
// skipped to end
break;
}
if ( ! noncanonical )
{
useRevComp = true;
bool prefixEqual = true;
if ( debug ) {for ( uint64_t j = i; j < i + kmerSize; j++ ) { cout << *(seq + j); } cout << endl;}
for ( uint64_t j = 0; j < kmerSize; j++ )
{
char base = seq[i + j];
char baseMinus = seqRev[length - i - kmerSize + j];
if ( debug ) cout << baseMinus;
if ( prefixEqual && baseMinus > base )
{
useRevComp = false;
break;
}
if ( prefixEqual && baseMinus < base )
{
prefixEqual = false;
}
}
if ( debug ) cout << endl;
}
const char *kmer_fwd = seq + i;
const char *kmer_rev = seqRev + length - i - kmerSize;
const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev;
const char * kmer = (noncanonical || memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0) ? kmer_fwd : kmer_rev;
bool filter = false;
hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64);
if ( debug ) cout << endl;
minHashHeap.tryInsert(hash);
}
......
......@@ -61,8 +61,7 @@ int sketchParameterSetup(Sketch::Parameters & parameters, const Command & comman
if ( parameters.reads && command.getOption("threads").active )
{
cerr << "ERROR: The option " << command.getOption("threads").identifier << " cannot be used with " << command.getOption("reads").identifier << "." << endl;
return 1;
cerr << "WARNING: The option " << command.getOption("threads").identifier << " will be ignored with " << command.getOption("reads").identifier << "." << endl;
}
if ( parameters.reads && ! parameters.concatenated )
......
......@@ -4,4 +4,4 @@
//
// See the LICENSE.txt file included with this software for license information.
static const char * version = "2.1";
static const char * version = "2.1.1";