Skip to content
Commits on Source (8)
......@@ -23,6 +23,7 @@ Steps:
[--with-boost=...]
make
[sudo] make install
[make test]
Products:
......
......@@ -16,6 +16,7 @@ SOURCES=\
src/mash/CommandContain.cpp \
src/mash/CommandDistance.cpp \
src/mash/CommandScreen.cpp \
src/mash/CommandTriangle.cpp \
src/mash/CommandFind.cpp \
src/mash/CommandInfo.cpp \
src/mash/CommandPaste.cpp \
......@@ -67,6 +68,12 @@ install : mash
cp `pwd`/src/mash/*.h @prefix@/include/mash/
cp `pwd`/src/mash/capnp/MinHash.capnp.h @prefix@/include/mash/capnp/
.PHONY: uninstall
uninstall:
rm -f @prefix@/bin/mash
rm -f @prefix@/lib/libmash.a
rm -rf @prefix@/include/mash
clean :
-rm mash
-rm libmash.a
......@@ -74,3 +81,26 @@ clean :
-rm src/mash/capnp/*.o
-rm src/mash/capnp/*.c++
-rm src/mash/capnp/*.h
.PHONY: test
test : testSketch testDist testScreen
testSketch : mash test/genomes.msh test/reads.msh
./mash info -d test/genomes.msh > test/genomes.json
./mash info -d test/reads.msh > test/reads.json
diff test/genomes.json test/ref/genomes.json
diff test/reads.json test/ref/reads.json
test/genomes.msh : mash
cd test ; ../mash sketch -o genomes.msh genome1.fna genome2.fna genome3.fna
test/reads.msh : mash
cd test ; ../mash sketch -r -I reads reads1.fastq reads2.fastq -o reads.msh
testDist : mash test/genomes.msh test/reads.msh
./mash dist test/genomes.msh test/reads.msh > test/genomes.dist
diff test/genomes.dist test/ref/genomes.dist
testScreen : mash test/genomes.msh
cd test ; ../mash screen genomes.msh reads1.fastq reads2.fastq > screen
diff test/screen test/ref/screen
Mash is normally distributed as a dependency-free binary for Linux or OSX (see
https://github.com/marbl/Mash/releases). This source distribution is intended
for other operating systems or for development. Mash requires c++11 to build,
which is available in and GCC >= 4.8 and OSX >= 10.7.
See http://mash.readthedocs.org for more information.
mash (2.0-3) UNRELEASED; urgency=medium
mash (2.1-1) unstable; urgency=medium
[ Jelmer Vernooij ]
* Remove unnecessary 'Testsuite: autopkgtest' header.
-- Jelmer Vernooij <jelmer@debian.org> Sat, 15 Sep 2018 11:06:34 +0100
[ Sascha Steinbiss ]
* New upstream release.
* Use debhelper 11.
* Remove obsolete dependencies and options.
* Adjust doc-base paths.
* Bump Standards-Version.
* Update VCS fields to Salsa URLs.
-- Sascha Steinbiss <satta@debian.org> Thu, 11 Oct 2018 15:33:17 +0200
mash (2.0-2) unstable; urgency=medium
......
......@@ -3,18 +3,17 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Sascha Steinbiss <satta@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9),
Build-Depends: debhelper (>= 11),
capnproto (>= 0.6.1),
libcapnp-dev (>= 0.6.1),
libgsl-dev,
dh-autoreconf,
zlib1g-dev,
python-sphinx,
python3-sphinx,
libjs-mathjax,
asciidoctor
Standards-Version: 4.0.0
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/mash.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/mash.git
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/mash
Vcs-Git: https://salsa.debian.org/med-team/mash.git
Homepage: https://mash.readthedocs.io
Package: mash
......
......@@ -7,5 +7,5 @@ Abstract: Additional documentation for Mash
Section: Science/Biology
Format: HTML
Files: /usr/share/doc/mash-doc/*
Index: /usr/share/doc/mash-doc/index.html
Files: /usr/share/doc/mash/*
Index: /usr/share/doc/mash/index.html
......@@ -2,25 +2,10 @@
# DH_VERBOSE := 1
export LC_ALL=C.UTF-8
# some helpful variables - uncomment them if needed
# shamelessly stolen from http://jmtd.net/log/awk/
#DEBVERS := $(shell dpkg-parsechangelog | awk '/^Version:/ {print $$2}')
#VERSION := $(shell echo '$(DEBVERS)' | sed -e 's/^[0-9]*://' -e 's/-.*//')
#DEBFLAVOR := $(shell dpkg-parsechangelog | awk '/^Distribution:/ {print $$2}')
#DEBPKGNAME := $(shell dpkg-parsechangelog | awk '/^Source:/ {print $$2}')
#DEBIAN_BRANCH := $(shell awk 'BEGIN{FS="[= ]+"} /debian-branch/ {print $$2}' debian/gbp.conf)
#GIT_TAG := $(subst ~,_,$(VERSION))
# alternatively to manually set those variables, you can
# include /usr/share/dpkg/default.mk
# and use what is set there.
# for hardening you might like to uncomment this:
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@ --with autoreconf,sphinxdoc --parallel
dh $@ --with autoreconf,sphinxdoc
override_dh_auto_clean:
dh_auto_clean
......
......@@ -51,9 +51,9 @@ copyright = u'2015, Brian Ondov, Todd Treangen, Adam Phillippy'
# built documents.
#
# The short X.Y version.
version = '1.0'
version = '2.0'
# The full version, including alpha/beta/rc tags.
release = '1.0'
release = '2.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
......@@ -26,6 +26,7 @@ Downloads
=========
* `Linux/OSX binaries and source <https://github.com/marbl/Mash/releases>`_
* `RefSeq sketch database <https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh>`_
Documentation
=============
......
......@@ -59,34 +59,41 @@ reference (which there are two of in the sketch file):
Querying read sets against an existing RefSeq sketch
----------------------------------------------------
Download and gunzip the pre-sketched RefSeq archive (reads not provided here;
Download the pre-sketched RefSeq archive (reads not provided here;
10x-100x coverage of a single genome with any sequencing technology should
work):
.. download::
`RefSeqSketches.msh.gz <http://gembox.cbcb.umd.edu/mash/RefSeqSketches.msh.gz>`_
`refseq.genomes.k21.s1000.msh <https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh>`_
Plasmids also available:
.. download::
`refseq.genomes+plasmids.k21.s1000.msh <https://gembox.cbcb.umd.edu/mash/refseq.genomes%2Bplasmid.k21s1000.msh>`_
`refseq.plasmids.k21.s1000.msh <https://gembox.cbcb.umd.edu/mash/refseq.plasmid.k21s1000.msh>`_
Concatenate paired ends (this could also be piped to :code:`mash` to save space by
specifying :code:`-` for standard input, zipped or unzipped):
.. code::
cat reads_1.fastq read_2.fastq > reads.fastq
cat reads_1.fastq reads_2.fastq > reads.fastq
Sketch the reads, using :code:`-m 2` to improve results
by ignoring single-copy k-mers, which are more likely to be erroneous:
.. code::
mash sketch -m 2 -k 16 -s 400 reads.fastq
mash sketch -m 2 reads.fastq
Run :code:`mash dist` with the RefSeq archive as the reference and the read
sketch as the query:
.. code::
mash dist RefSeqSketches.msh reads.fastq.msh > distances.tab
mash dist refseq.genomes.k21.s1000.msh reads.fastq.msh > distances.tab
Sort the results to see the top hits and their p-values:
......@@ -94,14 +101,62 @@ Sort the results to see the top hits and their p-values:
sort -gk3 distances.tab | head
Building a custom RefSeq database
---------------------------------
To create the RefSeq Mash database, genomes were downloaded from NCBI
(:code:`ftp.ncbi.nlm.nih.gov/refseq/release/complete`, fasta sequence and
GenBank annotations for :code:`genomic`), and the
`refseqCollate <https://github.com/ondovb/refseqCollate/releases>`_ utility was
used to collate contigs/chromosomes into individual fasta files per genome.
Groups of these files were sketched in parallel and then pasted together with
:code:`mash paste`. This process could be repeated for more current or custom
databases.
Screening a read set for containment of RefSeq genomes
------------------------------------------------------
(new in `Mash v2.0 <https://github.com/marbl/Mash/releases>`_)
If a read set potentially has multiple genomes, it can be "screened" against the
database to estimate how well each genome is contained in the read set. We can
use the `SRA Toolkit <https://www.ncbi.nlm.nih.gov/sra/docs/toolkitsoft/>`_ to
download ERR024951:
.. code::
fastq-dump ERR024951
...and screen it against Refseq Genomes (link above), sorting the results:
.. code::
mash screen refseq.genomes.k21s1000.msh ERR024951.fastq > screen.tab
sort -gr screen.tab | head
We see the expected organism, *Salmonella enterica*, but also an apparent contaminant, *Klebsiella pneumoniae*. The fields are [identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment]:
.. code::
0.99957 991/1000 26 0 GCF_000841985.1_ViralProj14228_genomic.fna.gz NC_004313.1 Salmonella phage ST64B, complete genome
0.99957 991/1000 24 0 GCF_002054545.1_ASM205454v1_genomic.fna.gz [57 seqs] NZ_MYON01000010.1 Salmonella enterica strain BCW_4905 NODE_10_length_152932_cov_1.77994, whole genome shotgun sequence [...]
0.999522 990/1000 102 0 GCF_900086185.1_12082_4_85_genomic.fna.gz [51 seqs] NZ_FLIP01000001.1 Klebsiella pneumoniae strain k1037, whole genome shotgun sequence [...]
0.999329 986/1000 24 0 GCF_002055205.1_ASM205520v1_genomic.fna.gz [72 seqs] NZ_MYOO01000010.1 Salmonella enterica strain BCW_4904 NODE_10_length_177558_cov_3.07217, whole genome shotgun sequence [...]
0.999329 986/1000 24 0 GCF_002054075.1_ASM205407v1_genomic.fna.gz [88 seqs] NZ_MYNK01000010.1 Salmonella enterica strain BCW_4936 NODE_10_length_177385_cov_3.78874, whole genome shotgun sequence [...]
0.999329 986/1000 24 0 GCF_000474475.1_CFSAN001184_01.0_genomic.fna.gz [45 seqs] NZ_AUQM01000001.1 Salmonella enterica subsp. enterica serovar Typhimurium str. CDC_2009K1158 isolate 2009K-1158 SEET1158_1, whole genome shotgun sequence [...]
0.999329 986/1000 24 0 GCF_000474355.1_CFSAN001186_01.0_genomic.fna.gz [46 seqs] NZ_AUQN01000001.1 Salmonella enterica subsp. enterica serovar Typhimurium str. CDC_2009K1283 isolate 2009K1283 (Typo) SEET1283_1, whole genome shotgun sequence [...]
0.999329 986/1000 24 0 GCF_000213635.1_ASM21363v1_genomic.fna.gz [2 seqs] NC_016863.1 Salmonella enterica subsp. enterica serovar Typhimurium str. UK-1, complete genome [...]
0.999281 985/1000 24 0 GCF_001271965.1_Salmonella_enterica_CVM_N43825_v1.0_genomic.fna.gz [67 seqs] NZ_LIMN01000001.1 Salmonella enterica subsp. enterica serovar Typhimurium strain CVM N43825 N43825_contig_1, whole genome shotgun sequence [...]
0.999281 985/1000 24 0 GCF_000974215.1_SALF-297-3.id2_v1.0_genomic.fna.gz [90 seqs] NZ_LAPO01000001.1 Salmonella enterica subsp. enterica serovar Typhimurium strain SALF-297-3 NODE_1, whole genome shotgun sequence [...]
Note, however, that multiple strains of *Salmonella enterica* have good identity. This is because they are each contained well when considered independently. For this reason :code:`mash screen` is not a true classifier. However, we can remove much of the redundancy
for interpreting the results using the winner-take-all strategy (:code:`-w`). And while we're at it, let's throw some more cores at
the task to speed it up (:code:`-p 4`):
.. code::
mash screen -w -p 4 refseq.genomes.k21s1000.msh ERR024951.fastq > screen.tab
sort -gr screen.tab | head
The output is now much cleaner, with just the two whole genomes, plus phages (a lot of other hits to viruses and assembly contigs would appear further down):
.. code::
0.99957 991/1000 24 0 GCF_002054545.1_ASM205454v1_genomic.fna.gz [57 seqs] NZ_MYON01000010.1 Salmonella enterica strain BCW_4905 NODE_10_length_152932_cov_1.77994, whole genome shotgun sequence [...]
0.99899 979/1000 26 0 GCF_000841985.1_ViralProj14228_genomic.fna.gz NC_004313.1 Salmonella phage ST64B, complete genome
0.998844 976/1000 101 0 GCF_900086185.1_12082_4_85_genomic.fna.gz [51 seqs] NZ_FLIP01000001.1 Klebsiella pneumoniae strain k1037, whole genome shotgun sequence [...]
0.923964 190/1000 40 0 GCF_000900935.1_ViralProj181984_genomic.fna.gz NC_019545.1 Salmonella phage SPN3UB, complete genome
0.900615 111/1000 100 0 GCF_001876675.1_ASM187667v1_genomic.fna.gz [137 seqs] NZ_MOXK01000132.1 Klebsiella pneumoniae strain AWD5 Contig_(1-18003), whole genome shotgun sequence [...]
0.887722 82/1000 31 3.16322e-233 GCF_001470135.1_ViralProj306294_genomic.fna.gz NC_028699.1 Salmonella phage SEN34, complete genome
0.873204 58/1000 22 1.8212e-156 GCF_000913735.1_ViralProj227000_genomic.fna.gz NC_022749.1 Shigella phage SfIV, complete genome
0.868675 52/1000 57 6.26251e-138 GCF_001744215.1_ViralProj344312_genomic.fna.gz NC_031129.1 Salmonella phage SJ46, complete genome
0.862715 45/1000 1 1.05185e-116 GCF_001882095.1_ViralProj353688_genomic.fna.gz NC_031940.1 Salmonella phage 118970_sal3, complete genome
0.856856 39/1000 21 6.70643e-99 GCF_000841165.1_ViralProj14230_genomic.fna.gz NC_004348.1 Enterobacteria phage ST64T, complete genome
......@@ -383,9 +383,14 @@ void compareSketches(CommandDistance::CompareOutput::PairOutput * output, const
{
//distance = log(double(common + 1) / (denom + 1)) / log(1. / (denom + 1));
distance = -log(2 * jaccard / (1. + jaccard)) / kmerSize;
if ( distance > 1 )
{
distance = 1;
}
}
if ( distance > maxDistance )
if ( maxDistance >= 0 && distance > maxDistance )
{
return;
}
......@@ -395,7 +400,7 @@ void compareSketches(CommandDistance::CompareOutput::PairOutput * output, const
output->distance = distance;
output->pValue = pValue(common, refRef.length, refQry.length, kmerSpace, denom);
if ( output->pValue > maxPValue )
if ( maxPValue >= 0 && output->pValue > maxPValue )
{
return;
}
......
......@@ -175,6 +175,12 @@ int CommandScreen::run() const
else
{
fps[f - 1] = gzopen(arguments[f].c_str(), "r");
if ( fps[f - 1] == 0 )
{
cerr << "ERROR: could not open " << arguments[f] << endl;
exit(1);
}
}
kseqs.push_back(kseq_init(fps[f - 1]));
......@@ -470,7 +476,6 @@ double estimateIdentity(uint64_t common, uint64_t denom, int kmerSize, double km
}
else
{
//distance = log(double(common + 1) / (denom + 1)) / log(1. / (denom + 1));
identity = pow(jaccard, 1. / kmerSize);
}
......
......@@ -27,6 +27,8 @@ CommandSketch::CommandSketch()
useOption("help");
addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <input> specify paths to sequence files, one per line.", ""));
addOption("prefix", Option(Option::File, "o", "Output", "Output prefix (first input file used if unspecified). The suffix '.msh' will be appended.", ""));
addOption("id", Option(Option::File, "I", "Sketch", "ID field for sketch of reads (instead of first sequence ID).", ""));
addOption("comment", Option(Option::File, "C", "Sketch", "Comment for a sketch of reads (instead of first sequence comment.", ""));
useSketchOptions();
}
......@@ -79,7 +81,32 @@ int CommandSketch::run() const
}
}
if ( getOption("id").active || getOption("comment").active )
{
if ( files.size() > 1 && ! parameters.reads )
{
cerr << "WARNING: -I and -C will only apply to first sketch" << endl;
}
}
if ( parameters.reads )
{
sketch.initFromReads(files, parameters);
}
else
{
sketch.initFromFiles(files, parameters, verbosity);
}
if ( getOption("id").active )
{
sketch.setReferenceName(0, getOption("id").argument);
}
if ( getOption("comment").active )
{
sketch.setReferenceComment(0, getOption("comment").argument);
}
double lengthThreshold = (parameters.warning * sketch.getKmerSpace()) / (1. - parameters.warning);
......
// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
// Sergey Koren, and Adam Phillippy
//
// See the LICENSE.txt file included with this software for license information.
#include "CommandDistance.h"
#include "CommandTriangle.h"
#include "Sketch.h"
#include <iostream>
#include <zlib.h>
#include "ThreadPool.h"
#include "sketchParameterSetup.h"
#include <math.h>
#ifdef USE_BOOST
#include <boost/math/distributions/binomial.hpp>
using namespace::boost::math;
#else
#include <gsl/gsl_cdf.h>
#endif
using namespace::std;
namespace mash {
CommandTriangle::CommandTriangle()
: Command()
{
name = "triangle";
summary = "Estimate a lower-triangular distance matrix.";
description = "Estimate the distance of each input sequence to every other input sequence. Outputs a lower-triangular distance matrix in relaxed Phylip format. The input sequences can be fasta or fastq, gzipped or not, or Mash sketch files (.msh) with matching k-mer sizes. Input files can also be files of file names (see -l). If more than one input file is provided, whole files are compared by default (see -i).";
argumentString = "<seq1> [<seq2>] ...";
useOption("help");
addOption("list", Option(Option::Boolean, "l", "Input", "List input. Lines in each <query> specify paths to sequence files, one per line. The reference file is not affected.", ""));
addOption("comment", Option(Option::Boolean, "C", "Output", "Use comment fields for sequence names instead of IDs.", ""));
//addOption("log", Option(Option::Boolean, "L", "Output", "Log scale distances and divide by k-mer size to provide a better analog to phylogenetic distance. The special case of zero shared min-hashes will result in a distance of 1.", ""));
useSketchOptions();
}
int CommandTriangle::run() const
{
if ( arguments.size() < 1 || options.at("help").active )
{
print();
return 0;
}
int threads = options.at("threads").getArgumentAsNumber();
bool list = options.at("list").active;
//bool log = options.at("log").active;
double pValueMax = 0;
bool comment = options.at("comment").active;
Sketch::Parameters parameters;
if ( sketchParameterSetup(parameters, *(Command *)this) )
{
return 1;
}
if ( arguments.size() == 1 )
{
parameters.concatenated = false;
}
Sketch sketch;
uint64_t lengthMax;
double randomChance;
int kMin;
string lengthMaxName;
int warningCount = 0;
vector<string> queryFiles;
for ( int i = 0; i < arguments.size(); i++ )
{
if ( list )
{
splitFile(arguments[i], queryFiles);
}
else
{
queryFiles.push_back(arguments[i]);
}
}
sketch.initFromFiles(queryFiles, parameters);
double lengthThreshold = (parameters.warning * sketch.getKmerSpace()) / (1. - parameters.warning);
for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ )
{
uint64_t length = sketch.getReference(i).length;
if ( length > lengthThreshold )
{
if ( warningCount == 0 || length > lengthMax )
{
lengthMax = length;
lengthMaxName = sketch.getReference(i).name;
randomChance = sketch.getRandomKmerChance(i);
kMin = sketch.getMinKmerSize(i);
}
warningCount++;
}
}
cout << '\t' << sketch.getReferenceCount() << endl;
cout << (comment ? sketch.getReference(0).comment : sketch.getReference(0).name) << endl;
ThreadPool<TriangleInput, TriangleOutput> threadPool(compare, threads);
for ( uint64_t i = 1; i < sketch.getReferenceCount(); i++ )
{
threadPool.runWhenThreadAvailable(new TriangleInput(sketch, i, parameters));
while ( threadPool.outputAvailable() )
{
writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
}
}
while ( threadPool.running() )
{
writeOutput(threadPool.popOutputWhenAvailable(), comment, pValueMax);
}
cerr << "Max p-value: " << pValueMax << endl;
if ( warningCount > 0 && ! parameters.reads )
{
warnKmerSize(parameters, *this, lengthMax, lengthMaxName, randomChance, kMin, warningCount);
}
return 0;
}
void CommandTriangle::writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const
{
const Sketch & sketch = output->sketch;
const Sketch::Reference & ref = sketch.getReference(output->index);
cout << (comment ? ref.comment : ref.name);
for ( uint64_t i = 0; i < output->index; i++ )
{
const CommandDistance::CompareOutput::PairOutput * pair = &output->pairs[i];
cout << '\t' << pair->distance;
if ( pair->pValue > pValueMax )
{
pValueMax = pair->pValue;
}
}
cout << endl;
delete output;
}
CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input)
{
const Sketch & sketch = input->sketch;
CommandTriangle::TriangleOutput * output = new CommandTriangle::TriangleOutput(input->sketch, input->index);
uint64_t sketchSize = sketch.getMinHashesPerWindow();
for ( uint64_t i = 0; i < input->index; i++ )
{
compareSketches(&output->pairs[i], sketch.getReference(input->index), sketch.getReference(i), sketchSize, sketch.getKmerSize(), sketch.getKmerSpace(), -1., -1.);
}
return output;
}
} // namespace mash
// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
// Sergey Koren, and Adam Phillippy
//
// See the LICENSE.txt file included with this software for license information.
#ifndef INCLUDED_CommandTriangle
#define INCLUDED_CommandTriangle
#include "Command.h"
#include "CommandDistance.h"
#include "Sketch.h"
namespace mash {
class CommandTriangle : public Command
{
public:
struct TriangleInput
{
TriangleInput(const Sketch & sketchNew, uint64_t indexNew, const Sketch::Parameters & parametersNew)
:
sketch(sketchNew),
index(indexNew),
parameters(parametersNew)
{}
const Sketch & sketch;
uint64_t index;
const Sketch::Parameters & parameters;
};
struct TriangleOutput
{
TriangleOutput(const Sketch & sketchNew, uint64_t indexNew)
:
sketch(sketchNew),
index(indexNew)
{
pairs = new CommandDistance::CompareOutput::PairOutput[index];
}
~TriangleOutput()
{
delete [] pairs;
}
const Sketch & sketch;
uint64_t index;
CommandDistance::CompareOutput::PairOutput * pairs;
};
CommandTriangle();
int run() const; // override
private:
double pValueMax;
bool comment;
void writeOutput(TriangleOutput * output, bool comment, double & pValueMax) const;
};
CommandTriangle::TriangleOutput * compare(CommandTriangle::TriangleInput * input);
} // namespace mash
#endif
......@@ -23,6 +23,8 @@
#include <capnp/serialize.h>
#include <sys/mman.h>
#include <math.h>
#include <list>
#include <string.h>
#define SET_BINARY_MODE(file)
#define CHUNK 16384
......@@ -91,6 +93,15 @@ uint64_t Sketch::getReferenceIndex(string id) const
}
}
void Sketch::initFromReads(const vector<string> & files, const Parameters & parametersNew)
{
parameters = parametersNew;
useThreadOutput(sketchFile(new SketchInput(files, 0, 0, "", "", parameters)));
createIndex();
}
int Sketch::initFromFiles(const vector<string> & files, const Parameters & parametersNew, int verbosity, bool enforceParameters, bool contain)
{
parameters = parametersNew;
......@@ -155,7 +166,9 @@ int Sketch::initFromFiles(const vector<string> & files, const Parameters & param
// init fully
//
threadPool.runWhenThreadAvailable(new SketchInput(files[i], 0, 0, "", "", parameters), loadCapnp);
vector<string> file;
file.push_back(files[i]);
threadPool.runWhenThreadAvailable(new SketchInput(file, 0, 0, "", "", parameters), loadCapnp);
}
else
{
......@@ -193,7 +206,9 @@ int Sketch::initFromFiles(const vector<string> & files, const Parameters & param
fclose(inStream);
}
threadPool.runWhenThreadAvailable(new SketchInput(files[i], 0, 0, "", "", parameters), sketchFile);
vector<string> file;
file.push_back(files[i]);
threadPool.runWhenThreadAvailable(new SketchInput(file, 0, 0, "", "", parameters), sketchFile);
}
else
{
......@@ -257,6 +272,14 @@ uint64_t Sketch::initParametersFromCapnp(const char * file)
void * data = mmap(NULL, fileInfo.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED)
{
uint64_t fileSize = fileInfo.st_size;
std::cerr << "Error: could not memory-map file " << file << " of size " << fileSize;
std::cerr << std::endl;
exit(1);
}
capnp::ReaderOptions readerOptions;
readerOptions.traversalLimitInWords = 1000000000000;
......@@ -327,7 +350,7 @@ bool Sketch::sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, S
//
memcpy(seqCopy, seq->seq.s, l);
threadPool->runWhenThreadAvailable(new SketchInput("", seqCopy, l, string(seq->name.s, seq->name.l), string(seq->comment.s, seq->comment.l), parameters), sketchSequence);
threadPool->runWhenThreadAvailable(new SketchInput(vector<string>(), seqCopy, l, string(seq->name.s, seq->name.l), string(seq->comment.s, seq->comment.l), parameters), sketchSequence);
while ( threadPool->outputAvailable() )
{
......@@ -570,10 +593,12 @@ void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const
if ( debug ) cout << endl;
}
const char * kmer = useRevComp ? seqRev + length - i - kmerSize : seq + i;
const char *kmer_fwd = seq + i;
const char *kmer_rev = seqRev + length - i - kmerSize;
const char * kmer = memcmp(kmer_fwd, kmer_rev, kmerSize) <= 0 ? kmer_fwd : kmer_rev;
bool filter = false;
hash_u hash = getHash(useRevComp ? seqRev + length - i - kmerSize : seq + i, kmerSize, parameters.seed, parameters.use64);
hash_u hash = getHash(kmer, kmerSize, parameters.seed, parameters.use64);
if ( debug ) cout << endl;
......@@ -910,7 +935,7 @@ bool hasSuffix(string const & whole, string const & suffix)
Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input)
{
const char * file = input->fileName.c_str();
const char * file = input->fileNames[0].c_str();
int fd = open(file, O_RDONLY);
struct stat fileInfo;
......@@ -1129,19 +1154,6 @@ void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap &
Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
{
gzFile fp;
if ( input->fileName == "-" )
{
fp = gzdopen(fileno(stdin), "r");
}
else
{
fp = gzopen(input->fileName.c_str(), "r");
}
kseq_t *seq = kseq_init(fp);
const Sketch::Parameters & parameters = input->parameters;
Sketch::SketchOutput * output = new Sketch::SketchOutput();
......@@ -1149,11 +1161,6 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
output->references.resize(1);
Sketch::Reference & reference = output->references[0];
if ( input->fileName != "-" )
{
reference.name = input->fileName;
}
MinHashHeap minHashHeap(parameters.use64, parameters.minHashesPerWindow, parameters.reads ? parameters.minCov : 1, parameters.memoryBound);
reference.length = 0;
......@@ -1163,16 +1170,87 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
int count = 0;
bool skipped = false;
while ((l = kseq_read(seq)) >= 0)
int fileCount = input->fileNames.size();
gzFile fps[fileCount];
list<kseq_t *> kseqs;
//
for ( int f = 0; f < fileCount; f++ )
{
if ( input->fileNames[f] == "-" )
{
if ( f > 1 )
{
cerr << "ERROR: '-' for stdin must be first input" << endl;
exit(1);
}
fps[f] = gzdopen(fileno(stdin), "r");
}
else
{
if ( reference.name == "" && input->fileNames[f] != "-" )
{
reference.name = input->fileNames[f];
}
fps[f] = gzopen(input->fileNames[f].c_str(), "r");
if ( fps[f] == 0 )
{
cerr << "ERROR: could not open " << input->fileNames[f] << endl;
exit(1);
}
}
kseqs.push_back(kseq_init(fps[f]));
}
list<kseq_t *>::iterator it = kseqs.begin();
while ( kseqs.begin() != kseqs.end() )
{
l = kseq_read(*it);
if ( l < -1 ) // error
{
break;
}
if ( l == -1 ) // eof
{
kseq_destroy(*it);
it = kseqs.erase(it);
if ( it == kseqs.end() )
{
it = kseqs.begin();
}
continue;
}
if ( l < parameters.kmerSize )
{
skipped = true;
continue;
}
if ( count == 0 )
{
if ( input->fileNames[0] == "-" )
{
reference.name = (*it)->name.s;
reference.comment = (*it)->comment.s ? (*it)->comment.s : "";
}
else
{
reference.comment = (*it)->name.s;
reference.comment.append(" ");
reference.comment.append((*it)->comment.s ? (*it)->comment.s : "");
}
}
count++;
//if ( verbosity > 0 && parameters.windowed ) cout << '>' << seq->name.s << " (" << l << "nt)" << endl << endl;
//if (seq->comment.l) printf("comment: %s\n", seq->comment.s);
//printf("seq: %s\n", seq->seq.s);
......@@ -1181,20 +1259,22 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
if ( ! parameters.reads )
{
reference.length += l;
if ( input->fileName == "-" && reference.name == "" )
{
reference.name = seq->name.s;
}
}
addMinHashes(minHashHeap, seq->seq.s, l, parameters);
addMinHashes(minHashHeap, (*it)->seq.s, l, parameters);
if ( parameters.reads && parameters.targetCov > 0 && minHashHeap.estimateMultiplicity() >= parameters.targetCov )
{
l = -1; // success code
break;
}
it++;
if ( it == kseqs.end() )
{
it = kseqs.begin();
}
}
if ( parameters.reads )
......@@ -1209,9 +1289,19 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
}
}
if ( count > 1 )
{
reference.comment.insert(0, " seqs] ");
reference.comment.insert(0, to_string(count));
reference.comment.insert(0, "[");
reference.comment.append(" [...]");
//reference.comment.append(to_string(count - 1));
//reference.comment.append(" more]");
}
if ( l != -1 )
{
cerr << "\nERROR: reading " << input->fileName << "." << endl;
cerr << "\nERROR: reading " << (input->fileNames.size() > 0 ? "input files" : input->fileNames[0]) << "." << endl;
exit(1);
}
......@@ -1219,11 +1309,11 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
{
if ( skipped )
{
cerr << "\nWARNING: All fasta records in " << input->fileName << "were shorter than the k-mer size (" << parameters.kmerSize << ")." << endl;
cerr << "\nWARNING: All fasta records in " << (input->fileNames.size() > 0 ? "input files" : input->fileNames[0]) << " were shorter than the k-mer size (" << parameters.kmerSize << ")." << endl;
}
else
{
cerr << "\nERROR: Did not find fasta records in \"" << input->fileName << "\"." << endl;
cerr << "\nERROR: Did not find fasta records in \"" << (input->fileNames.size() > 0 ? "input files" : input->fileNames[0]) << "\"." << endl;
}
exit(1);
......@@ -1245,8 +1335,10 @@ Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input)
}
}
kseq_destroy(seq);
gzclose(fp);
for ( int i = 0; i < fileCount; i++ )
{
gzclose(fps[i]);
}
return output;
}
......
......@@ -141,9 +141,9 @@ public:
struct SketchInput
{
SketchInput(std::string fileNameNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
:
fileName(fileNameNew),
fileNames(fileNamesNew),
seq(seqNew),
length(lengthNew),
name(nameNew),
......@@ -159,7 +159,7 @@ public:
}
}
std::string fileName;
std::vector<std::string> fileNames;
char * seq;
......@@ -200,7 +200,10 @@ public:
bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
uint64_t initParametersFromCapnp(const char * file);
void setReferenceName(int i, const std::string name) {references[i].name = name;}
void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
void useThreadOutput(SketchOutput * output);
void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
......
......@@ -10,6 +10,7 @@
#include "CommandFind.h"
#include "CommandDistance.h"
#include "CommandScreen.h"
#include "CommandTriangle.h"
#include "CommandContain.h"
#include "CommandInfo.h"
#include "CommandPaste.h"
......@@ -22,6 +23,7 @@ int main(int argc, const char ** argv)
//commandList.addCommand(new CommandFind());
commandList.addCommand(new mash::CommandDistance());
commandList.addCommand(new mash::CommandScreen());
commandList.addCommand(new mash::CommandTriangle());
#ifdef COMMAND_WITHIN
commandList.addCommand(new mash::CommandContain());
#endif
......