Commit 60b54479 authored by Andreas Tille's avatar Andreas Tille

New upstream version 2.1.5

parent bd7747be
version: 2
aliases:
- &docker_image ubuntu:bionic
- &install_common |
apt-get update -qq
apt-get install -qq software-properties-common
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update -qq
apt-get install -qq autoconf automake gcc g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
jobs:
build:
linux_clang6:
docker:
- image: ubuntu:bionic
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get update -qq
apt-get install -qq autoconf automake clang g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
apt-get install -qq clang-6.0
- checkout
- run: |
./autogen.sh
./configure CC=clang CXX=clang++ --with-mpi=/usr/lib/openmpi
./configure CC=clang-6.0 CXX=clang++-6.0 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc5:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-5 g++-5
- checkout
- run: |
./autogen.sh
./configure CC=gcc-5 CXX=g++-5 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc6:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-6 g++-6
- checkout
- run: |
./autogen.sh
./configure CC=gcc-6 CXX=g++-6 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc7:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-7 g++-7
- checkout
- run: |
./autogen.sh
./configure CC=gcc-7 CXX=g++-7 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc8:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-8 g++-8
- checkout
- run: |
./autogen.sh
./configure CC=gcc-8 CXX=g++-8 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
workflows:
version: 2
# test compiler versions in priority order and abort on first failure
abyss_builds:
jobs:
- linux_gcc8
- linux_gcc7:
requires:
- linux_gcc8
- linux_gcc6:
requires:
- linux_gcc7
- linux_gcc5:
requires:
- linux_gcc6
- linux_clang6:
requires:
- linux_gcc5
......@@ -210,9 +210,4 @@ isAmbiguous(const SequenceCollectionHash& g) const
BranchGroupStatus m_status;
};
namespace std {
template <>
inline void swap(BranchGroup&, BranchGroup&) NOEXCEPT { assert(false); }
}
#endif
......@@ -17,4 +17,5 @@ abyss_bloom_SOURCES = bloom.cc \
BloomFilterWindow.h \
ConcurrentBloomFilter.h \
CascadingBloomFilter.h \
CascadingBloomFilterWindow.h
CascadingBloomFilterWindow.h \
RollingBloomDBGVisitor.h
#ifndef _ROLLING_BLOOM_DBG_VISITOR_H_
#define _ROLLING_BLOOM_DBG_VISITOR_H_ 1
#include "Common/UnorderedMap.h"
#include "Graph/BreadthFirstSearch.h"
#include <iostream>
/**
* Visitor class that outputs visited nodes/edges in GraphViz format during
* a bounded breadth first traversal. An instance of this class may be passed
* as an argument to the `breadthFirstSearch` function.
*/
template <typename GraphT>
class RollingBloomDBGVisitor : public DefaultBFSVisitor<GraphT>
{
public:
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
typedef unordered_map<VertexT, unsigned> DepthMap;
typedef typename DepthMap::const_iterator DepthMapConstIt;
typedef typename DepthMap::iterator DepthMapIt;
typedef std::vector<std::pair<std::string, unordered_set<VertexT> > >
KmerProperties;
typedef typename KmerProperties::const_iterator KmerPropertiesIt;
/** Constructor */
RollingBloomDBGVisitor(const VertexT& root, unsigned maxDepth,
const KmerProperties& kmerProperties, std::ostream& out)
: m_root(root), m_maxDepth(maxDepth),
m_kmerProperties(kmerProperties), m_out(out)
{
m_depthMap[root] = 0;
/* start directed graph (GraphViz) */
m_out << "digraph " << root.kmer().c_str() << " {\n";
}
/** Called when graph search completes */
void post_processing()
{
/* end directed graph (GraphViz) */
m_out << "}\n";
}
/** Invoked on edges that lead to an undiscovered vertex */
BFSVisitorResult tree_edge(const EdgeT& e, const GraphT& g)
{
VertexT parent = source(e, g);
VertexT child = target(e, g);
if (!isForwardEdge(e, g))
std::swap(parent, child);
DepthMapConstIt parentIt = m_depthMap.find(parent);
assert(parentIt != m_depthMap.end());
unsigned parentDepth = parentIt->second;
if (parentDepth >= m_maxDepth)
return BFS_SKIP_ELEMENT;
DepthMapIt childIt;
bool inserted;
boost::tie(childIt, inserted) = m_depthMap.insert(
std::make_pair(child, parentDepth + 1));
assert(inserted);
/*
* since we use this visitor class with an undirected BFS,
* each edge is traversed twice (once forward, once reverse)
*/
if (isForwardEdge(e, g))
outputEdge(e, g);
return BFS_SUCCESS;
}
/** Invoked when a vertex is visited for the first time */
BFSVisitorResult discover_vertex(const VertexT& v, const GraphT&)
{
/* declare vertex (GraphViz) */
m_out << '\t' << v.kmer().c_str();
m_out << " [";
DepthMapConstIt depthIt = m_depthMap.find(v);
assert(depthIt != m_depthMap.end());
unsigned depth = depthIt->second;
m_out << "depth=" << depth;
for (KmerPropertiesIt it = m_kmerProperties.begin();
it != m_kmerProperties.end(); ++it) {
if (it->second.find(v) != it->second.end())
m_out << "," << it->first;
}
m_out << "];\n";
return BFS_SUCCESS;
}
/**
* Invoked when an edge is traversed. (Each edge
* in the graph is traversed exactly once.)
*/
BFSVisitorResult non_tree_edge(const EdgeT& e, const GraphT& g)
{
/*
* since we use this visitor class with an undirected BFS,
* each edge is traversed twice (once forward, once reverse)
*/
if (isForwardEdge(e, g))
outputEdge(e, g);
return BFS_SUCCESS;
}
BFSVisitorResult examine_vertex(const VertexT& v, const GraphT&)
{
m_currentVertex = v;
return BFS_SUCCESS;
}
/** Return if the current edge is a forward edge */
bool isForwardEdge(const EdgeT& e, const GraphT& g)
{
assert(source(e, g) == m_currentVertex
|| target(e, g) == m_currentVertex);
return source(e, g) == m_currentVertex;
}
/** Output and edge */
void outputEdge(const EdgeT& e, const GraphT& g)
{
const VertexT& u = source(e, g);
const VertexT& v = target(e, g);
/* declare edge (GraphViz) */
m_out << '\t' << u.kmer().c_str() << " -> "
<< v.kmer().c_str() << ";\n";
}
protected:
VertexT m_currentVertex;
DepthMap m_depthMap;
const VertexT& m_root;
const unsigned m_maxDepth;
const KmerProperties& m_kmerProperties;
std::ostream& m_out;
};
#endif
......@@ -7,6 +7,7 @@
#include "Common/Kmer.h"
#include "Common/BitUtil.h"
#include "Common/KmerIterator.h"
#include "Common/UnorderedSet.h"
#include "Graph/Path.h"
#include "Graph/ExtendPath.h"
#include "Konnector/DBGBloom.h"
......@@ -18,8 +19,11 @@
#include "Bloom/CascadingBloomFilter.h"
#include "Bloom/BloomFilterWindow.h"
#include "Bloom/CascadingBloomFilterWindow.h"
#include "Bloom/RollingBloomDBGVisitor.h"
#include "BloomDBG/BloomIO.h"
#include "BloomDBG/HashAgnosticCascadingBloom.h"
#include "BloomDBG/RollingBloomDBG.h"
#include "BloomDBG/RollingHashIterator.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include <cstdlib>
......@@ -51,9 +55,11 @@ static const char USAGE_MESSAGE[] =
"Usage 3: " PROGRAM " intersect [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
"Usage 4: " PROGRAM " info [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
"Usage 5: " PROGRAM " compare [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE_1> <BLOOM_FILE_2>\n"
"Usage 6: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
"Usage 7: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
"Build and manipulate bloom filter files.\n"
"Usage 6: " PROGRAM " graph [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
"Usage 7: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
"Usage 8: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
"\n"
"Build and manipulate Bloom filter files.\n"
"\n"
" Global options:\n"
"\n"
......@@ -98,6 +104,13 @@ static const char USAGE_MESSAGE[] =
" -m, --method=`String' choose distance calculation method \n"
" [`jaccard'(default), `forbes', `czekanowski']\n"
"\n"
" Options for `" PROGRAM " graph':\n"
"\n"
" -d, --depth=N depth of neighbouring from --root [k]\n"
" -R, --root=KMER root k-mer from graph traversal [required]\n"
" -a, --node-attr=STR:FILE assign a node attribute (e.g. 'color=blue')\n"
" k-mers in the given FASTA\n"
"\n"
" Options for `" PROGRAM " kmers':\n"
"\n"
" -r, --inverse get k-mers that are *NOT* in the bloom filter\n"
......@@ -112,6 +125,13 @@ static const char USAGE_MESSAGE[] =
enum BloomFilterType { BT_KONNECTOR, BT_ROLLING_HASH, BT_UNKNOWN };
enum OutputFormat { BED, FASTA, RAW };
/* types related to --node-attr option */
typedef string KmerProperty;
typedef string FastaPath;
typedef vector<pair<KmerProperty, FastaPath> > KmerProperties;
typedef KmerProperties::iterator KmerPropertiesIt;
namespace opt {
/** The size of the bloom filter in bytes. */
......@@ -120,6 +140,9 @@ namespace opt {
/** The size of the I/O buffer of each thread, in bytes */
size_t bufferSize = 100000;
/** Depth of graph traversal */
size_t depth = 0;
/** The number of parallel threads. */
unsigned threads = 1;
......@@ -150,6 +173,13 @@ namespace opt {
/** The type of Bloom filter to build */
BloomFilterType bloomType = BT_KONNECTOR;
/**
* For the "graph" command: assign node attribute
* (e.g. "color=blue") to k-mers contained in the
* associated FASTA file
*/
KmerProperties kmerProperties;
/** Index of bloom filter window.
("M" for -w option) */
unsigned windowIndex = 0;
......@@ -168,10 +198,13 @@ namespace opt {
*/
bool inverse = false;
/** Root node (k-mer) for `graph` subcommand */
string root;
OutputFormat format = FASTA;
}
static const char shortopts[] = "b:B:h:H:j:k:l:L:m:n:q:rvt:w:";
static const char shortopts[] = "a:b:B:d:h:H:j:k:l:L:m:n:q:rR:vt:w:";
enum { OPT_HELP = 1, OPT_VERSION, OPT_BED, OPT_FASTA, OPT_RAW };
......@@ -179,6 +212,7 @@ static const struct option longopts[] = {
{ "bloom-size", required_argument, NULL, 'b' },
{ "bloom-type", required_argument, NULL, 't' },
{ "buffer-size", required_argument, NULL, 'B' },
{ "depth", required_argument, NULL, 'd' },
{ "hash-seed", required_argument, NULL, 'h' },
{ "num-hashes", required_argument, NULL, 'H' },
{ "threads", required_argument, NULL, 'j' },
......@@ -193,12 +227,14 @@ static const struct option longopts[] = {
{ "trim-quality", required_argument, NULL, 'q' },
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
{ "node-attr", required_argument, NULL, 'a' },
{ "verbose", no_argument, NULL, 'v' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ "window", required_argument, NULL, 'w' },
{ "method", required_argument, NULL, 'm' },
{ "inverse", required_argument, NULL, 'r' },
{ "root", required_argument, NULL, 'R' },
{ "bed", no_argument, NULL, OPT_BED },
{ "fasta", no_argument, NULL, OPT_FASTA },
{ "raw", no_argument, NULL, OPT_RAW },
......@@ -844,6 +880,111 @@ int compare(int argc, char ** argv){
return 1;
}
int graph(int argc, char** argv)
{
parseGlobalOpts(argc, argv);
// default graph neighbourhood depth
opt::depth = opt::k;
for (int c; (c = getopt_long(argc, argv,
shortopts, longopts, NULL)) != -1;) {
istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?':
dieWithUsageError();
case 'a':
{
string s;
arg >> s;
size_t pos = s.find(":");
if (pos < s.length())
opt::kmerProperties.push_back(make_pair(
s.substr(0, pos), s.substr(pos + 1)));
else
arg.setstate(ios::failbit);
}
break;
case 'd':
arg >> opt::depth; break;
case 'R':
arg >> opt::root; break;
}
if (optarg != NULL && (!arg.eof() || arg.fail())) {
cerr << PROGRAM ": invalid option: `-"
<< (char)c << optarg << "'\n";
exit(EXIT_FAILURE);
}
}
if (opt::root.empty()) {
cerr << PROGRAM ": missing required option --root <KMER>\n";
dieWithUsageError();
}
if (opt::root.size() != opt::k) {
cerr << PROGRAM ": --root arg must be a k-mer of length "
<< opt::k << "\n";
dieWithUsageError();
}
if (argc - optind != 1) {
cerr << PROGRAM ": missing arguments\n";
dieWithUsageError();
}
typedef RollingBloomDBG<HashAgnosticCascadingBloom> Graph;
typedef boost::graph_traits<Graph>::vertex_descriptor V;
string bloomPath(argv[optind]);
optind++;
vector<pair<string, unordered_set<V> > > kmerProperties;
for (KmerPropertiesIt it = opt::kmerProperties.begin();
it != opt::kmerProperties.end(); ++it) {
if (opt::verbose)
cerr << "Loading k-mers from `" << it->second << "', to be "
<< "annotated with '" << it->first << "'\n";
kmerProperties.push_back(make_pair(it->first, unordered_set<V>()));
size_t count = 0;
size_t checkpoint = 0;
const size_t step = 10000;
FastaReader in(it->second.c_str(), FastaReader::FOLD_CASE);
for (FastaRecord rec; in >> rec;) {
for (RollingHashIterator it(rec.seq, 1, opt::k);
it != RollingHashIterator::end(); ++it, ++count) {
V v(it.kmer().c_str(), it.rollingHash());
kmerProperties.back().second.insert(v);
}
while (opt::verbose && count >= checkpoint) {
cerr << "Loaded " << checkpoint << " k-mers\n";
checkpoint += step;
}
}
if (opt::verbose)
cerr << "Loaded " << count << " k-mers in total\n";
}
if (opt::verbose)
cerr << "Loading Bloom filter from `" << bloomPath << "'..." << endl;
HashAgnosticCascadingBloom bloom(bloomPath);
assert(opt::k == bloom.getKmerSize());
Graph g(bloom);
V root(opt::root.c_str(), RollingHash(opt::root.c_str(),
bloom.getHashNum(), bloom.getKmerSize()));
RollingBloomDBGVisitor<Graph> visitor(root, opt::depth, kmerProperties, cout);
breadthFirstSearch(root, g, true, visitor);
return 0;
}
int memberOf(int argc, char ** argv){
// Initalise bloom and get globals
Konnector::BloomFilter bloom;
......@@ -941,6 +1082,9 @@ int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter&
// of the sequence
bool firstKmerMatch = true;
// longest branch of Bloom filter false positives
const unsigned fpTrim = 5;
KmerIterator it(seq, k);
for (; it != KmerIterator::end(); ++it) {
......@@ -951,21 +1095,26 @@ int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter&
if (!bloom[kmer])
continue;
// in degree, disregarding false branches
unsigned inDegree = trueBranches(kmer, REVERSE, g,
minBranchLen).size();
// out degree, disregarding false branches
unsigned outDegree = trueBranches(kmer, FORWARD, g,
minBranchLen).size();
PathExtensionResultCode leftResult, rightResult;
boost::tie(boost::tuples::ignore, leftResult)
= successor(kmer, REVERSE, g, minBranchLen, fpTrim);
boost::tie(boost::tuples::ignore, rightResult)
= successor(kmer, FORWARD, g, minBranchLen, fpTrim);
if (firstKmerMatch) {
bool leftTip = (inDegree == 0 && outDegree == 1);
bool rightTip = (inDegree == 1 && outDegree == 0);
bool leftTip = leftResult == ER_DEAD_END
&& rightResult == ER_LENGTH_LIMIT;
bool rightTip = leftResult == ER_LENGTH_LIMIT
&& rightResult == ER_DEAD_END;
if (!leftTip && !rightTip)
break;
} else if (inDegree != 1 || outDegree != 1) {
// end of linear path
break;
} else {
bool leftFork = leftResult == ER_AMBI_IN
|| leftResult == ER_AMBI_OUT;
bool rightFork = rightResult == ER_AMBI_IN
|| rightResult == ER_AMBI_OUT;
if (leftFork || rightFork)
break;
}
firstKmerMatch = false;
......@@ -1104,6 +1253,9 @@ int main(int argc, char** argv)
else if (command == "compare") {
return compare(argc, argv);
}
else if (command == "graph") {
return graph(argc, argv);
}
else if (command == "kmers" || command == "getKmers") {
return memberOf(argc, argv);
}
......
......@@ -16,13 +16,17 @@ namespace BloomDBG {
{
/** reads consisting entirely of solid k-mers */
size_t solidReads;
size_t readsExtended;
/**
* reads consisting entirely of k-mers already
* included in output contigs
*/
size_t visitedReads;
size_t readsProcessed;
size_t basesAssembled;
size_t contigID;
AssemblyCounters() : solidReads(0), readsExtended(0), readsProcessed(0),
basesAssembled(0), contigID(0) {}
AssemblyCounters() : solidReads(0), visitedReads(0),
readsProcessed(0), basesAssembled(0), contigID(0) {}
/** serialize counters as a TSV table */
friend std::ostream& operator<<(std::ostream& out,
......@@ -31,7 +35,6 @@ namespace BloomDBG {
/* write headers */
out << "solid_reads"
<< '\t' << "extended_reads"
<< '\t' << "processed_reads"
<< '\t' << "bases_assembled"
<< '\t' << "next_contig_id"
......@@ -40,7 +43,6 @@ namespace BloomDBG {
/* write data */
out << o.solidReads
<< '\t' << o.readsExtended
<< '\t' << o.readsProcessed
<< '\t' << o.basesAssembled
<< '\t' << o.contigID
......@@ -61,7 +63,6 @@ namespace BloomDBG {
/* read data */
in >> o.solidReads
>> expect("\t") >> o.readsExtended
>> expect("\t") >> o.readsProcessed
>> expect("\t") >> o.basesAssembled
>> expect("\t") >> o.contigID
......
......@@ -27,6 +27,9 @@ namespace BloomDBG {
/** minimum k-mer coverage threshold */
unsigned minCov;
/** path to output debugging info about processing of each read */
std::string readLogPath;
/** WIG track containing 0/1 for sufficient k-mer cov */
std::string covTrackPath;
......
......@@ -15,11 +15,14 @@ namespace BloomDBG {
std::ostream& checkpointOut;
/** trace file output for debugging */
std::ostream& traceOut;
/** outcomes of processing each read */
std::ostream& readLogOut;
AssemblyStreams(InputStreamT& in, std::ostream& out,
std::ostream& checkpointOut, std::ostream& traceOut) :
std::ostream& checkpointOut, std::ostream& traceOut,
std::ostream& readLogOut) :
in(in), out(out), checkpointOut(checkpointOut),
traceOut(traceOut) {}
traceOut(traceOut), readLogOut(readLogOut) {}
};
}
......
......@@ -2,8 +2,12 @@
#define LIGHTWEIGHT_KMER_H 1
#include "BloomDBG/MaskedKmer.h"
#include "Common/Kmer.h"
#include "Common/Sequence.h"
#include "Common/Sense.h"
#include <algorithm>
#include <cctype>
#include <cstring>
#include <boost/shared_array.hpp>
......@@ -78,6 +82,51 @@ public:
return *(m_kmer.get() + pos);
}
/**
* Return true if k-mer is in its lexicographically smallest orientation
*/
bool isCanonical() const
{
const unsigned k = Kmer::length();
for (unsigned i = 0; i < k/2; ++i) {
const char c1 = toupper(m_kmer.get()[i]);
const char c2 = complementBaseChar(
toupper(m_kmer.get()[k-i-1]));
if (c1 > c2)
return false;
else if (c1 < c2)
return true;
}
return true;
}
/**
* Reverse complement the k-mer, if it is not already in its
* canonical (i.e. lexicographically smallest) orientation.
*/
void canonicalize()
{
if (!isCanonical())
reverseComplement();
}
void reverseComplement()
{
const unsigned k = Kmer::length();
for (unsigned i = 0; i < k/2; ++i) {
const char tmp = complementBaseChar(m_kmer.get()[i]);
m_kmer.get()[i] = complementBaseChar(
m_kmer.get()[k-i-1]);
m_kmer.get()[k-i-1] = tmp;
}
/* if k is odd, we need to reverse complement the middle char */
if (k % 2 == 1 && k > 1)
m_kmer.get()[k/2]
= complementBaseChar(m_kmer.get()[k/2]);
}
/** Equality operator */
bool operator==(const LightweightKmer& o) const
{
......@@ -95,6 +144,13 @@ public:
return true;
}
}
/** Inequality operator */
bool operator!=(const LightweightKmer& o) const
{
return !operator==(o);
}
};
#endif
......@@ -69,6 +69,23 @@ public:
kmer().setLastBase(dir, base);
}