Skip to content
Commits on Source (7)
version: 2
aliases:
- &docker_image ubuntu:bionic
- &install_common |
apt-get update -qq
apt-get install -qq software-properties-common
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update -qq
apt-get install -qq autoconf automake gcc g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
jobs:
build:
linux_clang6:
docker:
- image: ubuntu:bionic
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get update -qq
apt-get install -qq autoconf automake clang g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
apt-get install -qq clang-6.0
- checkout
- run: |
./autogen.sh
./configure CC=clang-6.0 CXX=clang++-6.0 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc5:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-5 g++-5
- checkout
- run: |
./autogen.sh
./configure CC=gcc-5 CXX=g++-5 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc6:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-6 g++-6
- checkout
- run: |
./autogen.sh
./configure CC=clang CXX=clang++ --with-mpi=/usr/lib/openmpi
./configure CC=gcc-6 CXX=g++-6 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc7:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-7 g++-7
- checkout
- run: |
./autogen.sh
./configure CC=gcc-7 CXX=g++-7 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
linux_gcc8:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install -qq gcc-8 g++-8
- checkout
- run: |
./autogen.sh
./configure CC=gcc-8 CXX=g++-8 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
workflows:
version: 2
# test compiler versions in priority order and abort on first failure
abyss_builds:
jobs:
- linux_gcc8
- linux_gcc7:
requires:
- linux_gcc8
- linux_gcc6:
requires:
- linux_gcc7
- linux_gcc5:
requires:
- linux_gcc6
- linux_clang6:
requires:
- linux_gcc5
......@@ -210,9 +210,4 @@ isAmbiguous(const SequenceCollectionHash& g) const
BranchGroupStatus m_status;
};
namespace std {
template <>
inline void swap(BranchGroup&, BranchGroup&) NOEXCEPT { assert(false); }
}
#endif
......@@ -17,4 +17,5 @@ abyss_bloom_SOURCES = bloom.cc \
BloomFilterWindow.h \
ConcurrentBloomFilter.h \
CascadingBloomFilter.h \
CascadingBloomFilterWindow.h
CascadingBloomFilterWindow.h \
RollingBloomDBGVisitor.h
#ifndef _ROLLING_BLOOM_DBG_VISITOR_H_
#define _ROLLING_BLOOM_DBG_VISITOR_H_ 1
#include "Common/UnorderedMap.h"
#include "Graph/BreadthFirstSearch.h"
#include <iostream>
/**
* Visitor class that outputs visited nodes/edges in GraphViz format during
* a bounded breadth first traversal. An instance of this class may be passed
* as an argument to the `breadthFirstSearch` function.
*/
template <typename GraphT>
class RollingBloomDBGVisitor : public DefaultBFSVisitor<GraphT>
{
public:
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
typedef unordered_map<VertexT, unsigned> DepthMap;
typedef typename DepthMap::const_iterator DepthMapConstIt;
typedef typename DepthMap::iterator DepthMapIt;
typedef std::vector<std::pair<std::string, unordered_set<VertexT> > >
KmerProperties;
typedef typename KmerProperties::const_iterator KmerPropertiesIt;
/** Constructor */
RollingBloomDBGVisitor(const VertexT& root, unsigned maxDepth,
const KmerProperties& kmerProperties, std::ostream& out)
: m_root(root), m_maxDepth(maxDepth),
m_kmerProperties(kmerProperties), m_out(out)
{
m_depthMap[root] = 0;
/* start directed graph (GraphViz) */
m_out << "digraph " << root.kmer().c_str() << " {\n";
}
/** Called when graph search completes */
void post_processing()
{
/* end directed graph (GraphViz) */
m_out << "}\n";
}
/** Invoked on edges that lead to an undiscovered vertex */
BFSVisitorResult tree_edge(const EdgeT& e, const GraphT& g)
{
VertexT parent = source(e, g);
VertexT child = target(e, g);
if (!isForwardEdge(e, g))
std::swap(parent, child);
DepthMapConstIt parentIt = m_depthMap.find(parent);
assert(parentIt != m_depthMap.end());
unsigned parentDepth = parentIt->second;
if (parentDepth >= m_maxDepth)
return BFS_SKIP_ELEMENT;
DepthMapIt childIt;
bool inserted;
boost::tie(childIt, inserted) = m_depthMap.insert(
std::make_pair(child, parentDepth + 1));
assert(inserted);
/*
* since we use this visitor class with an undirected BFS,
* each edge is traversed twice (once forward, once reverse)
*/
if (isForwardEdge(e, g))
outputEdge(e, g);
return BFS_SUCCESS;
}
/** Invoked when a vertex is visited for the first time */
BFSVisitorResult discover_vertex(const VertexT& v, const GraphT&)
{
/* declare vertex (GraphViz) */
m_out << '\t' << v.kmer().c_str();
m_out << " [";
DepthMapConstIt depthIt = m_depthMap.find(v);
assert(depthIt != m_depthMap.end());
unsigned depth = depthIt->second;
m_out << "depth=" << depth;
for (KmerPropertiesIt it = m_kmerProperties.begin();
it != m_kmerProperties.end(); ++it) {
if (it->second.find(v) != it->second.end())
m_out << "," << it->first;
}
m_out << "];\n";
return BFS_SUCCESS;
}
/**
* Invoked when an edge is traversed. (Each edge
* in the graph is traversed exactly once.)
*/
BFSVisitorResult non_tree_edge(const EdgeT& e, const GraphT& g)
{
/*
* since we use this visitor class with an undirected BFS,
* each edge is traversed twice (once forward, once reverse)
*/
if (isForwardEdge(e, g))
outputEdge(e, g);
return BFS_SUCCESS;
}
BFSVisitorResult examine_vertex(const VertexT& v, const GraphT&)
{
m_currentVertex = v;
return BFS_SUCCESS;
}
/** Return if the current edge is a forward edge */
bool isForwardEdge(const EdgeT& e, const GraphT& g)
{
assert(source(e, g) == m_currentVertex
|| target(e, g) == m_currentVertex);
return source(e, g) == m_currentVertex;
}
/** Output and edge */
void outputEdge(const EdgeT& e, const GraphT& g)
{
const VertexT& u = source(e, g);
const VertexT& v = target(e, g);
/* declare edge (GraphViz) */
m_out << '\t' << u.kmer().c_str() << " -> "
<< v.kmer().c_str() << ";\n";
}
protected:
VertexT m_currentVertex;
DepthMap m_depthMap;
const VertexT& m_root;
const unsigned m_maxDepth;
const KmerProperties& m_kmerProperties;
std::ostream& m_out;
};
#endif
......@@ -7,6 +7,7 @@
#include "Common/Kmer.h"
#include "Common/BitUtil.h"
#include "Common/KmerIterator.h"
#include "Common/UnorderedSet.h"
#include "Graph/Path.h"
#include "Graph/ExtendPath.h"
#include "Konnector/DBGBloom.h"
......@@ -18,8 +19,11 @@
#include "Bloom/CascadingBloomFilter.h"
#include "Bloom/BloomFilterWindow.h"
#include "Bloom/CascadingBloomFilterWindow.h"
#include "Bloom/RollingBloomDBGVisitor.h"
#include "BloomDBG/BloomIO.h"
#include "BloomDBG/HashAgnosticCascadingBloom.h"
#include "BloomDBG/RollingBloomDBG.h"
#include "BloomDBG/RollingHashIterator.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include <cstdlib>
......@@ -51,9 +55,11 @@ static const char USAGE_MESSAGE[] =
"Usage 3: " PROGRAM " intersect [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
"Usage 4: " PROGRAM " info [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
"Usage 5: " PROGRAM " compare [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE_1> <BLOOM_FILE_2>\n"
"Usage 6: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
"Usage 7: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
"Build and manipulate bloom filter files.\n"
"Usage 6: " PROGRAM " graph [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
"Usage 7: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
"Usage 8: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
"\n"
"Build and manipulate Bloom filter files.\n"
"\n"
" Global options:\n"
"\n"
......@@ -98,6 +104,13 @@ static const char USAGE_MESSAGE[] =
" -m, --method=`String' choose distance calculation method \n"
" [`jaccard'(default), `forbes', `czekanowski']\n"
"\n"
" Options for `" PROGRAM " graph':\n"
"\n"
" -d, --depth=N depth of neighbouring from --root [k]\n"
" -R, --root=KMER root k-mer from graph traversal [required]\n"
" -a, --node-attr=STR:FILE assign a node attribute (e.g. 'color=blue')\n"
" k-mers in the given FASTA\n"
"\n"
" Options for `" PROGRAM " kmers':\n"
"\n"
" -r, --inverse get k-mers that are *NOT* in the bloom filter\n"
......@@ -112,6 +125,13 @@ static const char USAGE_MESSAGE[] =
enum BloomFilterType { BT_KONNECTOR, BT_ROLLING_HASH, BT_UNKNOWN };
enum OutputFormat { BED, FASTA, RAW };
/* types related to --node-attr option */
typedef string KmerProperty;
typedef string FastaPath;
typedef vector<pair<KmerProperty, FastaPath> > KmerProperties;
typedef KmerProperties::iterator KmerPropertiesIt;
namespace opt {
/** The size of the bloom filter in bytes. */
......@@ -120,6 +140,9 @@ namespace opt {
/** The size of the I/O buffer of each thread, in bytes */
size_t bufferSize = 100000;
/** Depth of graph traversal */
size_t depth = 0;
/** The number of parallel threads. */
unsigned threads = 1;
......@@ -150,6 +173,13 @@ namespace opt {
/** The type of Bloom filter to build */
BloomFilterType bloomType = BT_KONNECTOR;
/**
* For the "graph" command: assign node attribute
* (e.g. "color=blue") to k-mers contained in the
* associated FASTA file
*/
KmerProperties kmerProperties;
/** Index of bloom filter window.
("M" for -w option) */
unsigned windowIndex = 0;
......@@ -168,10 +198,13 @@ namespace opt {
*/
bool inverse = false;
/** Root node (k-mer) for `graph` subcommand */
string root;
OutputFormat format = FASTA;
}
static const char shortopts[] = "b:B:h:H:j:k:l:L:m:n:q:rvt:w:";
static const char shortopts[] = "a:b:B:d:h:H:j:k:l:L:m:n:q:rR:vt:w:";
enum { OPT_HELP = 1, OPT_VERSION, OPT_BED, OPT_FASTA, OPT_RAW };
......@@ -179,6 +212,7 @@ static const struct option longopts[] = {
{ "bloom-size", required_argument, NULL, 'b' },
{ "bloom-type", required_argument, NULL, 't' },
{ "buffer-size", required_argument, NULL, 'B' },
{ "depth", required_argument, NULL, 'd' },
{ "hash-seed", required_argument, NULL, 'h' },
{ "num-hashes", required_argument, NULL, 'H' },
{ "threads", required_argument, NULL, 'j' },
......@@ -193,12 +227,14 @@ static const struct option longopts[] = {
{ "trim-quality", required_argument, NULL, 'q' },
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
{ "node-attr", required_argument, NULL, 'a' },
{ "verbose", no_argument, NULL, 'v' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ "window", required_argument, NULL, 'w' },
{ "method", required_argument, NULL, 'm' },
{ "inverse", required_argument, NULL, 'r' },
{ "root", required_argument, NULL, 'R' },
{ "bed", no_argument, NULL, OPT_BED },
{ "fasta", no_argument, NULL, OPT_FASTA },
{ "raw", no_argument, NULL, OPT_RAW },
......@@ -844,6 +880,111 @@ int compare(int argc, char ** argv){
return 1;
}
int graph(int argc, char** argv)
{
parseGlobalOpts(argc, argv);
// default graph neighbourhood depth
opt::depth = opt::k;
for (int c; (c = getopt_long(argc, argv,
shortopts, longopts, NULL)) != -1;) {
istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?':
dieWithUsageError();
case 'a':
{
string s;
arg >> s;
size_t pos = s.find(":");
if (pos < s.length())
opt::kmerProperties.push_back(make_pair(
s.substr(0, pos), s.substr(pos + 1)));
else
arg.setstate(ios::failbit);
}
break;
case 'd':
arg >> opt::depth; break;
case 'R':
arg >> opt::root; break;
}
if (optarg != NULL && (!arg.eof() || arg.fail())) {
cerr << PROGRAM ": invalid option: `-"
<< (char)c << optarg << "'\n";
exit(EXIT_FAILURE);
}
}
if (opt::root.empty()) {
cerr << PROGRAM ": missing required option --root <KMER>\n";
dieWithUsageError();
}
if (opt::root.size() != opt::k) {
cerr << PROGRAM ": --root arg must be a k-mer of length "
<< opt::k << "\n";
dieWithUsageError();
}
if (argc - optind != 1) {
cerr << PROGRAM ": missing arguments\n";
dieWithUsageError();
}
typedef RollingBloomDBG<HashAgnosticCascadingBloom> Graph;
typedef boost::graph_traits<Graph>::vertex_descriptor V;
string bloomPath(argv[optind]);
optind++;
vector<pair<string, unordered_set<V> > > kmerProperties;
for (KmerPropertiesIt it = opt::kmerProperties.begin();
it != opt::kmerProperties.end(); ++it) {
if (opt::verbose)
cerr << "Loading k-mers from `" << it->second << "', to be "
<< "annotated with '" << it->first << "'\n";
kmerProperties.push_back(make_pair(it->first, unordered_set<V>()));
size_t count = 0;
size_t checkpoint = 0;
const size_t step = 10000;
FastaReader in(it->second.c_str(), FastaReader::FOLD_CASE);
for (FastaRecord rec; in >> rec;) {
for (RollingHashIterator it(rec.seq, 1, opt::k);
it != RollingHashIterator::end(); ++it, ++count) {
V v(it.kmer().c_str(), it.rollingHash());
kmerProperties.back().second.insert(v);
}
while (opt::verbose && count >= checkpoint) {
cerr << "Loaded " << checkpoint << " k-mers\n";
checkpoint += step;
}
}
if (opt::verbose)
cerr << "Loaded " << count << " k-mers in total\n";
}
if (opt::verbose)
cerr << "Loading Bloom filter from `" << bloomPath << "'..." << endl;
HashAgnosticCascadingBloom bloom(bloomPath);
assert(opt::k == bloom.getKmerSize());
Graph g(bloom);
V root(opt::root.c_str(), RollingHash(opt::root.c_str(),
bloom.getHashNum(), bloom.getKmerSize()));
RollingBloomDBGVisitor<Graph> visitor(root, opt::depth, kmerProperties, cout);
breadthFirstSearch(root, g, true, visitor);
return 0;
}
int memberOf(int argc, char ** argv){
// Initalise bloom and get globals
Konnector::BloomFilter bloom;
......@@ -941,6 +1082,9 @@ int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter&
// of the sequence
bool firstKmerMatch = true;
// longest branch of Bloom filter false positives
const unsigned fpTrim = 5;
KmerIterator it(seq, k);
for (; it != KmerIterator::end(); ++it) {
......@@ -951,20 +1095,25 @@ int calcLeftTrim(const Sequence& seq, unsigned k, const Konnector::BloomFilter&
if (!bloom[kmer])
continue;
// in degree, disregarding false branches
unsigned inDegree = trueBranches(kmer, REVERSE, g,
minBranchLen).size();
// out degree, disregarding false branches
unsigned outDegree = trueBranches(kmer, FORWARD, g,
minBranchLen).size();
PathExtensionResultCode leftResult, rightResult;
boost::tie(boost::tuples::ignore, leftResult)
= successor(kmer, REVERSE, g, minBranchLen, fpTrim);
boost::tie(boost::tuples::ignore, rightResult)
= successor(kmer, FORWARD, g, minBranchLen, fpTrim);
if (firstKmerMatch) {
bool leftTip = (inDegree == 0 && outDegree == 1);
bool rightTip = (inDegree == 1 && outDegree == 0);
bool leftTip = leftResult == ER_DEAD_END
&& rightResult == ER_LENGTH_LIMIT;
bool rightTip = leftResult == ER_LENGTH_LIMIT
&& rightResult == ER_DEAD_END;
if (!leftTip && !rightTip)
break;
} else if (inDegree != 1 || outDegree != 1) {
// end of linear path
} else {
bool leftFork = leftResult == ER_AMBI_IN
|| leftResult == ER_AMBI_OUT;
bool rightFork = rightResult == ER_AMBI_IN
|| rightResult == ER_AMBI_OUT;
if (leftFork || rightFork)
break;
}
......@@ -1104,6 +1253,9 @@ int main(int argc, char** argv)
else if (command == "compare") {
return compare(argc, argv);
}
else if (command == "graph") {
return graph(argc, argv);
}
else if (command == "kmers" || command == "getKmers") {
return memberOf(argc, argv);
}
......
......@@ -16,13 +16,17 @@ namespace BloomDBG {
{
/** reads consisting entirely of solid k-mers */
size_t solidReads;
size_t readsExtended;
/**
* reads consisting entirely of k-mers already
* included in output contigs
*/
size_t visitedReads;
size_t readsProcessed;
size_t basesAssembled;
size_t contigID;
AssemblyCounters() : solidReads(0), readsExtended(0), readsProcessed(0),
basesAssembled(0), contigID(0) {}
AssemblyCounters() : solidReads(0), visitedReads(0),
readsProcessed(0), basesAssembled(0), contigID(0) {}
/** serialize counters as a TSV table */
friend std::ostream& operator<<(std::ostream& out,
......@@ -31,7 +35,6 @@ namespace BloomDBG {
/* write headers */
out << "solid_reads"
<< '\t' << "extended_reads"
<< '\t' << "processed_reads"
<< '\t' << "bases_assembled"
<< '\t' << "next_contig_id"
......@@ -40,7 +43,6 @@ namespace BloomDBG {
/* write data */
out << o.solidReads
<< '\t' << o.readsExtended
<< '\t' << o.readsProcessed
<< '\t' << o.basesAssembled
<< '\t' << o.contigID
......@@ -61,7 +63,6 @@ namespace BloomDBG {
/* read data */
in >> o.solidReads
>> expect("\t") >> o.readsExtended
>> expect("\t") >> o.readsProcessed
>> expect("\t") >> o.basesAssembled
>> expect("\t") >> o.contigID
......
......@@ -27,6 +27,9 @@ namespace BloomDBG {
/** minimum k-mer coverage threshold */
unsigned minCov;
/** path to output debugging info about processing of each read */
std::string readLogPath;
/** WIG track containing 0/1 for sufficient k-mer cov */
std::string covTrackPath;
......
......@@ -15,11 +15,14 @@ namespace BloomDBG {
std::ostream& checkpointOut;
/** trace file output for debugging */
std::ostream& traceOut;
/** outcomes of processing each read */
std::ostream& readLogOut;
AssemblyStreams(InputStreamT& in, std::ostream& out,
std::ostream& checkpointOut, std::ostream& traceOut) :
std::ostream& checkpointOut, std::ostream& traceOut,
std::ostream& readLogOut) :
in(in), out(out), checkpointOut(checkpointOut),
traceOut(traceOut) {}
traceOut(traceOut), readLogOut(readLogOut) {}
};
}
......
......@@ -2,8 +2,12 @@
#define LIGHTWEIGHT_KMER_H 1
#include "BloomDBG/MaskedKmer.h"
#include "Common/Kmer.h"
#include "Common/Sequence.h"
#include "Common/Sense.h"
#include <algorithm>
#include <cctype>
#include <cstring>
#include <boost/shared_array.hpp>
......@@ -78,6 +82,51 @@ public:
return *(m_kmer.get() + pos);
}
/**
* Return true if k-mer is in its lexicographically smallest orientation
*/
bool isCanonical() const
{
const unsigned k = Kmer::length();
for (unsigned i = 0; i < k/2; ++i) {
const char c1 = toupper(m_kmer.get()[i]);
const char c2 = complementBaseChar(
toupper(m_kmer.get()[k-i-1]));
if (c1 > c2)
return false;
else if (c1 < c2)
return true;
}
return true;
}
/**
* Reverse complement the k-mer, if it is not already in its
* canonical (i.e. lexicographically smallest) orientation.
*/
void canonicalize()
{
if (!isCanonical())
reverseComplement();
}
void reverseComplement()
{
const unsigned k = Kmer::length();
for (unsigned i = 0; i < k/2; ++i) {
const char tmp = complementBaseChar(m_kmer.get()[i]);
m_kmer.get()[i] = complementBaseChar(
m_kmer.get()[k-i-1]);
m_kmer.get()[k-i-1] = tmp;
}
/* if k is odd, we need to reverse complement the middle char */
if (k % 2 == 1 && k > 1)
m_kmer.get()[k/2]
= complementBaseChar(m_kmer.get()[k/2]);
}
/** Equality operator */
bool operator==(const LightweightKmer& o) const
{
......@@ -95,6 +144,13 @@ public:
return true;
}
}
/** Inequality operator */
bool operator!=(const LightweightKmer& o) const
{
return !operator==(o);
}
};
#endif
......@@ -69,6 +69,23 @@ public:
kmer().setLastBase(dir, base);
}
void reverseComplement()
{
m_kmer.reverseComplement();
m_rollingHash.reverseComplement();
}
bool isCanonical() const
{
return m_kmer.isCanonical();
}
void canonicalize()
{
if (!m_kmer.isCanonical())
reverseComplement();
}
/**
* Comparison operator that takes spaced seed bitmask into account.
*/
......@@ -78,7 +95,7 @@ public:
if (m_rollingHash != o.m_rollingHash)
return false;
return m_kmer == o.m_kmer;
return compare(o) == 0;
}
/**
......@@ -88,6 +105,59 @@ public:
{
return !(*this == o);
}
/**
* Comparison operator that is invariant under reverse-complement.
*/
bool operator<(const RollingBloomDBGVertex& o) const
{
return compare(o) < 0;
}
/** Comparison operator that is invariant under reverse complement */
int compare(const RollingBloomDBGVertex& o) const
{
unsigned k = Kmer::length();
const std::string& spacedSeed = MaskedKmer::mask();
const LightweightKmer& kmer1 = kmer();
const LightweightKmer& kmer2 = o.kmer();
bool rc1 = !kmer1.isCanonical();
bool rc2 = !kmer2.isCanonical();
int end1 = rc1 ? -1 : k;
int end2 = rc2 ? -1 : k;
int inc1 = rc1 ? -1 : 1;
int inc2 = rc2 ? -1 : 1;
int pos1 = rc1 ? k-1 : 0;
int pos2 = rc2 ? k-1 : 0;
for (; pos1 != end1 && pos2 != end2; pos1+=inc1, pos2+=inc2) {
char c1 = toupper(kmer1.c_str()[pos1]);
char c2 = toupper(kmer2.c_str()[pos2]);
/* ignore positions masked by spaced seed */
if (!spacedSeed.empty() && spacedSeed.at(pos1) != '1') {
/* spaced seed must be symmetric */
assert(spacedSeed.at(pos2) != '1');
continue;
}
if (rc1)
c1 = complementBaseChar(c1);
if (rc2)
c2 = complementBaseChar(c2);
if (c1 > c2)
return 1;
if (c1 < c2)
return -1;
}
return 0;
}
};
NAMESPACE_STD_HASH_BEGIN
......@@ -362,9 +432,11 @@ struct in_edge_iterator
// Subgraph
/** Return whether this vertex exists in the subgraph. */
template <typename Graph>
template <typename BloomT>
static inline bool
vertex_exists(const typename graph_traits<Graph>::vertex_descriptor& u, const Graph& g)
vertex_exists(
const typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor& u,
const RollingBloomDBG<BloomT>& g)
{
size_t hashes[MAX_HASHES];
u.rollingHash().getHashes(hashes);
......@@ -395,38 +467,40 @@ out_degree(
return std::distance(adj.first, adj.second);
}
template <typename Graph>
template <typename BloomT>
static inline typename
std::pair<typename graph_traits<Graph>::out_edge_iterator,
typename graph_traits<Graph>::out_edge_iterator>
std::pair<typename graph_traits<RollingBloomDBG<BloomT> >::out_edge_iterator,
typename graph_traits<RollingBloomDBG<BloomT> >::out_edge_iterator>
out_edges(
const typename graph_traits<Graph>::vertex_descriptor& u,
const Graph& g)
const typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor& u,
const RollingBloomDBG<BloomT>& g)
{
typedef RollingBloomDBG<BloomT> Graph;
typedef typename graph_traits<Graph>::out_edge_iterator Oit;
return std::make_pair(Oit(g, u), Oit(g));
}
// BidirectionalGraph
template <typename Graph>
template <typename BloomT>
static inline
std::pair<typename graph_traits<Graph>::in_edge_iterator,
typename graph_traits<Graph>::in_edge_iterator>
std::pair<typename graph_traits<RollingBloomDBG<BloomT> >::in_edge_iterator,
typename graph_traits<RollingBloomDBG<BloomT> >::in_edge_iterator>
in_edges(
const typename graph_traits<Graph>::vertex_descriptor& u,
const Graph& g)
const typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor& u,
const RollingBloomDBG<BloomT>& g)
{
typedef RollingBloomDBG<BloomT> Graph;
typedef typename graph_traits<Graph>::in_edge_iterator Iit;
return std::make_pair(Iit(g, u), Iit(g));
}
template <typename Graph>
template <typename BloomT>
static inline
typename graph_traits<Graph>::degree_size_type
in_degree(const typename graph_traits<Graph>::vertex_descriptor& u,
const Graph& g)
typename graph_traits<RollingBloomDBG<BloomT> >::degree_size_type
in_degree(const typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor& u,
const RollingBloomDBG<BloomT>& g)
{
//return out_degree(reverseComplement(u), g);
typedef RollingBloomDBG<BloomT> Graph;
typedef typename graph_traits<Graph>::in_edge_iterator Iit;
std::pair<Iit, Iit> it = in_edges(u, g);
return std::distance(it.first, it.second);
......@@ -435,50 +509,66 @@ in_degree(const typename graph_traits<Graph>::vertex_descriptor& u,
// PropertyGraph
/** Return the reverse complement of the specified k-mer. */
template <typename Graph>
template <typename BloomT>
static inline
typename graph_traits<Graph>::vertex_descriptor
get(vertex_complement_t, const Graph&,
typename graph_traits<Graph>::vertex_descriptor u)
typename graph_traits< RollingBloomDBG<BloomT> >::vertex_descriptor
get(vertex_complement_t, const RollingBloomDBG<BloomT>&,
typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor u)
{
typedef RollingBloomDBG<BloomT> Graph;
typedef typename graph_traits<Graph>::vertex_descriptor V;
return V(reverseComplement(u.first), u.second);
}
/** Return the name of the specified vertex. */
template <typename Graph>
template <typename BloomT>
static inline
MaskedKmer get(vertex_name_t, const Graph&,
typename graph_traits<Graph>::vertex_descriptor u)
MaskedKmer get(vertex_name_t, const RollingBloomDBG<BloomT>&,
typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor u)
{
return u.first;
}
template <typename Graph>
static inline
bool
get(vertex_removed_t, const Graph&,
typename graph_traits<Graph>::vertex_descriptor)
template <typename BloomT>
static inline bool
get(vertex_removed_t, const RollingBloomDBG<BloomT>&,
typename graph_traits<RollingBloomDBG<BloomT> >::vertex_descriptor)
{
return false;
}
template <typename Graph>
static inline
no_property
get(vertex_bundle_t, const Graph&,
typename graph_traits<Graph>::edge_descriptor)
template <typename BloomT>
static inline no_property
get(vertex_bundle_t, const RollingBloomDBG<BloomT>&,
typename graph_traits<RollingBloomDBG<BloomT> >::edge_descriptor)
{
return no_property();
}
template <typename BloomT>
static inline no_property
get(edge_bundle_t, const RollingBloomDBG<BloomT>&,
typename graph_traits<RollingBloomDBG<BloomT> >::edge_descriptor)
{
return no_property();
}
template <typename Graph>
static inline
no_property
get(edge_bundle_t, const Graph&,
typename graph_traits<Graph>::edge_descriptor)
std::pair<typename boost::graph_traits<Graph>::edge_descriptor, bool>
edge(const typename boost::graph_traits<Graph>::vertex_descriptor& u,
const typename boost::graph_traits<Graph>::vertex_descriptor&v,
const Graph& g)
{
return no_property();
typedef typename boost::graph_traits<Graph>::edge_descriptor E;
typedef typename boost::graph_traits<Graph>::adjacency_iterator AdjIt;
AdjIt it, end;
E e(u, v);
for (boost::tie(it, end) = adjacent_vertices(u, g); it != end; ++it) {
if (*it == v)
return std::make_pair(e, true);
}
return std::make_pair(e, false);
}
#endif
......@@ -189,6 +189,18 @@ public:
kmer, m_k);
}
/**
* Reverse complement the hash state, so that rolling right becomes
* rolling left, and vice versa. This operation is needed
* whenever we reverse-complement a k-mer that has an associated
* `RollingHash` state, so that subsequent rolling operations will
* produce the correct hash value.
*/
void reverseComplement()
{
std::swap(m_hash1, m_rcHash1);
}
private:
/** number of hash functions */
......
......@@ -68,7 +68,7 @@ static const char USAGE_MESSAGE[] =
" for FASTQ and SAM files [default]\n"
" --illumina-quality zero quality is `@' (64), typically\n"
" for qseq and export files\n"
" -t, --trim-length max branch length to trim, in k-mers [k]\n"
" -t, --trim-length=N max branch length to trim, in k-mers [k]\n"
" -v, --verbose display verbose output\n"
" --version output version information and exit\n"
"\n"
......@@ -90,10 +90,11 @@ static const char USAGE_MESSAGE[] =
" -C, --cov-track=FILE WIG track with 0/1 indicating k-mers with\n"
" coverage above the -c threshold. A reference\n"
" must also be specified with -R.\n"
" -T, --trace-file=FILE write debugging info about extension of\n"
" each read to FILE\n"
" --read-log=FILE write outcome of processing each read to FILE\n"
" -R, --ref=FILE specify a reference genome. FILE may be\n"
" FASTA, FASTQ, SAM, or BAM and may be gzipped."
" FASTA, FASTQ, SAM, or BAM and may be gzipped.\n"
" -T, --trace-file=FILE write debugging info about generation of\n"
" each read to FILE\n"
"\n"
"Experimental Options:\n"
"\n"
......@@ -123,7 +124,7 @@ static const char shortopts[] = "b:C:g:H:i:j:k:K:o:q:Q:R:s:t:T:v";
enum {
OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV,
CHECKPOINT, KEEP_CHECKPOINT, CHECKPOINT_PREFIX
CHECKPOINT, KEEP_CHECKPOINT, CHECKPOINT_PREFIX, READ_LOG,
};
static const struct option longopts[] = {
......@@ -151,10 +152,11 @@ static const struct option longopts[] = {
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
{ "qr-seed", required_argument, NULL, QR_SEED },
{ "read-log", required_argument, NULL, READ_LOG },
{ "ref", required_argument, NULL, 'R' },
{ "spaced-seed", no_argument, NULL, 's' },
{ "trim-length", no_argument, NULL, 't' },
{ "trace-file", no_argument, NULL, 'T'},
{ "spaced-seed", required_argument, NULL, 's' },
{ "trim-length", required_argument, NULL, 't' },
{ "trace-file", required_argument, NULL, 'T'},
{ "verbose", no_argument, NULL, 'v' },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
......@@ -251,13 +253,22 @@ void resumeAssemblyFromCheckpoint(int argc, char** argv,
if (!params.tracePath.empty()) {
traceOut.open(params.tracePath.c_str());
assert_good(traceOut, params.tracePath);
BloomDBG::SeqExtensionResult::printHeaders(traceOut);
BloomDBG::ContigRecord::printHeaders(traceOut);
assert_good(traceOut, params.tracePath);
}
/* logs outcome of processing of each read (`--read-log`) */
std::ofstream readLogOut;
if (!params.readLogPath.empty()) {
readLogOut.open(params.readLogPath.c_str());
assert_good(readLogOut, params.readLogPath);
BloomDBG::ReadRecord::printHeaders(readLogOut);
assert_good(readLogOut, params.readLogPath);
}
/* bundle input/output streams for assembly */
BloomDBG::AssemblyStreams<FastaConcat>
streams(in, out, checkpointOut, traceOut);
streams(in, out, checkpointOut, traceOut, readLogOut);
/* restore state of Bloom filters, counters, and input/output streams */
BloomDBG::resumeFromCheckpoint(solidKmerSet, visitedKmerSet,
......@@ -438,6 +449,9 @@ int main(int argc, char** argv)
case CHECKPOINT_PREFIX:
arg >> params.checkpointPathPrefix;
break;
case READ_LOG:
arg >> params.readLogPath;
break;
}
if (optarg != NULL && (!arg.eof() || arg.fail())) {
......
This diff is collapsed.
2018-12-04 Sauparna Palchowdhury <spchowdhury@bcgsc.ca>
* Release version 2.1.5
* Compiler fixes and increse stack size limits to avoid stack overflows.
abyss-pe:
* Add 'ulimit' statements to the Makefile to increse a thread's
stack size to 64MB.
2018-11-09 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.1.4
* Major improvements to Bloom filter assembly contiguity and
correctness. Bloom filter assemblies now have equivalent scaffold
contiguity and better correctness than MPI assemblies of the same
data, while still requiring less than 1/10th of the memory. On
human, Bloom filter assembly times are still a few hours longer
than MPI assemblies (e.g. 17 hours vs. 13 hours, using 48
threads).
abyss-pe:
* Change default value of `m` from 50 => 0, which has the effect
of disallowing sequence overlaps < k-1 bp. QUAST tests on E. coli /
C. elegans / H. sapiens showed that both contiguity and
correctness were improved by allowing only overlaps of k-1 bp
between sequence ends.
2018-11-05 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.1.3
* Bug fix release
abyss-bloom:
* Added `graph` command for visualizing neighbourhoods of the
Bloom filter de Bruijn graph (produces GraphViz)
abyss-fixmate-ssq:
* Fixed missing tab in SAM output which broke ABySS linked reads
pipeline (Tigmint/ARCS)
2018-10-24 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.1.2
* Improved scaffold N50 on human by ~10% by
implementing a new `DistanceEst --median`
option (thanks to @lcoombe!)
* Added a `--max-cost` option to `konnector` and
`abyss-sealer`. This address a long-standing issue
with indeterminately long running times, particularly
for low values of `k`.
abyss-pe:
* Use the new `DistanceEst --median` option as the
default for the scaffolding stage
Dockerfile:
* Fix OpenMPI setup
DistanceEst:
* Added `--median` option
konnector:
* Added `--max-cost` option to bound running time
sealer:
* Added `--max-cost` option to bound running time
2018-09-11 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.1.1
......
......@@ -15,7 +15,7 @@ class PMF
public:
/** Construct a PMF from a histogram. */
PMF(const Histogram& h)
: m_dist(h.maximum() + 1), m_mean(h.mean()), m_stdDev(h.sd())
: m_dist(h.maximum() + 1), m_mean(h.mean()), m_stdDev(h.sd()), m_median(h.median())
{
unsigned count = h.size();
m_minp = (double)1 / count;
......@@ -44,6 +44,9 @@ class PMF
return m_dist.size() - 1;
}
/** Return the median of this distribution. */
int median() const { return m_median; }
/** Return the mean of this distribution. */
double mean() const { return m_mean; }
......@@ -60,6 +63,7 @@ class PMF
double m_mean;
double m_stdDev;
double m_minp;
int m_median;
};
namespace std {
......
......@@ -324,9 +324,8 @@ struct SAMRecord : SAMAlignment {
#if SAM_SEQ_QUAL
out << '\t' << o.seq
<< '\t' << o.qual;
// note: leading tab ('\t') is already embedded in o.tags
if (!o.tags.empty())
out << o.tags;
out << '\t' << o.tags;
#else
out << "\t*\t*";
#endif
......
......@@ -8,6 +8,7 @@
typedef std::string Sequence;
char complementBaseChar(char c);
Sequence reverseComplement(const Sequence& s);
Sequence colourToNucleotideSpace(char anchor, const Sequence& seq);
char colourToNucleotideSpace(char anchor, char cs);
......@@ -32,6 +33,16 @@ static inline bool allACGT(const Sequence& seq)
return strspn(seq.c_str(), "acgtACGT") == seq.length();
}
/**
* Transform a sequence in its canonical orientation.
*/
static inline void canonicalize(Sequence& seq)
{
Sequence rc = reverseComplement(seq);
if (rc < seq)
seq = rc;
}
/**
* Convert each ambiguity code to the lexicographically smallest
* matching base.
......
......@@ -59,6 +59,8 @@ static const char USAGE_MESSAGE[] =
" -o, --out=FILE write result to FILE\n"
" --mle use the MLE [default]\n"
" (maximum likelihood estimator)\n"
" --median use the difference of the population median\n"
" and the sample median\n"
" --mean use the difference of the population mean\n"
" and the sample mean\n"
" --dist output the graph in dist format [default]\n"
......@@ -78,7 +80,7 @@ static const char USAGE_MESSAGE[] =
"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
/** Which estimator to use. See opt::method. */
enum { MLE, MEAN };
enum { MLE, MEAN, MEDIAN };
namespace opt {
string db;
......@@ -131,6 +133,7 @@ static const struct option longopts[] = {
{ "mind", required_argument, NULL, OPT_MIND },
{ "maxd", required_argument, NULL, OPT_MAXD },
{ "mle", no_argument, &opt::method, MLE },
{ "median", no_argument, &opt::method, MEDIAN },
{ "mean", no_argument, &opt::method, MEAN },
{ "kmer", required_argument, NULL, 'k' },
{ "npairs", required_argument, NULL, 'n' },
......@@ -175,6 +178,29 @@ static int estimateDistanceUsingMean(
return d;
}
/** Estimate the distance between two contigs using the difference of
* the population median and the sample median.
* @param numPairs [out] the number of pairs that agree with the
* expected distribution
* @return the estimated distance
*/
static int estimateDistanceUsingMedian(
const std::vector<int>& samples, const PMF& pmf,
unsigned& numPairs)
{
Histogram h(samples.begin(), samples.end());
int d = (int)round(pmf.median() - h.median());
// Count the number of samples that agree with the distribution.
unsigned n = 0;
for (Histogram::const_iterator it = h.begin();
it != h.end(); ++it)
if (pmf[it->first + d] > pmf.minProbability())
n += it->second;
numPairs = n;
return d;
}
/** Global variable to track a recommended minAlign parameter */
unsigned g_recMA;
......@@ -257,6 +283,11 @@ static int estimateDistance(unsigned len0, unsigned len1,
// and the sample mean.
return estimateDistanceUsingMean(
fragmentSizes, pmf, numPairs);
case MEDIAN:
// Use the difference of the population median
// and the sample median.
return estimateDistanceUsingMedian(
fragmentSizes, pmf, numPairs);
default:
assert(false);
abort();
......
FROM ubuntu:latest
FROM ubuntu:18.04
MAINTAINER Shaun Jackman <sjackman@gmail.com>
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bsdmainutils libgomp1 make openmpi-bin ssh
bsdmainutils libgomp1 make openmpi-bin ssh sudo \
&& useradd -m -s /bin/bash abyss \
&& echo 'abyss ALL=(ALL) NOPASSWD:ALL' >>/etc/sudoers
ADD . /tmp/abyss
RUN apt-get install -y --no-install-recommends \
automake g++ libboost-dev libopenmpi-dev libsparsehash-dev \
&& cd /tmp/abyss \
&& ./autogen.sh \
&& mkdir build && cd build \
&& ../configure --with-mpi=/usr/lib/openmpi \
&& ../configure --with-mpi=/usr/lib/x86_64-linux-gnu/openmpi \
&& make install-strip \
&& rm -rf /tmp/abyss \
&& apt-get autoremove -y binutils \
automake g++ libboost-dev libopenmpi-dev libsparsehash-dev
ENV SHELL=/bin/bash
USER abyss
WORKDIR /home/abyss
ENV SHELL=/bin/bash USER=abyss
ENTRYPOINT ["abyss-pe"]
CMD ["help"]
......@@ -6,26 +6,32 @@
#include <boost/tuple/tuple.hpp> // for boost::tie
#include <boost/graph/graph_traits.hpp>
#include <boost/graph/graph_concepts.hpp>
#include <vector>
template <class IncidenceGraph>
PathSearchResult allPathsSearch(
const IncidenceGraph& g,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor start,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor goal,
std::vector< Path<typename boost::graph_traits<IncidenceGraph>::vertex_descriptor> >& pathsFound)
/** result of exhaustive path search from vertex A to vertex B */
template <class VertexT>
struct AllPathsSearchResult
{
return allPathsSearch(g, start, goal, NO_LIMIT, NO_LIMIT, NO_LIMIT, pathsFound);
}
public:
/** code indicating type of success/failure for search */
PathSearchResult resultCode;
/** number of edges traversed during search */
unsigned cost;
/** all paths from vertex A to vertex B (if successful) */
std::vector< Path<VertexT> > paths;
/** constructor */
AllPathsSearchResult() : resultCode(NO_PATH), cost(0) {}
};
template <class IncidenceGraph>
PathSearchResult allPathsSearch(
AllPathsSearchResult<typename boost::graph_traits<IncidenceGraph>::vertex_descriptor>
allPathsSearch(
const IncidenceGraph& g,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor start,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor goal,
unsigned maxPaths,
unsigned minDepth,
unsigned maxDepth,
std::vector< Path<typename boost::graph_traits<IncidenceGraph>::vertex_descriptor> >& pathsFound)
unsigned maxPaths, unsigned minDepth, unsigned maxDepth, unsigned maxCost)
{
typedef typename boost::graph_traits<IncidenceGraph>::vertex_descriptor V;
typedef typename boost::graph_traits<IncidenceGraph>::out_edge_iterator EdgeIter;
......@@ -40,15 +46,21 @@ PathSearchResult allPathsSearch(
visited.insert(start);
eiStack.push_back(out_edges(start, g));
while(!path.empty()) {
AllPathsSearchResult<V> result;
while(!path.empty() && result.cost <= maxCost) {
if (path.back() == goal &&
(minDepth == NO_LIMIT || (path.size() - 1) >= minDepth)) {
if (maxPaths != NO_LIMIT && pathsFound.size() >= maxPaths)
return TOO_MANY_PATHS;
if (!cycleVertices.empty())
return PATH_CONTAINS_CYCLE;
pathsFound.push_back(path);
if (maxPaths != NO_LIMIT && result.paths.size() >= maxPaths) {
result.resultCode = TOO_MANY_PATHS;
return result;
}
if (!cycleVertices.empty()) {
result.resultCode = PATH_CONTAINS_CYCLE;
return result;
}
result.paths.push_back(path);
}
// find next unvisited node and append to path
......@@ -73,6 +85,7 @@ PathSearchResult allPathsSearch(
path.push_back(v);
eiStack.push_back(out_edges(v, g));
visited.insert(v);
result.cost++;
break;
}
} // if ei != ei_end
......@@ -80,10 +93,24 @@ PathSearchResult allPathsSearch(
} // while !path.empty()
if (pathsFound.empty())
return NO_PATH;
if (result.cost > maxCost)
result.resultCode = MAX_COST_EXCEEDED;
else if (result.paths.empty())
result.resultCode = NO_PATH;
else
return FOUND_PATH;
result.resultCode = FOUND_PATH;
return result;
}
template <class IncidenceGraph>
AllPathsSearchResult<typename boost::graph_traits<IncidenceGraph>::vertex_descriptor>
allPathsSearch(
const IncidenceGraph& g,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor start,
typename boost::graph_traits<IncidenceGraph>::vertex_descriptor goal)
{
return allPathsSearch(g, start, goal, NO_LIMIT, NO_LIMIT, NO_LIMIT, NO_LIMIT);
}
#endif