Skip to content
Commits on Source (4)
......@@ -7,7 +7,7 @@ aliases:
apt-get install -qq software-properties-common
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update -qq
apt-get install -qq autoconf automake gcc g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
apt-get install -qq autoconf automake gcc g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc
jobs:
......@@ -22,9 +22,7 @@ jobs:
- run: |
./autogen.sh
./configure CC=clang-6.0 CXX=clang++-6.0 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
- run: make -j12 distcheck
linux_gcc5:
docker:
......@@ -37,9 +35,7 @@ jobs:
- run: |
./autogen.sh
./configure CC=gcc-5 CXX=g++-5 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
- run: make -j12 distcheck
linux_gcc6:
docker:
......@@ -52,9 +48,7 @@ jobs:
- run: |
./autogen.sh
./configure CC=gcc-6 CXX=g++-6 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
- run: make -j12 distcheck
linux_gcc7:
docker:
......@@ -67,9 +61,7 @@ jobs:
- run: |
./autogen.sh
./configure CC=gcc-7 CXX=g++-7 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
- run: make -j12 distcheck
linux_gcc8:
docker:
......@@ -82,25 +74,34 @@ jobs:
- run: |
./autogen.sh
./configure CC=gcc-8 CXX=g++-8 --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
- run: make -j12 distcheck
clang-format:
docker:
- image: *docker_image
steps:
- run: *install_common
- run: |
apt-get install --yes --force-yes sudo
sudo apt-get install --yes --force-yes curl
curl https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-8 main"
sudo apt-get update
sudo apt-get install -y --no-install-recommends clang-format-8
sudo ln -s clang-format-8 /usr/bin/clang-format
- checkout
- run: |
./autogen.sh
./configure
- run: make clang-format
workflows:
version: 2
# test compiler versions in priority order and abort on first failure
abyss_builds:
jobs:
- linux_gcc8
- linux_gcc7:
requires:
- linux_gcc8
- linux_gcc6:
requires:
- linux_gcc7
- linux_gcc5:
requires:
- linux_gcc6
- linux_clang6:
requires:
- linux_gcc5
- linux_clang6
- clang-format
BasedOnStyle: Mozilla
AlignAfterOpenBracket: AlwaysBreak
AlignOperands: true
ColumnLimit: 100
ContinuationIndentWidth: 4
IndentCaseLabels: false
IndentWidth: 4
TabWidth: 4
UseTab: ForIndentation
......@@ -17,6 +17,9 @@ using namespace std;
#define PROGRAM "ABYSS"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
namespace opt {
static const char VERSION_MESSAGE[] =
......@@ -48,7 +51,7 @@ static const char USAGE_MESSAGE[] =
" --SS assemble in strand-specific mode\n"
" --no-SS do not assemble in strand-specific mode\n"
" -o, --out=FILE write the contigs to FILE\n"
" -k, --kmer=N the length of a k-mer (when -K is not set)\n"
" -k, --kmer=N the length of a k-mer (when -K is not set) [<=" STR(MAX_KMER) "]\n"
" or the span of a k-mer pair (when -K is set)\n"
" -K, --single-kmer=N the length of a single k-mer in a k-mer pair\n"
" -t, --trim-length=N maximum length of blunt contigs to trim [k]\n"
......
......@@ -46,28 +46,28 @@ namespace Bloom {
/** Print a progress message after loading this many seqs */
static const unsigned LOAD_PROGRESS_STEP = 100000;
/** file format version number */
static const unsigned BLOOM_VERSION = 4;
static const unsigned BLOOM_VERSION = 5;
/** Return the hash value of this object. */
inline static size_t hash(const key_type& key)
{
if (key.isCanonical())
return hashmem(&key, sizeof key);
return hashmem(&key, key.bytes());
key_type copy(key);
copy.reverseComplement();
return hashmem(&copy, sizeof copy, 0);
return hashmem(&copy, copy.bytes(), 0);
}
/** Return the hash value of this object given seed. */
inline static size_t hash(const key_type& key, size_t seed)
{
if (key.isCanonical())
return hashmem(&key, sizeof key, seed);
return hashmem(&key, key.bytes(), seed);
key_type copy(key);
copy.reverseComplement();
return hashmem(&copy, sizeof copy, seed);
return hashmem(&copy, copy.bytes(), seed);
}
template <typename BF>
......@@ -190,17 +190,6 @@ namespace Bloom {
return header;
}
//TODO: Bloom filter calculation methods
//static double calcApproxFPR(size_t numBucket, size_t numEntr,
// unsigned numHash) const;
//static double calcRedunancyFPR(size_t numBucket, size_t numEntr,
// unsigned numHash) const;
//static size_t calcOptimalSize(size_t numEle, double fpr) const;
//static size_t calcOptimalSize(size_t numEle, double fpr,
// unsigned numHash) const;
//static unsigned calcOptimalNumHash(double fpr);
//static unsigned calcOptimalNumHash(size_t numBucket, size_t numEle);
};
#endif /* BLOOM_H_ */
......@@ -5,7 +5,7 @@
#ifndef HASH_AGNOSTIC_CASCADING_BLOOM_H
#define HASH_AGNOSTIC_CASCADING_BLOOM_H 1
#include "lib/bloomfilter/BloomFilter.hpp"
#include "vendor/btl_bloomfilter/BloomFilter.hpp"
#include <vector>
/**
......@@ -26,9 +26,11 @@
class HashAgnosticCascadingBloom
{
public:
/** Default constructor */
HashAgnosticCascadingBloom() : m_k(0), m_hashes(0) {}
HashAgnosticCascadingBloom()
: m_k(0)
, m_hashes(0)
{}
/**
* Constructor.
......@@ -37,8 +39,9 @@ class HashAgnosticCascadingBloom
* @param levels number of levels in Cascading Bloom filter
* @param k k-mer size
*/
HashAgnosticCascadingBloom(size_t size, unsigned hashes,
size_t levels, unsigned k) : m_k(k), m_hashes(hashes)
HashAgnosticCascadingBloom(size_t size, unsigned hashes, size_t levels, unsigned k)
: m_k(k)
, m_hashes(hashes)
{
m_data.reserve(levels);
for (unsigned i = 0; i < levels; i++)
......@@ -50,16 +53,10 @@ class HashAgnosticCascadingBloom
* files. This is used to make BloomFilter support the
* same interface as HashAgnosticCascadingBloom.
*/
HashAgnosticCascadingBloom(const string& bloomPath)
{
loadFilter(bloomPath);
}
HashAgnosticCascadingBloom(const string& bloomPath) { loadFilter(bloomPath); }
/** Destructor */
~HashAgnosticCascadingBloom()
{
clear();
}
~HashAgnosticCascadingBloom() { clear(); }
/** Return k-mer size used by Bloom filter. */
unsigned getKmerSize() const { return m_k; }
......@@ -74,6 +71,9 @@ class HashAgnosticCascadingBloom
return m_data.back()->getFilterSize();
}
/** Return the size of the bit array. */
size_t getFilterSize() const { return size(); }
/** Return the number of elements with count >= levels. */
size_t popcount() const
{
......@@ -82,16 +82,10 @@ class HashAgnosticCascadingBloom
}
/** Return number of levels in cascading Bloom filter */
unsigned levels() const
{
return m_data.size();
}
unsigned levels() const { return m_data.size(); }
/** Return the estimated false positive rate */
double FPR() const
{
return pow((double)popcount()/size(), m_hashes);
}
double FPR() const { return pow((double)popcount() / size(), m_hashes); }
/**
* Return true if the element with the given hash values
......@@ -145,8 +139,7 @@ class HashAgnosticCascadingBloom
}
/** Operator for writing the Bloom filter to a stream */
friend std::ostream& operator<<(std::ostream& out,
const HashAgnosticCascadingBloom& o)
friend std::ostream& operator<<(std::ostream& out, const HashAgnosticCascadingBloom& o)
{
assert(o.m_data.size() > 0);
assert(o.m_data.back() != NULL);
......@@ -166,7 +159,6 @@ class HashAgnosticCascadingBloom
}
private:
/** Free all allocated memory and reset parameters to defaults */
void clear()
{
......
......@@ -2,7 +2,8 @@ bin_PROGRAMS = abyss-bloom
abyss_bloom_CPPFLAGS = -I$(top_srcdir) \
-I$(top_srcdir)/Common \
-I$(top_srcdir)/DataLayer
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/vendor
abyss_bloom_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
......@@ -18,4 +19,5 @@ abyss_bloom_SOURCES = bloom.cc \
ConcurrentBloomFilter.h \
CascadingBloomFilter.h \
CascadingBloomFilterWindow.h \
RollingBloomDBGVisitor.h
RollingBloomDBGVisitor.h \
HashAgnosticCascadingBloom.h
......@@ -2,7 +2,9 @@
#define _ROLLING_BLOOM_DBG_VISITOR_H_ 1
#include "Common/UnorderedMap.h"
#include "Common/UnorderedSet.h"
#include "Graph/BreadthFirstSearch.h"
#include "vendor/btl_bloomfilter/BloomFilter.hpp"
#include <iostream>
......@@ -15,7 +17,6 @@ template <typename GraphT>
class RollingBloomDBGVisitor : public DefaultBFSVisitor<GraphT>
{
public:
typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
......@@ -23,20 +24,33 @@ public:
typedef typename DepthMap::const_iterator DepthMapConstIt;
typedef typename DepthMap::iterator DepthMapIt;
typedef std::vector<std::pair<std::string, unordered_set<VertexT> > >
KmerProperties;
typedef std::vector<std::pair<std::string, unordered_set<VertexT>>> KmerProperties;
typedef typename KmerProperties::const_iterator KmerPropertiesIt;
typedef std::vector<std::pair<std::string, BloomFilter*>> BloomProperties;
typedef typename BloomProperties::const_iterator BloomPropertiesIt;
/** Constructor */
RollingBloomDBGVisitor(const VertexT& root, unsigned maxDepth,
const KmerProperties& kmerProperties, std::ostream& out)
: m_root(root), m_maxDepth(maxDepth),
m_kmerProperties(kmerProperties), m_out(out)
template<typename VertexSetT>
RollingBloomDBGVisitor(
const VertexSetT& roots,
unsigned maxDepth,
const KmerProperties& kmerProperties,
const BloomProperties& bloomProperties,
std::ostream& out)
: m_maxDepth(maxDepth)
, m_kmerProperties(kmerProperties)
, m_bloomProperties(bloomProperties)
, m_out(out)
{
m_depthMap[root] = 0;
typedef typename VertexSetT::const_iterator RootIt;
for (RootIt it = roots.begin(); it != roots.end(); ++it)
m_depthMap[*it] = 0;
/* start directed graph (GraphViz) */
m_out << "digraph " << root.kmer().c_str() << " {\n";
m_out << "digraph "
<< " {\n";
}
/** Called when graph search completes */
......@@ -64,8 +78,7 @@ public:
DepthMapIt childIt;
bool inserted;
boost::tie(childIt, inserted) = m_depthMap.insert(
std::make_pair(child, parentDepth + 1));
boost::tie(childIt, inserted) = m_depthMap.insert(std::make_pair(child, parentDepth + 1));
assert(inserted);
/*
......@@ -92,11 +105,20 @@ public:
m_out << "depth=" << depth;
for (KmerPropertiesIt it = m_kmerProperties.begin();
it != m_kmerProperties.end(); ++it) {
for (KmerPropertiesIt it = m_kmerProperties.begin(); it != m_kmerProperties.end(); ++it) {
if (it->second.find(v) != it->second.end())
m_out << "," << it->first;
}
size_t hashes[MAX_HASHES];
v.rollingHash().getHashes(hashes);
for (BloomPropertiesIt it = m_bloomProperties.begin(); it != m_bloomProperties.end();
++it) {
if (it->second->contains(hashes))
m_out << "," << it->first;
}
m_out << "];\n";
return BFS_SUCCESS;
......@@ -126,8 +148,7 @@ public:
/** Return if the current edge is a forward edge */
bool isForwardEdge(const EdgeT& e, const GraphT& g)
{
assert(source(e, g) == m_currentVertex
|| target(e, g) == m_currentVertex);
assert(source(e, g) == m_currentVertex || target(e, g) == m_currentVertex);
return source(e, g) == m_currentVertex;
}
......@@ -138,20 +159,17 @@ public:
const VertexT& v = target(e, g);
/* declare edge (GraphViz) */
m_out << '\t' << u.kmer().c_str() << " -> "
<< v.kmer().c_str() << ";\n";
m_out << '\t' << u.kmer().c_str() << " -> " << v.kmer().c_str() << ";\n";
}
protected:
VertexT m_currentVertex;
DepthMap m_depthMap;
const VertexT& m_root;
const unsigned m_maxDepth;
const KmerProperties& m_kmerProperties;
const BloomProperties& m_bloomProperties;
std::ostream& m_out;
};
#endif
This diff is collapsed.
#ifndef BLOOM_IO_H
#define BLOOM_IO_H 1
#include "BloomDBG/RollingHashIterator.h"
#include "BloomDBG/RollingHash.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include "BloomDBG/RollingHashIterator.h"
#include "DataLayer/FastaReader.h"
#include "vendor/btl_bloomfilter/BloomFilter.hpp"
namespace BloomDBG {
......@@ -11,7 +12,8 @@ namespace BloomDBG {
* Round up `num` to the nearest multiple of `base`.
*/
template<typename T>
inline static T roundUpToMultiple(T num, T base)
inline static T
roundUpToMultiple(T num, T base)
{
if (base == 0)
return num;
......@@ -28,12 +30,12 @@ namespace BloomDBG {
* @param seq DNA sequence
*/
template<typename BF>
inline static void loadSeq(BF& bloom, const std::string& seq)
inline static void
loadSeq(BF& bloom, const std::string& seq)
{
const unsigned k = bloom.getKmerSize();
const unsigned numHashes = bloom.getHashNum();
for (RollingHashIterator it(seq, numHashes, k);
it != RollingHashIterator::end(); ++it) {
for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
bloom.insert(*it);
}
}
......@@ -46,8 +48,8 @@ namespace BloomDBG {
* @param verbose if true, print progress messages to STDERR
*/
template<typename BF>
inline static void loadFile(BF& bloom, const std::string& path,
bool verbose = false)
inline static void
loadFile(BF& bloom, const std::string& path, bool verbose = false)
{
const size_t BUFFER_SIZE = 1000000;
const size_t LOAD_PROGRESS_STEP = 10000;
......@@ -81,22 +83,20 @@ namespace BloomDBG {
{
readCount++;
if (readCount % LOAD_PROGRESS_STEP == 0)
std::cerr << "Loaded " << readCount
<< " reads into Bloom filter\n";
std::cerr << "Loaded " << readCount << " reads into Bloom filter\n";
}
}
}
assert(in.eof());
if (verbose) {
std::cerr << "Loaded " << readCount << " reads from `"
<< path << "` into Bloom filter\n";
std::cerr << "Loaded " << readCount << " reads from `" << path << "` into Bloom filter\n";
}
}
/** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
template<typename BloomFilterT>
static inline void loadBloomFilter(int argc, char** argv,
BloomFilterT& bloom, bool verbose=false)
static inline void
loadBloomFilter(int argc, char** argv, BloomFilterT& bloom, bool verbose = false)
{
/* load reads into Bloom filter */
for (int i = optind; i < argc; ++i) {
......@@ -114,8 +114,7 @@ namespace BloomDBG {
BloomDBG::loadFile(bloom, argv[i], verbose);
}
if (verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< bloom.FPR() * 100 << "%" << endl;
cerr << "Bloom filter FPR: " << setprecision(3) << bloom.FPR() * 100 << "%" << endl;
}
} // end namespace 'BloomDBG'
......
......@@ -5,12 +5,12 @@
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/AssemblyStreams.h"
#include "Common/IOUtil.h"
#include "DataLayer/FastaReader.h"
#include <cstdio>
#include <iostream>
#include <fstream>
#include <iostream>
namespace BloomDBG
{
namespace BloomDBG {
const std::string CHECKPOINT_FASTA_EXT = ".contigs.fa";
const std::string CHECKPOINT_COUNTERS_EXT = ".counters.tsv";
const std::string CHECKPOINT_BLOOM_DBG_EXT = ".dbg.bloom";
......@@ -29,7 +29,9 @@ namespace BloomDBG
* command line opts)
*/
template<typename BloomDBGT, typename VisitedKmerSetT>
static inline void createCheckpoint(const BloomDBGT& dbg,
static inline void
createCheckpoint(
const BloomDBGT& dbg,
const VisitedKmerSetT& visitedKmerSet,
const AssemblyCounters& counters,
const AssemblyParams& params)
......@@ -47,8 +49,8 @@ namespace BloomDBG
std::string dbgPathTmp = dbgPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing Bloom filter de Bruijn graph to `"
<< dbgPathTmp << "'" << std::endl;
std::cerr << '\t' << "Writing Bloom filter de Bruijn graph to `" << dbgPathTmp << "'"
<< std::endl;
std::ofstream dbgOut;
dbgOut.open(dbgPathTmp.c_str());
......@@ -62,8 +64,8 @@ namespace BloomDBG
std::string visitedPathTmp = visitedPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing visited k-mers Bloom to `"
<< visitedPathTmp << "'" << std::endl;
std::cerr << '\t' << "Writing visited k-mers Bloom to `" << visitedPathTmp << "'"
<< std::endl;
std::ofstream visitedOut;
visitedOut.open(visitedPathTmp.c_str());
......@@ -77,8 +79,8 @@ namespace BloomDBG
std::string countersPathTmp = countersPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing assembly counters to `"
<< countersPathTmp << "'" << std::endl;
std::cerr << '\t' << "Writing assembly counters to `" << countersPathTmp << "'"
<< std::endl;
std::ofstream countersOut;
countersOut.open(countersPathTmp.c_str());
......@@ -92,14 +94,13 @@ namespace BloomDBG
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPathTmp
<< " to `" << fastaPath << "'" << std::endl;
std::cerr << '\t' << "Copying `" << fastaPathTmp << " to `" << fastaPath << "'"
<< std::endl;
copyFile(fastaPathTmp, fastaPath);
if (params.verbose)
std::cerr << '\t' << "Moving `" << dbgPathTmp
<< "' to `" << dbgPath << "'" << std::endl;
std::cerr << '\t' << "Moving `" << dbgPathTmp << "' to `" << dbgPath << "'" << std::endl;
if (rename(dbgPathTmp.c_str(), dbgPath.c_str()) != 0) {
perror("Error renaming file");
......@@ -107,8 +108,8 @@ namespace BloomDBG
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << visitedPathTmp
<< "' to `" << visitedPath << "'" << std::endl;
std::cerr << '\t' << "Moving `" << visitedPathTmp << "' to `" << visitedPath << "'"
<< std::endl;
if (rename(visitedPathTmp.c_str(), visitedPath.c_str()) != 0) {
perror("Error renaming file");
......@@ -116,8 +117,8 @@ namespace BloomDBG
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << countersPathTmp
<< "' to `" << countersPath << "'" << std::endl;
std::cerr << '\t' << "Moving `" << countersPathTmp << "' to `" << countersPath << "'"
<< std::endl;
if (rename(countersPathTmp.c_str(), countersPath.c_str()) != 0) {
perror("Error renaming file");
......@@ -126,7 +127,8 @@ namespace BloomDBG
}
/** Return true if checkpoint files exist and are readable */
static inline bool checkpointExists(const AssemblyParams& params)
static inline bool
checkpointExists(const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
......@@ -137,10 +139,8 @@ namespace BloomDBG
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
return std::ifstream(fastaPath.c_str()).good()
&& std::ifstream(dbgPath.c_str()).good()
&& std::ifstream(visitedPath.c_str()).good()
&& std::ifstream(countersPath.c_str()).good();
return std::ifstream(fastaPath.c_str()).good() && std::ifstream(dbgPath.c_str()).good() &&
std::ifstream(visitedPath.c_str()).good() && std::ifstream(countersPath.c_str()).good();
}
/**
......@@ -154,11 +154,13 @@ namespace BloomDBG
* @param out main output stream for assembled contigs
* command line opts)
*/
template <typename BloomDBGT, typename VisitedKmerSetT,
typename InputReadStreamT>
static inline void resumeFromCheckpoint(
BloomDBGT& dbg, VisitedKmerSetT& visitedKmerSet,
AssemblyCounters& counters, const AssemblyParams& params,
template<typename BloomDBGT, typename VisitedKmerSetT, typename InputReadStreamT>
static inline void
resumeFromCheckpoint(
BloomDBGT& dbg,
VisitedKmerSetT& visitedKmerSet,
AssemblyCounters& counters,
const AssemblyParams& params,
AssemblyStreams<InputReadStreamT>& streams)
{
assert(checkpointExists(params));
......@@ -173,24 +175,24 @@ namespace BloomDBG
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading Bloom filter de Bruijn graph from `"
<< dbgPath << "'" << std::endl;
std::cerr << '\t' << "Reading Bloom filter de Bruijn graph from `" << dbgPath << "'"
<< std::endl;
dbg.loadFilter(dbgPath);
/* load visited k-mers Bloom filter */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading reading visited k-mers Bloom from `"
<< visitedPath << "'" << std::endl;
std::cerr << '\t' << "Reading reading visited k-mers Bloom from `" << visitedPath << "'"
<< std::endl;
visitedKmerSet.loadFilter(visitedPath);
/* load index for next input read */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading index of next input read from `"
<< countersPath << "'" << std::endl;
std::cerr << '\t' << "Reading index of next input read from `" << countersPath << "'"
<< std::endl;
std::ifstream countersIn(countersPath.c_str());
assert_good(countersIn, countersPath);
countersIn >> counters;
......@@ -199,8 +201,8 @@ namespace BloomDBG
/* advance to previous position in input reads */
if (params.verbose)
std::cerr << '\t' << "Advancing to read index "
<< counters.readsProcessed << " in input reads..." << std::endl;
std::cerr << '\t' << "Advancing to read index " << counters.readsProcessed
<< " in input reads..." << std::endl;
FastaRecord rec;
for (size_t i = 0; i < counters.readsProcessed && streams.in; ++i)
......@@ -212,8 +214,8 @@ namespace BloomDBG
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPath
<< "' to `" << fastaPathTmp << "'" << std::endl;
std::cerr << '\t' << "Copying `" << fastaPath << "' to `" << fastaPathTmp << "'"
<< std::endl;
copyFile(fastaPath, fastaPathTmp);
if (params.verbose)
......@@ -226,7 +228,8 @@ namespace BloomDBG
}
/** Delete a file if it exists */
static inline void removeFileIfExists(const std::string& path)
static inline void
removeFileIfExists(const std::string& path)
{
if (std::ifstream(path.c_str()).good()) {
if (remove(path.c_str()) != 0) {
......@@ -237,7 +240,8 @@ namespace BloomDBG
}
/** Remove all checkpoint-related files */
static inline void removeCheckpointData(const AssemblyParams& params)
static inline void
removeCheckpointData(const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
......
......@@ -2,7 +2,8 @@ bin_PROGRAMS = abyss-bloom-dbg
abyss_bloom_dbg_CPPFLAGS = -I$(top_srcdir) \
-I$(top_srcdir)/Common \
-I$(top_srcdir)/DataLayer
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/vendor
abyss_bloom_dbg_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
......@@ -18,12 +19,9 @@ abyss_bloom_dbg_SOURCES = \
bloom-dbg.h \
BloomIO.h \
Checkpoint.h \
HashAgnosticCascadingBloom.h \
LightweightKmer.h \
MaskedKmer.h \
RollingBloomDBG.h \
RollingHash.h \
RollingHashIterator.h \
SpacedSeed.h \
$(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
$(top_srcdir)/lib/nthash/nthash.hpp
SpacedSeed.h
......@@ -12,7 +12,7 @@
#include "Graph/Properties.h"
#include "BloomDBG/RollingHash.h"
#include "BloomDBG/LightweightKmer.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include "vendor/btl_bloomfilter/BloomFilter.hpp"
#include <algorithm>
#include <cassert>
......
......@@ -6,7 +6,7 @@
#include "BloomDBG/LightweightKmer.h"
#include "BloomDBG/MaskedKmer.h"
#include "Common/Sense.h"
#include "lib/nthash/nthash.hpp"
#include "vendor/nthash/nthash.hpp"
#include <algorithm>
#include <string>
......
#include "config.h"
#include "BloomDBG/bloom-dbg.h"
#include "BloomDBG/AssemblyCounters.h"
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/Checkpoint.h"
#include "BloomDBG/HashAgnosticCascadingBloom.h"
#include "BloomDBG/MaskedKmer.h"
#include "BloomDBG/SpacedSeed.h"
#include "Common/StringUtil.h"
#include "BloomDBG/bloom-dbg.h"
#include "Common/Options.h"
#include "Common/StringUtil.h"
#include "DataLayer/Options.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include "vendor/btl_bloomfilter/CountingBloomFilter.hpp"
#include <getopt.h>
#include <iostream>
#include <sstream>
#include <cstdlib>
#include <iomanip>
#include <cstring>
#include <getopt.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <sstream>
#include <string>
#if _OPENMP
#include <omp.h>
#endif
typedef uint8_t BloomCounterType;
typedef CountingBloomFilter<BloomCounterType> CountingBloomFilterType;
using namespace std;
#define PROGRAM "abyss-bloom-dbg"
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
......@@ -57,9 +61,10 @@ static const char USAGE_MESSAGE[] =
" --trim-masked trim masked bases from the ends of reads\n"
" --no-trim-masked do not trim masked bases from the ends\n"
" of reads [default]\n"
" -k, --kmer=N the size of a k-mer [required]\n"
" --kc=N use a cascading Bloom filter with N levels,\n"
" instead of a counting Bloom filter [2]\n"
" -k, --kmer=N the size of a k-mer [<=" STR(
MAX_KMER) "]\n"
" --kc=N ignore k-mers having a count < N,\n"
" using a counting Bloom filter [2]\n"
" -o, --out=FILE write the contigs to FILE [STDOUT]\n"
" -q, --trim-quality=N trim bases from the ends of reads whose\n"
" quality is less than the threshold\n"
......@@ -122,9 +127,16 @@ BloomDBG::AssemblyParams params;
static const char shortopts[] = "b:C:g:H:i:j:k:K:o:q:Q:R:s:t:T:v";
enum {
OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV,
CHECKPOINT, KEEP_CHECKPOINT, CHECKPOINT_PREFIX, READ_LOG,
enum
{
OPT_HELP = 1,
OPT_VERSION,
QR_SEED,
MIN_KMER_COV,
CHECKPOINT,
KEEP_CHECKPOINT,
CHECKPOINT_PREFIX,
READ_LOG,
};
static const struct option longopts[] = {
......@@ -162,22 +174,26 @@ static const struct option longopts[] = {
{ NULL, 0, NULL, 0 }
};
void printCascadingBloomStats(HashAgnosticCascadingBloom& bloom, ostream& os)
template<typename T>
void
printCountingBloomStats(T& bloom, ostream& os)
{
for (unsigned i = 0; i < bloom.levels(); i++) {
os << "Stats for Bloom filter level " << i+1 << ":\n"
<< "\tBloom size (bits): "
<< bloom.getBloomFilter(i).getFilterSize() << "\n"
<< "\tBloom popcount (bits): "
<< bloom.getBloomFilter(i).getPop() << "\n"
<< "\tBloom filter FPR: " << setprecision(3)
<< 100 * bloom.getBloomFilter(i).getFPR() << "%\n";
}
os << "Counting Bloom filter stats:"
<< "\n\t#counters = " << bloom.size()
<< "\n\t#size (B) = " << bloom.sizeInBytes()
<< "\n\tthreshold = " << bloom.threshold()
<< "\n\tpopcount = " << bloom.filtered_popcount()
<< "\n\tFPR = " << setprecision(3) << 100.f * bloom.filtered_FPR() << "%"
<< "\n";
}
/** Create optional auxiliary output files */
template<typename BloomFilterT>
void writeAuxiliaryFiles(int argc, char** argv, const BloomFilterT& bloom,
void
writeAuxiliaryFiles(
int argc,
char** argv,
const BloomFilterT& bloom,
const BloomDBG::AssemblyParams& params)
{
/* generate wiggle coverage track */
......@@ -196,7 +212,8 @@ void writeAuxiliaryFiles(int argc, char** argv, const BloomFilterT& bloom,
}
/** Initialize global variables for k-mer size and spaced seed pattern */
void initGlobals(const BloomDBG::AssemblyParams& params)
void
initGlobals(const BloomDBG::AssemblyParams& params)
{
/* set global variable for k-mer length */
MaskedKmer::setLength(params.k);
......@@ -218,8 +235,8 @@ void initGlobals(const BloomDBG::AssemblyParams& params)
/**
* Resume assembly from previously saved checkpoint.
*/
void resumeAssemblyFromCheckpoint(int argc, char** argv,
BloomDBG::AssemblyParams& params, ostream& out)
void
resumeAssemblyFromCheckpoint(int argc, char** argv, BloomDBG::AssemblyParams& params, ostream& out)
{
assert(params.checkpointsEnabled() && checkpointExists(params));
......@@ -227,7 +244,7 @@ void resumeAssemblyFromCheckpoint(int argc, char** argv,
initGlobals(params);
/* empty Bloom filter de Bruijn graph */
HashAgnosticCascadingBloom solidKmerSet;
CountingBloomFilterType solidKmerSet;
/* empty visited k-mers Bloom filter */
BloomFilter visitedKmerSet;
......@@ -267,16 +284,13 @@ void resumeAssemblyFromCheckpoint(int argc, char** argv,
}
/* bundle input/output streams for assembly */
BloomDBG::AssemblyStreams<FastaConcat>
streams(in, out, checkpointOut, traceOut, readLogOut);
BloomDBG::AssemblyStreams<FastaConcat> streams(in, out, checkpointOut, traceOut, readLogOut);
/* restore state of Bloom filters, counters, and input/output streams */
BloomDBG::resumeFromCheckpoint(solidKmerSet, visitedKmerSet,
counters, params, streams);
BloomDBG::resumeFromCheckpoint(solidKmerSet, visitedKmerSet, counters, params, streams);
/* resume the assembly */
BloomDBG::assemble(solidKmerSet, visitedKmerSet, counters, params,
streams);
BloomDBG::assemble(solidKmerSet, visitedKmerSet, counters, params, streams);
}
/**
......@@ -284,31 +298,28 @@ void resumeAssemblyFromCheckpoint(int argc, char** argv,
* from file (`-i` option). (The input Bloom filter file
* is constructed using `abyss-bloom build -t rolling-hash`.)
*/
void prebuiltBloomAssembly(int argc, char** argv,
BloomDBG::AssemblyParams& params, ostream& out)
void
prebuiltBloomAssembly(int argc, char** argv, BloomDBG::AssemblyParams& params, ostream& out)
{
/* load prebuilt Bloom filter from file */
assert(!params.bloomPath.empty());
if (params.verbose)
cerr << "Loading prebuilt Bloom filter from `" << params.bloomPath
<< "'" << endl;
cerr << "Loading prebuilt Bloom filter from `" << params.bloomPath << "'" << endl;
/* load the Bloom filter from file */
HashAgnosticCascadingBloom bloom(params.bloomPath);
CountingBloomFilterType bloom(params.bloomPath, params.minCov);
if (params.verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< bloom.FPR() * 100 << "%" << endl;
cerr << "Bloom filter FPR: " << setprecision(3) << bloom.FPR() * 100 << "%" << endl;
printCascadingBloomStats(bloom, cerr);
printCountingBloomStats(bloom, cerr);
/* override command line options with values from Bloom file */
params.k = bloom.getKmerSize();
params.numHashes = bloom.getHashNum();
params.bloomSize = bloom.size() + 7 / 8;
params.bloomSize = bloom.sizeInBytes();
if (params.trim == std::numeric_limits<unsigned>::max())
params.trim = params.k;
......@@ -324,8 +335,7 @@ void prebuiltBloomAssembly(int argc, char** argv,
/* do assembly */
BloomDBG::assemble(argc - optind, argv + optind,
bloom, params, out);
BloomDBG::assemble(argc - optind, argv + optind, bloom, params, out);
/* write supplementary files (e.g. GraphViz) */
......@@ -333,10 +343,10 @@ void prebuiltBloomAssembly(int argc, char** argv,
}
/**
* Load the reads into a cascading Bloom filter and do the assembly.
* Load the reads into a counting Bloom filter and do the assembly.
*/
void cascadingBloomAssembly(int argc, char** argv,
const BloomDBG::AssemblyParams& params, ostream& out)
void
countingBloomAssembly(int argc, char** argv, const BloomDBG::AssemblyParams& params, ostream& out)
{
/* init global vars for k-mer size and spaced seed pattern */
......@@ -346,93 +356,99 @@ void cascadingBloomAssembly(int argc, char** argv,
if (params.verbose)
cerr << params;
/*
* Build/load cascading Bloom filetr.
*
* Note 1: We use (params.minCov + 1) here because we use an additional
* Bloom filter in BloomDBG::assemble() to track the set of
* assembled k-mers.
*
* Note 2: BloomFilter class requires size to be a multiple of 64.
*/
const size_t bitsPerByte = 8;
size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
params.bloomSize * bitsPerByte / (params.minCov + 1), (size_t)64);
/* Initialize a counting Bloom filter:
Divide the requested memory in bytes by the byte-size of each counter to determine the number
of counters, and then round up that count to the next multiple of 64.*/
HashAgnosticCascadingBloom cascadingBloom(
bloomLevelSize, params.numHashes, params.minCov, params.k);
size_t counters =
BloomDBG::roundUpToMultiple(params.bloomSize / sizeof(BloomCounterType), (size_t)64);
BloomDBG::loadBloomFilter(argc, argv, cascadingBloom, params.verbose);
CountingBloomFilterType bloom(counters, params.numHashes, params.k, params.minCov);
BloomDBG::loadBloomFilter(argc, argv, bloom, params.verbose);
if (params.verbose)
printCascadingBloomStats(cascadingBloom, cerr);
printCountingBloomStats(bloom, cerr);
/* second pass through FASTA files for assembling */
BloomDBG::assemble(argc - optind, argv + optind,
cascadingBloom, params, out);
BloomDBG::assemble(argc - optind, argv + optind, bloom, params, out);
/* write supplementary files (e.g. GraphViz) */
writeAuxiliaryFiles(argc - optind, argv + optind, cascadingBloom, params);
writeAuxiliaryFiles(argc - optind, argv + optind, bloom, params);
}
/**
* Create a de novo genome assembly using a Bloom filter de
* Bruijn graph.
*/
int main(int argc, char** argv)
int
main(int argc, char** argv)
{
bool die = false;
for (int c; (c = getopt_long(argc, argv,
shortopts, longopts, NULL)) != -1;) {
for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?':
die = true; break;
die = true;
break;
case 'b':
params.bloomSize = SIToBytes(arg); break;
params.bloomSize = SIToBytes(arg);
break;
case 'C':
arg >> params.covTrackPath; break;
arg >> params.covTrackPath;
break;
case 'g':
arg >> params.graphPath; break;
arg >> params.graphPath;
break;
case 'H':
arg >> params.numHashes; break;
arg >> params.numHashes;
break;
case 'i':
arg >> params.bloomPath; break;
arg >> params.bloomPath;
break;
case 'j':
arg >> params.threads; break;
arg >> params.threads;
break;
case 'k':
arg >> params.k; break;
arg >> params.k;
break;
case 'K':
params.resetSpacedSeedParams();
arg >> params.K;
break;
case 'o':
arg >> params.outputPath; break;
arg >> params.outputPath;
break;
case 'q':
arg >> opt::qualityThreshold; break;
arg >> opt::qualityThreshold;
break;
case 'R':
arg >> params.refPath; break;
arg >> params.refPath;
break;
case 's':
params.resetSpacedSeedParams();
arg >> params.spacedSeed;
break;
case 't':
arg >> params.trim; break;
arg >> params.trim;
break;
case 'T':
arg >> params.tracePath; break;
arg >> params.tracePath;
break;
case 'Q':
arg >> opt::internalQThreshold; break;
arg >> opt::internalQThreshold;
break;
case 'v':
++params.verbose; break;
++params.verbose;
break;
case OPT_HELP:
cout << USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case MIN_KMER_COV:
arg >> params.minCov; break;
arg >> params.minCov;
break;
case OPT_VERSION:
cout << VERSION_MESSAGE;
exit(EXIT_SUCCESS);
......@@ -455,8 +471,7 @@ int main(int argc, char** argv)
}
if (optarg != NULL && (!arg.eof() || arg.fail())) {
cerr << PROGRAM ": invalid option: `-"
<< (char)c << optarg << "'\n";
cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
exit(EXIT_FAILURE);
}
}
......@@ -478,7 +493,9 @@ int main(int argc, char** argv)
if (params.numHashes > MAX_HASHES) {
cerr << PROGRAM ": number of hash functions (`-H`) must "
"be <= " << MAX_HASHES << " (set by `configure` option "
"be <= "
<< MAX_HASHES
<< " (set by `configure` option "
"--enable-max-hashes=N)\n";
die = true;
}
......@@ -505,8 +522,7 @@ int main(int argc, char** argv)
}
if (die) {
cerr << "Try `" << PROGRAM
<< " --help' for more information.\n";
cerr << "Try `" << PROGRAM << " --help' for more information.\n";
exit(EXIT_FAILURE);
}
......@@ -529,7 +545,7 @@ int main(int argc, char** argv)
else if (!params.bloomPath.empty())
prebuiltBloomAssembly(argc, argv, params, out);
else
cascadingBloomAssembly(argc, argv, params, out);
countingBloomAssembly(argc, argv, params, out);
/* cleanup */
if (!params.outputPath.empty())
......
This diff is collapsed.
......@@ -27,11 +27,11 @@ Files: Layout/*
Copyright: Copyright 2012 Shaun Jackman
License: GPL-3
Files: lib/bloomfilter/*
Files: vendor/btl_bloomfilter/*
Copyright: Copyright 2016 Justin Chu
License: GPL-3
Files: lib/rolling-hash/*
Files: vendor/nthash/*
Copyright: Copyright 2016 Hamid Mohamadi
License: GPL-3
......@@ -128,7 +128,7 @@ License: Expat
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
Files: lib/gtest-*/*
Files: vendor/gtest-*/*
Copyright: Copyright 2008 Google Inc.
License: BSD-3-clause
Redistribution and use in source and binary forms, with or without
......
2019-08-01 Johnathan Wong <jowong@bcgsc.ca>
* Release version 2.2.0
* Construct a counting bloom filter instead of a cascading bloom filter.
abyss-bloom:
* Add 'counting' as valid argument to '-t' option to build a counting
bloom filter
2018-12-04 Sauparna Palchowdhury <spchowdhury@bcgsc.ca>
* Release version 2.1.5
......
......@@ -188,9 +188,9 @@ static Seq load(const uint8_t *src)
Seq seq;
#if MAX_KMER > 96
# if WORDS_BIGENDIAN
const uint64_t *s = reinterpret_cast<const uint64_t*>(src);
uint64_t *d = reinterpret_cast<uint64_t*>(&seq + 1);
copy(s, s + SEQ_WORDS, reverse_iterator<uint64_t*>(d));
const size_t *s = reinterpret_cast<const size_t*>(src);
size_t *d = reinterpret_cast<size_t*>(&seq + 1);
copy(s, s + Kmer::NUM_BYTES/sizeof(size_t), reverse_iterator<size_t*>(d));
# else
uint8_t *d = reinterpret_cast<uint8_t*>(&seq);
memcpy(d, src, sizeof seq);
......@@ -234,10 +234,10 @@ static void storeReverse(uint8_t *dest, const Seq seq)
{
#if MAX_KMER > 96
# if WORDS_BIGENDIAN
const uint64_t *s = reinterpret_cast<const uint64_t*>(&seq);
uint64_t *d = reinterpret_cast<uint64_t*>(dest);
copy(s, s + SEQ_WORDS,
reverse_iterator<uint64_t*>(d + SEQ_WORDS));
const size_t *s = reinterpret_cast<const size_t*>(&seq);
size_t *d = reinterpret_cast<size_t*>(dest);
copy(s, s + Kmer::NUM_BYTES/sizeof(size_t),
reverse_iterator<size_t*>(d + Kmer::NUM_BYTES/sizeof(size_t)));
reverse(dest, dest + Kmer::NUM_BYTES);
# else
memcpy(dest, &seq, Kmer::NUM_BYTES);
......
......@@ -5,10 +5,9 @@
#include "Sense.h"
#include "Sequence.h"
#include "Common/Hash.h"
#include <cassert>
#include <cstring> // for memcpy
#include <iostream>
#include <stdint.h>
#include <ostream>
/** A k-mer. */
class Kmer
......@@ -46,7 +45,12 @@ class Kmer
*/
static void setLength(unsigned length)
{
assert(length <= MAX_KMER);
if (length > MAX_KMER) {
std::cerr << "Error: k is " << length
<< " and must be no more than " << MAX_KMER
<< ". You can recompile ABySS to increase this limit.\n";
exit(EXIT_FAILURE);
}
s_length = length;
s_bytes = (length + 3) / 4;
}
......
......@@ -7,8 +7,14 @@
#include <unistd.h> // for sbrk
#if __MACH__
# ifdef __APPLE__
# include <mach/mach.h> // for mach_task_self
# include <mach/task.h> // for task_info
# else
extern "C" {
# include <mach/mach.h> // for mach_task_self and task_info
}
# endif
#endif
/** Return the number of bytes used by the data and stack segments.
......