Commit b538cfd6 authored by Andreas Tille's avatar Andreas Tille

New upstream version 2.0.3

parent fe441e45
version: 2
jobs:
build:
docker:
- image: ubuntu:xenial
steps:
- run: |
apt-get update -qq
apt-get install -qq autoconf automake clang g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
- checkout
- run: |
./autogen.sh
./configure CC=clang CXX=clang++ --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
# Configuration for probot-stale - https://github.com/probot/stale
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 21
# Number of days of inactivity before a stale Issue or Pull Request is closed
daysUntilClose: 7
# Issues or Pull Requests with these labels will never be considered stale
exemptLabels:
- gsoc-outreachy
- help wanted
- in progress
# Label to use when marking as stale
staleLabel: stale
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs.
language: cpp
compiler:
- gcc
- clang
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev
......
......@@ -56,8 +56,10 @@ static const char USAGE_MESSAGE[] =
" --adj output the graph in ADJ format [default]\n"
" --asqg output the graph in ASQG format\n"
" --dot output the graph in GraphViz format\n"
" --gfa output the graph in GFA1 format\n"
" --gfa1 output the graph in GFA1 format\n"
" --gfa2 output the graph in GFA2 format\n"
" --gv output the graph in GraphViz format\n"
" --gfa output the graph in GFA format\n"
" --sam output the graph in SAM format\n"
" --SS expect contigs to be oriented correctly\n"
" --no-SS no assumption about contig orientation\n"
......@@ -100,8 +102,10 @@ static const struct option longopts[] = {
{ "adj", no_argument, &opt::format, ADJ },
{ "asqg", no_argument, &opt::format, ASQG },
{ "dot", no_argument, &opt::format, DOT },
{ "gfa", no_argument, &opt::format, GFA1 },
{ "gfa1", no_argument, &opt::format, GFA1 },
{ "gfa2", no_argument, &opt::format, GFA2 },
{ "gv", no_argument, &opt::format, DOT },
{ "gfa", no_argument, &opt::format, GFA },
{ "sam", no_argument, &opt::format, SAM },
{ "SS", no_argument, &opt::ss, 1 },
{ "no-SS", no_argument, &opt::ss, 0 },
......
......@@ -237,7 +237,7 @@ void writeBubble(std::ostream& out, const BranchGroup& group, unsigned id)
<< currBranch.calculateBranchMultiplicity() << '\n'
<< contig.c_str() << '\n';
}
assert(out.good());
assert_good(out, opt::snpPath);
}
/** Collapse a bubble to a single path. */
......
......@@ -13,6 +13,7 @@
#include <boost/graph/graph_traits.hpp>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
......@@ -103,11 +104,21 @@ SequenceCollectionHash()
// architecture and 2 bits per element on a 32-bit architecture.
// The number of elements is rounded up to a power of two.
if (opt::rank >= 0) {
// Make room for 200 million k-mers. Approximately 58 million
// 96-mers fit into 2 GB of ram, which results in a hash load
// of 0.216, and approximately 116 million 32-mers, which
// results in a hash load of 0.432.
m_data.rehash(200000000);
// Initialize sparsehash size to 2^30 (~1 billion) empty buckets.
// Setting the initial sparsehash size to a large
// number avoids the prohibitively slow step of resizing
// the hash table and rehashing *every* element when the
// maximum load factor is exceeded.
//
// The initial memory footprint per CPU core is
// 2^30 * 2.67 / 8 ~= 0.358 GB. The value of 2^30 buckets
// was chosen to accomodate a k=144 32-thread assembly of human
// (NA24143) uncorrected Illumina reads with ~70X coverage and
// 20,317,980,431 distinct 144-mers, without requiring sparsehash
// resizing. For further details on the test dataset, see:
// "ABySS 2.0: Resource-efficient assembly of large genomes
// using a Bloom filter".
m_data.rehash((size_t)pow(2, 30));
m_data.min_load_factor(0.2);
} else {
// Allocate a big hash for a single processor.
......
#ifndef CONCURRENTBLOOMFILTER_H
#define CONCURRENTBLOOMFILTER
#define CONCURRENTBLOOMFILTER_H
#ifndef _OPENMP
# error ConcurrentBloomFilter class requires a compiler that supports OpenMP
......
This diff is collapsed.
#ifndef _ASSEMBLY_COUNTERS_H_
#define _ASSEMBLY_COUNTERS_H_
#include "Common/IOUtil.h"
#include <iostream>
#include <cassert>
#include <string>
namespace BloomDBG {
/**
* Counters for tracking assembly statistics and producing
* progress messages.
*/
struct AssemblyCounters
{
/** reads consisting entirely of solid k-mers */
size_t solidReads;
size_t readsExtended;
size_t readsProcessed;
size_t basesAssembled;
size_t contigID;
AssemblyCounters() : solidReads(0), readsExtended(0), readsProcessed(0),
basesAssembled(0), contigID(0) {}
/** serialize counters as a TSV table */
friend std::ostream& operator<<(std::ostream& out,
const AssemblyCounters& o)
{
/* write headers */
out << "solid_reads"
<< '\t' << "extended_reads"
<< '\t' << "processed_reads"
<< '\t' << "bases_assembled"
<< '\t' << "next_contig_id"
<< '\n';
/* write data */
out << o.solidReads
<< '\t' << o.readsExtended
<< '\t' << o.readsProcessed
<< '\t' << o.basesAssembled
<< '\t' << o.contigID
<< '\n';
return out;
}
/** deserialize counters from a TSV table */
friend std::istream& operator>>(std::istream& in,
AssemblyCounters& o)
{
/* ignore header line */
std::string headers;
getline(in, headers);
/* read data */
in >> o.solidReads
>> expect("\t") >> o.readsExtended
>> expect("\t") >> o.readsProcessed
>> expect("\t") >> o.basesAssembled
>> expect("\t") >> o.contigID
>> expect("\n");
return in;
}
};
} /* end of BloomDBG namespace */
#endif
#ifndef _ASSEMBLY_PARAMS_H_
#define _ASSEMBLY_PARAMS_H_
#include <string>
#include <iostream>
#include <limits>
namespace BloomDBG {
/**
* Parameters controlling assembly.
*/
struct AssemblyParams
{
/** Bloom filter size (in bytes) */
size_t bloomSize;
/** Checkpoint frequency (reads processed per checkpoint) */
size_t readsPerCheckpoint;
/** Do not delete checkpoint files after a successful assembly */
bool keepCheckpoint;
/** Filename prefix for checkpoint files */
std::string checkpointPathPrefix;
/** minimum k-mer coverage threshold */
unsigned minCov;
/** WIG track containing 0/1 for sufficient k-mer cov */
std::string covTrackPath;
/** path for output GraphViz file */
std::string graphPath;
/** num Bloom filter hash functions */
unsigned numHashes;
/** input Bloom filter file (if empty, build Bloom filter from reads)*/
std::string bloomPath;
/** the number of parallel threads. */
unsigned threads;
/** the size of a k-mer. */
unsigned k;
/** the size of a single k-mer in a k-mer pair */
unsigned K;
/** reference genome */
std::string refPath;
/** Quadratic Residue (QR) seed length */
unsigned qrSeedLen;
/** spaced seed */
std::string spacedSeed;
/** maximum length of branches to trim */
unsigned trim;
/** verbose level for progress messages */
int verbose;
/** output contigs path (empty string indicates STDOUT) */
std::string outputPath;
/** output path for trace file (-T) option */
std::string tracePath;
/** Default constructor */
AssemblyParams() : bloomSize(0),
readsPerCheckpoint(std::numeric_limits<size_t>::max()),
keepCheckpoint(false), checkpointPathPrefix("bloom-dbg-checkpoint"),
minCov(2), graphPath(), numHashes(1), threads(1),
k(0), K(0), qrSeedLen(0), spacedSeed(),
trim(std::numeric_limits<unsigned>::max()),
verbose(0), outputPath(), tracePath() {}
/** Return true if all required members are initialized */
bool initialized() const {
return bloomSize > 0 && k > 0 &&
trim != std::numeric_limits<unsigned>::max();
}
/** Return true if checkpoint creation is enabled */
bool checkpointsEnabled() const {
return readsPerCheckpoint != std::numeric_limits<size_t>::max();
}
/** Reset all spaced seed params to their default values */
void resetSpacedSeedParams() {
spacedSeed.clear();
K = 0;
qrSeedLen = 0;
}
/** Report current parameter values (for logging) */
friend std::ostream& operator<<(std::ostream& out,
const AssemblyParams& o)
{
out << "Assembly parameters:" << std::endl
<< '\t' << "K-mer size (-k): " << o.k << std::endl
<< '\t' << "K-mer coverage threshold (--kc): " << o.minCov << std::endl
<< '\t' << "Max branch trim length (-t): " << o.trim << std::endl
<< '\t' << "Bloom size in bytes (-b): " << o.bloomSize << std::endl
<< '\t' << "Bloom hash functions (-H): " << o.numHashes << std::endl;
if (o.K > 0)
out << '\t' << "Spaced k-mer size (-K): " << o.K << std::endl;
if (o.qrSeedLen > 0)
out << '\t' << "Quadratic residue (QR) seed length (--qr-seed): "
<< o.qrSeedLen << std::endl;
return out;
}
};
} /* end of BloomDBG namespace */
#endif
#ifndef _ASSEMBLY_STREAMS_H_
#define _ASSEMBLY_STREAMS_H_
namespace BloomDBG {
/** Bundles together input and output streams used during assembly */
template <typename InputStreamT>
struct AssemblyStreams
{
/** input reads stream */
InputStreamT& in;
/** main FASTA output */
std::ostream& out;
/** duplicated FASTA output for checkpointing */
std::ostream& checkpointOut;
/** trace file output for debugging */
std::ostream& traceOut;
AssemblyStreams(InputStreamT& in, std::ostream& out,
std::ostream& checkpointOut, std::ostream& traceOut) :
in(in), out(out), checkpointOut(checkpointOut),
traceOut(traceOut) {}
};
}
#endif
#ifndef BLOOM_IO_H
#define BLOOM_IO_H 1
#include "BloomDBG/RollingHashIterator.h"
#include "BloomDBG/RollingHash.h"
#include "lib/bloomfilter/BloomFilter.hpp"
namespace BloomDBG {
/**
* Round up `num` to the nearest multiple of `base`.
*/
template <typename T>
inline static T roundUpToMultiple(T num, T base)
{
if (base == 0)
return num;
T remainder = num % base;
if (remainder == 0)
return num;
return num + base - remainder;
}
/**
* Load DNA sequence into Bloom filter using rolling hash.
*
* @param bloom target Bloom filter
* @param seq DNA sequence
*/
template <typename BF>
inline static void loadSeq(BF& bloom, const std::string& seq)
{
const unsigned k = bloom.getKmerSize();
const unsigned numHashes = bloom.getHashNum();
for (RollingHashIterator it(seq, numHashes, k);
it != RollingHashIterator::end(); ++it) {
bloom.insert(*it);
}
}
/**
* Load sequences contents of FASTA file into Bloom filter using
* rolling hash.
* @param bloom target Bloom filter
* @param path path to FASTA file
* @param verbose if true, print progress messages to STDERR
*/
template <typename BF>
inline static void loadFile(BF& bloom, const std::string& path,
bool verbose = false)
{
const size_t BUFFER_SIZE = 1000000;
const size_t LOAD_PROGRESS_STEP = 10000;
assert(!path.empty());
if (verbose)
std::cerr << "Reading `" << path << "'..." << std::endl;
FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
uint64_t readCount = 0;
#pragma omp parallel
for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
buffer.clear();
size_t bufferSize = 0;
bool good = true;
#pragma omp critical(in)
for (; good && bufferSize < BUFFER_SIZE;) {
std::string seq;
good = in >> seq;
if (good) {
buffer.push_back(seq);
bufferSize += seq.length();
}
}
if (buffer.size() == 0)
break;
for (size_t j = 0; j < buffer.size(); j++) {
loadSeq(bloom, buffer.at(j));
if (verbose)
#pragma omp critical(cerr)
{
readCount++;
if (readCount % LOAD_PROGRESS_STEP == 0)
std::cerr << "Loaded " << readCount
<< " reads into Bloom filter\n";
}
}
}
assert(in.eof());
if (verbose) {
std::cerr << "Loaded " << readCount << " reads from `"
<< path << "` into Bloom filter\n";
}
}
/** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
template <typename BloomFilterT>
static inline void loadBloomFilter(int argc, char** argv,
BloomFilterT& bloom, bool verbose=false)
{
/* load reads into Bloom filter */
for (int i = optind; i < argc; ++i) {
/*
* Debugging feature: If there is a ':'
* separating the list of input read files into
* two parts, use the first set of files
* to load the Bloom filter and the second
* set of files for the assembly (read extension).
*/
if (strcmp(argv[i],":") == 0) {
optind = i + 1;
break;
}
BloomDBG::loadFile(bloom, argv[i], verbose);
}
if (verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< bloom.FPR() * 100 << "%" << endl;
}
} // end namespace 'BloomDBG'
#endif
#ifndef _CHECKPOINT_H_
#define _CHECKPOINT_H_
#include "BloomDBG/AssemblyCounters.h"
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/AssemblyStreams.h"
#include "Common/IOUtil.h"
#include <cstdio>
#include <iostream>
#include <fstream>
namespace BloomDBG
{
const std::string CHECKPOINT_FASTA_EXT = ".contigs.fa";
const std::string CHECKPOINT_COUNTERS_EXT = ".counters.tsv";
const std::string CHECKPOINT_BLOOM_DBG_EXT = ".dbg.bloom";
const std::string CHECKPOINT_BLOOM_VISITED_EXT = ".visited.bloom";
const std::string CHECKPOINT_TMP_EXT = ".tmp";
/**
* Save the current assembly state to a set of checkpoint files,
* so that the assembly can be resumed from that point.
*
* @param dbg Bloom filter de Bruijn graph
* @param visitedKmerSet k-mers included in output contigs so far
* @param counters counters capturing assembly state
* (e.g. next input read index, next output contig index)
* @param params various assembly parameters (corresponding to
* command line opts)
*/
template <typename BloomDBGT, typename VisitedKmerSetT>
static inline void createCheckpoint(const BloomDBGT& dbg,
const VisitedKmerSetT& visitedKmerSet,
const AssemblyCounters& counters,
const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
if (params.verbose)
std::cerr << "Writing checkpoint data..." << std::endl;
/* write out Bloom filter de Bruijn graph */
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
std::string dbgPathTmp = dbgPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing Bloom filter de Bruijn graph to `"
<< dbgPathTmp << "'" << std::endl;
std::ofstream dbgOut;
dbgOut.open(dbgPathTmp.c_str());
assert_good(dbgOut, dbgPathTmp);
dbgOut << dbg;
assert_good(dbgOut, dbgPathTmp);
/* write out visited k-mers Bloom filter */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string visitedPathTmp = visitedPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing visited k-mers Bloom to `"
<< visitedPathTmp << "'" << std::endl;
std::ofstream visitedOut;
visitedOut.open(visitedPathTmp.c_str());
assert_good(visitedOut, visitedPathTmp);
visitedOut << visitedKmerSet;
assert_good(visitedOut, visitedPathTmp);
/* write out index of next input read */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
std::string countersPathTmp = countersPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing assembly counters to `"
<< countersPathTmp << "'" << std::endl;
std::ofstream countersOut;
countersOut.open(countersPathTmp.c_str());
assert_good(countersOut, countersPathTmp);
countersOut << counters;
assert_good(countersOut, countersPathTmp);
/* copy/move new checkpoint files on top of old ones */
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPathTmp
<< " to `" << fastaPath << "'" << std::endl;
copyFile(fastaPathTmp, fastaPath);
if (params.verbose)
std::cerr << '\t' << "Moving `" << dbgPathTmp
<< "' to `" << dbgPath << "'" << std::endl;
if (rename(dbgPathTmp.c_str(), dbgPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << visitedPathTmp
<< "' to `" << visitedPath << "'" << std::endl;
if (rename(visitedPathTmp.c_str(), visitedPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << countersPathTmp
<< "' to `" << countersPath << "'" << std::endl;
if (rename(countersPathTmp.c_str(), countersPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
}
/** Return true if checkpoint files exist and are readable */
static inline bool checkpointExists(const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
return std::ifstream(fastaPath.c_str()).good()
&& std::ifstream(dbgPath.c_str()).good()
&& std::ifstream(visitedPath.c_str()).good()
&& std::ifstream(countersPath.c_str()).good();
}
/**
* Restore assembly state from checkpoint files.
*
* @param dbg Bloom filter de Bruijn graph
* @param visitedKmerSet k-mers included in output contigs so far
* @param counters counters capturing assembly state
* (e.g. next input read index, next output contig index)
* @param params various assembly parameters (corresponding to
* @param out main output stream for assembled contigs
* command line opts)
*/
template <typename BloomDBGT, typename VisitedKmerSetT,
typename InputReadStreamT>
static inline void resumeFromCheckpoint(
BloomDBGT& dbg, VisitedKmerSetT& visitedKmerSet,
AssemblyCounters& counters, const AssemblyParams& params,
AssemblyStreams<InputReadStreamT>& streams)
{
assert(checkpointExists(params));
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
if (params.verbose)
std::cerr << "Resuming from last checkpoint..." << std::endl;
/* load Bloom filter de Bruijn graph */
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading Bloom filter de Bruijn graph from `"
<< dbgPath << "'" << std::endl;
dbg.loadFilter(dbgPath);
/* load visited k-mers Bloom filter */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading reading visited k-mers Bloom from `"
<< visitedPath << "'" << std::endl;
visitedKmerSet.loadFilter(visitedPath);
/* load index for next input read */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading index of next input read from `"
<< countersPath << "'" << std::endl;
std::ifstream countersIn(countersPath.c_str());
assert_good(countersIn, countersPath);
countersIn >> counters;
assert_good(countersIn, countersPath);
/* advance to previous position in input reads */
if (params.verbose)
std::cerr << '\t' << "Advancing to read index "
<< counters.readsProcessed << " in input reads..." << std::endl;
FastaRecord rec;
for (size_t i = 0; i < counters.readsProcessed && streams.in; ++i)
streams.in >> rec;
assert (!streams.in.eof());
/* restore previously assembled contigs */
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPath
<< "' to `" << fastaPathTmp << "'" << std::endl;
copyFile(fastaPath, fastaPathTmp);
if (params.verbose)
std::cerr << '\t' << "Outputting previously assembled contigs "
<< "from `" << fastaPath << "'" << std::endl;
std::ifstream prevContigs(fastaPath.c_str());
assert_good(prevContigs, fastaPath);
streams.out << prevContigs.rdbuf();
assert(streams.out);
}
/** Delete a file if it exists */
static inline void removeFileIfExists(const std::string& path)
{
if (std::ifstream(path.c_str()).good()) {
if (remove(path.c_str()) != 0) {
perror("Error removing file");
abort();
}
}
}
/** Remove all checkpoint-related files */
static inline void removeCheckpointData(const AssemblyParams& params)