Skip to content
Commits on Source (6)
version: 2
jobs:
build:
docker:
- image: ubuntu:xenial
steps:
- run: |
apt-get update -qq
apt-get install -qq autoconf automake clang g++ libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev make pandoc gdb valgrind
- checkout
- run: |
./autogen.sh
./configure CC=clang CXX=clang++ --with-mpi=/usr/lib/openmpi
- run: make -j
- run: make -j check
- run: make -j distcheck
# Configuration for probot-stale - https://github.com/probot/stale
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 21
# Number of days of inactivity before a stale Issue or Pull Request is closed
daysUntilClose: 7
# Issues or Pull Requests with these labels will never be considered stale
exemptLabels:
- gsoc-outreachy
- help wanted
- in progress
# Label to use when marking as stale
staleLabel: stale
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs.
language: cpp
compiler:
- gcc
- clang
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev
......
......@@ -56,8 +56,10 @@ static const char USAGE_MESSAGE[] =
" --adj output the graph in ADJ format [default]\n"
" --asqg output the graph in ASQG format\n"
" --dot output the graph in GraphViz format\n"
" --gfa output the graph in GFA1 format\n"
" --gfa1 output the graph in GFA1 format\n"
" --gfa2 output the graph in GFA2 format\n"
" --gv output the graph in GraphViz format\n"
" --gfa output the graph in GFA format\n"
" --sam output the graph in SAM format\n"
" --SS expect contigs to be oriented correctly\n"
" --no-SS no assumption about contig orientation\n"
......@@ -100,8 +102,10 @@ static const struct option longopts[] = {
{ "adj", no_argument, &opt::format, ADJ },
{ "asqg", no_argument, &opt::format, ASQG },
{ "dot", no_argument, &opt::format, DOT },
{ "gfa", no_argument, &opt::format, GFA1 },
{ "gfa1", no_argument, &opt::format, GFA1 },
{ "gfa2", no_argument, &opt::format, GFA2 },
{ "gv", no_argument, &opt::format, DOT },
{ "gfa", no_argument, &opt::format, GFA },
{ "sam", no_argument, &opt::format, SAM },
{ "SS", no_argument, &opt::ss, 1 },
{ "no-SS", no_argument, &opt::ss, 0 },
......
......@@ -237,7 +237,7 @@ void writeBubble(std::ostream& out, const BranchGroup& group, unsigned id)
<< currBranch.calculateBranchMultiplicity() << '\n'
<< contig.c_str() << '\n';
}
assert(out.good());
assert_good(out, opt::snpPath);
}
/** Collapse a bubble to a single path. */
......
......@@ -13,6 +13,7 @@
#include <boost/graph/graph_traits.hpp>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
......@@ -103,11 +104,21 @@ SequenceCollectionHash()
// architecture and 2 bits per element on a 32-bit architecture.
// The number of elements is rounded up to a power of two.
if (opt::rank >= 0) {
// Make room for 200 million k-mers. Approximately 58 million
// 96-mers fit into 2 GB of ram, which results in a hash load
// of 0.216, and approximately 116 million 32-mers, which
// results in a hash load of 0.432.
m_data.rehash(200000000);
// Initialize sparsehash size to 2^30 (~1 billion) empty buckets.
// Setting the initial sparsehash size to a large
// number avoids the prohibitively slow step of resizing
// the hash table and rehashing *every* element when the
// maximum load factor is exceeded.
//
// The initial memory footprint per CPU core is
// 2^30 * 2.67 / 8 ~= 0.358 GB. The value of 2^30 buckets
// was chosen to accomodate a k=144 32-thread assembly of human
// (NA24143) uncorrected Illumina reads with ~70X coverage and
// 20,317,980,431 distinct 144-mers, without requiring sparsehash
// resizing. For further details on the test dataset, see:
// "ABySS 2.0: Resource-efficient assembly of large genomes
// using a Bloom filter".
m_data.rehash((size_t)pow(2, 30));
m_data.min_load_factor(0.2);
} else {
// Allocate a big hash for a single processor.
......
#ifndef CONCURRENTBLOOMFILTER_H
#define CONCURRENTBLOOMFILTER
#define CONCURRENTBLOOMFILTER_H
#ifndef _OPENMP
# error ConcurrentBloomFilter class requires a compiler that supports OpenMP
......
......@@ -18,6 +18,9 @@
#include "Bloom/CascadingBloomFilter.h"
#include "Bloom/BloomFilterWindow.h"
#include "Bloom/CascadingBloomFilterWindow.h"
#include "BloomDBG/BloomIO.h"
#include "BloomDBG/HashAgnosticCascadingBloom.h"
#include "lib/bloomfilter/BloomFilter.hpp"
#include <cstdlib>
#include <getopt.h>
......@@ -64,7 +67,10 @@ static const char USAGE_MESSAGE[] =
" -b, --bloom-size=N size of bloom filter [500M]\n"
" -B, --buffer-size=N size of I/O buffer for each thread, in bytes [100000]\n"
" -j, --threads=N use N parallel threads [1]\n"
" -h, --hash-seed=N seed for hash function [0]\n"
" -h, --hash-seed=N seed for hash function (only works with\n"
" `-t konnector') [0]\n"
" -H, --num-hashes=N number of hash functions (only works with\n"
" `-t rolling-hash') [1]\n"
" -l, --levels=N build a cascading bloom filter with N levels\n"
" and output the last level\n"
" -L, --init-level='N=FILE' initialize level N of cascading bloom filter\n"
......@@ -77,6 +83,7 @@ static const char USAGE_MESSAGE[] =
" -n, --num-locks=N number of write locks on bloom filter [1000]\n"
" -q, --trim-quality=N trim bases from the ends of reads whose\n"
" quality is less than the threshold\n"
" -t, --bloom-type=STR 'konnector' or 'rolling-hash' [konnector]\n"
" --standard-quality zero quality is `!' (33)\n"
" default for FASTQ and SAM files\n"
" --illumina-quality zero quality is `@' (64)\n"
......@@ -102,6 +109,7 @@ static const char USAGE_MESSAGE[] =
"\n"
"Report bugs to <" PACKAGE_BUGREPORT ">.\n";;
enum BloomFilterType { BT_KONNECTOR, BT_ROLLING_HASH, BT_UNKNOWN };
enum OutputFormat { BED, FASTA, RAW };
namespace opt {
......@@ -118,6 +126,9 @@ namespace opt {
/** Seed for Bloom filter hash function. */
size_t hashSeed = 0;
/** Number of hash functions (only works with `-t rolling-hash') */
unsigned numHashes = 1;
/** The size of a k-mer. */
unsigned k;
......@@ -136,6 +147,9 @@ namespace opt {
*/
size_t numLocks = 1000;
/** The type of Bloom filter to build */
BloomFilterType bloomType = BT_KONNECTOR;
/** Index of bloom filter window.
("M" for -w option) */
unsigned windowIndex = 0;
......@@ -157,14 +171,16 @@ namespace opt {
OutputFormat format = FASTA;
}
static const char shortopts[] = "b:B:h:j:k:l:L:m:n:q:rvw:";
static const char shortopts[] = "b:B:h:H:j:k:l:L:m:n:q:rvt:w:";
enum { OPT_HELP = 1, OPT_VERSION, OPT_BED, OPT_FASTA, OPT_RAW };
static const struct option longopts[] = {
{ "bloom-size", required_argument, NULL, 'b' },
{ "bloom-type", required_argument, NULL, 't' },
{ "buffer-size", required_argument, NULL, 'B' },
{ "hash-seed", required_argument, NULL, 'h' },
{ "num-hashes", required_argument, NULL, 'H' },
{ "threads", required_argument, NULL, 'j' },
{ "kmer", required_argument, NULL, 'k' },
{ "levels", required_argument, NULL, 'l' },
......@@ -350,6 +366,139 @@ void printCascadingBloomStats(ostream& os, BF& bloom)
}
}
template <typename BF>
void printHashAgnosticCascadingBloomStats(ostream& os, BF& bloom)
{
for (unsigned i = 0; i < opt::levels; i++) {
os << "Stats for Bloom filter level " << i+1 << ":\n"
<< "\tBloom size (bits): "
<< bloom.getBloomFilter(i).getFilterSize() << "\n"
<< "\tBloom popcount (bits): "
<< bloom.getBloomFilter(i).getPop() << "\n"
<< "\tBloom filter FPR: " << setprecision(3)
<< 100 * bloom.getBloomFilter(i).getFPR() << "%\n";
}
}
/**
* Convert string argument from `-t' option to an equivalent
* BloomFilterType value.
*/
static inline BloomFilterType strToBloomType(const std::string& str)
{
if (str == "konnector")
return BT_KONNECTOR;
else if (str == "rolling-hash")
return BT_ROLLING_HASH;
else
return BT_UNKNOWN;
}
static inline string bloomTypeToStr(const BloomFilterType type)
{
assert(type != BT_UNKNOWN);
if (type == BT_KONNECTOR) {
return string("konnector");
} else {
assert(type == BT_ROLLING_HASH);
return string("rolling-hash");
}
}
/** Build a konnector-style Bloom filter. */
static inline void buildKonnectorBloom(size_t bits, string outputPath,
int argc, char** argv)
{
// if we are building a cascading bloom filter, reduce
// the size of each level so that the overall bloom filter
// fits within the memory limit (specified by -b)
bits /= opt::levels;
if (opt::windows == 0) {
if (opt::levels == 1) {
BloomFilter bloom(bits, opt::hashSeed);
#ifdef _OPENMP
ConcurrentBloomFilter<BloomFilter>
cbf(bloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(bloom, argc, argv);
#endif
printBloomStats(cerr, bloom);
writeBloom(bloom, outputPath);
}
else {
CascadingBloomFilter cascadingBloom(bits, opt::levels, opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
#ifdef _OPENMP
ConcurrentBloomFilter<CascadingBloomFilter>
cbf(cascadingBloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(cascadingBloom, argc, argv);
#endif
printCascadingBloomStats(cerr, cascadingBloom);
writeBloom(cascadingBloom, outputPath);
}
} else {
size_t bitsPerWindow = bits / opt::windows;
size_t startBitPos = (opt::windowIndex - 1) * bitsPerWindow;
size_t endBitPos;
if (opt::windowIndex < opt::windows)
endBitPos = opt::windowIndex * bitsPerWindow - 1;
else
endBitPos = bits - 1;
if (opt::levels == 1) {
BloomFilterWindow bloom(bits, startBitPos,
endBitPos, opt::hashSeed);
loadFilters(bloom, argc, argv);
printBloomStats(cerr, bloom);
writeBloom(bloom, outputPath);
}
else {
CascadingBloomFilterWindow cascadingBloom(
bits, startBitPos, endBitPos, opt::levels,
opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
loadFilters(cascadingBloom, argc, argv);
printCascadingBloomStats(cerr, cascadingBloom);
writeBloom(cascadingBloom, outputPath);
}
}
}
/** Build a rolling-hash based Bloom filter (used by `abyss-bloom-dbg`) */
static inline void buildRollingHashBloom(size_t bits, string outputPath,
int argc, char** argv)
{
/* BloomFilter class requires size to be a multiple of 64 */
size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
bits / opt::levels, (size_t)64);
/* use cascading Bloom filter to remove error k-mers */
HashAgnosticCascadingBloom cascadingBloom(
bloomLevelSize, opt::numHashes, opt::levels, opt::k);
/* load reads into Bloom filter */
for (int i = optind; i < argc; ++i)
BloomDBG::loadFile(cascadingBloom, argv[i], opt::verbose);
if (opt::verbose)
printHashAgnosticCascadingBloomStats(cerr, cascadingBloom);
writeBloom(cascadingBloom, outputPath);
}
/**
* Build Bloom filter file of type 'konnector' or 'rolling-hash', as
* per `-t` option.
*/
int build(int argc, char** argv)
{
parseGlobalOpts(argc, argv);
......@@ -366,6 +515,8 @@ int build(int argc, char** argv)
arg >> opt::bufferSize; break;
case 'h':
arg >> opt::hashSeed; break;
case 'H':
arg >> opt::numHashes; break;
case 'j':
arg >> opt::threads; break;
case 'l':
......@@ -389,6 +540,13 @@ int build(int argc, char** argv)
arg >> opt::numLocks; break;
case 'q':
arg >> opt::qualityThreshold; break;
case 't':
{
std::string str;
arg >> str;
opt::bloomType = strToBloomType(str);
}
break;
case 'w':
arg >> opt::windowIndex;
arg >> expect("/");
......@@ -415,6 +573,18 @@ int build(int argc, char** argv)
dieWithUsageError();
}
if (opt::bloomType == BT_UNKNOWN) {
cerr << PROGRAM ": unrecognized argument to `-t' "
<< "(should be 'konnector' or 'rolling-hash')\n";
dieWithUsageError();
}
if (opt::bloomType == BT_KONNECTOR && opt::numHashes != 1) {
cerr << PROGRAM ": warning: -H option has no effect"
" when using `-t konnector'\n";
opt::numHashes = 1;
}
#if _OPENMP
if (opt::threads > 0)
omp_set_num_threads(opt::threads);
......@@ -423,12 +593,6 @@ int build(int argc, char** argv)
// bloom filter size in bits
size_t bits = opt::bloomSize * 8;
if (bits % opt::levels != 0) {
cerr << PROGRAM ": bloom filter size (-b) must be evenly divisible "
<< "by number of bloom filter levels (-l)\n";
dieWithUsageError();
}
if (opt::windows != 0 && bits / opt::levels % opt::windows != 0) {
cerr << PROGRAM ": (b / l) % w == 0 must be true, where "
<< "b is bloom filter size (-b), "
......@@ -442,68 +606,23 @@ int build(int argc, char** argv)
dieWithUsageError();
}
// if we are building a cascading bloom filter, reduce
// the size of each level so that the overall bloom filter
// fits within the memory limit (specified by -b)
bits /= opt::levels;
string outputPath(argv[optind]);
optind++;
if (opt::windows == 0) {
if (opt::levels == 1) {
BloomFilter bloom(bits, opt::hashSeed);
#ifdef _OPENMP
ConcurrentBloomFilter<BloomFilter>
cbf(bloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(bloom, argc, argv);
#endif
printBloomStats(cerr, bloom);
writeBloom(bloom, outputPath);
}
else {
CascadingBloomFilter cascadingBloom(bits, opt::levels, opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
#ifdef _OPENMP
ConcurrentBloomFilter<CascadingBloomFilter>
cbf(cascadingBloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(cascadingBloom, argc, argv);
#endif
printCascadingBloomStats(cerr, cascadingBloom);
writeBloom(cascadingBloom, outputPath);
if (opt::verbose) {
cerr << "Building a Bloom filter of type '"
<< bloomTypeToStr(opt::bloomType) << "' with "
<< opt::levels << " level(s), "
<< opt::numHashes << " hash function(s), and a total size of "
<< opt::bloomSize << " bytes" << endl;
}
assert(opt::bloomType != BT_UNKNOWN);
if (opt::bloomType == BT_KONNECTOR) {
buildKonnectorBloom(bits, outputPath, argc, argv);
} else {
size_t bitsPerWindow = bits / opt::windows;
size_t startBitPos = (opt::windowIndex - 1) * bitsPerWindow;
size_t endBitPos;
if (opt::windowIndex < opt::windows)
endBitPos = opt::windowIndex * bitsPerWindow - 1;
else
endBitPos = bits - 1;
if (opt::levels == 1) {
BloomFilterWindow bloom(bits, startBitPos,
endBitPos, opt::hashSeed);
loadFilters(bloom, argc, argv);
printBloomStats(cerr, bloom);
writeBloom(bloom, outputPath);
}
else {
CascadingBloomFilterWindow cascadingBloom(
bits, startBitPos, endBitPos, opt::levels,
opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
loadFilters(cascadingBloom, argc, argv);
printCascadingBloomStats(cerr, cascadingBloom);
writeBloom(cascadingBloom, outputPath);
}
assert(opt::bloomType == BT_ROLLING_HASH);
buildRollingHashBloom(bits, outputPath, argc, argv);
}
return 0;
......@@ -784,7 +903,7 @@ int memberOf(int argc, char ** argv){
i += pos;
continue;
}
if (bloom[Kmer(kmer)] || opt::inverse) {
if (bloom[Kmer(kmer)] != opt::inverse) {
if (opt::format == FASTA) {
cout << ">" << rec.id << ":seq:" << seqCount
<< ":kmer:" << i << "\n";
......
#ifndef _ASSEMBLY_COUNTERS_H_
#define _ASSEMBLY_COUNTERS_H_
#include "Common/IOUtil.h"
#include <iostream>
#include <cassert>
#include <string>
namespace BloomDBG {
/**
* Counters for tracking assembly statistics and producing
* progress messages.
*/
struct AssemblyCounters
{
/** reads consisting entirely of solid k-mers */
size_t solidReads;
size_t readsExtended;
size_t readsProcessed;
size_t basesAssembled;
size_t contigID;
AssemblyCounters() : solidReads(0), readsExtended(0), readsProcessed(0),
basesAssembled(0), contigID(0) {}
/** serialize counters as a TSV table */
friend std::ostream& operator<<(std::ostream& out,
const AssemblyCounters& o)
{
/* write headers */
out << "solid_reads"
<< '\t' << "extended_reads"
<< '\t' << "processed_reads"
<< '\t' << "bases_assembled"
<< '\t' << "next_contig_id"
<< '\n';
/* write data */
out << o.solidReads
<< '\t' << o.readsExtended
<< '\t' << o.readsProcessed
<< '\t' << o.basesAssembled
<< '\t' << o.contigID
<< '\n';
return out;
}
/** deserialize counters from a TSV table */
friend std::istream& operator>>(std::istream& in,
AssemblyCounters& o)
{
/* ignore header line */
std::string headers;
getline(in, headers);
/* read data */
in >> o.solidReads
>> expect("\t") >> o.readsExtended
>> expect("\t") >> o.readsProcessed
>> expect("\t") >> o.basesAssembled
>> expect("\t") >> o.contigID
>> expect("\n");
return in;
}
};
} /* end of BloomDBG namespace */
#endif
#ifndef _ASSEMBLY_PARAMS_H_
#define _ASSEMBLY_PARAMS_H_
#include <string>
#include <iostream>
#include <limits>
namespace BloomDBG {
/**
* Parameters controlling assembly.
*/
struct AssemblyParams
{
/** Bloom filter size (in bytes) */
size_t bloomSize;
/** Checkpoint frequency (reads processed per checkpoint) */
size_t readsPerCheckpoint;
/** Do not delete checkpoint files after a successful assembly */
bool keepCheckpoint;
/** Filename prefix for checkpoint files */
std::string checkpointPathPrefix;
/** minimum k-mer coverage threshold */
unsigned minCov;
/** WIG track containing 0/1 for sufficient k-mer cov */
std::string covTrackPath;
/** path for output GraphViz file */
std::string graphPath;
/** num Bloom filter hash functions */
unsigned numHashes;
/** input Bloom filter file (if empty, build Bloom filter from reads)*/
std::string bloomPath;
/** the number of parallel threads. */
unsigned threads;
/** the size of a k-mer. */
unsigned k;
/** the size of a single k-mer in a k-mer pair */
unsigned K;
/** reference genome */
std::string refPath;
/** Quadratic Residue (QR) seed length */
unsigned qrSeedLen;
/** spaced seed */
std::string spacedSeed;
/** maximum length of branches to trim */
unsigned trim;
/** verbose level for progress messages */
int verbose;
/** output contigs path (empty string indicates STDOUT) */
std::string outputPath;
/** output path for trace file (-T) option */
std::string tracePath;
/** Default constructor */
AssemblyParams() : bloomSize(0),
readsPerCheckpoint(std::numeric_limits<size_t>::max()),
keepCheckpoint(false), checkpointPathPrefix("bloom-dbg-checkpoint"),
minCov(2), graphPath(), numHashes(1), threads(1),
k(0), K(0), qrSeedLen(0), spacedSeed(),
trim(std::numeric_limits<unsigned>::max()),
verbose(0), outputPath(), tracePath() {}
/** Return true if all required members are initialized */
bool initialized() const {
return bloomSize > 0 && k > 0 &&
trim != std::numeric_limits<unsigned>::max();
}
/** Return true if checkpoint creation is enabled */
bool checkpointsEnabled() const {
return readsPerCheckpoint != std::numeric_limits<size_t>::max();
}
/** Reset all spaced seed params to their default values */
void resetSpacedSeedParams() {
spacedSeed.clear();
K = 0;
qrSeedLen = 0;
}
/** Report current parameter values (for logging) */
friend std::ostream& operator<<(std::ostream& out,
const AssemblyParams& o)
{
out << "Assembly parameters:" << std::endl
<< '\t' << "K-mer size (-k): " << o.k << std::endl
<< '\t' << "K-mer coverage threshold (--kc): " << o.minCov << std::endl
<< '\t' << "Max branch trim length (-t): " << o.trim << std::endl
<< '\t' << "Bloom size in bytes (-b): " << o.bloomSize << std::endl
<< '\t' << "Bloom hash functions (-H): " << o.numHashes << std::endl;
if (o.K > 0)
out << '\t' << "Spaced k-mer size (-K): " << o.K << std::endl;
if (o.qrSeedLen > 0)
out << '\t' << "Quadratic residue (QR) seed length (--qr-seed): "
<< o.qrSeedLen << std::endl;
return out;
}
};
} /* end of BloomDBG namespace */
#endif
#ifndef _ASSEMBLY_STREAMS_H_
#define _ASSEMBLY_STREAMS_H_
namespace BloomDBG {
/** Bundles together input and output streams used during assembly */
template <typename InputStreamT>
struct AssemblyStreams
{
/** input reads stream */
InputStreamT& in;
/** main FASTA output */
std::ostream& out;
/** duplicated FASTA output for checkpointing */
std::ostream& checkpointOut;
/** trace file output for debugging */
std::ostream& traceOut;
AssemblyStreams(InputStreamT& in, std::ostream& out,
std::ostream& checkpointOut, std::ostream& traceOut) :
in(in), out(out), checkpointOut(checkpointOut),
traceOut(traceOut) {}
};
}
#endif
#ifndef BLOOM_IO_H
#define BLOOM_IO_H 1
#include "BloomDBG/RollingHashIterator.h"
#include "BloomDBG/RollingHash.h"
#include "lib/bloomfilter/BloomFilter.hpp"
namespace BloomDBG {
/**
* Round up `num` to the nearest multiple of `base`.
*/
template <typename T>
inline static T roundUpToMultiple(T num, T base)
{
if (base == 0)
return num;
T remainder = num % base;
if (remainder == 0)
return num;
return num + base - remainder;
}
/**
* Load DNA sequence into Bloom filter using rolling hash.
*
* @param bloom target Bloom filter
* @param seq DNA sequence
*/
template <typename BF>
inline static void loadSeq(BF& bloom, const std::string& seq)
{
const unsigned k = bloom.getKmerSize();
const unsigned numHashes = bloom.getHashNum();
for (RollingHashIterator it(seq, numHashes, k);
it != RollingHashIterator::end(); ++it) {
bloom.insert(*it);
}
}
/**
* Load sequences contents of FASTA file into Bloom filter using
* rolling hash.
* @param bloom target Bloom filter
* @param path path to FASTA file
* @param verbose if true, print progress messages to STDERR
*/
template <typename BF>
inline static void loadFile(BF& bloom, const std::string& path,
bool verbose = false)
{
const size_t BUFFER_SIZE = 1000000;
const size_t LOAD_PROGRESS_STEP = 10000;
assert(!path.empty());
if (verbose)
std::cerr << "Reading `" << path << "'..." << std::endl;
FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
uint64_t readCount = 0;
#pragma omp parallel
for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
buffer.clear();
size_t bufferSize = 0;
bool good = true;
#pragma omp critical(in)
for (; good && bufferSize < BUFFER_SIZE;) {
std::string seq;
good = in >> seq;
if (good) {
buffer.push_back(seq);
bufferSize += seq.length();
}
}
if (buffer.size() == 0)
break;
for (size_t j = 0; j < buffer.size(); j++) {
loadSeq(bloom, buffer.at(j));
if (verbose)
#pragma omp critical(cerr)
{
readCount++;
if (readCount % LOAD_PROGRESS_STEP == 0)
std::cerr << "Loaded " << readCount
<< " reads into Bloom filter\n";
}
}
}
assert(in.eof());
if (verbose) {
std::cerr << "Loaded " << readCount << " reads from `"
<< path << "` into Bloom filter\n";
}
}
/** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
template <typename BloomFilterT>
static inline void loadBloomFilter(int argc, char** argv,
BloomFilterT& bloom, bool verbose=false)
{
/* load reads into Bloom filter */
for (int i = optind; i < argc; ++i) {
/*
* Debugging feature: If there is a ':'
* separating the list of input read files into
* two parts, use the first set of files
* to load the Bloom filter and the second
* set of files for the assembly (read extension).
*/
if (strcmp(argv[i],":") == 0) {
optind = i + 1;
break;
}
BloomDBG::loadFile(bloom, argv[i], verbose);
}
if (verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< bloom.FPR() * 100 << "%" << endl;
}
} // end namespace 'BloomDBG'
#endif
#ifndef _CHECKPOINT_H_
#define _CHECKPOINT_H_
#include "BloomDBG/AssemblyCounters.h"
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/AssemblyStreams.h"
#include "Common/IOUtil.h"
#include <cstdio>
#include <iostream>
#include <fstream>
namespace BloomDBG
{
const std::string CHECKPOINT_FASTA_EXT = ".contigs.fa";
const std::string CHECKPOINT_COUNTERS_EXT = ".counters.tsv";
const std::string CHECKPOINT_BLOOM_DBG_EXT = ".dbg.bloom";
const std::string CHECKPOINT_BLOOM_VISITED_EXT = ".visited.bloom";
const std::string CHECKPOINT_TMP_EXT = ".tmp";
/**
* Save the current assembly state to a set of checkpoint files,
* so that the assembly can be resumed from that point.
*
* @param dbg Bloom filter de Bruijn graph
* @param visitedKmerSet k-mers included in output contigs so far
* @param counters counters capturing assembly state
* (e.g. next input read index, next output contig index)
* @param params various assembly parameters (corresponding to
* command line opts)
*/
template <typename BloomDBGT, typename VisitedKmerSetT>
static inline void createCheckpoint(const BloomDBGT& dbg,
const VisitedKmerSetT& visitedKmerSet,
const AssemblyCounters& counters,
const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
if (params.verbose)
std::cerr << "Writing checkpoint data..." << std::endl;
/* write out Bloom filter de Bruijn graph */
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
std::string dbgPathTmp = dbgPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing Bloom filter de Bruijn graph to `"
<< dbgPathTmp << "'" << std::endl;
std::ofstream dbgOut;
dbgOut.open(dbgPathTmp.c_str());
assert_good(dbgOut, dbgPathTmp);
dbgOut << dbg;
assert_good(dbgOut, dbgPathTmp);
/* write out visited k-mers Bloom filter */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string visitedPathTmp = visitedPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing visited k-mers Bloom to `"
<< visitedPathTmp << "'" << std::endl;
std::ofstream visitedOut;
visitedOut.open(visitedPathTmp.c_str());
assert_good(visitedOut, visitedPathTmp);
visitedOut << visitedKmerSet;
assert_good(visitedOut, visitedPathTmp);
/* write out index of next input read */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
std::string countersPathTmp = countersPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Writing assembly counters to `"
<< countersPathTmp << "'" << std::endl;
std::ofstream countersOut;
countersOut.open(countersPathTmp.c_str());
assert_good(countersOut, countersPathTmp);
countersOut << counters;
assert_good(countersOut, countersPathTmp);
/* copy/move new checkpoint files on top of old ones */
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPathTmp
<< " to `" << fastaPath << "'" << std::endl;
copyFile(fastaPathTmp, fastaPath);
if (params.verbose)
std::cerr << '\t' << "Moving `" << dbgPathTmp
<< "' to `" << dbgPath << "'" << std::endl;
if (rename(dbgPathTmp.c_str(), dbgPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << visitedPathTmp
<< "' to `" << visitedPath << "'" << std::endl;
if (rename(visitedPathTmp.c_str(), visitedPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
if (params.verbose)
std::cerr << '\t' << "Moving `" << countersPathTmp
<< "' to `" << countersPath << "'" << std::endl;
if (rename(countersPathTmp.c_str(), countersPath.c_str()) != 0) {
perror("Error renaming file");
abort();
}
}
/** Return true if checkpoint files exist and are readable */
static inline bool checkpointExists(const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
return std::ifstream(fastaPath.c_str()).good()
&& std::ifstream(dbgPath.c_str()).good()
&& std::ifstream(visitedPath.c_str()).good()
&& std::ifstream(countersPath.c_str()).good();
}
/**
* Restore assembly state from checkpoint files.
*
* @param dbg Bloom filter de Bruijn graph
* @param visitedKmerSet k-mers included in output contigs so far
* @param counters counters capturing assembly state
* (e.g. next input read index, next output contig index)
* @param params various assembly parameters (corresponding to
* @param out main output stream for assembled contigs
* command line opts)
*/
template <typename BloomDBGT, typename VisitedKmerSetT,
typename InputReadStreamT>
static inline void resumeFromCheckpoint(
BloomDBGT& dbg, VisitedKmerSetT& visitedKmerSet,
AssemblyCounters& counters, const AssemblyParams& params,
AssemblyStreams<InputReadStreamT>& streams)
{
assert(checkpointExists(params));
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
if (params.verbose)
std::cerr << "Resuming from last checkpoint..." << std::endl;
/* load Bloom filter de Bruijn graph */
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading Bloom filter de Bruijn graph from `"
<< dbgPath << "'" << std::endl;
dbg.loadFilter(dbgPath);
/* load visited k-mers Bloom filter */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading reading visited k-mers Bloom from `"
<< visitedPath << "'" << std::endl;
visitedKmerSet.loadFilter(visitedPath);
/* load index for next input read */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
if (params.verbose)
std::cerr << '\t' << "Reading index of next input read from `"
<< countersPath << "'" << std::endl;
std::ifstream countersIn(countersPath.c_str());
assert_good(countersIn, countersPath);
countersIn >> counters;
assert_good(countersIn, countersPath);
/* advance to previous position in input reads */
if (params.verbose)
std::cerr << '\t' << "Advancing to read index "
<< counters.readsProcessed << " in input reads..." << std::endl;
FastaRecord rec;
for (size_t i = 0; i < counters.readsProcessed && streams.in; ++i)
streams.in >> rec;
assert (!streams.in.eof());
/* restore previously assembled contigs */
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
if (params.verbose)
std::cerr << '\t' << "Copying `" << fastaPath
<< "' to `" << fastaPathTmp << "'" << std::endl;
copyFile(fastaPath, fastaPathTmp);
if (params.verbose)
std::cerr << '\t' << "Outputting previously assembled contigs "
<< "from `" << fastaPath << "'" << std::endl;
std::ifstream prevContigs(fastaPath.c_str());
assert_good(prevContigs, fastaPath);
streams.out << prevContigs.rdbuf();
assert(streams.out);
}
/** Delete a file if it exists */
static inline void removeFileIfExists(const std::string& path)
{
if (std::ifstream(path.c_str()).good()) {
if (remove(path.c_str()) != 0) {
perror("Error removing file");
abort();
}
}
}
/** Remove all checkpoint-related files */
static inline void removeCheckpointData(const AssemblyParams& params)
{
assert(params.checkpointsEnabled());
assert(!params.checkpointPathPrefix.empty());
std::string prefix = params.checkpointPathPrefix;
if (params.verbose)
std::cerr << "Removing checkpoint files..." << std::endl;
/* remove Bloom filter de Bruijn graph file(s) */
std::string dbgPath = prefix + CHECKPOINT_BLOOM_DBG_EXT;
std::string dbgPathTmp = dbgPath + CHECKPOINT_TMP_EXT;
removeFileIfExists(dbgPath);
removeFileIfExists(dbgPathTmp);
/* remove visited k-mers Bloom filter file(s) */
std::string visitedPath = prefix + CHECKPOINT_BLOOM_VISITED_EXT;
std::string visitedPathTmp = visitedPath + CHECKPOINT_TMP_EXT;
removeFileIfExists(visitedPath);
removeFileIfExists(visitedPathTmp);
/* remove assembly counters file(s) */
std::string countersPath = prefix + CHECKPOINT_COUNTERS_EXT;
std::string countersPathTmp = countersPath + CHECKPOINT_TMP_EXT;
removeFileIfExists(countersPath);
removeFileIfExists(countersPathTmp);
/* remove contigs FASTA file(s) */
std::string fastaPath = prefix + CHECKPOINT_FASTA_EXT;
std::string fastaPathTmp = fastaPath + CHECKPOINT_TMP_EXT;
removeFileIfExists(fastaPath);
removeFileIfExists(fastaPathTmp);
}
}
#endif
......@@ -42,17 +42,23 @@ class HashAgnosticCascadingBloom
{
m_data.reserve(levels);
for (unsigned i = 0; i < levels; i++)
m_data.push_back(new BloomFilter(size, hashes, k));
m_data.push_back(new BTL::BloomFilter(size, hashes, k));
}
/**
* Constructor to load a single-level BTL::BloomFilter from
* files. This is used to make BTL::BloomFilter support the
* same interface as HashAgnosticCascadingBloom.
*/
HashAgnosticCascadingBloom(const string& bloomPath)
{
loadFilter(bloomPath);
}
/** Destructor */
~HashAgnosticCascadingBloom()
{
typedef std::vector<BloomFilter*>::iterator Iterator;
for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
assert(*i != NULL);
delete *i;
}
clear();
}
/** Return k-mer size used by Bloom filter. */
......@@ -75,6 +81,12 @@ class HashAgnosticCascadingBloom
return m_data.back()->getPop();
}
/** Return number of levels in cascading Bloom filter */
unsigned levels() const
{
return m_data.size();
}
/** Return the estimated false positive rate */
double FPR() const
{
......@@ -126,20 +138,54 @@ class HashAgnosticCascadingBloom
}
/** Get the Bloom filter for a given level */
BloomFilter& getBloomFilter(unsigned level)
BTL::BloomFilter& getBloomFilter(unsigned level)
{
assert(m_data.at(level) != NULL);
return *m_data.at(level);
}
/** Operator for writing the Bloom filter to a stream */
friend std::ostream& operator<<(std::ostream& out,
const HashAgnosticCascadingBloom& o)
{
assert(o.m_data.size() > 0);
assert(o.m_data.back() != NULL);
/* o.m_data.back()->storeFilter(out); */
out << *o.m_data.back();
return out;
}
/** Load a Bloom filter from a file */
void loadFilter(const string& bloomPath)
{
clear();
BTL::BloomFilter* bloom = new BTL::BloomFilter(bloomPath);
m_k = bloom->getKmerSize();
m_hashes = bloom->getHashNum();
m_data.push_back(bloom);
}
private:
/** Free all allocated memory and reset parameters to defaults */
void clear()
{
m_k = 0;
m_hashes = 0;
typedef std::vector<BTL::BloomFilter*>::iterator Iterator;
for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
assert(*i != NULL);
delete *i;
}
m_data.clear();
}
/** k-mer length */
unsigned m_k;
/** number of hash functions */
unsigned m_hashes;
/** the array of Bloom filters */
std::vector<BloomFilter*> m_data;
std::vector<BTL::BloomFilter*> m_data;
};
#endif
......@@ -28,10 +28,12 @@ public:
LightweightKmer() {}
/** Constructor */
LightweightKmer(const char* kmer) : m_kmer(new char[Kmer::length()])
LightweightKmer(const char* kmer) : m_kmer(new char[Kmer::length() + 1])
{
const unsigned k = Kmer::length();
std::copy(kmer, kmer + k, m_kmer.get());
/* null-terminate k-mer string */
*(m_kmer.get() + k) = '\0';
}
/** Get pointer to raw char array for k-mer */
......
......@@ -10,14 +10,20 @@ abyss_bloom_dbg_LDADD = \
$(top_builddir)/DataLayer/libdatalayer.a \
$(top_builddir)/Common/libcommon.a
abyss_bloom_dbg_SOURCES = bloom-dbg.cc \
abyss_bloom_dbg_SOURCES = \
AssemblyCounters.h \
AssemblyParams.h \
AssemblyStreams.h \
bloom-dbg.cc \
bloom-dbg.h \
MaskedKmer.h \
SpacedSeed.h \
BloomIO.h \
Checkpoint.h \
HashAgnosticCascadingBloom.h \
LightweightKmer.h \
MaskedKmer.h \
RollingBloomDBG.h \
RollingHash.h \
RollingHashIterator.h \
SpacedSeed.h \
$(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
$(top_srcdir)/lib/rolling-hash/rolling.h
#include "config.h"
#include "BloomDBG/bloom-dbg.h"
#include "BloomDBG/AssemblyCounters.h"
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/Checkpoint.h"
#include "BloomDBG/HashAgnosticCascadingBloom.h"
#include "BloomDBG/MaskedKmer.h"
#include "BloomDBG/SpacedSeed.h"
......@@ -35,7 +38,7 @@ static const char VERSION_MESSAGE[] =
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " -b <bloom_size> -H <bloom_hashes> -k <kmer_size> \\\n"
" -G <genome_size> [options] <FASTQ> [FASTQ]... > assembly.fasta\n"
" [options] <FASTQ> [FASTQ]... > assembly.fasta\n"
"\n"
"Perform a de Bruijn graph assembly of the given FASTQ files.\n"
"\n"
......@@ -49,6 +52,7 @@ static const char USAGE_MESSAGE[] =
" -g --graph=FILE write de Bruijn graph to FILE (GraphViz)\n"
" --help display this help and exit\n"
" -H --num-hashes=N number of Bloom filter hash functions [1]\n"
" -i --input-bloom=FILE load Bloom filter from FILE\n"
" -j, --threads=N use N parallel threads [1]\n"
" --trim-masked trim masked bases from the ends of reads\n"
" --no-trim-masked do not trim masked bases from the ends\n"
......@@ -91,6 +95,16 @@ static const char USAGE_MESSAGE[] =
" -R, --ref=FILE specify a reference genome. FILE may be\n"
" FASTA, FASTQ, SAM, or BAM and may be gzipped."
"\n"
"Experimental Options:\n"
"\n"
" Note!: These options may not be supported in future versions.\n"
"\n"
" --checkpoint=N create a checkpoint every N reads [disabled=0]\n"
" --keep-checkpoint do not delete checkpoint files after assembly\n"
" completes successfully [disabled]\n"
" --checkpoint-prefix=STR filename prefix for checkpoint files\n"
" ['bloom-dbg-checkpoint']\n"
"\n"
"Example:\n"
"\n"
" Assemble a genome using a k-mer size of 50bp. Allocate a 1GB\n"
......@@ -105,9 +119,12 @@ static const char USAGE_MESSAGE[] =
/** Assembly params (stores command-line options) */
BloomDBG::AssemblyParams params;
static const char shortopts[] = "b:C:g:H:j:k:K:o:q:Q:R:s:t:T:v";
static const char shortopts[] = "b:C:g:H:i:j:k:K:o:q:Q:R:s:t:T:v";
enum { OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV };
enum {
OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV,
CHECKPOINT, KEEP_CHECKPOINT, CHECKPOINT_PREFIX
};
static const struct option longopts[] = {
{ "bloom-size", required_argument, NULL, 'b' },
......@@ -115,8 +132,12 @@ static const struct option longopts[] = {
{ "cov-track", required_argument, NULL, 'C' },
{ "chastity", no_argument, &opt::chastityFilter, 1 },
{ "no-chastity", no_argument, &opt::chastityFilter, 0 },
{ "checkpoint", required_argument, NULL, CHECKPOINT },
{ "keep-checkpoint", no_argument, NULL, KEEP_CHECKPOINT },
{ "checkpoint-prefix", required_argument, NULL, CHECKPOINT_PREFIX },
{ "graph", required_argument, NULL, 'g' },
{ "num-hashes", required_argument, NULL, 'H' },
{ "input-bloom", required_argument, NULL, 'i' },
{ "help", no_argument, NULL, OPT_HELP },
{ "threads", required_argument, NULL, 'j' },
{ "trim-masked", no_argument, &opt::trimMasked, 1 },
......@@ -139,6 +160,213 @@ static const struct option longopts[] = {
{ NULL, 0, NULL, 0 }
};
void printCascadingBloomStats(HashAgnosticCascadingBloom& bloom, ostream& os)
{
for (unsigned i = 0; i < bloom.levels(); i++) {
os << "Stats for Bloom filter level " << i+1 << ":\n"
<< "\tBloom size (bits): "
<< bloom.getBloomFilter(i).getFilterSize() << "\n"
<< "\tBloom popcount (bits): "
<< bloom.getBloomFilter(i).getPop() << "\n"
<< "\tBloom filter FPR: " << setprecision(3)
<< 100 * bloom.getBloomFilter(i).getFPR() << "%\n";
}
}
/** Create optional auxiliary output files */
template <typename BloomFilterT>
void writeAuxiliaryFiles(int argc, char** argv, const BloomFilterT& bloom,
const BloomDBG::AssemblyParams& params)
{
/* generate wiggle coverage track */
if (!params.covTrackPath.empty() && !params.refPath.empty())
BloomDBG::writeCovTrack(bloom, params);
/* generate de Bruijn graph in GraphViz format */
if (!params.graphPath.empty()) {
ofstream graphOut(params.graphPath.c_str());
assert_good(graphOut, params.graphPath);
BloomDBG::outputGraph(argc, argv, bloom, params, graphOut);
assert_good(graphOut, params.graphPath);
graphOut.close();
assert_good(graphOut, params.graphPath);
}
}
/** Initialize global variables for k-mer size and spaced seed pattern */
void initGlobals(const BloomDBG::AssemblyParams& params)
{
/* set global variable for k-mer length */
MaskedKmer::setLength(params.k);
if (params.verbose)
cerr << "Assembling with k-mer size " << params.k << endl;
/* set global variable for spaced seed */
if (params.K > 0)
MaskedKmer::setMask(SpacedSeed::kmerPair(params.k, params.K));
else if (params.qrSeedLen > 0)
MaskedKmer::setMask(SpacedSeed::qrSeedPair(params.k, params.qrSeedLen));
else
MaskedKmer::setMask(params.spacedSeed);
if (params.verbose && !MaskedKmer::mask().empty())
cerr << "Using spaced seed " << MaskedKmer::mask() << endl;
}
/**
* Resume assembly from previously saved checkpoint.
*/
void resumeAssemblyFromCheckpoint(int argc, char** argv,
BloomDBG::AssemblyParams& params, ostream& out)
{
assert(params.checkpointsEnabled() && checkpointExists(params));
assert(params.initialized());
initGlobals(params);
/* empty Bloom filter de Bruijn graph */
HashAgnosticCascadingBloom solidKmerSet;
/* empty visited k-mers Bloom filter */
BTL::BloomFilter visitedKmerSet;
/* counters for progress messages */
BloomDBG::AssemblyCounters counters;
/* setup input/output streams for the assembly */
/* input reads */
FastaConcat in(argv + optind, argv + argc, FastaReader::FOLD_CASE);
/* output stream for duplicate contigs FASTA output (for checkpoints) */
ofstream checkpointOut;
assert(!params.checkpointPathPrefix.empty());
string prefix = params.checkpointPathPrefix;
string checkpointPath = prefix + BloomDBG::CHECKPOINT_FASTA_EXT;
checkpointOut.open(checkpointPath.c_str(), std::ofstream::app);
assert_good(checkpointOut, checkpointPath);
/* stream for trace file output ('-T' option) */
ofstream traceOut;
if (!params.tracePath.empty()) {
traceOut.open(params.tracePath.c_str());
assert_good(traceOut, params.tracePath);
BloomDBG::SeqExtensionResult::printHeaders(traceOut);
assert_good(traceOut, params.tracePath);
}
/* bundle input/output streams for assembly */
BloomDBG::AssemblyStreams<FastaConcat>
streams(in, out, checkpointOut, traceOut);
/* restore state of Bloom filters, counters, and input/output streams */
BloomDBG::resumeFromCheckpoint(solidKmerSet, visitedKmerSet,
counters, params, streams);
/* resume the assembly */
BloomDBG::assemble(solidKmerSet, visitedKmerSet, counters, params,
streams);
}
/**
* Do the assembly after loading a pre-built Bloom filter
* from file (`-i` option). (The input Bloom filter file
* is constructed using `abyss-bloom build -t rolling-hash`.)
*/
void prebuiltBloomAssembly(int argc, char** argv,
BloomDBG::AssemblyParams& params, ostream& out)
{
/* load prebuilt Bloom filter from file */
assert(!params.bloomPath.empty());
if (params.verbose)
cerr << "Loading prebuilt Bloom filter from `" << params.bloomPath
<< "'" << endl;
/* load the Bloom filter from file */
HashAgnosticCascadingBloom bloom(params.bloomPath);
if (params.verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< bloom.FPR() * 100 << "%" << endl;
printCascadingBloomStats(bloom, cerr);
/* override command line options with values from Bloom file */
params.k = bloom.getKmerSize();
params.numHashes = bloom.getHashNum();
params.bloomSize = bloom.size() + 7 / 8;
if (params.trim == std::numeric_limits<unsigned>::max())
params.trim = params.k;
assert(params.numHashes <= MAX_HASHES);
assert(params.initialized());
/* init global vars for k-mer size and spaced seed pattern */
initGlobals(params);
if (params.verbose)
cerr << params;
/* do assembly */
BloomDBG::assemble(argc - optind, argv + optind,
bloom, params, out);
/* write supplementary files (e.g. GraphViz) */
writeAuxiliaryFiles(argc - optind, argv + optind, bloom, params);
}
/**
* Load the reads into a cascading Bloom filter and do the assembly.
*/
void cascadingBloomAssembly(int argc, char** argv,
const BloomDBG::AssemblyParams& params, ostream& out)
{
/* init global vars for k-mer size and spaced seed pattern */
assert(params.initialized());
initGlobals(params);
if (params.verbose)
cerr << params;
/*
* Build/load cascading Bloom filetr.
*
* Note 1: We use (params.minCov + 1) here because we use an additional
* Bloom filter in BloomDBG::assemble() to track the set of
* assembled k-mers.
*
* Note 2: BloomFilter class requires size to be a multiple of 64.
*/
const size_t bitsPerByte = 8;
size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
params.bloomSize * bitsPerByte / (params.minCov + 1), (size_t)64);
HashAgnosticCascadingBloom cascadingBloom(
bloomLevelSize, params.numHashes, params.minCov, params.k);
BloomDBG::loadBloomFilter(argc, argv, cascadingBloom, params.verbose);
if (params.verbose)
printCascadingBloomStats(cascadingBloom, cerr);
/* second pass through FASTA files for assembling */
BloomDBG::assemble(argc - optind, argv + optind,
cascadingBloom, params, out);
/* write supplementary files (e.g. GraphViz) */
writeAuxiliaryFiles(argc - optind, argv + optind, cascadingBloom, params);
}
/**
* Create a de novo genome assembly using a Bloom filter de
* Bruijn graph.
......@@ -161,6 +389,8 @@ int main(int argc, char** argv)
arg >> params.graphPath; break;
case 'H':
arg >> params.numHashes; break;
case 'i':
arg >> params.bloomPath; break;
case 'j':
arg >> params.threads; break;
case 'k':
......@@ -199,7 +429,17 @@ int main(int argc, char** argv)
params.resetSpacedSeedParams();
arg >> params.qrSeedLen;
break;
case CHECKPOINT:
arg >> params.readsPerCheckpoint;
break;
case KEEP_CHECKPOINT:
params.keepCheckpoint = true;
break;
case CHECKPOINT_PREFIX:
arg >> params.checkpointPathPrefix;
break;
}
if (optarg != NULL && (!arg.eof() || arg.fail())) {
cerr << PROGRAM ": invalid option: `-"
<< (char)c << optarg << "'\n";
......@@ -207,12 +447,12 @@ int main(int argc, char** argv)
}
}
if (params.bloomSize == 0) {
if (params.bloomPath.empty() && params.bloomSize == 0) {
cerr << PROGRAM ": missing mandatory option `-b'\n";
die = true;
}
if (params.k == 0) {
if (params.bloomPath.empty() && params.k == 0) {
cerr << PROGRAM ": missing mandatory option `-k'\n";
die = true;
}
......@@ -236,12 +476,12 @@ int main(int argc, char** argv)
}
if (!params.covTrackPath.empty() && params.refPath.empty()) {
cerr << PROGRAM ": you must specify a reference with `-r' "
cerr << PROGRAM ": you must specify a reference with `-R' "
"when using `-C'\n";
die = true;
}
if (params.trim == std::numeric_limits<unsigned>::max()) {
if (params.k > 0 && params.trim == std::numeric_limits<unsigned>::max()) {
params.trim = params.k;
}
......@@ -256,27 +496,11 @@ int main(int argc, char** argv)
exit(EXIT_FAILURE);
}
assert(params.initialized());
#if _OPENMP
if (params.threads > 0)
omp_set_num_threads(params.threads);
#endif
/* set global variable for k-mer length */
MaskedKmer::setLength(params.k);
/* set global variable for spaced seed */
if (params.K > 0)
MaskedKmer::setMask(SpacedSeed::kmerPair(params.k, params.K));
else if (params.qrSeedLen > 0)
MaskedKmer::setMask(SpacedSeed::qrSeedPair(params.k, params.qrSeedLen));
else
MaskedKmer::setMask(params.spacedSeed);
if (params.verbose && !MaskedKmer::mask().empty())
cerr << "Using spaced seed " << MaskedKmer::mask() << endl;
/* print contigs to STDOUT unless -o option was set */
ofstream outputFile;
if (!params.outputPath.empty()) {
......@@ -285,58 +509,13 @@ int main(int argc, char** argv)
}
ostream& out = params.outputPath.empty() ? cout : outputFile;
/* BloomFilter class requires size to be a multiple of 64 */
const size_t bitsPerByte = 8;
/*
* Note: it is (params.minCov + 1) here because we use an additional
* Bloom filter in BloomDBG::assemble() to track the set of
* assembled k-mers.
*/
size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
params.bloomSize * bitsPerByte / (params.minCov + 1), (size_t)64);
/* use cascading Bloom filter to remove error k-mers */
HashAgnosticCascadingBloom cascadingBloom(
bloomLevelSize, params.numHashes, params.minCov, params.k);
/* load reads into Bloom filter */
for (int i = optind; i < argc; ++i) {
/*
* Debugging feature: If there is a ':'
* separating the list of input read files into
* two parts, use the first set of files
* to load the Bloom filter and the second
* set of files for the assembly (read extension).
*/
if (strcmp(argv[i],":") == 0) {
optind = i + 1;
break;
}
BloomDBG::loadFile(cascadingBloom, argv[i], params.verbose);
}
if (params.verbose)
cerr << "Bloom filter FPR: " << setprecision(3)
<< cascadingBloom.FPR() * 100 << "%" << endl;
if (!params.covTrackPath.empty()) {
assert(!params.refPath.empty());
BloomDBG::writeCovTrack(cascadingBloom, params);
}
/* second pass through FASTA files for assembling */
BloomDBG::assemble(argc - optind, argv + optind,
cascadingBloom, params, out);
/* generate de Bruijn graph in GraphViz format (optional) */
if (!params.graphPath.empty()) {
ofstream graphOut(params.graphPath.c_str());
assert_good(graphOut, params.graphPath);
BloomDBG::outputGraph(argc - optind, argv + optind,
cascadingBloom, params, graphOut);
assert_good(graphOut, params.graphPath);
graphOut.close();
assert_good(graphOut, params.graphPath);
}
/* load the Bloom filter and do the assembly */
if (params.checkpointsEnabled() && checkpointExists(params))
resumeAssemblyFromCheckpoint(argc, argv, params, out);
else if (!params.bloomPath.empty())
prebuiltBloomAssembly(argc, argv, params, out);
else
cascadingBloomAssembly(argc, argv, params, out);
/* cleanup */
if (!params.outputPath.empty())
......
#ifndef BLOOM_DBG_H
#define BLOOM_DBG_H 1
#include "BloomDBG/AssemblyParams.h"
#include "BloomDBG/AssemblyCounters.h"
#include "BloomDBG/RollingHashIterator.h"
#include "Common/Uncompress.h"
#include "Common/IOUtil.h"
#include "Common/Sequence.h"
#include "DataLayer/FastaReader.h"
#include "Graph/Path.h"
#include "Graph/ExtendPath.h"
#include "Graph/BreadthFirstSearch.h"
#include "BloomDBG/BloomIO.h"
#include "BloomDBG/Checkpoint.h"
#include "BloomDBG/MaskedKmer.h"
#include "BloomDBG/RollingHash.h"
#include "BloomDBG/RollingBloomDBG.h"
......@@ -33,162 +38,6 @@ namespace BloomDBG {
*/
typedef RollingBloomDBGVertex Vertex;
/**
* Parameters controlling assembly.
*/
struct AssemblyParams
{
/** Bloom filter size (in bits) */
size_t bloomSize;
/** minimum k-mer coverage threshold */
unsigned minCov;
/** WIG track containing 0/1 for sufficient k-mer cov */
std::string covTrackPath;
/** path for output GraphViz file */
string graphPath;
/** num Bloom filter hash functions */
unsigned numHashes;
/** the number of parallel threads. */
unsigned threads;
/** the size of a k-mer. */
unsigned k;
/** the size of a single k-mer in a k-mer pair */
unsigned K;
/** reference genome */
std::string refPath;
/** Quadratic Residue (QR) seed length */
unsigned qrSeedLen;
/** spaced seed */
string spacedSeed;
/** maximum length of branches to trim */
unsigned trim;
/** verbose level for progress messages */
int verbose;
/** output contigs path (empty string indicates STDOUT) */
std::string outputPath;
/** output path for trace file (-T) option */
std::string tracePath;
/** Default constructor */
AssemblyParams() : bloomSize(0), minCov(2), graphPath(),
numHashes(1), threads(1), k(0), K(0), qrSeedLen(0),
spacedSeed(), trim(std::numeric_limits<unsigned>::max()),
verbose(0), outputPath(), tracePath() {}
/** Return true if all required members are initialized */
bool initialized() const {
return bloomSize > 0 && k > 0 &&
trim != std::numeric_limits<unsigned>::max();
}
/** Reset all spaced seed params to their default values */
void resetSpacedSeedParams() {
spacedSeed.clear();
K = 0;
qrSeedLen = 0;
}
};
/**
* Round up `num` to the nearest multiple of `base`.
*/
template <typename T>
inline static T roundUpToMultiple(T num, T base)
{
if (base == 0)
return num;
T remainder = num % base;
if (remainder == 0)
return num;
return num + base - remainder;
}
/**
* Load DNA sequence into Bloom filter using rolling hash.
*
* @param bloom target Bloom filter
* @param seq DNA sequence
*/
template <typename BF>
inline static void loadSeq(BF& bloom, const std::string& seq)
{
const unsigned k = bloom.getKmerSize();
const unsigned numHashes = bloom.getHashNum();
for (RollingHashIterator it(seq, numHashes, k);
it != RollingHashIterator::end(); ++it) {
bloom.insert(*it);
}
}
/**
* Load sequences contents of FASTA file into Bloom filter using
* rolling hash.
* @param bloom target Bloom filter
* @param path path to FASTA file
* @param verbose if true, print progress messages to STDERR
*/
template <typename BF>
inline static void loadFile(BF& bloom, const std::string& path,
bool verbose = false)
{
const size_t BUFFER_SIZE = 1000000;
const size_t LOAD_PROGRESS_STEP = 10000;
assert(!path.empty());
if (verbose)
std::cerr << "Reading `" << path << "'..." << std::endl;
FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
uint64_t readCount = 0;
#pragma omp parallel
for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
buffer.clear();
size_t bufferSize = 0;
bool good = true;
#pragma omp critical(in)
for (; good && bufferSize < BUFFER_SIZE;) {
std::string seq;
good = in >> seq;
if (good) {
buffer.push_back(seq);
bufferSize += seq.length();
}
}
if (buffer.size() == 0)
break;
for (size_t j = 0; j < buffer.size(); j++) {
loadSeq(bloom, buffer.at(j));
if (verbose)
#pragma omp critical(cerr)
{
readCount++;
if (readCount % LOAD_PROGRESS_STEP == 0)
std::cerr << "Loaded " << readCount
<< " reads into Bloom filter\n";
}
}
}
assert(in.eof());
if (verbose) {
std::cerr << "Loaded " << readCount << " reads from `"
<< path << "` into Bloom filter\n";
}
}
/**
* Return true if all of the k-mers in `seq` are contained in `bloom`
* and false otherwise.
......@@ -448,22 +297,6 @@ namespace BloomDBG {
return result;
}
/**
* Counters for tracking assembly statistics and producing
* progress messages.
*/
struct AssemblyCounters
{
size_t readsExtended;
size_t readsProcessed;
size_t basesAssembled;
size_t contigID;
AssemblyCounters() : readsExtended(0), readsProcessed(0),
basesAssembled(0), contigID(0) {}
};
/** Print an intermediate progress message during assembly */
void printProgressMessage(AssemblyCounters counters)
{
......@@ -473,6 +306,10 @@ namespace BloomDBG {
<< " of " << counters.readsProcessed
<< " reads (" << std::setprecision(3) << (float)100
* counters.readsExtended / counters.readsProcessed
<< "%), saw " << counters.solidReads
<< " of " << counters.readsProcessed
<< " solid reads (" << std::setprecision(3) << (float)100
* counters.solidReads / counters.readsProcessed
<< "%), assembled " << counters.basesAssembled
<< " bp so far" << std::endl;
}
......@@ -769,19 +606,22 @@ namespace BloomDBG {
* (e.g. k-mer coverage threshold)
* @param counters counter variables used for generating assembly
* progress messages.
* @param out output stream for contigs
* @param traceOut output stream for trace file (-T option)
* @param streams input/output streams used during assembly
*/
template <typename GraphT, typename BloomT>
template <typename GraphT, typename BloomT, typename AssemblyStreamsT>
inline static void extendRead(const FastaRecord& read,
const GraphT& dbg, BloomT& assembledKmerSet,
const AssemblyParams& params, AssemblyCounters& counters,
std::ostream& out, std::ostream& traceOut)
AssemblyStreamsT& streams)
{
const unsigned k = params.k;
const unsigned numHashes = params.numHashes;
const unsigned minBranchLen = params.trim + 1;
std::ostream& out = streams.out;
std::ostream& checkpointOut = streams.checkpointOut;
std::ostream& traceOut = streams.traceOut;
if (params.verbose >= 2) {
#pragma omp critical(cerr)
std::cerr << "Extending read: " << read.id << std::endl;
......@@ -851,6 +691,11 @@ namespace BloomDBG {
/* add contig to output FASTA */
printContig(seq, counters.contigID, read.id, k, out);
/* add contig to checkpoint FASTA file */
if (params.checkpointsEnabled())
printContig(seq, counters.contigID, read.id, k,
checkpointOut);
/* update counters / trace results */
traceResult.redundantContig = false;
traceResult.contigID = counters.contigID;
......@@ -867,6 +712,104 @@ namespace BloomDBG {
}
} /* for each read segment */
if (params.verbose >= 2) {
#pragma omp critical(cerr)
std::cerr << "Finished extending read: " << read.id << std::endl;
}
}
/**
* Return true if the left end of the given sequence is a blunt end
* in the Bloom filterde Bruijn graph. PRECONDITION: `seq` does not contain
* any non-ACGT chars.
*/
template <typename GraphT>
inline static unsigned leftIsBluntEnd(const Sequence& seq, const GraphT& graph,
const AssemblyParams& params)
{
unsigned k = params.k;
unsigned numHashes = params.numHashes;
unsigned fpLookAhead = 5;
if (seq.length() < k)
return false;
Sequence firstKmerStr(seq, 0, k);
Path<Vertex> path = seqToPath(firstKmerStr, k, numHashes);
Vertex& firstKmer = path.front();
return !lookAhead(firstKmer, REVERSE, fpLookAhead, graph);
}
/**
* Return true if the left and/or right end of the sequence is a blunt
* end in the de Bruijn graph. Such reads likely have read errors
* and thus we should not try to extend these into contings.
* When testing for a blunt end, we account for the possibility of
* false positive extensions by using a small amount of look-ahead
* (as dictated by params.fpLookAhead).
*/
template <typename GraphT>
inline static bool hasBluntEnd(const Sequence& seq, const GraphT& graph,
const AssemblyParams& params)
{
if (leftIsBluntEnd(seq, graph, params))
return true;
Sequence rc = reverseComplement(seq);
if (leftIsBluntEnd(rc, graph, params))
return true;
return false;
}
/**
* Decide if a read should be extended and if so extend it into a contig.
*/
template <typename SolidKmerSetT, typename AssembledKmerSetT,
typename AssemblyStreamsT>
static inline void processRead(const FastaRecord& rec,
const SolidKmerSetT& solidKmerSet, AssembledKmerSetT& assembledKmerSet,
const AssemblyParams& params, AssemblyCounters& counters,
AssemblyStreamsT& streams)
{
/* Boost graph API for Bloom filter */
RollingBloomDBG<SolidKmerSetT> graph(solidKmerSet);
unsigned k = params.k;
const Sequence& seq = rec.seq;
/* we can't extend reads shorter than k */
if (seq.length() < k)
return;
/* skip reads with non-ACGT chars */
if (!allACGT(seq))
return;
/* don't extend reads that are tips */
if (hasBluntEnd(seq, graph, params))
return;
/* only extend "solid" reads */
if (!allKmersInBloom(seq, solidKmerSet))
return;
#pragma omp atomic
counters.solidReads++;
/* skip reads in previously assembled regions */
if (allKmersInBloom(seq, assembledKmerSet))
return;
/* extend the read into a contig */
extendRead(rec, graph, assembledKmerSet, params,
counters, streams);
#pragma omp atomic
counters.readsExtended++;
}
/**
......@@ -885,18 +828,30 @@ namespace BloomDBG {
* @param verbose set to true to print progress messages to
* STDERR
*/
template <typename BloomT>
inline static void assemble(int argc, char** argv, const BloomT& goodKmerSet,
const AssemblyParams& params, std::ostream& out)
template <typename SolidKmerSetT>
inline static void assemble(int argc, char** argv,
SolidKmerSetT& solidKmerSet, const AssemblyParams& params,
std::ostream& out)
{
assert(params.initialized());
/* k-mers in previously assembled contigs */
BTL::BloomFilter visitedKmerSet(solidKmerSet.size(),
solidKmerSet.getHashNum(), solidKmerSet.getKmerSize());
/* per-thread I/O buffer (size is in bases) */
const size_t SEQ_BUFFER_SIZE = 1000000;
/* counters for progress messages */
AssemblyCounters counters;
/* print progress message after processing this many reads */
const unsigned progressStep = 1000;
const unsigned k = goodKmerSet.getKmerSize();
/* input reads */
FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
/* duplicate FASTA output for checkpoints */
std::ofstream checkpointOut;
if (params.checkpointsEnabled()) {
assert(!params.checkpointPathPrefix.empty());
std::string path = params.checkpointPathPrefix
+ CHECKPOINT_FASTA_EXT + CHECKPOINT_TMP_EXT;
checkpointOut.open(path.c_str());
assert_good(checkpointOut, path);
}
/* trace file output ('-T' option) */
std::ofstream traceOut;
......@@ -907,33 +862,74 @@ namespace BloomDBG {
assert_good(traceOut, params.tracePath);
}
/* k-mers in previously assembled contigs */
BloomFilter assembledKmerSet(goodKmerSet.size(),
goodKmerSet.getHashNum(), goodKmerSet.getKmerSize());
/* counters for progress messages */
AssemblyCounters counters;
/* bundle output streams */
AssemblyStreams<FastaConcat> streams(in, out, checkpointOut, traceOut);
/* Boost graph API over Bloom filter */
RollingBloomDBG<BloomT> graph(goodKmerSet);
/* run the assembly */
assemble(solidKmerSet, visitedKmerSet, counters, params, streams);
}
/**
* Perform a Bloom-filter-based de Bruijn graph assembly.
* Contigs are generated by extending reads left/right within
* the de Bruijn graph, up to the next branching point or dead end.
* Short branches due to Bloom filter false positives are
* ignored.
*
* @param argc number of input FASTA files
* @param argv array of input FASTA filenames
* @param genomeSize approx genome size
* @param goodKmerSet Bloom filter containing k-mers that
* occur more than once in the input data
* @param visitedKmerSet Bloom filter containing k-mers that have
* already been included in contigs
* @param in input stream for sequencing reads (FASTA)
* @param out output stream for contigs (FASTA)
* @param verbose set to true to print progress messages to
* STDERR
*/
template <typename SolidKmerSetT, typename VisitedKmerSetT,
typename InputReadStreamT>
inline static void assemble(const SolidKmerSetT& goodKmerSet,
VisitedKmerSetT& assembledKmerSet, AssemblyCounters& counters,
const AssemblyParams& params,
AssemblyStreams<InputReadStreamT>& streams)
{
assert(params.initialized());
/* per-thread I/O buffer (size is in bases) */
const size_t SEQ_BUFFER_SIZE = 1000000;
/* controls frequency of progress messages */
const unsigned PROGRESS_STEP = 1000;
if (params.verbose)
std::cerr << "Trimming branches " << params.trim
<< " k-mers or shorter" << std::endl;
FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
#pragma omp parallel
for (std::vector<FastaRecord> buffer;;) {
InputReadStreamT& in = streams.in;
std::ostream& checkpointOut = streams.checkpointOut;
while (true)
{
size_t readsUntilCheckpoint = params.readsPerCheckpoint;
#pragma omp parallel
for (std::vector<FastaRecord> buffer;;)
{
/* read sequences in batches to reduce I/O contention */
buffer.clear();
size_t bufferSize;
bool good = true;
#pragma omp critical(in)
for (bufferSize = 0; bufferSize < SEQ_BUFFER_SIZE;) {
for (bufferSize = 0; bufferSize < SEQ_BUFFER_SIZE
&& readsUntilCheckpoint > 0;)
{
FastaRecord rec;
good = in >> rec;
if (!good)
break;
#pragma omp atomic
readsUntilCheckpoint--;
buffer.push_back(rec);
bufferSize += rec.seq.length();
}
......@@ -941,50 +937,40 @@ namespace BloomDBG {
break;
for (std::vector<FastaRecord>::iterator it = buffer.begin();
it != buffer.end(); ++it) {
const FastaRecord& rec = *it;
bool skip = false;
/* we can't extend reads shorter than k */
if (rec.seq.length() < k)
skip = true;
/* only extend error-free reads */
if (!skip && !allKmersInBloom(rec.seq, goodKmerSet))
skip = true;
/* skip reads in previously assembled regions */
if (!skip && allKmersInBloom(rec.seq, assembledKmerSet))
skip = true;
/* extend the read left and right within the DBG */
if (!skip) {
extendRead(rec, graph, assembledKmerSet, params,
counters, out, traceOut);
#pragma omp atomic
counters.readsExtended++;
}
it != buffer.end(); ++it)
{
processRead(*it, goodKmerSet, assembledKmerSet,
params, counters, streams);
#pragma omp atomic
counters.readsProcessed++;
if (params.verbose && counters.readsProcessed % progressStep == 0)
if (params.verbose && counters.readsProcessed % PROGRESS_STEP == 0)
printProgressMessage(counters);
}
} /* for each read */
} /* for each batch of reads (parallel) */
} /* for batch of reads between I/O operations */
if (readsUntilCheckpoint > 0) {
assert(in.eof());
if (!params.tracePath.empty()) {
traceOut.close();
assert_good(traceOut, params.tracePath);
break;
}
/* save assembly state to checkpoint files */
checkpointOut.flush();
createCheckpoint(goodKmerSet, assembledKmerSet, counters,
params);
} /* for each batch of reads between checkpoints */
assert(in.eof());
if (params.verbose) {
printProgressMessage(counters);
std::cerr << "Assembly complete" << std::endl;
}
if (params.checkpointsEnabled() && !params.keepCheckpoint)
removeCheckpointData(params);
}
/**
......
@article{simpson2009abyss,
title={ABySS: a parallel assembler for short read sequence data},
author={Simpson, Jared T and Wong, Kim and Jackman, Shaun D and Schein, Jacqueline E and Jones, Steven JM and Birol, Inan{\c{c}}},
journal={Genome research},
volume={19},
number={6},
pages={1117--1123},
year={2009},
publisher={Cold Spring Harbor Lab}
@article{Jackman_2017,
title={ABySS 2.0: resource-efficient assembly of large genomes using a Bloom filter},
volume={27},
ISSN={1549-5469},
url={http://doi.org/10.1101/gr.214346.116},
DOI={10.1101/gr.214346.116},
number={5},
journal={Genome Research},
publisher={Cold Spring Harbor Laboratory},
author={Jackman, Shaun D. and Vandervalk, Benjamin P. and Mohamadi, Hamid and Chu, Justin and Yeo, Sarah and Hammond, S. Austin and Jahesh, Golnaz and Khan, Hamza and Coombe, Lauren and Warren, Rene L. and et al.},
year={2017},
month={Feb},
pages={768–777}
}
Citation
================================================================================
# Citation
+ [doi:10.1101/gr.089532.108](http://dx.doi.org/10.1101/gr.089532.108)
+ [PubMed PMID: 19251739](http://www.ncbi.nlm.nih.gov/pubmed/19251739)
+ [Google Scholar](http://scholar.google.ca/scholar?q=doi%3A10.1101%2Fgr.089532.108)
+ [Genome Research](http://genome.cshlp.org/content/19/6/1117.short)
+ [PDF](http://genome.cshlp.org/content/19/6/1117.full.pdf)
Shaun D Jackman, Benjamin P Vandervalk, Hamid Mohamadi, Justin Chu, Sarah Yeo, S Austin Hammond, Golnaz Jahesh, Hamza Khan, Lauren Coombe, René L Warren, and Inanc Birol (2017).
**ABySS 2.0: Resource-efficient assembly of large genomes using a Bloom filter**.
*Genome research*, 27(5), 768-777.
[doi:10.1101/gr.214346.116](http://doi.org/10.1101/gr.214346.116)
BibTeX
------------------------------------------------------------
- [PDF](http://genome.cshlp.org/content/27/5/768.full.pdf)
- [Genome Research](http://genome.cshlp.org/content/27/5/768)
- [PubMed PMID: 28232478](https://www.ncbi.nlm.nih.gov/pubmed/28232478)
- [Google Scholar](http://scholar.google.ca/scholar?q=10.1101/gr.214346.116)
# BibTeX
```bibtex
@article{simpson2009abyss,
title={ABySS: a parallel assembler for short read sequence data},
author={Simpson, Jared T and Wong, Kim and Jackman, Shaun D and Schein, Jacqueline E and Jones, Steven JM and Birol, Inan{\c{c}}},
journal={Genome research},
volume={19},
number={6},
pages={1117--1123},
year={2009},
publisher={Cold Spring Harbor Lab}
@article{Jackman_2017,
title={ABySS 2.0: resource-efficient assembly of large genomes using a Bloom filter},
volume={27},
ISSN={1549-5469},
url={http://doi.org/10.1101/gr.214346.116},
DOI={10.1101/gr.214346.116},
number={5},
journal={Genome Research},
publisher={Cold Spring Harbor Laboratory},
author={Jackman, Shaun D. and Vandervalk, Benjamin P. and Mohamadi, Hamid and Chu, Justin and Yeo, Sarah and Hammond, S. Austin and Jahesh, Golnaz and Khan, Hamza and Coombe, Lauren and Warren, Rene L. and et al.},
year={2017},
month={Feb},
pages={768–777}
}
```
MLA
------------------------------------------------------------
# MLA
Simpson, Jared T., et al. "ABySS: a parallel assembler for short read sequence data." *Genome research* 19.6 (2009): 1117-1123.
Jackman, Shaun D., et al. "ABySS 2.0: resource-efficient assembly of large genomes using a Bloom filter." Genome research 27.5 (2017): 768-777.
APA
------------------------------------------------------------
# APA
Simpson, J. T., Wong, K., Jackman, S. D., Schein, J. E., Jones, S. J., & Birol, I. (2009). ABySS: a parallel assembler for short read sequence data. *Genome research*, 19(6), 1117-1123.
Jackman, S. D., Vandervalk, B. P., Mohamadi, H., Chu, J., Yeo, S., Hammond, S. A., ... & Birol, I. (2017). ABySS 2.0: resource-efficient assembly of large genomes using a Bloom filter. Genome research, 27(5), 768-777.
Chicago
------------------------------------------------------------
# Chicago
Simpson, Jared T., Kim Wong, Shaun D. Jackman, Jacqueline E. Schein, Steven JM Jones, and Inanç Birol. "ABySS: a parallel assembler for short read sequence data." *Genome research* 19, no. 6 (2009): 1117-1123.
Jackman, Shaun D., Benjamin P. Vandervalk, Hamid Mohamadi, Justin Chu, Sarah Yeo, S. Austin Hammond, Golnaz Jahesh et al. "ABySS 2.0: resource-efficient assembly of large genomes using a Bloom filter." Genome research 27, no. 5 (2017): 768-777.