Commit fcd96a6b authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 1.5.2

parent b391f1fb
*/Makefile.in
Makefile.in
aclocal.m4
autom4te.cache
config.h.in
configure
depcomp
install-sh
missing
test-driver
_*
*.swp
*.swo
*.swn
*~
tags
compile
*.orig
language: cpp
compiler:
- gcc
- clang
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libboost-dev libopenmpi-dev libsparsehash-dev
script: ./autogen.sh && ./configure --with-mpi=/usr/lib/openmpi && make
......@@ -58,6 +58,7 @@ static void assemble(const string& pathIn, const string& pathOut)
bind1st(ptr_fun(AssemblyAlgorithms::loadSequences), &g));
size_t numLoaded = g.size();
cout << "Loaded " << numLoaded << " k-mer\n";
g.setDeletedKey();
g.shrink();
if (g.empty()) {
cerr << "error: no usable sequence\n";
......
This diff is collapsed.
......@@ -31,10 +31,10 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Shaun Jackman.\n"
"\n"
"Copyright 2013 Canada's Michael Smith Genome Science Centre\n";
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " [OPTION]... [FILE]...\n"
"Usage: " PROGRAM " -k<kmer> [OPTION]... [FILE]...\n"
"Find overlaps of [m,k) bases. Contigs may be read from FILE(s)\n"
"or standard input. Output is written to standard output.\n"
"Overlaps of exactly k-1 bases are found using a hash table.\n"
......@@ -47,6 +47,8 @@ static const char USAGE_MESSAGE[] =
" --adj output the results in adj format [default]\n"
" --dot output the results in dot format\n"
" --sam output the results in SAM format\n"
" --SS expect contigs to be oriented correctly\n"
" --no-SS no assumption about contig orientation\n"
" -v, --verbose display verbose output\n"
" --help display this help and exit\n"
" --version output version information and exit\n"
......@@ -57,6 +59,9 @@ namespace opt {
unsigned k; // used by GraphIO
int format; // used by GraphIO
/** Run a strand-specific RNA-Seq assembly. */
static int ss;
/** The minimum required amount of overlap. */
static unsigned minOverlap = 50;
}
......@@ -71,6 +76,8 @@ static const struct option longopts[] = {
{ "adj", no_argument, &opt::format, ADJ },
{ "dot", no_argument, &opt::format, DOT },
{ "sam", no_argument, &opt::format, SAM },
{ "SS", no_argument, &opt::ss, 1 },
{ "no-SS", no_argument, &opt::ss, 0 },
{ "verbose", no_argument, NULL, 'v' },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
......@@ -102,6 +109,8 @@ static void addOverlapsSA(Graph& g, const SuffixArray& sa,
pair<It, It> range = sa.equal_range(q);
for (It it = range.first; it != range.second; ++it) {
ContigNode u(it->second);
if (opt::ss && u.sense() != v.sense())
continue;
if (seen.insert(u).second) {
// Add the longest overlap between two vertices.
unsigned overlap = it->first.size();
......@@ -118,7 +127,6 @@ static void addOverlapsSA(Graph& g, const vector<Kmer>& prefixes)
typedef pair<string, ContigNode> Suffix;
typedef vector<Suffix> Suffixes;
Suffixes suffixes;
SuffixArray sa(opt::minOverlap);
typedef graph_traits<Graph>::vertex_descriptor V;
typedef graph_traits<Graph>::vertex_iterator Vit;
......@@ -132,8 +140,12 @@ static void addOverlapsSA(Graph& g, const vector<Kmer>& prefixes)
assert(uci < prefixes.size());
string suffix(reverseComplement(prefixes[uci]).str());
suffixes.push_back(Suffix(suffix, u));
sa.insert(suffixes.back());
}
SuffixArray sa(opt::minOverlap);
for (Suffixes::const_iterator it = suffixes.begin();
it != suffixes.end(); ++it)
sa.insert(*it);
sa.construct();
for (Suffixes::const_iterator it = suffixes.begin();
......@@ -144,7 +156,8 @@ static void addOverlapsSA(Graph& g, const vector<Kmer>& prefixes)
}
/** An index of suffixes of k-1 bp. */
typedef unordered_map<Kmer, vector<ContigNode>, hashKmer> SuffixMap;
typedef unordered_map<Kmer, vector<ContigNode>, hash<Kmer> >
SuffixMap;
/** Read contigs. Add contig properties to the graph. Add prefixes to
* the collection and add suffixes to their index.
......@@ -261,6 +274,8 @@ int main(int argc, char** argv)
itu = edges.begin(); itu != edges.end(); ++itu) {
V uc = get(vertex_complement, g, *itu);
V vc = get(vertex_complement, g, v);
if (opt::ss && uc.sense() != vc.sense())
continue;
add_edge(vc, uc, -(int)opt::k + 1, static_cast<DG&>(g));
}
}
......
......@@ -5,7 +5,6 @@ AdjList_CPPFLAGS = -I$(top_srcdir) \
-I$(top_srcdir)/DataLayer
AdjList_LDADD = \
$(top_builddir)/Graph/libgraph.a \
$(top_builddir)/DataLayer/libdatalayer.a \
$(top_builddir)/Common/libcommon.a
......
This diff is collapsed.
This diff is collapsed.
......@@ -22,7 +22,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Shaun Jackman.\n"
"\n"
"Copyright 2013 Canada's Michael Smith Genome Science Centre\n";
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " [OPTION]... [FASTA]...\n"
......
......@@ -3,8 +3,10 @@
#include <cassert>
#include <cctype>
#include <ostream>
#include <utility>
#include <iostream>
#include <string>
#include <vector>
/** The result of a Needleman-Wunsch alignment. */
struct NWAlignment {
......@@ -39,4 +41,52 @@ unsigned alignGlobal(
const std::string& a, const std::string& b,
NWAlignment& align);
/** Align the specified pair of sequences.
* @return the number of matches and size of the consensus
*/
static inline std::pair<unsigned, unsigned> alignPair(
const std::string& seqa, const std::string& seqb, NWAlignment& align)
{
unsigned matches = alignGlobal(seqa, seqb, align);
return std::make_pair(matches, align.size());
}
/** Align the specified sequences.
* @return the number of matches and size of the consensus
*/
template <typename Seq>
static std::pair<unsigned, unsigned> alignMulti(
const std::vector<Seq>& seqs, NWAlignment& align)
{
Seq alignment = seqs[0];
unsigned matches = 0;
for (unsigned j = 0; j < seqs.size() - 1; j++) {
matches = std::min(matches, alignGlobal(alignment,
seqs[j+1], align));
alignment = align.match_align;
}
return std::make_pair(matches, alignment.size());
}
/** Align the specified sequences.
* @return the number of matches and size of the consensus
*/
template <typename Seq>
static std::pair<unsigned, unsigned> align(
const std::vector<Seq>& seqs, NWAlignment& aln)
{
assert(seqs.size() > 1);
if (seqs.size() == 2)
return alignPair(seqs[0], seqs[1], aln);
else
return alignMulti(seqs, aln);
}
template <typename Seq>
static std::pair<unsigned, unsigned> align(const std::vector<Seq>& seqs)
{
NWAlignment aln;
return align(seqs, aln);
}
#endif
......@@ -24,7 +24,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Anthony Raymond.\n"
"\n"
"Copyright 2013 Canada's Michael Smith Genome Science Centre\n";
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " [OPTION]... READS1 READS2\n"
......@@ -72,6 +72,7 @@ static struct {
unsigned total_reads;
unsigned merged_reads;
unsigned unmerged_reads;
unsigned unchaste_reads;
unsigned no_alignment;
unsigned too_many_aligns;
unsigned low_matches;
......@@ -256,6 +257,8 @@ static void alignFiles(const char* reads1, const char* reads2)
}
}
r2 >> rec2;
stats.unchaste_reads = r1.unchaste();
stats.total_reads += r1.unchaste();
assert(r1.eof());
assert(r2.eof());
unmerged1.close();
......@@ -320,9 +323,10 @@ int main(int argc, char** argv)
alignFiles(reads1, reads2);
cerr << "Read merging stats: total=" << stats.total_reads
cerr << "Pair merging stats: total=" << stats.total_reads
<< " merged=" << stats.merged_reads
<< " unmerged=" << stats.unmerged_reads << '\n'
<< " unmerged=" << stats.unmerged_reads
<< " unchaste=" << stats.unchaste_reads << '\n'
<< "no_alignment=" << stats.no_alignment
<< " too_many_aligns=" << stats.too_many_aligns
<< " too_few_matches=" << stats.low_matches
......
......@@ -82,14 +82,18 @@ void loadSequences(ISequenceCollection* seqCollection, string inFile)
}
size_t count = 0, count_good = 0,
count_small = 0, count_nonACGT = 0;
FastaReader reader(inFile.c_str(), FastaReader::FOLD_CASE);
count_small = 0, count_nonACGT = 0,
count_reversed = 0;
int fastaFlags = opt::maskCov ? FastaReader::NO_FOLD_CASE :
FastaReader::FOLD_CASE;
FastaReader reader(inFile.c_str(), fastaFlags);
if (endsWith(inFile, ".jf") || endsWith(inFile, ".jfq")) {
// Load k-mer with coverage data.
count = loadKmer(*seqCollection, reader);
count_good = count;
} else
for (Sequence seq; reader >> seq;) {
for (FastaRecord rec; reader >> rec;) {
Sequence seq = rec.seq;
size_t len = seq.length();
if (opt::kmerSize > len) {
count_small++;
......@@ -115,11 +119,24 @@ void loadSequences(ISequenceCollection* seqCollection, string inFile)
bool good = seq.find_first_not_of("ACGT0123") == string::npos;
bool discarded = true;
if (opt::ss && rec.id.size() > 2
&& rec.id.substr(rec.id.size()-2) == "/1") {
seq = reverseComplement(seq);
count_reversed++;
}
for (unsigned i = 0; i < len - opt::kmerSize + 1; i++) {
Sequence kmer(seq, i, opt::kmerSize);
if (good || kmer.find_first_not_of("ACGT0123")
if (good || kmer.find_first_not_of("acgtACGT0123")
== string::npos) {
seqCollection->add(Kmer(kmer));
if (good || kmer.find_first_of("acgt") == string::npos)
seqCollection->add(Kmer(kmer));
else {
transform(kmer.begin(), kmer.end(), kmer.begin(),
::toupper);
seqCollection->add(Kmer(kmer), 0);
}
discarded = false;
}
}
......@@ -139,6 +156,9 @@ void loadSequences(ISequenceCollection* seqCollection, string inFile)
logger(1) << "Read " << count << " reads. ";
seqCollection->printLoad();
if (count_reversed > 0)
cerr << "`" << inFile << "': "
"reversed " << count_reversed << " reads\n";
if (count_small > 0)
cerr << "`" << inFile << "': "
"discarded " << count_small << " reads "
......@@ -230,7 +250,7 @@ size_t markAmbiguous(ISequenceCollection* g)
if (++progress % 1000000 == 0)
logger(1) << "Splitting: " << progress << '\n';
if (it->first.isPalindrome()) {
if (!opt::ss && it->first.isPalindrome()) {
countv += 2;
g->mark(it->first);
counte += markNeighbours(g, *it, SENSE);
......@@ -238,7 +258,7 @@ size_t markAmbiguous(ISequenceCollection* g)
for (extDirection sense = SENSE;
sense <= ANTISENSE; ++sense) {
if (it->second.getExtension(sense).isAmbiguous()
|| it->first.isPalindrome(sense)) {
|| (!opt::ss && it->first.isPalindrome(sense))) {
countv++;
g->mark(it->first, sense);
counte += markNeighbours(g, *it, sense);
......@@ -737,7 +757,7 @@ bool processLinearExtensionForBranch(BranchRecord& branch,
unsigned maxLength, bool addKmer)
{
/** Stop contig assembly at palindromes. */
const bool stopAtPalindromes = maxLength == UINT_MAX;
const bool stopAtPalindromes = !opt::ss && maxLength == UINT_MAX;
extDirection dir = branch.getDirection();
if (branch.isTooLong(maxLength)) {
......@@ -901,7 +921,8 @@ size_t assemble(SequenceCollectionHash* seqCollection,
currSeq, extRec, multiplicity, UINT_MAX);
}
if (currBranch.isCanonical()) {
if ((opt::ss && currBranch.getDirection() == SENSE)
|| (!opt::ss && currBranch.isCanonical())) {
size_t removed = assembleContig(seqCollection,
fileWriter, currBranch, contigID++);
assembledKmer += currBranch.size();
......
......@@ -10,10 +10,10 @@ int BranchRecord::calculateBranchMultiplicity() const
for (BranchData::const_iterator it = m_data.begin();
it != m_data.end(); ++it) {
int m = it->second.getMultiplicity();
assert(m > 0);
assert(m >= 0);
total += m;
}
assert(total > 0);
assert(total >= 0);
return total;
}
......
......@@ -7,11 +7,12 @@
#if HAVE_GOOGLE_SPARSE_HASH_MAP
# include <google/sparse_hash_map>
typedef google::sparse_hash_map<Kmer, KmerData,
hashKmer> SequenceDataHash;
typedef google::sparse_hash_map<Kmer, KmerData, hash<Kmer> >
SequenceDataHash;
#else
# include "UnorderedMap.h"
typedef unordered_map<Kmer, KmerData, hashKmer> SequenceDataHash;
typedef unordered_map<Kmer, KmerData, hash<Kmer> >
SequenceDataHash;
#endif
/** The interface of a map of Kmer to KmerData. */
......
This diff is collapsed.
......@@ -20,12 +20,12 @@ namespace opt {
static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Jared Simpson and Shaun Jackman.\n"
"Written by Jared Simpson, Shaun Jackman and Anthony Raymond.\n"
"\n"
"Copyright 2013 Canada's Michael Smith Genome Science Centre\n";
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " [OPTION]... FILE...\n"
"Usage: " PROGRAM " -k<kmer> -o<output.fa> [OPTION]... FILE...\n"
"Assemble the input files, FILE, which may be in FASTA, FASTQ,\n"
"qseq, export, SAM or BAM format and compressed with gz, bz2 or xz.\n"
"\n"
......@@ -39,10 +39,13 @@ static const char USAGE_MESSAGE[] =
" reads\n"
" -q, --trim-quality=N trim bases from the ends of reads whose\n"
" quality is less than the threshold\n"
" -Q, --mask-quality=N mask all low quality bases as `N'\n"
" --standard-quality zero quality is `!' (33)\n"
" default for FASTQ and SAM files\n"
" --illumina-quality zero quality is `@' (64)\n"
" default for qseq and export files\n"
" --SS assemble in strand-specific mode\n"
" --no-SS do not assemble in strand-specific mode\n"
" -o, --out=FILE write the contigs to FILE\n"
" -k, --kmer=N k-mer size\n"
" -t, --trim-length=N maximum length of dangling edges to trim\n"
......@@ -56,12 +59,17 @@ static const char USAGE_MESSAGE[] =
" with coverage less than this threshold on\n"
" either strand\n"
" --coverage-hist=FILE write the k-mer coverage histogram to FILE\n"
" -g, --graph=FILE generate a graph in dot format\n"
" -m, --mask-cov do not include kmers containing masked bases in\n"
" coverage calculations [experimental]\n"
" -s, --snp=FILE record popped bubbles in FILE\n"
" -v, --verbose display verbose output\n"
" --help display this help and exit\n"
" --version output version information and exit\n"
"\n"
" ABYSS Options: (won't work with ABYSS-P)\n"
"\n"
" -g, --graph=FILE generate a graph in dot format\n"
"\n"
"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
/** k-mer length */
......@@ -88,6 +96,15 @@ float coverage = -1;
/** Pop bubbles shorter than N bp. */
int bubbleLen = -1;
/** Whether to run a strand-specific assembly. */
int ss = 0;
/**
* do not include kmers containing masked bases in
* coverage calculations (experimental)
*/
bool maskCov = false;
/** coverage histogram path */
string coverageHistPath;
......@@ -108,7 +125,7 @@ string snpPath;
/** input FASTA files */
vector<string> inFiles;
static const char shortopts[] = "b:c:e:E:g:k:o:q:s:t:v";
static const char shortopts[] = "b:c:e:E:g:k:mo:Q:q:s:t:v";
enum { OPT_HELP = 1, OPT_VERSION, COVERAGE_HIST };
......@@ -123,6 +140,8 @@ static const struct option longopts[] = {
{ "trim-quality", required_argument, NULL, 'q' },
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
{ "SS", no_argument, &opt::ss, 1 },
{ "no-SS", no_argument, &opt::ss, 0 },
{ "coverage", required_argument, NULL, 'c' },
{ "coverage-hist", required_argument, NULL, COVERAGE_HIST },
{ "bubble-length", required_argument, NULL, 'b' },
......@@ -130,6 +149,7 @@ static const struct option longopts[] = {
{ "erode", required_argument, NULL, 'e' },
{ "erode-strand", required_argument, NULL, 'E' },
{ "no-erode", no_argument, (int*)&erode, 0 },
{ "mask-cov", no_argument, NULL, 'm' },
{ "graph", required_argument, NULL, 'g' },
{ "snp", required_argument, NULL, 's' },
{ "verbose", no_argument, NULL, 'v' },
......@@ -184,6 +204,9 @@ void parse(int argc, char* const* argv)
case COVERAGE_HIST:
getline(arg, coverageHistPath);
break;
case 'm':
maskCov = true;
break;
case 'o':
getline(arg, contigsPath);
break;
......@@ -202,6 +225,9 @@ void parse(int argc, char* const* argv)
case 'q':
arg >> opt::qualityThreshold;
break;
case 'Q':
arg >> opt::internalQThreshold;
break;
case 's':
getline(arg, snpPath);
break;
......@@ -240,6 +266,7 @@ void parse(int argc, char* const* argv)
}
assert(opt::qualityThreshold <= 40);
assert(opt::internalQThreshold <= 40);
if (opt::rank <= 0
&& opt::coverage >= 0 && opt::erode == (unsigned)-1)
......
......@@ -14,6 +14,8 @@ namespace opt {
extern unsigned trimLen;
extern float coverage;
extern unsigned bubbleLen;
extern unsigned ss;
extern bool maskCov;
extern std::string coverageHistPath;
extern std::string contigsPath;
extern std::string contigsTempPath;
......
......@@ -2,6 +2,7 @@
#include "SequenceCollection.h"
#include "Log.h"
#include "Common/Options.h"
#include "Assembly/Options.h"
#include "MemoryUtil.h"
#include "StringUtil.h" // for toSI
#include "Timer.h"
......@@ -35,24 +36,45 @@ SequenceCollectionHash::SequenceCollectionHash()
#endif
}
/** sparse_hash_set requires that set_deleted_key()
* is called before calling erase(). This key cannot
* be an existing kmer in m_data. This function sets
* the deleted key and should be called after all
* data has been loaded.
*/
void SequenceCollectionHash::setDeletedKey()
{
#if HAVE_GOOGLE_SPARSE_HASH_MAP
for (SequenceDataHash::iterator it = m_data.begin();
it != m_data.end(); it++) {
Kmer rc(reverseComplement(it->first));
bool isrc;
SequenceDataHash::iterator search = find(rc, isrc);
// If this is false, we should have a palindrome or we're
// doing a SS assembly.
if (isrc || search == m_data.end()) {
m_data.set_deleted_key(rc);
return;
}
}
logger(1) << "error: unable to set deleted key.\n";
exit(EXIT_FAILURE);
#else
return;
#endif
}
/** Add the specified k-mer to this collection. */
void SequenceCollectionHash::add(const Kmer& seq, unsigned coverage)
{
bool rc;
SequenceCollectionHash::iterator it = find(seq, rc);
if (it == m_data.end()) {
#if HAVE_GOOGLE_SPARSE_HASH_MAP
if (m_data.empty()) {
/* sparse_hash_set requires that set_deleted_key()
* is called before calling erase(). */
Kmer rc(reverseComplement(seq));
assert(rc != seq);
m_data.set_deleted_key(rc);
}
#endif
m_data.insert(make_pair(seq, KmerData(SENSE, coverage)));
} else
} else if (coverage > 0) {
assert(!rc || !opt::ss);
it->second.addMultiplicity(rc ? ANTISENSE : SENSE, coverage);
}
}
/** Clean up by erasing sequences flagged as deleted.
......@@ -89,11 +111,16 @@ bool SequenceCollectionHash::setBaseExtension(
SequenceCollectionHash::iterator it = find(kmer, rc);
if (it == m_data.end())
return false;
bool palindrome = kmer.isPalindrome();
if (!rc || palindrome)
if (opt::ss) {
assert(!rc);
it->second.setBaseExtension(dir, base);
if (rc || palindrome)
it->second.setBaseExtension(!dir, complementBaseCode(base));
} else {
bool palindrome = kmer.isPalindrome();
if (!rc || palindrome)
it->second.setBaseExtension(dir, base);
if (rc || palindrome)
it->second.setBaseExtension(!dir, complementBaseCode(base));
}
return true;
}
......@@ -104,11 +131,16 @@ void SequenceCollectionHash::removeExtension(const Kmer& kmer,
bool rc;
SequenceCollectionHash::iterator it = find(kmer, rc);
assert(it != m_data.end());
bool palindrome = kmer.isPalindrome();
if (!rc || palindrome)
if (opt::ss) {
assert(!rc);
it->second.removeExtension(dir, ext);
if (rc || palindrome)
it->second.removeExtension(!dir, ~ext);
} else {
bool palindrome = kmer.isPalindrome();
if (!rc || palindrome)
it->second.removeExtension(dir, ext);
if (rc || palindrome)
it->second.removeExtension(!dir, ~ext);
}
notify(*it);
}
......@@ -144,7 +176,7 @@ SequenceCollectionHash::iterator SequenceCollectionHash::find(
const Kmer& key, bool& rc)
{
SequenceCollectionHash::iterator it = find(key);
if (it != m_data.end()) {
if (opt::ss || it != m_data.end()) {
rc = false;
return it;
} else {
......@@ -160,7 +192,7 @@ SequenceCollectionHash::const_iterator SequenceCollectionHash::find(
const Kmer& key, bool& rc) const
{
SequenceCollectionHash::const_iterator it = find(key);
if (it != m_data.end()) {
if (opt::ss || it != m_data.end()) {
rc = false;
return it;
} else {
......@@ -190,6 +222,7 @@ bool SequenceCollectionHash::getSeqData(const Kmer& key,
{
bool rc;
SequenceCollectionHash::const_iterator it = find(key, rc);
assert(!rc || !opt::ss);
if (it == m_data.end())
return false;
const KmerData data = it->second;
......
......@@ -101,6 +101,7 @@ class SequenceCollectionHash : public ISequenceCollection
void store(const char* path);
bool isAdjacencyLoaded() const { return m_adjacencyLoaded; }
void setColourSpace(bool flag);
void setDeletedKey();
private:
iterator find(const Kmer& key) { return m_data.find(key); }
......
/*
* Bloom.h
*
* This class was created when we removed the virtual class
* BloomFilterBase.
*
* It contains a mix of components from BloomFilterBase as well as
* additional functions and properties shared between Bloom Filter
* classes.
*