Commit ca88ee12 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 1.9.0

parent fcd96a6b
......@@ -3,6 +3,10 @@ compiler:
- gcc
- clang
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq libboost-dev libopenmpi-dev libsparsehash-dev
script: ./autogen.sh && ./configure --with-mpi=/usr/lib/openmpi && make
- sudo apt-get update -qq
- sudo apt-get install -qq libboost-dev libgtest-dev libopenmpi-dev libsparsehash-dev
script:
- ./autogen.sh
- ./configure --with-mpi=/usr/lib/openmpi
- make
- make check
bin_PROGRAMS = ABYSS
ABYSS_CPPFLAGS = -I$(top_srcdir) \
-I$(top_srcdir)/Assembly \
-I$(top_srcdir)/Common \
-I$(top_srcdir)/DataLayer
ABYSS_CPPFLAGS = -I$(top_srcdir)
ABYSS_LDADD = \
$(top_builddir)/DataBase/libdb.a \
$(SQLITE_LIBS) \
$(top_builddir)/Assembly/libassembly.a \
$(top_builddir)/DataLayer/libdatalayer.a \
$(top_builddir)/Common/libcommon.a
ABYSS_SOURCES = Abyss.cpp
ABYSS_SOURCES = abyss.cc
#include "Assembly/Options.h"
#include "AssemblyAlgorithms.h"
#include "DotWriter.h"
#include "FastaWriter.h"
#include "Histogram.h"
#include "ISequenceCollection.h"
#include "SequenceCollection.h"
#include "Timer.h"
#include "Uncompress.h"
#if PAIRED_DBG
# include "PairedDBG/SequenceCollection.h"
# include "PairedDBG/PairedDBGAlgorithms.h"
#else
# include "Assembly/SequenceCollection.h"
#endif
#include "Assembly/AssemblyAlgorithms.h"
#include "Assembly/DotWriter.h"
#include <algorithm>
#include <cstdio> // for setvbuf
#include <fstream>
#include <iostream>
#include <sstream>
#include "DataBase/DB.h"
using namespace std;
DB db;
static void removeLowCoverageContigs(SequenceCollectionHash& g)
{
AssemblyAlgorithms::markAmbiguous(&g);
......@@ -54,9 +56,12 @@ static void assemble(const string& pathIn, const string& pathOut)
if (!pathIn.empty())
AssemblyAlgorithms::loadSequences(&g, pathIn.c_str());
for_each(opt::inFiles.begin(), opt::inFiles.end(),
bind1st(ptr_fun(AssemblyAlgorithms::loadSequences), &g));
for_each(opt::inFiles.begin(), opt::inFiles.end(), bind1st(
ptr_fun(AssemblyAlgorithms::loadSequences<SequenceCollectionHash>),
&g));
size_t numLoaded = g.size();
if (!opt::db.empty())
addToDb(db, "loadedKmer", numLoaded);
cout << "Loaded " << numLoaded << " k-mer\n";
g.setDeletedKey();
g.shrink();
......@@ -71,6 +76,10 @@ static void assemble(const string& pathIn, const string& pathOut)
cout << "Generating adjacency" << endl;
AssemblyAlgorithms::generateAdjacency(&g);
#if PAIRED_DBG
removePairedDBGInconsistentEdges(g);
#endif
erode:
if (opt::erode > 0) {
cout << "Eroding tips" << endl;
......@@ -95,7 +104,6 @@ erode:
write_graph(opt::graphPath, g);
AssemblyAlgorithms::markAmbiguous(&g);
FastaWriter writer(pathOut.c_str());
unsigned nContigs = AssemblyAlgorithms::assemble(&g, &writer);
if (nContigs == 0) {
......@@ -118,6 +126,9 @@ int main(int argc, char* const* argv)
// Set stdout to be line buffered.
setvbuf(stdout, NULL, _IOLBF, 0);
#if PAIRED_DBG
opt::singleKmerSize = -1;
#endif
opt::parse(argc, argv);
bool krange = opt::kMin != opt::kMax;
......@@ -125,11 +136,29 @@ int main(int argc, char* const* argv)
cout << "Assembling k=" << opt::kMin << "-" << opt::kMax
<< ":" << opt::kStep << endl;
if (!opt::db.empty()) {
init(db,
opt::getUvalue(),
opt::getVvalue(),
"ABYSS",
opt::getCommand(),
opt::getMetaValue());
addToDb(db, "SS", opt::ss);
addToDb(db, "k", opt::kmerSize);
addToDb(db, "singleK", opt::singleKmerSize);
addToDb(db, "numProc", 1);
}
for (unsigned k = opt::kMin; k <= opt::kMax; k += opt::kStep) {
if (krange)
cout << "Assembling k=" << k << endl;
opt::kmerSize = k;
Kmer::setLength(k);
#if PAIRED_DBG
Kmer::setLength(opt::singleKmerSize);
KmerPair::setLength(opt::kmerSize);
#else
Kmer::setLength(opt::kmerSize);
#endif
if (k > opt::kMin) {
// Reset the assembly options to defaults.
......@@ -149,5 +178,8 @@ int main(int argc, char* const* argv)
k1 << opt::contigsPath.c_str();
assemble(k0.str(), k1.str());
}
if (!opt::db.empty())
addToDb(db, AssemblyAlgorithms::tempStatMap);
return 0;
}
This diff is collapsed.
......@@ -4,9 +4,22 @@ AdjList_CPPFLAGS = -I$(top_srcdir) \
-I$(top_srcdir)/Common \
-I$(top_srcdir)/DataLayer
if PAIRED_DBG
AdjList_CPPFLAGS += -DPAIRED_DBG
endif
AdjList_LDADD = \
$(top_builddir)/DataLayer/libdatalayer.a \
$(top_builddir)/Common/libcommon.a
if PAIRED_DBG
AdjList_LDADD += \
$(top_builddir)/PairedDBG/libpaireddbg.a
endif
AdjList_LDADD += \
$(top_builddir)/DataBase/libdb.a \
$(SQLITE_LIBS)
AdjList_SOURCES = \
AdjList.cpp
......@@ -289,10 +289,10 @@ int main(int argc, char** argv)
case '2': arg >> opt::max_len_2; break;
case 'v': opt::verbose++; break;
case OPT_HELP:
cerr << USAGE_MESSAGE;
cout << USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
cerr << VERSION_MESSAGE;
cout << VERSION_MESSAGE;
exit(EXIT_SUCCESS);
}
if (optarg != NULL && !arg.eof()) {
......
#ifndef ASSEMBLY_ADJACENCYALGORITHM_H
#define ASSEMBLY_ADJACENCYALGORITHM_H 1
namespace AssemblyAlgorithms {
/** Generate the adjacency information for each sequence in the
* collection. */
template <typename Graph>
size_t generateAdjacency(Graph* seqCollection)
{
typedef typename graph_traits<Graph>::vertex_descriptor V;
typedef typename Graph::Symbol Symbol;
typedef typename Graph::SymbolSet SymbolSet;
Timer timer("GenerateAdjacency");
size_t count = 0;
size_t numBasesSet = 0;
for (typename Graph::iterator iter = seqCollection->begin();
iter != seqCollection->end(); ++iter) {
if (iter->second.deleted())
continue;
if (++count % 1000000 == 0)
logger(1) << "Finding adjacent k-mer: " << count << '\n';
for (extDirection dir = SENSE; dir <= ANTISENSE; ++dir) {
V testSeq(iter->first);
Symbol adjBase = testSeq.shift(dir);
for (unsigned i = 0; i < SymbolSet::NUM; ++i) {
testSeq.setLastBase(dir, Symbol(i));
if (seqCollection->setBaseExtension(
testSeq, !dir, adjBase))
numBasesSet++;
}
}
seqCollection->pumpNetwork();
}
if (numBasesSet > 0) {
logger(0) << "Added " << numBasesSet << " edges.\n";
if (!opt::db.empty())
addToDb("EdgesGenerated", numBasesSet);
}
return numBasesSet;
}
} // namespace AssemblyAlgorithms
#endif
#ifndef ASSEMBLY_ASSEMBLEALGORITHM_H
#define ASSEMBLY_ASSEMBLEALGORITHM_H 1
#include "DataLayer/FastaWriter.h"
#include <iostream>
#include <climits>
namespace AssemblyAlgorithms {
/** Assemble a contig.
* @return the number of k-mer below the coverage threshold
*/
template <typename Graph>
size_t assembleContig(
Graph* seqCollection, FastaWriter* writer,
BranchRecord& branch, unsigned id)
{
assert(!branch.isActive());
assert(branch.getState() == BS_NOEXT
|| branch.getState() == BS_AMBI_SAME
|| branch.getState() == BS_AMBI_OPP);
// Assemble the contig.
Sequence contig(branch);
size_t kmerCount = branch.calculateBranchMultiplicity();
if (writer != NULL)
writer->WriteSequence(contig, id, kmerCount);
// Remove low-coverage contigs.
float coverage = (float)kmerCount / branch.size();
if (opt::coverage > 0 && coverage < opt::coverage) {
for (BranchRecord::iterator it = branch.begin();
it != branch.end(); ++it)
seqCollection->remove(it->first);
return branch.size();
}
return 0;
}
/** Assemble contigs.
* @return the number of contigs assembled
*/
static inline
size_t assemble(SequenceCollectionHash* seqCollection,
FastaWriter* fileWriter = NULL)
{
typedef SequenceCollectionHash Graph;
typedef graph_traits<Graph>::vertex_descriptor V;
typedef Graph::SymbolSetPair SymbolSetPair;
Timer timer("Assemble");
size_t kmerCount = 0;
unsigned contigID = 0;
size_t assembledKmer = 0;
size_t lowCoverageKmer = 0;
size_t lowCoverageContigs = 0;
for (Graph::iterator iter = seqCollection->begin();
iter != seqCollection->end(); ++iter) {
if (iter->second.deleted())
continue;
kmerCount++;
extDirection dir;
SeqContiguity status = checkSeqContiguity(*iter, dir, true);
if (status == SC_CONTIGUOUS)
continue;
else if(status == SC_ISLAND)
{
BranchRecord currBranch(SENSE);
currBranch.push_back(*iter);
currBranch.terminate(BS_NOEXT);
size_t removed = assembleContig(seqCollection,
fileWriter, currBranch, contigID++);
assembledKmer += currBranch.size();
if (removed > 0) {
lowCoverageContigs++;
lowCoverageKmer += removed;
}
continue;
}
assert(status == SC_ENDPOINT);
BranchRecord currBranch(dir);
currBranch.push_back(*iter);
V currSeq = iter->first;
extendBranch(currBranch, currSeq,
iter->second.getExtension(dir));
assert(currBranch.isActive());
while(currBranch.isActive())
{
SymbolSetPair extRec;
int multiplicity = -1;
bool success = seqCollection->getSeqData(
currSeq, extRec, multiplicity);
assert(success);
(void)success;
processLinearExtensionForBranch(currBranch,
currSeq, extRec, multiplicity, UINT_MAX);
}
if ((opt::ss && currBranch.getDirection() == SENSE)
|| (!opt::ss && currBranch.isCanonical())) {
size_t removed = assembleContig(seqCollection,
fileWriter, currBranch, contigID++);
assembledKmer += currBranch.size();
if (removed > 0) {
lowCoverageContigs++;
lowCoverageKmer += removed;
}
}
seqCollection->pumpNetwork();
}
if (opt::coverage > 0) {
std::cout << "Found " << assembledKmer << " k-mer in " << contigID
<< " contigs before removing low-coverage contigs.\n"
"Removed " << lowCoverageKmer << " k-mer in "
<< lowCoverageContigs << " low-coverage contigs.\n";
tempCounter[3] += lowCoverageContigs;
tempCounter[4] += lowCoverageKmer;
} else {
assert(assembledKmer <= kmerCount);
size_t circularKmer = kmerCount - assembledKmer;
if (circularKmer > 0)
std::cout << "Left " << circularKmer
<< " unassembled k-mer in circular contigs.\n";
std::cout << "Assembled " << assembledKmer << " k-mer in "
<< contigID << " contigs.\n";
if (!opt::db.empty()) {
addToDb("finalAmbgVertices", tempCounter[5]);
//addToDb("finalAmbgEdges", tempCounter[6]);
tempCounter.assign(16,0);
addToDb("assembledKmerNum", assembledKmer);
addToDb("assembledCntg", contigID);
}
}
return contigID;
}
} // namespace AssemblyAlgorithms
#endif
#include "config.h"
#include "Common/InsOrderedMap.h"
#include <string>
#include <vector>
namespace AssemblyAlgorithms
{
/** The number of k-mer that have been eroded. */
size_t g_numEroded;
std::vector<size_t> tempCounter(16,0);
InsOrderedMap<std::string,int> tempStatMap;
void addToDb(const std::string& key, const int& value)
{
tempStatMap.push_back(key, value);
}
};
This diff is collapsed.
#ifndef ASSEMBLYALGORITHMS_H
#define ASSEMBLYALGORITHMS_H 1
#include "BranchGroup.h"
#include "BranchRecord.h"
#include "FastaWriter.h"
#include "SequenceCollection.h"
#include <ostream>
#ifndef ASSEMBLY_ASSEMBLYALGORITHMS_H
#define ASSEMBLY_ASSEMBLYALGORITHMS_H 1
#include "Assembly/BranchGroup.h"
#include "Assembly/Options.h"
#include "Common/Log.h"
#include "Common/Timer.h"
#include <vector>
#include <string>
#include "Common/InsOrderedMap.h"
class Histogram;
......@@ -19,88 +20,127 @@ enum SeqContiguity
};
/** De Bruijn graph assembly algorithms. */
namespace AssemblyAlgorithms
{
// Read a sequence file and load them into the collection
void loadSequences(ISequenceCollection* seqCollection,
std::string inFile);
namespace AssemblyAlgorithms {
extern std::vector<size_t> tempCounter;
extern InsOrderedMap<std::string,int> tempStatMap;
extern void addToDb(const std::string&, const int&);
static inline
bool extendBranch(BranchRecord& branch,
graph_traits<SequenceCollectionHash>::vertex_descriptor& kmer,
SequenceCollectionHash::SymbolSet ext);
static inline bool
processLinearExtensionForBranch(BranchRecord& branch,
graph_traits<SequenceCollectionHash>::vertex_descriptor& currSeq,
SequenceCollectionHash::SymbolSetPair extensions,
int multiplicity,
unsigned maxLength, bool addKmer = true);
/** Generate the adjacency information for all the sequences in the
* collection. This is required before any other algorithm can run.
static inline void
initiateBranchGroup(BranchGroup& group,
const graph_traits<SequenceCollectionHash>::vertex_descriptor& seq,
const SequenceCollectionHash::SymbolSet& extension);
template <typename Graph>
void removeSequenceAndExtensions(Graph* seqCollection,
const typename Graph::value_type& seq);
/** Return the kmer which are adjacent to this kmer. */
template <typename V, typename SymbolSet>
void generateSequencesFromExtension(
const V& currSeq,
extDirection dir,
SymbolSet extension,
std::vector<V>& outseqs)
{
typedef typename SymbolSet::Symbol Symbol;
std::vector<V> extensions;
V extSeq(currSeq);
extSeq.shift(dir);
// Check for the existance of the 4 possible extensions
for (unsigned i = 0; i < SymbolSet::NUM; ++i) {
// Does this sequence have an extension?
Symbol x(i);
if (extension.checkBase(x)) {
extSeq.setLastBase(dir, x);
outseqs.push_back(extSeq);
}
}
}
/** Return the adjacency of this sequence.
* @param considerMarks when true, treat a marked vertex as having
* no edges
*/
void generateAdjacency(ISequenceCollection* seqCollection);
Histogram coverageHistogram(const ISequenceCollection& c);
void setCoverageParameters(const Histogram& h);
/* Erosion. Remove k-mer from the ends of blunt contigs. */
size_t erodeEnds(ISequenceCollection* seqCollection);
size_t erode(ISequenceCollection* c,
const ISequenceCollection::value_type& seq);
size_t getNumEroded();
size_t removeMarked(ISequenceCollection* pSC);
// Check whether a sequence can be trimmed
static inline
SeqContiguity checkSeqContiguity(
const ISequenceCollection::value_type& seq,
extDirection& outDir, bool considerMarks = false);
// process a terminated branch for trimming
bool processTerminatedBranchTrim(
ISequenceCollection* seqCollection, BranchRecord& branch);
bool extendBranch(BranchRecord& branch, Kmer& kmer, SeqExt ext);
// Process the extensions of the current sequence for trimming
bool processLinearExtensionForBranch(BranchRecord& branch,
Kmer& currSeq, ExtensionRecord extensions, int multiplicity,
unsigned maxLength, bool addKmer = true);
/** Populate the branch group with the initial extensions to this
* sequence. */
void initiateBranchGroup(BranchGroup& group, const Kmer& seq,
const SeqExt& extension);
// process an a branch group extension
bool processBranchGroupExtension(BranchGroup& group,
size_t branchIndex, const Kmer& seq,
ExtensionRecord extensions, int multiplicity,
unsigned maxLength);
void openBubbleFile(std::ofstream& out);
void writeBubble(std::ostream& out, const BranchGroup& group,
unsigned id);
void collapseJoinedBranches(
ISequenceCollection* seqCollection, BranchGroup& group);
/* Split the remaining ambiguous nodes to allow for a non-redundant
* assembly. Remove extensions to/from ambiguous sequences to avoid
* generating redundant/wrong contigs.
const SequenceCollectionHash::value_type& seq,
extDirection& outDir, bool considerMarks = false)
{
assert(!seq.second.deleted());
bool child = seq.second.hasExtension(SENSE)
&& !(considerMarks && seq.second.marked(SENSE));
bool parent = seq.second.hasExtension(ANTISENSE)
&& !(considerMarks && seq.second.marked(ANTISENSE));
if(!child && !parent)
{
//this sequence is completely isolated
return SC_ISLAND;
}
else if(!child)
{
outDir = ANTISENSE;
return SC_ENDPOINT;
}
else if(!parent)
{
outDir = SENSE;
return SC_ENDPOINT;
}
else
{
// sequence is contiguous
return SC_CONTIGUOUS;
}
}
/** Remove all marked k-mer.
* @return the number of removed k-mer
*/
size_t markAmbiguous(ISequenceCollection* seqCollection);
size_t splitAmbiguous(ISequenceCollection* seqCollection);
size_t assembleContig(ISequenceCollection* seqCollection,
FastaWriter* writer, BranchRecord& branch, unsigned id);
void removeSequenceAndExtensions(ISequenceCollection* seqCollection,
const ISequenceCollection::value_type& seq);
void removeExtensionsToSequence(ISequenceCollection* seqCollection,
const ISequenceCollection::value_type& seq, extDirection dir);
void generateSequencesFromExtension(const Kmer& currSeq,
extDirection dir, SeqExt extension,
std::vector<Kmer>& outseqs);
/* Non-distributed graph algorithms. */
void performTrim(SequenceCollectionHash* seqCollection);
size_t popBubbles(SequenceCollectionHash* pSC, std::ostream& out);
size_t assemble(SequenceCollectionHash* seqCollection,
FastaWriter* fileWriter = NULL);
};
template <typename Graph>
size_t removeMarked(Graph* pSC)
{
typedef typename Graph::iterator iterator;
Timer timer(__func__);
size_t count = 0;
for (iterator it = pSC->begin(); it != pSC->end(); ++it) {
if (it->second.deleted())
continue;
if (it->second.marked()) {
removeSequenceAndExtensions(pSC, *it);
count++;
}
pSC->pumpNetwork();
}
if (count > 0)
logger(1) << "Removed " << count << " marked k-mer.\n";
return count;
}
} // namespace AssemblyAlgorithms
#include "AdjacencyAlgorithm.h"
#include "AssembleAlgorithm.h"
#include "BubbleAlgorithm.h"
#include "CoverageAlgorithm.h"
#include "ErodeAlgorithm.h"
#include "LoadAlgorithm.h"
#include "SplitAlgorithm.h"
#include "TrimAlgorithm.h"
#endif
#include "BranchGroup.h"
#include "Algorithms.h"
#include <algorithm>
#include <functional>
using namespace std;
// Check the stop conditions for the bubble growth
BranchGroupStatus BranchGroup::updateStatus(unsigned maxLength)
{
assert(m_branches.size() <= m_maxNumBranches);