Skip to content
Commits on Source (7)
2017-12-19 Ben Vandervalk <benv@bcgsc.ca>
2018-04-13 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.1.0
* Adds support for misassembly correction and scaffolding
using linked reads, using Tigmint and ARCS. (Tigmint and
ARCS must be installed separately.)
* Support simultaneous optimization of `s` (min sequence length)
and `n` (min supporting read pairs / Chromium barcodes)
during scaffolding
abyss-longseqdist:
* Fix hang on input SAM containing no alignments with MAPQ > 0
abyss-pe:
* New `lr` parameter. Provide linked reads (i.e. 10x Genomics
Chromium reads) via this parameter to perform misassembly
correction and scaffolding using Chromium barcode information.
Requires Tigmint and ARCS tools to be installed in addition
to ABySS.
* Fix bug where `j` (threads) was not being correctly passed to
to `bgzip`/`pigz`
* Fix bug where `zsh` time/memory profiling was not being used,
even when `zsh` was available
abyss-scaffold:
* Simultaneous optimization of `n` and `s` using line search
or grid search [default]
SimpleGraph:
* add options `-s` and `-n` to filter paired-end paths by
seed length and edge weight, respectively
2018-03-14 Ben Vandervalk <benv@bcgsc.ca>
* Release version 2.0.3
* Many compiler fixes for GCC >= 6, Boost >= 1.64
......
......@@ -24,13 +24,32 @@ static inline void setNextContigName(cstring s)
g_nextContigName = 0;
}
/**
* Set the next contig name returned by createContigName
* to one plus the current largest numeric contig name.
*/
static inline void setNextContigName()
{
if (g_contigNames.empty()) {
g_nextContigName = 0;
return;
}
unsigned maxContigName = 0;
for (unsigned i = 0; i < g_contigNames.size(); ++i) {
cstring s = g_contigNames.getName(i);
std::istringstream iss((std::string)s);
unsigned contigName;
if (iss >> contigName && iss.eof() && contigName > maxContigName)
maxContigName = contigName;
}
g_nextContigName = 1 + maxContigName;
}
/** Return the next unique contig name. */
static inline std::string createContigName()
{
if (g_nextContigName == 0) {
assert(!g_contigNames.empty());
setNextContigName(g_contigNames.back());
}
if (g_nextContigName == 0)
setNextContigName();
std::ostringstream ss;
ss << g_nextContigName++;
return ss.str();
......
......@@ -315,22 +315,20 @@ namespace std {
inline void swap(Histogram&, Histogram&) { assert(false); }
}
/** Print assembly contiguity statistics. */
static inline std::ostream& printContiguityStats(
std::ostream& out, const Histogram& h0,
unsigned minSize, bool printHeader = true,
/** Print assembly contiguity statistics header. */
static inline std::ostream& printContiguityStatsHeader(
std::ostream& out,
unsigned minSize,
const std::string& sep = "\t",
const long long unsigned expSize = 0)
{
Histogram h = h0.trimLow(minSize);
if (printHeader) {
out << "n" << sep
<< "n:" << minSize << sep
<< "L50" << sep;
if (expSize > 0)
out << "LG50" << sep
<< "NG50" << sep;
out << "min" << sep
return out << "min" << sep
<< "N80" << sep
<< "N50" << sep
<< "N20" << sep
......@@ -339,6 +337,17 @@ static inline std::ostream& printContiguityStats(
<< "sum" << sep
<< "name" << '\n';
}
/** Print assembly contiguity statistics. */
static inline std::ostream& printContiguityStats(
std::ostream& out, const Histogram& h0,
unsigned minSize, bool printHeader = true,
const std::string& sep = "\t",
const long long unsigned expSize = 0)
{
Histogram h = h0.trimLow(minSize);
if (printHeader)
printContiguityStatsHeader(out, minSize, sep, expSize);
unsigned n50 = h.n50();
out << toEng(h0.size()) << sep
<< toEng(h.size()) << sep
......
......@@ -51,9 +51,12 @@ static inline std::istream& operator>>(std::istream& in, expect o)
if (!in || c != *p) {
std::cerr << "error: Expected `" << p
<< "' and saw ";
if (in)
if (in) {
std::cerr << '`' << c << "'\n";
else if (in.eof())
std::string s;
if (getline(in, s) && !s.empty())
std::cerr << "near: " << c << s << '\n';
} else if (in.eof())
std::cerr << "end-of-file\n";
else
std::cerr << "I/O error\n";
......
......@@ -275,6 +275,8 @@ struct SAMRecord : SAMAlignment {
{
flag &= ~(FPROPER_PAIR | FMUNMAP | FMREVERSE);
flag |= FPAIRED;
if (rname == o.rname && isReverse() != o.isReverse())
flag |= FPROPER_PAIR;
if (o.isUnmapped())
flag |= FMUNMAP;
if (o.isReverse())
......@@ -308,7 +310,7 @@ struct SAMRecord : SAMAlignment {
friend std::ostream& operator <<(std::ostream& out,
const SAMRecord& o)
{
return out << o.qname
out << o.qname
<< '\t' << o.flag
<< '\t' << o.rname
<< '\t' << (1 + o.pos)
......@@ -316,13 +318,16 @@ struct SAMRecord : SAMAlignment {
<< '\t' << o.cigar
<< '\t' << (o.mrnm == o.rname ? "=" : o.mrnm)
<< '\t' << (1 + o.mpos)
<< '\t' << o.isize
<< '\t' << o.isize;
#if SAM_SEQ_QUAL
<< '\t' << o.seq
out << '\t' << o.seq
<< '\t' << o.qual;
if (!o.tags.empty())
out << '\t' << o.tags;
#else
<< "\t*\t*";
out << "\t*\t*";
#endif
return out;
}
friend std::istream& operator >>(std::istream& in, SAMRecord& o)
......
......@@ -58,6 +58,8 @@ static const char USAGE_MESSAGE[] =
"\n"
" -k, --kmer=KMER_SIZE k-mer size\n"
" -s, --seed-length=L minimum length of a seed contig [0]\n"
" -G, --genome-size=N expected genome size. Used to calculate NG50\n"
" and associated stats [disabled]\n"
" -o, --out=FILE write result to FILE\n"
" --no-greedy use the non-greedy algorithm [default]\n"
" --greedy use the greedy algorithm\n"
......@@ -86,16 +88,20 @@ namespace opt {
/** Use a greedy algorithm. */
static int greedy;
/** Genome size. Used to calculate NG50. */
static long long unsigned genomeSize;
/** Write the path overlap graph to this file. */
static string graphPath;
}
static const char shortopts[] = "g:j:k:o:s:v";
static const char shortopts[] = "G:g:j:k:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
//enum { OPT_HELP = 1, OPT_VERSION };
static const struct option longopts[] = {
{ "genome-size", required_argument, NULL, 'G' },
{ "graph", no_argument, NULL, 'g' },
{ "greedy", no_argument, &opt::greedy, true },
{ "no-greedy", no_argument, &opt::greedy, false },
......@@ -135,6 +141,52 @@ static ContigPath align(const Lengths& lengths,
static bool gDebugPrint;
/**
* Build a histogram of the lengths of the assembled paths and unused contigs.
* Note: This function does not account for the ammount of overlap between contigs.
*/
static Histogram
buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths)
{
Histogram h;
// Compute the lengths of the paths
// Mark the vertices that are used in paths
vector<bool> used(lengths.size());
for (ContigPaths::const_iterator pathIt = paths.begin();
pathIt != paths.end(); ++pathIt) {
const ContigPath& path = *pathIt;
size_t totalLength = 0;
for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
if (it->ambiguous())
continue;
unsigned id = it->id();
assert(id < lengths.size());
totalLength += lengths[id];
used[id] = true;
}
h.insert(totalLength);
}
// Add the contigs that were not used in paths.
for (unsigned i = 0; i < lengths.size(); ++i) {
if (!used[i])
h.insert(lengths[i]);
}
return h;
}
/** Report assembly metrics. */
static void
reportAssemblyMetrics(const Lengths& lengths, const ContigPaths& paths)
{
Histogram h = buildAssembledLengthHistogram(lengths, paths);
const unsigned STATS_MIN_LENGTH = opt::seedLen;
printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize)
<< '\t' << opt::out << '\n';
}
/** Return all contigs that are tandem repeats, identified as those
* contigs that appear more than once in a single path.
*/
......@@ -699,7 +751,7 @@ static void outputPathGraph(PathGraph& pathGraph)
}
/** Sort and output the specified paths. */
static void outputSortedPaths(const ContigPathMap& paths)
static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths)
{
// Sort the paths.
vector<ContigPath> sortedPaths(paths.size());
......@@ -715,6 +767,9 @@ static void outputSortedPaths(const ContigPathMap& paths)
it != sortedPaths.end(); ++it)
out << createContigName() << '\t' << *it << '\n';
assert_good(out, opt::out);
// Report assembly metrics.
reportAssemblyMetrics(lengths, sortedPaths);
}
/** Assemble the path overlap graph. */
......@@ -753,7 +808,7 @@ static void assemblePathGraph(const Lengths& lengths,
cout << "Removing redundant contigs\n";
removeSubsumedPaths(lengths, paths);
outputSortedPaths(paths);
outputSortedPaths(lengths, paths);
}
/** Read a set of paths from the specified file. */
......@@ -891,6 +946,12 @@ int main(int argc, char** argv)
istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?': die = true; break;
case 'G': {
double x;
arg >> x;
opt::genomeSize = x;
break;
}
case 'g': arg >> opt::graphPath; break;
case 'j': arg >> opt::threads; break;
case 'k': arg >> opt::k; break;
......@@ -1051,7 +1112,7 @@ int main(int argc, char** argv)
}
originalPathMap.clear();
outputSortedPaths(resultsPathMap);
outputSortedPaths(lengths, resultsPathMap);
return 0;
}
......
......@@ -279,7 +279,7 @@ parseArgs = do
where
help = putStr (usageInfo usage options) >> exitSuccess
tryHelp = "Try 'abyss-samtobreak --help' for more information."
version = "abyss-samtobreak (ABySS) 2.0.3\n"
version = "abyss-samtobreak (ABySS) 2.1.0\n"
usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
\Calculate contig and scaffold contiguity and correctness metrics.\n"
......
......@@ -18,6 +18,7 @@ Contents
* [Assembling a paired-end library](#assembling-a-paired-end-library)
* [Assembling multiple libraries](#assembling-multiple-libraries)
* [Scaffolding](#scaffolding)
* [Scaffolding with linked reads](#scaffolding-with-linked-reads)
* [Rescaffolding with long sequences](#rescaffolding-with-long-sequences)
* [Assembling using a Bloom filter de Bruijn graph](#assembling-using-a-bloom-filter-de-bruijn-graph)
* [Assembling using a paired de Bruijn graph](#assembling-using-a-paired-de-bruijn-graph)
......@@ -37,20 +38,29 @@ Contents
Quick Start
===========
## Install ABySS on Debian or Ubuntu
## Install ABySS on Linux
Run the command
Install [Linuxbrew](http://linuxbrew.sh/), and run the command
sudo apt-get install abyss
brew install brewsci/bio/abyss
## Install ABySS on macOS
Install [Homebrew](https://brew.sh/), and run the command
brew install brewsci/bio/abyss
## Install ABySS on Windows
Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command
or download and install the
[Debian package](http://www.bcgsc.ca/platform/bioinfo/software/abyss).
brew install brewsci/bio/abyss
## Install ABySS on Mac OS X
## Install ABySS on Debian or Ubuntu
Install [Homebrew](http://brew.sh/), and run the commands
Run the command
brew install homebrew/science/abyss
sudo apt-get install abyss
## Assemble a small synthetic data set
......@@ -66,19 +76,37 @@ Install [Homebrew](http://brew.sh/), and run the commands
Dependencies
============
Dependencies may be installed using the package manager [Homebrew](https://homebrew.sh) on macOS and [Linxubrew](http://linuxbrew.sh) on Linux and Windows, using Windows Subsystem for Linux.
ABySS requires a C++ compiler that supports
[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
ABySS requires the following libraries:
* [Boost](http://www.boost.org/)
* [Open MPI](http://www.open-mpi.org)
* [sparsehash](https://code.google.com/p/sparsehash/)
* [SQLite](http://www.sqlite.org/)
ABySS requires a C++ compiler that supports
[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
brew install boost open-mpi google-sparsehash
ABySS will receive an error when compiling with Boost 1.51.0 or 1.52.0
since they contain a bug. Later versions of Boost compile without error.
## Dependencies for linked reads
- [ARCS](https://github.com/bcgsc/arcs) to scaffold
- [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors
brew install brewsci/bio/arcs brewsci/bio/links
## Optional dependencies
- [pigz](https://zlib.net/pigz/) for parallel gzip
- [samtools](https://samtools.github.io) for reading BAM files
- [zsh](https://sourceforge.net/projects/zsh/) for reporting time and memory usage
brew install pigz samtools zsh
Compiling ABySS from GitHub
===========================
......@@ -142,8 +170,7 @@ usage, although it will build without. sparsehash should be found in
./configure CPPFLAGS=-I/usr/local/include
If SQLite is installed in non-default directories, its location can be
specified to `configure`:
If the optional dependency SQLite is installed in non-default directories, its location can be specified to `configure`:
./configure --with-sqlite=/opt/sqlite3
......@@ -169,7 +196,7 @@ To assemble paired reads in two files named `reads1.fa` and
`reads2.fa` into contigs in a file named `ecoli-contigs.fa`, run the
command:
abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa'
abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa'
The parameter `in` specifies the input files to read, which may be in
FASTA, FASTQ, qseq, export, SRA, SAM or BAM format and compressed with
......@@ -208,7 +235,7 @@ fragment libraries and single-end reads. Note that the names of the libraries
The command line to assemble this example data set is:
abyss-pe k=64 name=ecoli lib='pea peb' \
abyss-pe k=96 name=ecoli lib='pea peb' \
pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
se='se1.fa se2.fa'
......@@ -228,13 +255,29 @@ Here's an example of assembling a data set with two paired-end
libraries and two mate-pair libraries. Note that the names of the libraries
(`pea`, `peb`, `mpa`, `mpb`) are arbitrary.
abyss-pe k=64 name=ecoli lib='pea peb' mp='mpc mpd' \
abyss-pe k=96 name=ecoli lib='pea peb' mp='mpc mpd' \
pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
mpc='mpc_1.fa mpc_2.fa' mpd='mpd_1.fa mpd_2.fa'
The mate-pair libraries are used only for scaffolding and do not
contribute towards the consensus sequence.
Scaffolding with linked reads
================================================================================
ABySS can scaffold using linked reads from 10x Genomics Chromium. The barcodes must first be extracted from the read sequences and added to the `BX:Z` tag of the FASTQ header, typically using the `longranger basic` command of [Long Ranger](https://support.10xgenomics.com/genome-exome/software/overview/welcome) or [EMA preproc](https://github.com/arshajii/ema#readme). The linked reads are used to correct assembly errors, which requires that [Tigmint](https://github.com/bcgsc/tigmint). The linked reads are also used for scaffolding, which requires [ARCS](https://github.com/bcgsc/arcs). See [Dependencies](#dependencies) for installation instructions.
ABySS can combine paired-end, mate-pair, and linked-read libraries. The `pe` and `lr` libraries will be used to build the de Bruijn graph. The `mp` libraries will be used for paired-end/mate-pair scaffolding. The `lr` libraries will be used for misassembly correction using Tigmint and scaffolding using ARCS.
abyss-pe k=96 name=hsapiens \
pe='pea' pea='lra.fastq.gz' \
mp='mpa' mpa='lra.fastq.gz' \
lr='lra' lra='lra.fastq.gz'
ABySS performs better with a mixture of paired-end, mate-pair, and linked reads, but it is possible to assemble only linked reads using ABySS, though this mode of operation is experimental.
abyss-pe k=96 name=hsapiens lr='lra' lra='lra.fastq.gz'
Rescaffolding with long sequences
=================================
......@@ -247,7 +290,7 @@ Similar to scaffolding, the names of the datasets can be specified with
the `long` parameter. These scaffolds will be stored in the file
`${name}-long-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly. Note that the names of the libraries are arbitrary.
abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
abyss-pe k=96 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa' \
longa='longa.fa'
......@@ -266,7 +309,7 @@ example, the following will run a E. coli assembly with an overall memory budget
of 100 megabytes, 3 hash functions, a minimum k-mer count threshold of 3, with
verbose logging enabled:
abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
At the current time, the user must calculate suitable values for `B` and `H` on
their own, and finding the best value for `kc` may require experimentation
......@@ -291,7 +334,7 @@ To assemble using paired de Bruijn graph mode, specify both individual
k-mer size (`K`) and k-mer pair span (`k`). For example, to assemble E.
coli with a individual k-mer size of 16 and a k-mer pair span of 64:
abyss-pe name=ecoli K=16 k=64 in='reads1.fa reads2.fa'
abyss-pe name=ecoli K=16 k=96 in='reads1.fa reads2.fa'
In this example, the size of the intervening gap between k-mer pairs is
32 bp (64 - 2\*16). Note that the `k` parameter takes on a new meaning
......@@ -308,7 +351,7 @@ respect to the original transcripts that were sequenced. In order to
run ABySS in strand-specific mode, the `SS` parameter must be used as
in the following example:
abyss-pe name=SS-RNA k=64 in='reads1.fa reads2.fa' SS=--SS
abyss-pe name=SS-RNA k=96 in='reads1.fa reads2.fa' SS=--SS
The expected orientation for the read sequences with respect to the
original RNA is RF. i.e. the first read in a read pair is always in
......@@ -406,6 +449,8 @@ Parameters of the driver script, `abyss-pe`
* `t`: maximum length of blunt contigs to trim [`k`]
* `v`: use `v=-v` for verbose logging, `v=-vv` for extra verbose
* `x`: spaced seed (Bloom filter assembly only)
* `lr_s`: minimum contig size required for building scaffolds with linked reads (bp) [`S`]
* `lr_n`: minimum number of barcodes required for building scaffolds with linked reads [`10`]
Please see the
[abyss-pe](http://manpages.ubuntu.com/abyss-pe.1.html)
......@@ -414,7 +459,7 @@ manual page for more information on assembly parameters.
Environment variables
=====================
`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=20`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=96`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
abyss-pe env [options]
......
......@@ -120,8 +120,7 @@ static void readAlignments(istream& in, Graph& g)
SAMRecord rec, prev;
vector<Alignment> recs;
int i = 0;
while (prev.isUnmapped() || prev.mapq == 0)
in >> prev;
while (in >> prev && (prev.isUnmapped() || prev.mapq == 0));
recs.push_back(prev);
while (in >> rec) {
if (rec.isUnmapped() || rec.mapq == 0)
......
......@@ -6,6 +6,7 @@
#include "IOUtil.h"
#include "Iterator.h"
#include "Uncompress.h"
#include "Common/UnorderedMap.h"
#include "Graph/Assemble.h"
#include "Graph/ContigGraph.h"
#include "Graph/ContigGraphAlgorithms.h"
......@@ -39,7 +40,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Shaun Jackman.\n"
"\n"
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
......@@ -54,9 +55,14 @@ static const char USAGE_MESSAGE[] =
" Options:\n"
"\n"
" -n, --npairs=N minimum number of pairs [0]\n"
" -s, --seed-length=N minimum contig length [200]\n"
" or -s N0-N1 Find the value of s in [N0,N1]\n"
" or -n A-B:S Find the value of n in [A,B] with step size S\n"
" that maximizes the scaffold N50.\n"
" Default value for the step size is 1, if unspecified.\n"
" -s, --seed-length=N minimum contig length [1000]\n"
" or -s A-B Find the value of s in [A,B]\n"
" that maximizes the scaffold N50.\n"
" --grid optimize using a grid search [default]\n"
" --line optimize using a line search\n"
" -k, --kmer=N length of a k-mer\n"
" -G, --genome-size=N expected genome size. Used to calculate NG50\n"
" and associated stats [disabled]\n"
......@@ -84,12 +90,17 @@ namespace opt {
unsigned k; // used by ContigProperties
/** Optimization search strategy. */
static int searchStrategy;
/** Minimum number of pairs. */
static unsigned minNumPairs;
static unsigned minEdgeWeight;
static unsigned minEdgeWeightEnd;
static unsigned minEdgeWeightStep;
/** Minimum contig length. */
static unsigned minContigLength = 200;
static unsigned minContigLengthEnd;
static unsigned minContigLength = 1000;
static unsigned minContigLengthEnd = 1000;
/** Genome size. Used to calculate NG50. */
static long long unsigned genomeSize;
......@@ -124,7 +135,9 @@ static const char shortopts[] = "G:g:k:n:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
//enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP };
/** Optimization search strategy. */
enum { GRID_SEARCH, LINE_SEARCH };
static const struct option longopts[] = {
{ "graph", no_argument, NULL, 'g' },
......@@ -133,6 +146,8 @@ static const struct option longopts[] = {
{ "min-gap", required_argument, NULL, OPT_MIN_GAP },
{ "max-gap", required_argument, NULL, OPT_MAX_GAP },
{ "npairs", required_argument, NULL, 'n' },
{ "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
{ "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
{ "out", required_argument, NULL, 'o' },
{ "seed-length", required_argument, NULL, 's' },
{ "complex", no_argument, &opt::comp_trans, 1 },
......@@ -171,16 +186,17 @@ struct InvalidEdge {
/** Return whether the specified edges has sufficient support. */
struct PoorSupport {
PoorSupport(Graph& g) : m_g(g) { }
PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { }
bool operator()(graph_traits<Graph>::edge_descriptor e) const
{
return m_g[e].numPairs < opt::minNumPairs;
return m_g[e].numPairs < m_minEdgeWeight;
}
const Graph& m_g;
unsigned m_minEdgeWeight;
};
/** Remove short vertices and unsupported edges from the graph. */
static void filterGraph(Graph& g, unsigned minContigLength)
static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
{
typedef graph_traits<Graph> GTraits;
typedef GTraits::vertex_descriptor V;
......@@ -203,7 +219,7 @@ static void filterGraph(Graph& g, unsigned minContigLength)
// Remove poorly-supported edges.
unsigned numBefore = num_edges(g);
remove_edge_if(PoorSupport(g), static_cast<DG&>(g));
remove_edge_if(PoorSupport(g, minEdgeWeight), static_cast<DG&>(g));
unsigned numRemovedE = numBefore - num_edges(g);
if (opt::verbose > 0)
cerr << "Removed " << numRemovedE << " edges.\n";
......@@ -527,6 +543,8 @@ static ContigPath addDistEst(const Graph& g0, const Graph& g1,
assert(!v.ambiguous());
pair<E, bool> e0 = edge(u, v, g0);
pair<E, bool> e1 = edge(u, v, g1);
if (!e0.second && !e1.second)
std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n';
assert(e0.second || e1.second);
const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
if (!isOverlap(ep)) {
......@@ -591,7 +609,10 @@ unsigned addLength(const Graph& g, It first, It last)
/** A container of contig paths. */
typedef vector<ContigPath> ContigPaths;
/** Build the scaffold length histogram. */
/**
* Build the scaffold length histogram.
* @param g The graph g is destroyed.
*/
static Histogram buildScaffoldLengthHistogram(
Graph& g, const ContigPaths& paths)
{
......@@ -647,17 +668,45 @@ static void addCntgStatsToDb(
}
}
/** Parameters of scaffolding. */
struct ScaffoldParam {
ScaffoldParam(unsigned n, unsigned s) : n(n), s(s) { }
bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
unsigned n;
unsigned s;
};
NAMESPACE_STD_HASH_BEGIN
template <> struct hash<ScaffoldParam> {
size_t operator()(const ScaffoldParam& param) const
{
return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
}
};
NAMESPACE_STD_HASH_END
/** Result of scaffolding. */
struct ScaffoldResult : ScaffoldParam{
ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { }
ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
: ScaffoldParam(n, s), n50(n50), metrics(metrics) { }
unsigned n50;
std::string metrics;
};
/** Build scaffold paths.
* @param output write the results
* @return the scaffold N50
*/
unsigned scaffold(const Graph& g0, unsigned minContigLength,
ScaffoldResult
scaffold(const Graph& g0,
unsigned minEdgeWeight, unsigned minContigLength,
bool output)
{
Graph g(g0);
// Filter the graph.
filterGraph(g, minContigLength);
filterGraph(g, minEdgeWeight, minContigLength);
if (opt::verbose > 0)
printGraphStats(cerr, g);
......@@ -737,19 +786,7 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
addToDb(db, "scaffolds_assembled", paths.size());
}
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
if (!output) {
static bool printHeader = true;
Histogram h = buildScaffoldLengthHistogram(g, paths);
printContiguityStats(cerr, h, STATS_MIN_LENGTH,
printHeader, "\t", opt::genomeSize)
<< "\ts=" << minContigLength << '\n';
if (opt::verbose == 0)
printHeader = false;
addCntgStatsToDb(h, STATS_MIN_LENGTH);
return h.trimLow(STATS_MIN_LENGTH).n50();
}
if (output) {
// Output the paths.
ofstream fout(opt::out.c_str());
ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
......@@ -768,14 +805,179 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
write_dot(out, g);
assert_good(out, opt::graphPath);
}
}
// Print assembly contiguity statistics.
Histogram h = buildScaffoldLengthHistogram(g, paths);
printContiguityStats(cerr, h, STATS_MIN_LENGTH,
true, "\t", opt::genomeSize)
<< "\ts=" << minContigLength << '\n';
addCntgStatsToDb(h, STATS_MIN_LENGTH);
return h.trimLow(STATS_MIN_LENGTH).n50();
// Compute contiguity metrics.
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
std::ostringstream ss;
Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH,
false, "\t", opt::genomeSize)
<< "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
std::string metrics = ss.str();
addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
return ScaffoldResult(minEdgeWeight, minContigLength,
scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
metrics);
}
/** Memoize the optimization results so far. */
typedef unordered_map<ScaffoldParam, ScaffoldResult> ScaffoldMemo;
/** Build scaffold paths, memoized. */
ScaffoldResult
scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo)
{
ScaffoldParam param(n, s);
ScaffoldMemo::const_iterator it = memo.find(param);
if (it != memo.end()) {
// Clear the metrics string, so that this result is not listed
// multiple times in the final table of metrics.
ScaffoldResult result(it->second);
result.metrics.clear();
return result;
}
if (opt::verbose > 0)
std::cerr << "\nScaffolding with n=" << n << " s=" << s << "\n\n";
ScaffoldResult result = scaffold(g, n, s, false);
memo[param] = result;
// Print assembly metrics.
if (opt::verbose > 0) {
std::cerr << '\n';
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
}
std::cerr << result.metrics;
if (opt::verbose > 0)
cerr << '\n';
return result;
}
/** Find the value of n that maximizes the scaffold N50. */
static ScaffoldResult
optimize_n(const Graph& g,
std::pair<unsigned, unsigned> minEdgeWeight,
unsigned minContigLength,
ScaffoldMemo& memo)
{
std::string metrics_table;
unsigned bestn = 0, bestN50 = 0;
for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
ScaffoldResult result = scaffold_memoized(g, n, minContigLength, memo);
metrics_table += result.metrics;
if (result.n50 > bestN50) {
bestN50 = result.n50;
bestn = n;
}
}
return ScaffoldResult(bestn, minContigLength, bestN50, metrics_table);
}
/** Find the value of s that maximizes the scaffold N50. */
static ScaffoldResult
optimize_s(const Graph& g,
unsigned minEdgeWeight,
std::pair<unsigned, unsigned> minContigLength,
ScaffoldMemo& memo)
{
std::string metrics_table;
unsigned bests = 0, bestN50 = 0;
const double STEP = cbrt(10); // Three steps per decade.
unsigned ilast = (unsigned)round(
log(minContigLength.second) / log(STEP));
for (unsigned i = (unsigned)round(
log(minContigLength.first) / log(STEP));
i <= ilast; ++i) {
unsigned s = (unsigned)pow(STEP, (int)i);
// Round to 1 figure.
double nearestDecade = pow(10, floor(log10(s)));
s = unsigned(round(s / nearestDecade) * nearestDecade);
ScaffoldResult result = scaffold_memoized(g, minEdgeWeight, s, memo);
metrics_table += result.metrics;
if (result.n50 > bestN50) {
bestN50 = result.n50;
bests = s;
}
}
return ScaffoldResult(minEdgeWeight, bests, bestN50, metrics_table);
}
/** Find the values of n and s that maximizes the scaffold N50. */
static ScaffoldResult
optimize_grid_search(const Graph& g,
std::pair<unsigned, unsigned> minEdgeWeight,
std::pair<unsigned, unsigned> minContigLength)
{
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
if (opt::verbose == 0)
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
ScaffoldMemo memo;
std::string metrics_table;
ScaffoldResult best(0, 0, 0, "");
for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
ScaffoldResult result = optimize_s(g, n, minContigLength, memo);
metrics_table += result.metrics;
if (result.n50 > best.n50)
best = result;
}
best.metrics = metrics_table;
return best;
}
/** Find the values of n and s that maximizes the scaffold N50. */
static ScaffoldResult
optimize_line_search(const Graph& g,
std::pair<unsigned, unsigned> minEdgeWeight,
std::pair<unsigned, unsigned> minContigLength)
{
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
if (opt::verbose == 0)
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
ScaffoldMemo memo;
std::string metrics_table;
ScaffoldResult best(
(minEdgeWeight.first + minEdgeWeight.second) / 2,
minContigLength.second, 0, "");
// An upper limit on the number of iterations.
const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
// Optimize s.
if (opt::verbose > 0) {
std::cerr << "\nOptimizing s for n=" << best.n << "\n\n";
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
}
unsigned previous_s = best.s;
best = optimize_s(g, best.n, minContigLength, memo);
metrics_table += best.metrics;
if (best.s == previous_s)
break;
// Optimize n.
if (opt::verbose > 0) {
std::cerr << "\nOptimizing n for s=" << best.s << "\n\n";
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
}
unsigned previous_n = best.n;
best = optimize_n(g, minEdgeWeight, best.s, memo);
metrics_table += best.metrics;
if (best.n == previous_n)
break;
}
std::cerr << "\nLine search converged in " << memo.size() << " iterations.\n";
best.metrics = metrics_table;
return best;
}
/** Run abyss-scaffold. */
......@@ -806,7 +1008,16 @@ int main(int argc, char** argv)
arg >> opt::graphPath;
break;
case 'n':
arg >> opt::minNumPairs;
arg >> opt::minEdgeWeight;
if (arg.peek() == '-') {
arg >> expect("-") >> opt::minEdgeWeightEnd;
assert(opt::minEdgeWeight <= opt::minEdgeWeightEnd);
} else
opt::minEdgeWeightEnd = opt::minEdgeWeight;
if (arg.peek() == ':')
arg >> expect(":") >> opt::minEdgeWeightStep;
else
opt::minEdgeWeightStep = 1;
break;
case 'o':
arg >> opt::out;
......@@ -818,7 +1029,8 @@ int main(int argc, char** argv)
arg >> expect("-") >> opt::minContigLengthEnd;
assert(opt::minContigLength
<= opt::minContigLengthEnd);
}
} else
opt::minContigLengthEnd = opt::minContigLength;
break;
case 'v':
opt::verbose++;
......@@ -908,37 +1120,51 @@ int main(int argc, char** argv)
if (!opt::db.empty())
addToDb(db, "Edges_invalid", numRemoved);
if (opt::minContigLengthEnd == 0) {
scaffold(g, opt::minContigLength, true);
return 0;
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
if (opt::minEdgeWeight == opt::minEdgeWeightEnd
&& opt::minContigLength == opt::minContigLengthEnd) {
ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
// Print assembly contiguity statistics.
if (opt::verbose > 0)
std::cerr << '\n';
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
std::cerr << result.metrics;
} else {
ScaffoldResult best(0, 0, 0, "");
switch (opt::searchStrategy) {
case GRID_SEARCH:
best = optimize_grid_search(g,
std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
break;
case LINE_SEARCH:
best = optimize_line_search(g,
std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
break;
default:
abort();
break;
}
// Find the value of s that maximizes the scaffold N50.
unsigned bests = 0, bestN50 = 0;
const double STEP = cbrt(10); // Three steps per decade.
unsigned ilast = (unsigned)round(
log(opt::minContigLengthEnd) / log(STEP));
for (unsigned i = (unsigned)round(
log(opt::minContigLength) / log(STEP));
i <= ilast; ++i) {
unsigned s = (unsigned)pow(STEP, (int)i);
// Round to 1 figure.
double nearestDecade = pow(10, floor(log10(s)));
s = unsigned(round(s / nearestDecade) * nearestDecade);
unsigned n50 = scaffold(g, s, false);
if (opt::verbose > 0)
cerr << '\n';
if (n50 > bestN50) {
bestN50 = n50;
bests = s;
}
std::cerr << "\nScaffolding with n=" << best.n << " s=" << best.s << "\n\n";
ScaffoldResult result = scaffold(g, best.n, best.s, true);
// Print the table of all the parameter values attempted and their metrics.
if (opt::verbose > 0) {
std::cerr << '\n';
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
std::cerr << best.metrics;
}
bestN50 = scaffold(g, bests, true);
cerr << "Best scaffold N50 is " << bestN50
<< " at s=" << bests << ".\n";
std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n";
// Print assembly contiguity statistics.
std::cerr << '\n';
printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
std::cerr << result.metrics;
}
return 0;
}
......@@ -30,7 +30,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Jared Simpson and Shaun Jackman.\n"
"\n"
"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " -k<kmer> -o<out.path> [OPTION]... ADJ DIST\n"
......@@ -44,6 +44,8 @@ static const char USAGE_MESSAGE[] =
" Options:\n"
"\n"
" -k, --kmer=KMER_SIZE k-mer size\n"
" -n, --npairs=N minimum number of pairs [0]\n"
" -s, --seed-length=N minimum seed contig length [0]\n"
" -d, --dist-error=N acceptable error of a distance estimate\n"
" default is 6 bp\n"
" --max-cost=COST maximum computational cost\n"
......@@ -73,6 +75,12 @@ namespace opt {
static int verbose;
static string out;
/** Minimum number of pairs. */
static unsigned minEdgeWeight;
/** Minimum seed length. */
static unsigned minSeedLength;
/** The acceptable error of a distance estimate. */
unsigned distanceError = 6;
......@@ -80,7 +88,7 @@ namespace opt {
int format = DIST; // used by Estimate
}
static const char shortopts[] = "d:j:k:o:v";
static const char shortopts[] = "d:j:k:n:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
......@@ -88,6 +96,8 @@ enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
static const struct option longopts[] = {
{ "kmer", required_argument, NULL, 'k' },
{ "npairs", required_argument, NULL, 'n' },
{ "seed-length", required_argument, NULL, 's' },
{ "dist-error", required_argument, NULL, 'd' },
{ "max-cost", required_argument, NULL, OPT_MAX_COST },
{ "out", required_argument, NULL, 'o' },
......@@ -125,7 +135,9 @@ int main(int argc, char** argv)
case 'j': arg >> opt::threads; break;
case 'k': arg >> opt::k; break;
case OPT_MAX_COST: arg >> opt::maxCost; break;
case 'n': arg >> opt::minEdgeWeight; break;
case 'o': arg >> opt::out; break;
case 's': arg >> opt::minSeedLength; break;
case 'v': opt::verbose++; break;
case OPT_HELP:
cout << USAGE_MESSAGE;
......@@ -248,6 +260,9 @@ static unsigned g_minNumPairs = UINT_MAX;
static unsigned g_minNumPairsUsed = UINT_MAX;
static struct {
unsigned seedTooShort;
unsigned noEdges;
unsigned edgesRemoved;
unsigned totalAttempted;
unsigned uniqueEnd;
unsigned noPossiblePaths;
......@@ -631,6 +646,16 @@ static void handleEstimate(const Graph& g,
pthread_mutex_unlock(&coutMutex);
}
/** Return whether the specified edge has sufficient support. */
struct PoorSupport {
PoorSupport(unsigned minEdgeWeight) : m_minEdgeWeight(minEdgeWeight) { }
bool operator()(const Estimates::value_type& estimate) const
{
return estimate.second.numPairs < m_minEdgeWeight;
}
const unsigned m_minEdgeWeight;
};
struct WorkerArg {
istream* in;
ostream* out;
......@@ -642,6 +667,9 @@ struct WorkerArg {
static void* worker(void* pArg)
{
WorkerArg& arg = *static_cast<WorkerArg*>(pArg);
unsigned countSeedTooShort = 0;
unsigned countNoEdges = 0;
unsigned countEdgesRemoved = 0;
for (;;) {
/** Lock the input stream. */
static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
......@@ -651,6 +679,26 @@ static void* worker(void* pArg)
pthread_mutex_unlock(&inMutex);
if (!good)
break;
if ((*arg.graph)[ContigNode(er.refID, false)].length < opt::minSeedLength) {
++countSeedTooShort;
continue;
}
// Remove edges with insufficient support.
for (unsigned i = 0; i < 2; ++i) {
Estimates& estimates = er.estimates[i];
if (estimates.empty())
continue;
unsigned sizeBefore = estimates.size();
estimates.erase(
remove_if(estimates.begin(), estimates.end(), PoorSupport(opt::minEdgeWeight)),
estimates.end());
unsigned sizeAfter = estimates.size();
unsigned edgesRemoved = sizeBefore - sizeAfter;
countEdgesRemoved += edgesRemoved;
if (sizeAfter == 0)
++countNoEdges;
}
// Flip the anterior distance estimates.
for (Estimates::iterator it = er.estimates[1].begin();
......@@ -673,6 +721,14 @@ static void* worker(void* pArg)
pthread_mutex_unlock(&outMutex);
}
}
static pthread_mutex_t statsMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&statsMutex);
stats.seedTooShort += countSeedTooShort;
stats.noEdges += countNoEdges;
stats.edgesRemoved += countEdgesRemoved;
pthread_mutex_unlock(&statsMutex);
return NULL;
}
......@@ -707,6 +763,9 @@ static void generatePathsThroughEstimates(const Graph& g,
cout << '\n';
cout <<
"Seed too short: " << stats.seedTooShort << "\n"
"Seeds with no edges: " << stats.noEdges << "\n"
"Edges removed: " << stats.edgesRemoved << "\n"
"Total paths attempted: " << stats.totalAttempted << "\n"
"Unique path: " << stats.uniqueEnd << "\n"
"No possible paths: " << stats.noPossiblePaths << "\n"
......
......@@ -3,14 +3,22 @@
# Written by Shaun Jackman <sjackman@bcgsc.ca> and
# Anthony Raymond <traymond@bcgsc.ca>.
SHELL=bash -e -o pipefail
ifneq ($(shell command -v zsh),)
# Set pipefail to ensure that all commands of a pipe succeed.
SHELL=zsh -e -o pipefail
# Report run time and memory usage with zsh.
export REPORTTIME=1
export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J
endif
# Record run time and memory usage in a file using GNU time.
ifdef time
ifneq ($(shell command -v gtime),)
gtime=command gtime -v -o $@.time
else
SHELL=bash -e -o pipefail
gtime=command time -v -o $@.time
endif
endif
# Define this environment variable on Mac OS X to read
......@@ -93,10 +101,10 @@ endif
# Use pigz or bgzip for parallel compression if available.
ifneq ($(shell command -v pigz),)
gzip=pigz -p$t
gzip=pigz -p$j
else
ifneq ($(shell command -v bgzip),)
gzip=bgzip -@$t
gzip=bgzip -@$j
else
gzip=gzip
endif
......@@ -123,6 +131,13 @@ MARKDOWN=pandoc
map=$(foreach a,$(2),$(call $(1),$(a)))
deref=$($1)
ifdef lr
ifndef lib
lib:=$(pe) $(lr)
endif
endif
ifdef lib
in?=$(call map, deref, $(lib))
else
......@@ -131,6 +146,7 @@ lib?=$(name)
$(lib)?=$(in)
endif
endif
pe?=$(lib)
mp?=$(pe)
......@@ -141,6 +157,9 @@ endif
ifdef se
override se:=$(strip $(se))
endif
ifdef lr
override lr_reads=$(strip $(call map, deref, $(lr)))
endif
# Graph file format
graph?=dot
......@@ -287,13 +306,16 @@ $(foreach i,$(pe),$(eval $i_n?=$n))
override deopt=$v $(dbopt) -j$j -k$k $(DISTANCEEST_OPTIONS) -l$($*_l) -s$($*_s) -n$($*_n) $($*_de)
# SimpleGraph parameters
sgopt += $(dbopt)
sgopt += $(dbopt) -s$s -n$n
ifdef d
sgopt += -d$d
endif
# MergePaths parameters
mpopt += $v $(dbopt) -j$j -k$k
mpopt += $v $(dbopt) -j$j -k$k -s$s
ifdef G
mpopt += -G$G
endif
# PathOverlap parameters
poopt += $v $(dbopt) -k$k
......@@ -346,9 +368,9 @@ ifndef k
error::
@>&2 echo 'abyss-pe: missing parameter `k`'
endif
ifeq ($(lib)$(in)$(se),)
ifeq ($(lib)$(in)$(se)$(lr),)
error::
@>&2 echo 'abyss-pe: missing parameter `lib`, `in` or `se`'
@>&2 echo 'abyss-pe: missing parameter `lib`, `in`, `se`, or `lr`'
endif
default:
......@@ -371,7 +393,7 @@ help:
@echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users@bcgsc.ca.'
version:
@echo "abyss-pe (ABySS) 2.0.3"
@echo "abyss-pe (ABySS) 2.1.0"
@echo "Written by Shaun Jackman and Anthony Raymond."
@echo
@echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
......@@ -689,9 +711,106 @@ endif
%-8.$g: %-7.$g %-7.path
PathOverlap --overlap $(poopt) --$g $^ >$@
# Scaffold using linked reads
ifdef lr
# Tigmint
# Options for mapping the reads to the draft assembly.
lr_l=$l
override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS)
# Options for abyss-scaffold
lr_s=1000-100000
lr_n=5-20
# Minimum AS/Read length ratio
tigmint_as=0.65
# Maximum number of mismatches
tigmint_nm=5
# Minimum mapping quality threshold
tigmint_mapq=0
# Maximum distance between reads to be considered the same molecule
tigmint_d=50000
# Minimum number of spanning molecules
tigmint_n=10
# Size of the window that must be spanned by moecules
tigmint_w=1000
# Align paired-end reads to the draft genome, sort by BX tag,
# and create molecule extents BED.
%.lr.bed: %.fa
$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
| samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \
| tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \
| sort -k1,1 -k2,2n -k3,3n >$@
# Align paired-end reads to the draft genome and sort by BX tag.
%.lr.sortbx.bam: %.fa
$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
| samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
# Filter the BAM file, create molecule extents BED.
%.lr.bed: %.lr.sortbx.bam
$(gtime) tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) $< \
| sort -k1,1 -k2,2n -k3,3n >$@
# Cut sequences at assembly errors.
%.tigmint.fa: %.lr.bed %.fa %.fa.fai
$(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $<
# ARCS
arcs_c=2
arcs_d=0
arcs_e=30000
arcs_l=0
arcs_m=4-20000
arcs_r=0.05
arcs_s=98
arcs_z=500
# Align reads and create a graph of linked contigs using ARCS.
%.arcs.dist.gv: %.fa
$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
| abyss-fixmate-ssq --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
| arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
-g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
# Align paired-end reads to the draft genome and do not sort.
%.lr.sortn.sam.gz: %.fa
$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
| abyss-fixmate --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
| $(gzip) >$@
# Create a graph of linked contigs using ARCS.
%.arcs.dist.gv: %.lr.sortn.sam.gz
gunzip -c $< \
| arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
-g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
# Scaffold using ARCS and abyss-scaffold.
%.arcs.path: %.arcs.dist.gv
abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
# Create the FASTA file of ARCS scaffolds.
%.arcs.fa: %.fa %.arcs.path
MergeContigs $(mcopt) -o $@ $^
%-scaffolds.fa: %-8.tigmint.arcs.fa
ln -sf $< $@
else
%-scaffolds.fa: %-8.fa
ln -sf $< $@
endif
%-scaffolds.$g: %-8.$g
ln -sf $< $@
......@@ -854,4 +973,4 @@ env:
@echo '[override], if variable was defined with an override directive in (this) makefile.'
@$(foreach var,$(varList),\
echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]" | grep -v "undefined";)
echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]";)
AC_PREREQ(2.62)
AC_INIT(ABySS, 2.0.3, abyss-users@bcgsc.ca, abyss,
AC_INIT(ABySS, 2.1.0, abyss-users@bcgsc.ca, abyss,
http://www.bcgsc.ca/platform/bioinfo/software/abyss)
m4_include(m4/m4_ax_pthread.m4)
AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)
......
abyss (2.1.0-1) unstable; urgency=medium
* New upstream version
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
-- Andreas Tille <tille@debian.org> Fri, 20 Apr 2018 10:04:32 +0200
abyss (2.0.3-1) unstable; urgency=medium
[ Steffen Moeller ]
......
......@@ -11,9 +11,9 @@ Build-Depends: debhelper (>= 11~),
libtext-multimarkdown-perl,
libgtest-dev,
libsqlite3-dev
Standards-Version: 4.1.3
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/abyss.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/abyss.git
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/abyss
Vcs-Git: https://salsa.debian.org/med-team/abyss.git
Homepage: http://www.bcgsc.ca/platform/bioinfo/software/abyss
Package: abyss
......
......@@ -9,6 +9,6 @@ Description: Add internal PATH to find abyss tools
+PATH:=/usr/lib/abyss:${PATH}
+
SHELL=bash -e -o pipefail
ifneq ($(shell command -v zsh),)
# Set pipefail to ensure that all commands of a pipe succeed.
SHELL=zsh -e -o pipefail
.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.0.3" "User Commands"
.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands"
.SH NAME
ABYSS \- assemble short reads into contigs
.SH SYNOPSIS
......
.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.0.3" "User Commands"
.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands"
.SH NAME
abyss-pe - assemble reads into contigs
.SH SYNOPSIS
......
.TH abyss-tofastq "1" "2015-May" "ABySS 2.0.3" "User Commands"
.TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands"
.SH NAME
abyss-tofastq \- convert various file formats to FASTQ format
.br
......