Skip to content
Commits on Source (4)
......@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
cmake_policy(SET CMP0048 NEW)
cmake_policy(SET CMP0005 NEW)
project(bowtie2 LANGUAGES CXX VERSION "2.3.4.2")
project(bowtie2 LANGUAGES CXX VERSION "2.3.4.3")
enable_testing()
......
......@@ -19,6 +19,14 @@ Please report any issues to the Bowtie 2 Github page or using the Sourceforge bu
Version Release History
=======================
Version 2.3.4.3 - Sep 17, 2018
* Fixed an issue causing `bowtie2-build` and `bowtie2-inspect`
to output incomplete help text.
* Fixed an issue causing `bowtie2-inspect` to crash.
* Fixed an issue preventing `bowtie2` from processing paired and/or
unpaired FASTQ reads together with interleaved FASTQ reads.
Version 2.3.4.2 - Aug 7, 2018
* Fixed issue causing `bowtie2` to fail in `--fast-local` mode.
......
......@@ -29,15 +29,15 @@ import subprocess
from collections import deque
def main():
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(add_help = False)
parser.add_argument('--large-index', action='store_true')
parser.add_argument('--verbose', action='store_true')
group = parser.add_mutually_exclusive_group()
group.add_argument('--debug', action='store_true')
group.add_argument('--sanitized', action='store_true')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--large-index', action='store_true')
logging.basicConfig(level=logging.ERROR,
format='%(levelname)s: %(message)s'
)
......@@ -51,8 +51,12 @@ def main():
build_bin_spec = os.path.join(ex_path,build_bin_s)
script_options, argv = parser.parse_known_args()
print_help = False
argv = deque(argv)
if '-h' in argv or '--help' in argv:
print_help = True
if script_options.verbose:
logging.getLogger().setLevel(logging.INFO)
......
......@@ -22,6 +22,7 @@
import os
import imp
import sys
import inspect
import logging
import argparse
......@@ -29,7 +30,7 @@ import subprocess
from collections import deque
def main():
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(add_help = False)
group = parser.add_mutually_exclusive_group()
group.add_argument('--debug', action='store_true')
......
......@@ -142,7 +142,10 @@ static void printUsage(ostream& out) {
<< " <reference_in>)" << endl;
if(wrapper == "basic-0") {
out << " --large-index force generated index to be 'large', even if ref" << endl
<< " has fewer than 4 billion nucleotides" << endl;
<< " has fewer than 4 billion nucleotides" << endl
<< " --debug use the debug binary; slower, assertions enabled" << endl
<< " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl
<< " --verbose log the issued command" << endl;
}
out << " -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting" << endl
<< " -p/--packed use packed strings internally; slower, less memory" << endl
......
......@@ -76,14 +76,16 @@ static void printUsage(ostream& out) {
<< "Options:" << endl;
if(wrapper == "basic-0") {
out << " --large-index force inspection of the 'large' index, even if a" << endl
<< " 'small' one is present." << endl;
<< " 'small' one is present." << endl
<< " --debug use the debug binary; slower, assertions enabled" << endl
<< " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl
<< " --verbose log the issued command" << endl;
}
out << " -a/--across <int> Number of characters across in FASTA output (default: 60)" << endl
<< " -n/--names Print reference sequence names only" << endl
<< " -s/--summary Print summary incl. ref names, lengths, index properties" << endl
<< " -v/--verbose Verbose output (for debugging)" << endl
<< " -h/--help print detailed description of tool and its options" << endl
<< " --help print this usage message" << endl
;
if(wrapper.empty()) {
cerr << endl
......
......@@ -84,6 +84,7 @@ static bool startVerbose; // be talkative at startup
int gQuiet; // print nothing but the alignments
static int sanityCheck; // enable expensive sanity checks
static int format; // default read format is FASTQ
static bool interleaved; // reads are interleaved
static string origString; // reference text, or filename(s)
static int seed; // srandom() seed
static int timing; // whether to report basic timing data
......@@ -278,6 +279,7 @@ static void resetOptions() {
gQuiet = false;
sanityCheck = 0; // enable expensive sanity checks
format = FASTQ; // default read format is FASTQ
interleaved = false; // reads are not interleaved by default
origString = ""; // reference text, or filename(s)
seed = 0; // srandom() seed
timing = 0; // whether to report basic timing data
......@@ -999,7 +1001,12 @@ static void parseOption(int next_option, const char *arg) {
case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break;
case ARG_INTERLEAVED_FASTQ: tokenize(arg, ",", mates12); format = INTERLEAVED; break;
case ARG_INTERLEAVED_FASTQ: {
tokenize(arg, ",", mates12);
format = FASTQ;
interleaved = true;
break;
}
case 'b': {
format = BAM;
saw_bam = true;
......@@ -1735,7 +1742,7 @@ createPatsrcFactory(
int tid)
{
PatternSourcePerThreadFactory *patsrcFact;
patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp);
patsrcFact = new PatternSourcePerThreadFactory(patcomp, pp, tid);
assert(patsrcFact != NULL);
return patsrcFact;
}
......@@ -4782,6 +4789,7 @@ static void driver(
}
PatternParams pp(
format, // file format
interleaved, // some or all of the reads are interleaved
fileParallel, // true -> wrap files with separate PairedPatternSources
seed, // pseudo-random seed
readsPerBatch, // # reads in a light parsing batch
......
bowtie2 (2.3.4.3-1) unstable; urgency=medium
[ Steffen Möller ]
* Updated OMICtools ref in d/u/metadata
[ Alexandre Mestiashvili ]
* New upstream version 2.3.4.3:
- Fixed an issue causing `bowtie2-build` and `bowtie2-inspect`
to output incomplete help text.
- Fixed an issue causing `bowtie2-inspect` to crash.
- Fixed an issue preventing `bowtie2` from processing paired and/or
unpaired FASTQ reads together with interleaved FASTQ reads.
* Bump Policy to 4.2.1
-- Alexandre Mestiashvili <mestia@debian.org> Tue, 25 Sep 2018 07:58:26 +0000
bowtie2 (2.3.4.2-1) unstable; urgency=medium
* New upstream version 2.3.4.2
......
......@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11),
libtest-deep-perl,
libclone-perl,
zlib1g-dev
Standards-Version: 4.2.0
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/bowtie2
Vcs-Git: https://salsa.debian.org/med-team/bowtie2.git
Homepage: http://bowtie-bio.sourceforge.net/bowtie2
......
......@@ -152,7 +152,7 @@ Fedora, CentOS
<pre><code>yum check-update</code></pre>
</td>
<td>
<p>yum search tbb</p>
<pre><code>yum search tbb</code></pre>
</td>
<td>
<pre><code>yum install tbb-devel.x86_64</code></pre>
......
<h2>Version 2.3.4.2 - August 07, 1987</h2>
<h2>Version 2.3.4.3 - September 17, 2018</h2>
<ul>
<li>Fixed issue causing <code>bowtie2</code> to fail in <code><a href="bowtie2-options-fast-local">--fast-local</a></code> mode.</li>
<li>Fixed issue causing <code><a href="bowtie2-options-soft-clipped-unmapped-tlen>--soft-clipped-unmapped-tlen</a></code> to be a positional argument.</li>
<li>New option <code><a href="bowtie2-options-trim-to">--trim-to</a> N</code> causes <code>bowtie2</code> to trim reads longer than <code>N</code> bases to exactly <code>N</code> bases. Can trim from either 3&#39; or 5&#39; end, e.g. <code><a href="bowtie2-options-trim-to">--trim-to</a> 5:30</code> trims reads to 30 bases, truncating at the 5&#39; end.</li>
<li>Updated <a href="#building-from-source">&quot;Building from source&quot;</a> manual section with additional instructions on installing TBB.</li>
<li>Fixed an issue causing <code>bowtie2-build</code> and <code>bowtie2-inspect</code> to output incomplete help text.</li>
<li>Fixed an issue causing <code>bowtie2-align</code> to crash.</li>
<li>Fixed an issue preventing <code>bowtie2</code> from processing paired and/or unpaired FASTQ reads together with interleaved FASTQ reads.</li>
</ul>
<h2>Version 2.3.4.2 - August 07, 2018</h2>
<ul>
<li>Fixed issue causing <code>bowtie2</code> to fail in <code><a href="manual.shtml#bowtie2-options-fast-local">--fast-local</a></code> mode.</li>
<li>Fixed issue causing <code><a href="manual.shtml#bowtie2-options-soft-clipped-unmapped-tlen">--soft-clipped-unmapped-tlen</a></code> to be a positional argument.</li>
<li>New option <code><a href="manual.shtml#bowtie2-options-trim-to">--trim-to</a> N</code> causes <code>bowtie2</code> to trim reads longer than <code>N</code> bases to exactly <code>N</code> bases. Can trim from either 3&#39; or 5&#39; end, e.g. <code><a href="manual.shtml#bowtie2-options-trim-to">--trim-to</a> 5:30</code> trims reads to 30 bases, truncating at the 5&#39; end.</li>
<li>Updated <a href="manual.shtml#building-from-source">&quot;Building from source&quot;</a> manual section with additional instructions on installing TBB.</li>
<li>Several other updates to manual, including new mentions of <a href="http://bioconda.github.io">Bioconda</a> and <a href="https://biocontainers.pro">Biocontainers</a>.</li>
<li>Fixed an issue preventing <code>bowtie2</code> from processing more than one pattern source when running single threaded.</li>
<li>Fixed an issue causing <code>bowtie2</code> and <code>bowtie2-inspect</code> to crash if the index contains a gap-only segment.</li>
<li>Added <i>experimental</i> BAM input mode <code>-b</code>. Works only with unpaired input reads and BAM files that are sorted by read name (<code>samtools sort -n</code>). BAM input mode also supports the following options:<li>
<ul>
<li><code>--preserve-sam-tags</code>: Preserve any optional fields present in BAM record</li>
<li><code>--align-paired-reads</code>: Paired-end mode for BAM files</li>
</ul>
&nbsp&nbsp&nbsp&nbsp<code>--preserve-sam-tags</code>: Preserve any optional fields present in BAM record</li>
&nbsp&nbsp&nbsp&nbsp<code>--align-paired-reads</code>: Paired-end mode for BAM files</li>
<li>Add <i>experimental</i> CMake support</li>
</ul>
......
......@@ -18,10 +18,10 @@
</tr>
<tr>
<td>
<a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.4.2">Bowtie2 2.3.4.2</a>
<a href="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.3.4.3">Bowtie2 2.3.4.3</a>
</td>
<td align="right">
08/07/18&nbsp;
09/17/18&nbsp;
</td>
</tr>
<tr>
......@@ -164,6 +164,7 @@
<h2>Related publications</h2>
<div class="box">
<ul>
<li style="font-size: x-small; line-height: 130%">Langmead B, Wilks C., Antonescu V., Charles R. <a href="https://doi.org/10.1093/bioinformatics/bty648"><b>Scaling read aligners to hundreds of threads on general-purpose processors</b></a>. <i><a href="https://academic.oup.com/bioinformatics">Bioinformatics</a></i>. bty648.<br/><br/></li>
<li style="font-size: x-small; line-height: 130%">Langmead B, Salzberg S. <a href="http://www.nature.com/nmeth/journal/v9/n4/full/nmeth.1923.html"><b>Fast gapped-read alignment with Bowtie 2</b></a>. <i><a href="http://www.nature.com/nmeth">Nature Methods</a></i>. 2012, 9:357-359.<br/><br/></li>
<li style="font-size: x-small; line-height: 130%">Langmead B, Trapnell C, Pop M, Salzberg SL. <a href="http://genomebiology.com/2009/10/3/R25"><b>Ultrafast and memory-efficient alignment of short DNA sequences to the human genome</b></a>. <i><a href="http://genomebiology.com">Genome Biology</a></i> <b>10</b>:R25.<br/><br/></li>
</ul>
......
# aliases for converting sample read files
# `source conversion_utilities.sh` to add them to your environment
alias fastq_to_fasta="sed 'N;x;N;N;x;s/@/>/"
alias paired_to_tab5="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed -n 'n;h;n;g;N;s/\n/ /g;p' reads_2.fq) > reads_12.tab5"
alias paired_to_tab6="paste <(sed 'N;x;N;g;N;s/\n/ /g' reads_1.fq) <(sed 'N;x;N;g;N;s/\n/ /g' reads_2.fq) > reads_12.tab6"
alias paired_to_interleaved="paste -d'\n' <(sed 'N;N;N;s/\n/ /g' reads_1.fq) <(sed 'N;N;N;s/\n/ /g' reads_2.fq) | tr '\t' '\n' > reads_12.fq"
......@@ -30,7 +30,6 @@ enum file_format {
FASTA = 1,
FASTA_CONT,
FASTQ,
INTERLEAVED,
BAM,
TAB_MATE5,
TAB_MATE6,
......@@ -44,7 +43,6 @@ static const std::string file_format_names[] = {
"FASTA",
"FASTA sampling",
"FASTQ",
"Interleaved FASTQ",
"BAM",
"Tabbed mated",
"Raw",
......
......@@ -100,9 +100,8 @@ PatternSource* PatternSource::patsrcFromStrings(
case FASTA: return new FastaPatternSource(qs, p);
case FASTA_CONT: return new FastaContinuousPatternSource(qs, p);
case RAW: return new RawPatternSource(qs, p);
case FASTQ: return new FastqPatternSource(qs, p);
case FASTQ: return new FastqPatternSource(qs, p, p.interleaved);
case BAM: return new BAMPatternSource(qs, p);
case INTERLEAVED: return new FastqPatternSource(qs, p, true /* interleaved */);
case TAB_MATE5: return new TabbedPatternSource(qs, p, false);
case TAB_MATE6: return new TabbedPatternSource(qs, p, true);
case CMDLINE: return new VectorPatternSource(qs, p);
......@@ -232,14 +231,13 @@ pair<bool, int> DualPatternComposer::nextBatch(PerThreadReadBuf& pt) {
pt,
true, // batch A (or pairs)
true); // grab lock below
bool done = res.first;
if(!done && res.second == 0) {
if(res.second == 0 && cur < srca_->size() - 1) {
ThreadSafe ts(mutex_m);
if(cur + 1 > cur_) cur_++;
cur = cur_; // Move on to next PatternSource
continue; // on to next pair of PatternSources
}
return make_pair(done, res.second);
return make_pair(res.first && cur == srca_->size() - 1, res.second);
} else {
pair<bool, int> resa, resb;
// Lock to ensure that this thread gets parallel reads
......@@ -295,12 +293,11 @@ PatternComposer* PatternComposer::setupPatternComposer(
const EList<string>& q, // qualities associated with singles
const EList<string>& q1, // qualities associated with m1
const EList<string>& q2, // qualities associated with m2
const PatternParams& p, // read-in parameters
PatternParams& p, // read-in parameters
bool verbose) // be talkative?
{
EList<PatternSource*>* a = new EList<PatternSource*>();
EList<PatternSource*>* b = new EList<PatternSource*>();
EList<PatternSource*>* ab = new EList<PatternSource*>();
// Create list of pattern sources for paired reads appearing
// interleaved in a single file
for(size_t i = 0; i < m12.size(); i++) {
......@@ -312,7 +309,9 @@ PatternComposer* PatternComposer::setupPatternComposer(
tmp.push_back(m12[i]);
assert_eq(1, tmp.size());
}
ab->push_back(PatternSource::patsrcFromStrings(p, *qs));
a->push_back(PatternSource::patsrcFromStrings(p, *qs));
b->push_back(NULL);
p.interleaved = false;
if(!p.fileParallel) {
break;
}
......@@ -376,16 +375,7 @@ PatternComposer* PatternComposer::setupPatternComposer(
}
PatternComposer *patsrc = NULL;
if(m12.size() > 0) {
patsrc = new SoloPatternComposer(ab, p);
for(size_t i = 0; i < a->size(); i++) delete (*a)[i];
for(size_t i = 0; i < b->size(); i++) delete (*b)[i];
delete a; delete b;
} else {
patsrc = new DualPatternComposer(a, b, p);
for(size_t i = 0; i < ab->size(); i++) delete (*ab)[i];
delete ab;
}
patsrc = new DualPatternComposer(a, b, p);
return patsrc;
}
......@@ -471,9 +461,15 @@ void CFilePatternSource::open() {
}
else {
const char* filename = infiles_[filecur_].c_str();
compressed_ = is_gzipped_file(filename);
if (compressed_) {
const char* ext = filename + infiles_[filecur_].length();
// Only temporary until we have a separate BGZFPatternSource
while (ext != filename && *--ext != '.') ;
bool bam_file = ext != filename && strncmp(ext, ".bam", 4) == 0;
if (!bam_file && is_gzipped_file(filename)) {
zfp_ = gzopen(filename, "rb");
compressed_ = true;
} else {
fp_ = fopen(filename, "rb");
}
......@@ -1146,6 +1142,12 @@ const int BAMPatternSource::offset[] = {
32, //read_name
};
const uint8_t BAMPatternSource::EOF_MARKER[] = {
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00, 0x03, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
bool BAMPatternSource::parse_bam_header() {
char magic[4];
......@@ -1203,58 +1205,281 @@ bool BAMPatternSource::parse_bam_header() {
return true;
}
std::pair<bool, int> BAMPatternSource::nextBatchFromFile(PerThreadReadBuf& pt,
bool batch_a, unsigned readi) {
if (first_) {
first_ = false;
if (!parse_bam_header()) {
std::cerr << "Unable to parse BAM file" << std::endl;
return make_pair(true, 0);
uint16_t BAMPatternSource::nextBGZFBlockFromFile(BGZF& b) {
fread(&b.hdr, sizeof(b.hdr), 1, fp_);
if (feof_unlocked(fp_) || ferror_unlocked(fp_)) {
return 0;
}
uint8_t *extra = new uint8_t[b.hdr.xlen];
fread(extra, b.hdr.xlen, 1, fp_);
if (ferror_unlocked(fp_)) {
return 0;
}
if (memcmp(EOF_MARKER, &b.hdr, sizeof(b.hdr)) == 0
&& memcmp(EOF_MARKER + sizeof(b.hdr), extra, 6 /* sizeof BAM subfield */) == 0)
{
delete[] extra;
fclose(fp_);
return 0;
}
uint16_t bsize = 0;
for (uint16_t i = 0; i < b.hdr.xlen;) {
if (extra[0] == 66 && extra[1] == 67) {
bsize = *((uint16_t *)(extra + 4));
bsize -= (b.hdr.xlen + 19);
break;
}
i = i + 2;
uint16_t sub_field_len = *((uint16_t *)(extra + 2));
i = i + 2 + sub_field_len;
}
delete[] extra;
if (bsize == 0) {
return 0;
}
fread(b.cdata, bsize, 1, fp_);
if (ferror_unlocked(fp_)) {
return 0;
}
fread(&b.ftr, sizeof b.ftr, 1, fp_);
if (ferror_unlocked(fp_)) {
return 0;
}
return bsize;
}
std::pair<bool, int> BAMPatternSource::nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock) {
uint16_t cdata_len;
unsigned nread = 0;
bool done = false;
pt.setReadId(readCnt_);
do {
if (bam_batch_indexes_[pt.tid_] >= bam_batches_[pt.tid_].size()) {
BGZF& block = blocks_[pt.tid_];
std::vector<uint8_t>& batch = bam_batches_[pt.tid_];
if (lock) {
ThreadSafe ts(mutex);
if (first_) {
nextBGZFBlockFromFile(block);
first_ = false;
}
cdata_len = nextBGZFBlockFromFile(block);
} else {
cdata_len = nextBGZFBlockFromFile(block);
}
if (cdata_len == 0) {
ThreadSafe ts(orphan_mates_mutex_);
get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, nread);
return make_pair(nread == 0, nread);
}
bam_batch_indexes_[pt.tid_] = 0;
batch.resize(block.ftr.isize);
int ret_code = decompress_bgzf_block(&batch[0], block.ftr.isize, block.cdata, cdata_len);
if (ret_code != Z_OK) {
return make_pair(true, 0);
}
uLong crc = crc32(0L, Z_NULL, 0);
crc = crc32(crc, &batch[0], batch.size());
assert(crc == block.ftr.crc32);
}
std::pair<bool, int> ret = get_alignments(pt, batch_a, nread);
done = ret.first;
} while (!done && nread < pt.max_buf_);
readCnt_ += 1;
return make_pair(done, nread);
}
std::pair<bool, int> BAMPatternSource::get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi) {
size_t& i = bam_batch_indexes_[pt.tid_];
bool done = false;
bool read1 = true;
while (readi < pt.max_buf_) {
int r;
if (i >= bam_batches_[pt.tid_].size()) {
return make_pair(false, readi);
}
uint16_t flag;
int32_t block_size;
EList<Read>& readbuf = pp_.align_paired_reads && !read1 ? pt.bufb_ : pt.bufa_;
if ((r = zread(&block_size, sizeof(block_size))) != sizeof(block_size)) {
return make_pair(true, readi);
}
if (readbuf[readi].readOrigBuf.length() < block_size) {
readbuf[readi].readOrigBuf.resize(block_size);
}
if (zread(readbuf[readi].readOrigBuf.wbuf(), block_size) != block_size) {
done = true;
break;
memcpy(&block_size, &bam_batches_[pt.tid_][0] + i, sizeof(block_size));
if (block_size <= 0) {
return make_pair(done, readi);
}
memcpy(&flag, readbuf[readi].readOrigBuf.buf() + offset[BAMField::flag], sizeof(flag));
i += sizeof(block_size);
memcpy(&flag, &bam_batches_[pt.tid_][0] + i + offset[BAMField::flag], sizeof(flag));
if (!pp_.align_paired_reads && ((flag & 0x40) != 0 || (flag & 0x80) != 0)) {
readbuf[readi].readOrigBuf.clear();
i += block_size;
continue;
}
if (pp_.align_paired_reads && ((flag & 0x40) == 0 && (flag & 0x80) == 0)) {
readbuf[readi].readOrigBuf.clear();
i += block_size;
continue;
}
if (pp_.align_paired_reads && read1 && (flag & 0x40) == 0) {
std::cerr << "Paired reads are out of order" << std::endl;
return make_pair(true, readi == 0 ? readi : readi-1);
if (pp_.align_paired_reads && (((flag & 0x40) != 0
&& i + block_size == bam_batches_[pt.tid_].size())
|| ((flag & 0x80) != 0 && i == sizeof(block_size))))
{
ThreadSafe ts(orphan_mates_mutex_);
store_orphan_mate(&bam_batches_[pt.tid_][0] + i, block_size);
i += block_size;
get_orphaned_pairs(pt.bufa_, pt.bufb_, pt.max_buf_, readi);
} else {
readbuf[readi].readOrigBuf.resize(block_size);
memcpy(readbuf[readi].readOrigBuf.wbuf(), &bam_batches_[pt.tid_][0] + i, block_size);
i += block_size;
read1 = !read1;
readi = (pp_.align_paired_reads
&& pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
}
if (pp_.align_paired_reads && !read1 && (flag & 0x80) == 0) {
std::cerr << "Paired reads are out of order" << std::endl;
return make_pair(true, readi == 0 ? readi : readi-1);
}
return make_pair(done, readi);
}
void BAMPatternSource::store_orphan_mate(const uint8_t* r, size_t read_len) {
uint8_t flag;
memcpy(&flag, r + offset[BAMField::flag], sizeof(flag));
std::vector<orphan_mate_t>&
orphan_mates = (flag & 0x40) != 0 ? orphan_mate1s : orphan_mate2s;
size_t i;
for (i = 0; i < orphan_mates.size() && !orphan_mates[i].empty(); ++i) ;
if (i == orphan_mates.size()) {
orphan_mates.resize(orphan_mates.size() * 2);
}
orphan_mate_t& mate = orphan_mates[i];
if (mate.data == NULL || mate.cap < read_len) {
mate.data = new uint8_t[read_len];
mate.cap = read_len;
}
if (mate.cap < read_len) {
mate.data = new uint8_t[read_len];
mate.cap = read_len;
}
mate.size = read_len;
memcpy(mate.data, r, read_len);
}
int BAMPatternSource::compare_read_names(const void* m1, const void* m2) {
const orphan_mate_t* mate1 = (const orphan_mate_t*)m1;
const orphan_mate_t* mate2 = (const orphan_mate_t*)m2;
const char* r1 = (const char *)(mate1->data + offset[BAMField::read_name]);
const char* r2 = (const char *)(mate2->data + offset[BAMField::read_name]);
return strcmp(r1, r2);
}
bool BAMPatternSource::compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2) {
if (m1.empty()) {
return false;
}
if (m2.empty()) {
return true;
}
const char* r1 = (const char *)(m1.data + offset[BAMField::read_name]);
const char* r2 = (const char *)(m2.data + offset[BAMField::read_name]);
return strcmp(r1, r2);
}
void BAMPatternSource::get_orphaned_pairs(EList<Read>& buf_a, EList<Read>& buf_b, const size_t max_buf, unsigned& readi) {
std::sort(orphan_mate1s.begin(), orphan_mate1s.end(), &BAMPatternSource::compare_read_names2);
std::sort(orphan_mate2s.begin(), orphan_mate2s.end(), &BAMPatternSource::compare_read_names2);
size_t lim1, lim2;
for (lim1 = 0; lim1 < orphan_mate1s.size() && !orphan_mate1s[lim1].empty(); lim1++) ;
for (lim2 = 0; lim2 < orphan_mate2s.size() && !orphan_mate2s[lim2].empty(); lim2++) ;
if (lim2 == 0) {
return;
}
for (size_t i = 0; i < lim1 && readi < max_buf; ++i) {
orphan_mate_t* mate1 = &orphan_mate1s[i];
orphan_mate_t* mate2 = (orphan_mate_t*)bsearch(mate1, &orphan_mate2s[0], lim2,
sizeof(orphan_mate2s[0]), compare_read_names);
if (mate2 != NULL) {
Read& ra = buf_a[readi];
Read& rb = buf_b[readi];
ra.readOrigBuf.resize(mate1->size);
rb.readOrigBuf.resize(mate2->size);
memcpy(ra.readOrigBuf.wbuf(), mate1->data, mate1->size);
memcpy(rb.readOrigBuf.wbuf(), mate2->data, mate2->size);
mate1->reset();
mate2->reset();
readi++;
}
}
}
read1 = !read1;
readi = (pp_.align_paired_reads
&& pt.bufb_[readi].readOrigBuf.length() == 0) ? readi : readi + 1;
int BAMPatternSource::decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len) {
z_stream strm;
strm.zalloc = Z_NULL;
strm.zfree = Z_NULL;
strm.opaque = Z_NULL;
strm.avail_in = src_len;
strm.next_in = src;
strm.avail_out = dst_len;
strm.next_out = dst;
int ret = inflateInit2(&strm, -8);
if (ret != Z_OK) {
return ret;
}
return make_pair(done, readi);
ret = inflate(&strm, Z_FINISH);
if (ret != Z_STREAM_END) {
return ret;
}
return inflateEnd(&strm);
}
bool BAMPatternSource::parse(Read& ra, Read& rb, TReadId rdid) const {
......
......@@ -26,6 +26,7 @@
#include <cassert>
#include <string>
#include <ctype.h>
#include <vector>
#include "alphabet.h"
#include "assert_helpers.h"
#include "random_source.h"
......@@ -54,6 +55,7 @@ struct PatternParams {
PatternParams(
int format_,
bool interleaved_,
bool fileParallel_,
uint32_t seed_,
size_t max_buf_,
......@@ -71,6 +73,7 @@ struct PatternParams {
bool preserve_sam_tags_,
bool align_paired_reads_) :
format(format_),
interleaved(interleaved_),
fileParallel(fileParallel_),
seed(seed_),
max_buf(max_buf_),
......@@ -89,6 +92,7 @@ struct PatternParams {
align_paired_reads(align_paired_reads_) { }
int format; // file format
bool interleaved; // some or all of the FASTQ reads are interleaved
bool fileParallel; // true -> wrap files with separate PatternComposers
uint32_t seed; // pseudo-random seed
size_t max_buf; // number of reads to buffer in one read
......@@ -112,11 +116,12 @@ struct PatternParams {
*/
struct PerThreadReadBuf {
PerThreadReadBuf(size_t max_buf) :
PerThreadReadBuf(size_t max_buf, int tid) :
max_buf_(max_buf),
bufa_(max_buf),
bufb_(max_buf),
rdid_()
rdid_(),
tid_(tid)
{
bufa_.resize(max_buf);
bufb_.resize(max_buf);
......@@ -185,6 +190,7 @@ struct PerThreadReadBuf {
EList<Read> bufb_; // Read buffer for mate bs
size_t cur_buf_; // Read buffer currently active
TReadId rdid_; // index of read at offset 0 of bufa_/bufb_
int tid_;
};
extern void wrongQualityFormat(const BTString& read_name);
......@@ -682,7 +688,7 @@ public:
FastqPatternSource(
const EList<std::string>& infiles,
const PatternParams& p, bool interleaved = false) :
const PatternParams& p, bool interleaved) :
CFilePatternSource(infiles, p),
first_(true),
interleaved_(interleaved) { }
......@@ -719,6 +725,63 @@ protected:
};
class BAMPatternSource : public CFilePatternSource {
struct hdr_t {
uint8_t id1;
uint8_t id2;
uint8_t cm;
uint8_t flg;
uint32_t mtime;
uint8_t xfl;
uint8_t os;
uint16_t xlen;
};
struct ftr_t {
uint32_t crc32;
uint32_t isize;
};
struct BGZF {
hdr_t hdr;
uint8_t cdata[1 << 16];
ftr_t ftr;
};
struct orphan_mate_t {
orphan_mate_t() :
data(NULL),
size(0),
cap(0) {}
void reset() {
size = 0;
}
bool empty() const {
return size == 0;
}
uint8_t* data;
uint16_t size;
uint16_t cap;
};
struct BAMField {
enum aln_rec_field_name {
refID,
pos,
l_read_name,
mapq,
bin,
n_cigar_op,
flag,
l_seq,
next_refID,
next_pos,
tlen,
read_name,
};
};
public:
......@@ -726,7 +789,19 @@ public:
const EList<std::string>& infiles,
const PatternParams& p) :
CFilePatternSource(infiles, p),
first_(true) {}
first_(true),
blocks_(p.nthreads),
bam_batches_(p.nthreads),
bam_batch_indexes_(p.nthreads),
orphan_mate1s(p.nthreads * 2),
orphan_mate2s(p.nthreads * 2),
orphan_mates_mutex_(),
pp_(p) {
// uncompressed size of BGZF block is limited to 2**16 bytes
for (size_t i = 0; i < bam_batches_.size(); ++i) {
bam_batches_[i].reserve(1 << 16);
}
}
virtual void reset() {
first_ = true;
......@@ -738,14 +813,27 @@ public:
*/
virtual bool parse(Read& ra, Read& rb, TReadId rdid) const;
~BAMPatternSource() {
// only temporary until c++11
for (size_t i = 0; i < orphan_mate1s.size(); i++) {
if (orphan_mate1s[i].data != NULL) {
delete[] orphan_mate1s[i].data;
}
}
for (size_t i = 0; i < orphan_mate2s.size(); i++) {
if (orphan_mate2s[i].data != NULL) {
delete[] orphan_mate2s[i].data;
}
}
}
protected:
/**
* Light-parse a batch into the given buffer.
*/
virtual std::pair<bool, int> nextBatchFromFile(PerThreadReadBuf& pt, bool batch_a,
unsigned readi);
virtual std::pair<bool, int> nextBatch(PerThreadReadBuf& pt, bool batch_a, bool lock = true);
uint16_t nextBGZFBlockFromFile(BGZF& block);
/**
* Reset state to be ready for the next file.
......@@ -760,24 +848,36 @@ private:
bool parse_bam_header();
struct BAMField {
enum aln_rec_field_name {
refID,
pos,
l_read_name,
mapq,
bin,
n_cigar_op,
flag,
l_seq,
next_refID,
next_pos,
tlen,
read_name,
};
};
virtual std::pair<bool, int> nextBatchFromFile(PerThreadReadBuf&, bool, unsigned) {
return make_pair(true, 0);
}
int decompress_bgzf_block(uint8_t *dst, size_t dst_len, uint8_t *src, size_t src_len);
std::pair<bool, int> get_alignments(PerThreadReadBuf& pt, bool batch_a, unsigned& readi);
void store_orphan_mate(const uint8_t* read, size_t read_len);
void get_orphaned_pairs(EList<Read>& buf_a, EList<Read>& buf_b, const size_t max_buf, unsigned& readi);
size_t get_matching_read(const uint8_t* rec);
static int compare_read_names(const void* m1, const void* m2);
static bool compare_read_names2(const orphan_mate_t& m1, const orphan_mate_t& m2);
static const int offset[];
static const uint8_t EOF_MARKER[];
std::vector<BGZF> blocks_;
std::vector<std::vector<uint8_t> > bam_batches_;
std::vector<size_t> bam_batch_indexes_;
std::vector<orphan_mate_t> orphan_mate1s;
std::vector<orphan_mate_t> orphan_mate2s;
MUTEX_T orphan_mates_mutex_;
PatternParams pp_;
};
/**
......@@ -861,7 +961,7 @@ public:
const EList<std::string>& q, // qualities associated with singles
const EList<std::string>& q1, // qualities associated with m1
const EList<std::string>& q2, // qualities associated with m2
const PatternParams& p, // read-in params
PatternParams& p, // read-in params
bool verbose); // be talkative?
protected:
......@@ -1017,9 +1117,9 @@ public:
PatternSourcePerThread(
PatternComposer& composer,
const PatternParams& pp) :
const PatternParams& pp, int tid) :
composer_(composer),
buf_(pp.max_buf),
buf_(pp.max_buf, tid),
pp_(pp),
last_batch_(false),
last_batch_size_(0) { }
......@@ -1107,15 +1207,16 @@ class PatternSourcePerThreadFactory {
public:
PatternSourcePerThreadFactory(
PatternComposer& composer,
const PatternParams& pp) :
const PatternParams& pp, int tid) :
composer_(composer),
pp_(pp) { }
pp_(pp),
tid_(tid) { }
/**
* Create a new heap-allocated PatternSourcePerThreads.
*/
virtual PatternSourcePerThread* create() const {
return new PatternSourcePerThread(composer_, pp_);
return new PatternSourcePerThread(composer_, pp_, tid_);
}
/**
......@@ -1125,7 +1226,7 @@ public:
virtual EList<PatternSourcePerThread*>* create(uint32_t n) const {
EList<PatternSourcePerThread*>* v = new EList<PatternSourcePerThread*>;
for(size_t i = 0; i < n; i++) {
v->push_back(new PatternSourcePerThread(composer_, pp_));
v->push_back(new PatternSourcePerThread(composer_, pp_, tid_));
assert(v->back() != NULL);
}
return v;
......@@ -1135,6 +1236,7 @@ private:
/// Container for obtaining paired reads from PatternSources
PatternComposer& composer_;
const PatternParams& pp_;
int tid_;
};
#endif /*PAT_H_*/