Skip to content
Commits on Source (5)
.vscode
build
googletest-release*
*~
.\" Man page generated from reStructuredText.
.
.TH "ADAPTERREMOVAL" "1" "Jan 22, 2019" "2.2.3" "AdapterRemoval"
.TH "ADAPTERREMOVAL" "1" "Jun 23, 2019" "2.3.0" "AdapterRemoval"
.SH NAME
AdapterRemoval \- Fast short-read adapter trimming and processing
.
......@@ -254,6 +254,11 @@ Set the threshold for trimming low quality bases using \fB\-\-trimqualities\fP a
.UNINDENT
.INDENT 0.0
.TP
.B \-\-preserve5p
If set, bases at the 5p will not be trimmed by \fB\-\-trimns\fP, \fB\-\-trimqualities\fP, and \fB\-\-trimwindows\fP\&. Collapsed reads will not be quality trimmed when this option is enabled.
.UNINDENT
.INDENT 0.0
.TP
.B \-\-minlength length
Reads shorter than this length are discarded following trimming. Defaults to 15.
.UNINDENT
......
### Version 2.3.1 - 2019-06-23
* Added --preserve5p option. This option prevents AdapterRemoval from trimming
the 5p of reads when the --trimqualities, --trimns, and --trimwindows options
are used. Neither end of collapsed reads are trimmed when this option is used.
* Fixed Ns being miscounted as As when constructing consensus adapter sequences
using --identify-adapters.
### Version 2.3.0 - 2019-03-12
* Fixed --collapse producing slightly different result on 32 bit and 64 bit
architectures. Courtesy of Andreas Tille.
* Added support for output files without a basename; to create such output
files, use an empty basename (--basename "") or a basename ending with a
slash (--basename path/).
* Added support for managing file handles to allow AdapterRemoval to run
when the the number of output files exceeds the number of file handles, e.g.
when demultiplexing large numbers of samples.
* Reworked demultiplexing to improve performance for many paired barcodes.
### Version 2.2.4 - 2019-02-10
* Fixed bug in --trim5p N which would AdapterRemoval to abort if N was greater
......
......@@ -65,7 +65,9 @@ PROG := AdapterRemoval
LIBNAME := libadapterremoval
LIBOBJS := $(BDIR)/adapterset.o \
$(BDIR)/alignment.o \
$(BDIR)/alignment_tables.o \
$(BDIR)/argparse.o \
$(BDIR)/barcode_table.o \
$(BDIR)/debug.o \
$(BDIR)/demultiplex.o \
$(BDIR)/fastq.o \
......@@ -76,6 +78,7 @@ LIBOBJS := $(BDIR)/adapterset.o \
$(BDIR)/main_adapter_id.o \
$(BDIR)/main_adapter_rm.o \
$(BDIR)/main_demultiplex.o \
$(BDIR)/managed_writer.o \
$(BDIR)/scheduler.o \
$(BDIR)/strutils.o \
$(BDIR)/threads.o \
......@@ -152,9 +155,12 @@ TEST_DIR := build/tests
TEST_OBJS := $(TEST_DIR)/main_test.o \
$(TEST_DIR)/debug.o \
$(TEST_DIR)/alignment.o \
$(TEST_DIR)/alignment_tables.o \
$(TEST_DIR)/alignment_test.o \
$(TEST_DIR)/argparse.o \
$(TEST_DIR)/argparse_test.o \
$(TEST_DIR)/barcodes_test.o \
$(TEST_DIR)/barcode_table.o \
$(TEST_DIR)/fastq.o \
$(TEST_DIR)/fastq_test.o \
$(TEST_DIR)/fastq_enc.o \
......
......@@ -34,7 +34,7 @@ AdapterRemoval was originally published in Lindgreen 2012:
If you have Conda [installed on your system](https://conda.io/miniconda.html):
```
conda install -c maxibor adapterremoval2
conda install -c bioconda adapterremoval
```
### Manual installation
......
adapterremoval (2.2.4-2) UNRELEASED; urgency=medium
adapterremoval (2.3.1-1) unstable; urgency=medium
* Team upload.
[ Andreas Tille ]
* d/copyright: Add upstream contact
-- Andreas Tille <tille@debian.org> Tue, 05 Mar 2019 11:05:51 +0100
[ Steffen Moeller ]
* debhelper-compat 12
* helped by Andreas' routine-update script
-- Steffen Moeller <moeller@debian.org> Tue, 05 Mar 2019 11:05:51 +0100
adapterremoval (2.2.4-1) unstable; urgency=medium
......
......@@ -4,12 +4,12 @@ Uploaders: Andreas Tille <tille@debian.org>,
Kevin Murray <kdmfoss@gmail.com>
Section: science
Priority: optional
Build-Depends: debhelper (>= 12~),
Build-Depends: debhelper-compat (= 12),
zlib1g-dev,
libbz2-dev,
libgtest-dev,
python-markdown
Standards-Version: 4.3.0
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/adapterremoval
Vcs-Git: https://salsa.debian.org/med-team/adapterremoval.git
Homepage: https://github.com/MikkelSchubert/adapterremoval
......
......@@ -54,9 +54,9 @@ author = u'Mikkel Schubert; Stinus Lindgreen'
# built documents.
#
# The short X.Y version.
version = u'2.2.3'
version = u'2.3.0'
# The full version, including alpha/beta/rc tags.
release = u'2.2.3'
release = u'2.3.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......
......@@ -199,6 +199,10 @@ FASTQ trimming options
Set the threshold for trimming low quality bases using ``--trimqualities`` and ``--trimwindows``. Default is 2.
.. option:: --preserve5p
If set, bases at the 5p will not be trimmed by ``--trimns``, ``--trimqualities``, and ``--trimwindows``. Collapsed reads will not be quality trimmed when this option is enabled.
.. option:: --minlength length
Reads shorter than this length are discarded following trimming. Defaults to 15.
......
......@@ -31,6 +31,7 @@
#include <cstring>
#include "alignment.hpp"
#include "alignment_tables.hpp"
#include "debug.hpp"
#include "fastq.hpp"
......@@ -165,9 +166,9 @@ alignment_info pairwise_align_sequences(const alignment_info& best_alignment,
struct phred_scores
{
phred_scores()
: identical_nts(std::numeric_limits<char>::min())
, different_nts(std::numeric_limits<char>::min())
explicit phred_scores(size_t index)
: identical_nts(IDENTICAL_NTS[index])
, different_nts(DIFFERENT_NTS[index])
{
}
......@@ -178,63 +179,17 @@ struct phred_scores
};
/**
* Calculates the phred scores to be assigned to a consensus base based on two
* bases, depending on the Phred scores assigned two these two bases. A phred
* score is calculated for both the case where the two bases are identical, and
* the case where they differ.
*
* The returned vector is inded by (phred1 * MAX_PHRED_SCORE) + phred2, where
* phred1 is assumed to be >= phred2. This is because we always select the base
* with the higher Phred score.
*/
std::vector<phred_scores> calculate_phred_score()
{
std::vector<double> Perror(MAX_PHRED_SCORE + 1, 0.0);
std::vector<double> Ptrue(MAX_PHRED_SCORE + 1, 0.0);
for (int i = 0; i <= MAX_PHRED_SCORE; ++i) {
const double p_err = std::min<double>(0.75, std::pow(10, static_cast<double>(i) / -10.0));
Perror.at(i) = std::log(p_err / 3.0);
Ptrue.at(i) = std::log(1.0 - p_err);
}
std::vector<phred_scores> new_scores((MAX_PHRED_SCORE + 1) * (MAX_PHRED_SCORE + 1));
for (int i = 0; i <= MAX_PHRED_SCORE; ++i) {
for (int j = 0; j <= i; ++j) {
const int index = (i * MAX_PHRED_SCORE) + j;
phred_scores& scores = new_scores.at(index);
{ // When two nucleotides are identical
const double ptrue = Ptrue.at(i) + Ptrue.at(j);
const double perror = Perror.at(i) + Perror.at(j);
const double normconstant = 1.0 + 3.0 * std::exp(perror - ptrue);
scores.identical_nts = fastq::p_to_phred_33(1.0 - 1.0 / normconstant);
}
{ // When two nucleotides differ
const double ptrue = Ptrue.at(i) + Perror.at(j);
const double perror_one = Perror.at(i) + Ptrue.at(j);
const double perror_both = Perror.at(i) + Perror.at(j);
const double normconstant = 1.0 + 2.0 * std::exp(perror_both - ptrue) + std::exp(perror_one - ptrue);
scores.different_nts = fastq::p_to_phred_33(1.0 - 1.0 / normconstant);
}
}
}
return new_scores;
}
const phred_scores& get_updated_phred_scores(char qual_1, char qual_2)
phred_scores get_updated_phred_scores(char qual_1, char qual_2)
{
AR_DEBUG_ASSERT(qual_1 >= qual_2);
// Cache of pre-calculated Phred scores for consensus bases; see above.
static const std::vector<phred_scores> updated_phred_scores = calculate_phred_score();
const size_t phred_1 = static_cast<size_t>(qual_1 - PHRED_OFFSET_33);
const size_t phred_2 = static_cast<size_t>(qual_2 - PHRED_OFFSET_33);
const size_t index = (phred_1 * (MAX_PHRED_SCORE + 1)) + phred_2;
AR_DEBUG_ASSERT(index < PHRED_TABLE_SIZE);
const size_t index = (static_cast<size_t>(qual_1 - PHRED_OFFSET_33) * MAX_PHRED_SCORE) + static_cast<size_t>(qual_2 - PHRED_OFFSET_33);
return updated_phred_scores.at(index);
return phred_scores(index);
}
......
This diff is collapsed.
/*************************************************************************\
* AdapterRemoval - cleaning next-generation sequencing reads *
* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
* *
* If you use the program, please cite the paper: *
* S. Lindgreen (2012): AdapterRemoval: Easy Cleaning of Next Generation *
* Sequencing Reads, BMC Research Notes, 5:337 *
* http://www.biomedcentral.com/1756-0500/5/337/ *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#ifndef ALIGNMENT_TABLES_H
#define ALIGNMENT_TABLES_H
#include <sys/types.h>
namespace ar
{
const size_t PHRED_TABLE_SIZE = 8836;
/**
* Table of Phred scores to assigned for identical positions during collapsing.
*
* Position is calculated as phred_1 * (MAX_PHRED_SCORE + 1) + phred_2,
* assuming that phred_1 >= phred_2.
*/
extern const char IDENTICAL_NTS[PHRED_TABLE_SIZE];
/**
* Table of Phred scores to assigned for mismatching positions during collapsing.
*
* Position is calculated as phred_1 * (MAX_PHRED_SCORE + 1) + phred_2,
* assuming that phred_1 >= phred_2.
*/
extern const char DIFFERENT_NTS[PHRED_TABLE_SIZE];
} // namespace ar
#endif
/*************************************************************************\
* AdapterRemoval - cleaning next-generation sequencing reads *
* *
* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
* *
* If you use the program, please cite the paper: *
* S. Lindgreen (2012): AdapterRemoval: Easy Cleaning of Next Generation *
* Sequencing Reads, BMC Research Notes, 5:337 *
* http://www.biomedcentral.com/1756-0500/5/337/ *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#include <algorithm>
#include <iostream>
#include "barcode_table.hpp"
#include "commontypes.hpp"
#include "debug.hpp"
#include "fastq_io.hpp"
#include "strutils.hpp"
#include "userconfig.hpp"
namespace ar
{
typedef std::pair<std::string, size_t> barcode_pair;
typedef std::vector<barcode_pair> barcode_vec;
struct next_subsequence {
explicit next_subsequence(const char* seq_,
const size_t max_local_mismatches_)
: seq(seq_)
, max_local_mismatches(max_local_mismatches_)
{
}
const char* seq;
const size_t max_local_mismatches;
};
///////////////////////////////////////////////////////////////////////////////
// barcode_error
barcode_error::barcode_error(const std::string& message)
: std::exception()
, m_message(message)
{
}
barcode_error::barcode_error(const barcode_error& error)
: std::exception()
, m_message(error.m_message)
{
}
barcode_error::~barcode_error() noexcept
{
}
const char* barcode_error::what() const noexcept
{
return m_message.c_str();
}
///////////////////////////////////////////////////////////////////////////////
demultiplexer_node::demultiplexer_node()
: children()
, value(barcode_table::no_match)
{
children.fill(barcode_table::no_match);
}
barcode_table::candidate::candidate(int barcode_, size_t mismatches_)
: barcode(barcode_)
, mismatches(mismatches_)
{
}
///////////////////////////////////////////////////////////////////////////////
/**
* Returns a lexicographically sorted list of merged barcodes, each paired with
* the 0-based index of corresponding barcode in the source vector.
*/
barcode_vec sort_barcodes(const fastq_pair_vec& barcodes)
{
AR_DEBUG_ASSERT(!barcodes.empty());
barcode_vec sorted_barcodes;
const size_t max_key_1_len = barcodes.front().first.length();
const size_t max_key_2_len = barcodes.front().second.length();
for (auto it = barcodes.begin(); it != barcodes.end(); ++it) {
if (it->first.length() != max_key_1_len) {
throw barcode_error("mate 1 barcodes do not have the same length");
} else if (it->second.length() != max_key_2_len) {
throw barcode_error("mate 2 barcodes do not have the same length");
}
std::string barcode;
barcode.reserve(max_key_1_len + max_key_2_len);
barcode.append(it->first.sequence());
barcode.append(it->second.sequence());
sorted_barcodes.push_back(barcode_pair(barcode, it - barcodes.begin()));
}
std::sort(sorted_barcodes.begin(), sorted_barcodes.end());
return sorted_barcodes;
}
/** Adds a nucleotide sequence with a given ID to a quad-tree. */
void add_sequence_to_tree(demux_node_vec& tree, const std::string& sequence,
const size_t barcode_id)
{
size_t node_idx = 0;
bool added_last_node = false;
for (auto nuc : sequence) {
auto& node = tree.at(node_idx);
// Indicate when PE barcodes can be unambigiously identified from SE
// reads
node.value = (node.value == barcode_table::no_match)
? barcode_id
: barcode_table::ambigious;
const auto nuc_idx = ACGT_TO_IDX(nuc);
auto child = node.children[nuc_idx];
added_last_node = (child == barcode_table::no_match);
if (added_last_node) {
// New nodes are added to the end of the list; as barcodes are
// added in lexicographic order, this helps ensure that a set of
// similar barcodes will be placed in mostly contiguous runs
// of the vector representation.
child = node.children[nuc_idx] = tree.size();
tree.push_back(demultiplexer_node());
}
node_idx = child;
}
if (!added_last_node) {
throw barcode_error(std::string("duplicate barcode (pair): ") +
sequence);
}
tree.at(node_idx).value = barcode_id;
}
/**
* Builds a sparse quad tree using the first sequence in a set of unique
* barcodes pairs; duplicate pairs will negatively impact the identification of
* these, since all hits will be considered ambiguous.
*/
demux_node_vec build_demux_tree(const fastq_pair_vec& barcodes)
{
// Step 1: Construct list of merged, sorted barcodes barcodes;
// this allows construction of the sparse tree in one pass.
const barcode_vec sorted_barcodes = sort_barcodes(barcodes);
// Step 2: Create empty tree containing just the root node; creating
// the root here simplifies the 'add_sequence_to_tree' function.
demux_node_vec tree;
tree.push_back(demultiplexer_node());
// Step 3: Add each barcode to the tree, in sorted order
for (auto& pair : sorted_barcodes) {
add_sequence_to_tree(tree, pair.first, pair.second);
}
return tree;
}
///////////////////////////////////////////////////////////////////////////////
const int barcode_table::no_match;
const int barcode_table::ambigious;
barcode_table::barcode_table(const fastq_pair_vec& barcodes, size_t mismatches,
size_t mm_r1, size_t mm_r2)
: m_nodes()
, m_max_mismatches()
, m_max_mismatches_r1()
, m_max_mismatches_r2()
, m_barcode_1_len()
, m_barcode_2_len()
{
m_max_mismatches = std::min<size_t>(mismatches, mm_r1 + mm_r2);
m_max_mismatches_r1 = std::min<size_t>(m_max_mismatches, mm_r1);
m_max_mismatches_r2 = std::min<size_t>(m_max_mismatches, mm_r2);
if (!barcodes.empty()) {
m_nodes = build_demux_tree(barcodes);
m_barcode_1_len = barcodes.front().first.length();
m_barcode_2_len = barcodes.front().second.length();
}
}
int barcode_table::identify(const fastq& read_r1) const
{
if (read_r1.length() < m_barcode_1_len) {
return barcode_table::no_match;
}
const std::string barcode = read_r1.sequence().substr(0, m_barcode_1_len);
auto match = lookup(barcode.c_str(), 0, 0, nullptr);
if (match.barcode == no_match && m_max_mismatches) {
match = lookup_with_mm(barcode.c_str(), 0, m_max_mismatches,
m_max_mismatches_r1, nullptr);
}
return match.barcode;
}
int barcode_table::identify(const fastq& read_r1, const fastq& read_r2) const
{
if (read_r1.length() < m_barcode_1_len ||
read_r2.length() < m_barcode_2_len) {
return no_match;
}
const auto barcode_1 = read_r1.sequence().substr(0, m_barcode_1_len);
const auto barcode_2 = read_r2.sequence().substr(0, m_barcode_2_len);
const auto combined_barcode = barcode_1 + barcode_2;
auto match = lookup(combined_barcode.c_str(), 0, 0, nullptr);
if (match.barcode == no_match && m_max_mismatches) {
const next_subsequence next(barcode_2.c_str(), m_max_mismatches_r2);
if (m_max_mismatches_r1) {
match = lookup_with_mm(barcode_1.c_str(), 0, m_max_mismatches,
m_max_mismatches_r1, &next);
} else {
match = lookup(barcode_1.c_str(), 0, m_max_mismatches, &next);
}
}
return match.barcode;
}
barcode_table::candidate
barcode_table::lookup(const char* seq, int parent,
const size_t max_global_mismatches,
const next_subsequence* next) const
{
for (; *seq && parent != no_match; ++seq) {
if (*seq == 'N') {
return candidate(no_match);
}
const auto encoded_nuc = ACGT_TO_IDX(*seq);
const auto& node = m_nodes.at(parent);
parent = node.children.at(encoded_nuc);
}
if (parent == no_match) {
return candidate(no_match);
} else if (next) {
const auto max_local_mismatches =
std::min(max_global_mismatches, next->max_local_mismatches);
if (max_local_mismatches) {
return lookup_with_mm(next->seq, parent, max_global_mismatches,
max_local_mismatches, nullptr);
} else {
return lookup(next->seq, parent, max_global_mismatches, nullptr);
}
} else {
const auto& node = m_nodes.at(parent);
return candidate(node.value, m_max_mismatches - max_global_mismatches);
}
}
barcode_table::candidate barcode_table::lookup_with_mm(
const char* seq, int parent, const size_t max_global_mismatches,
const size_t max_local_mismatches, const next_subsequence* next) const
{
const demultiplexer_node& node = m_nodes.at(parent);
const auto nucleotide = *seq;
if (nucleotide) {
candidate best_candidate;
for (size_t encoded_i = 0; encoded_i < 4; ++encoded_i) {
const auto child = node.children.at(encoded_i);
if (child != -1) {
candidate current_candidate;
if (nucleotide == IDX_TO_ACGT(encoded_i)) {
current_candidate =
lookup_with_mm(seq + 1, child, max_global_mismatches,
max_local_mismatches, next);
} else if (max_local_mismatches == 1) {
current_candidate =
lookup(seq + 1, child, max_global_mismatches - 1, next);
} else if (max_local_mismatches) {
current_candidate = lookup_with_mm(
seq + 1, child, max_global_mismatches - 1,
max_local_mismatches - 1, next);
}
if (current_candidate.mismatches < best_candidate.mismatches) {
best_candidate = current_candidate;
} else if (current_candidate.mismatches ==
best_candidate.mismatches) {
if (current_candidate.barcode != no_match) {
best_candidate.barcode = ambigious;
}
}
}
}
return best_candidate;
} else if (next) {
const size_t next_max_local_mismatches =
std::min(max_global_mismatches, next->max_local_mismatches);
if (next_max_local_mismatches) {
return lookup_with_mm(next->seq, parent, max_global_mismatches,
next_max_local_mismatches, nullptr);
} else {
return lookup(next->seq, parent, max_global_mismatches, nullptr);
}
} else {
return candidate(node.value, m_max_mismatches - max_global_mismatches);
}
}
} // namespace ar
/*************************************************************************\
* AdapterRemoval - cleaning next-generation sequencing reads *
* *
* Copyright (C) 2011 by Stinus Lindgreen - stinus@binf.ku.dk *
* Copyright (C) 2014 by Mikkel Schubert - mikkelsch@gmail.com *
* *
* If you use the program, please cite the paper: *
* S. Lindgreen (2012): AdapterRemoval: Easy Cleaning of Next Generation *
* Sequencing Reads, BMC Research Notes, 5:337 *
* http://www.biomedcentral.com/1756-0500/5/337/ *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
#ifndef BARCODE_TABLE_H
#define BARCODE_TABLE_H
#include <array>
#include "fastq.hpp"
#include "fastq_io.hpp"
#include "scheduler.hpp"
#include "statistics.hpp"
namespace ar
{
class userconfig;
struct next_subsequence;
/** Exception raised for FASTQ parsing and validation errors. */
class barcode_error : public std::exception
{
public:
barcode_error(const std::string& message);
barcode_error(const barcode_error& error);
virtual ~barcode_error() noexcept;
/** Returns error message; string is owned by exception. */
virtual const char* what() const noexcept;
private:
//! Error message associated with exception.
std::string m_message;
};
/**
* Struct representing node in quad-tree; children are referenced using the
* corresponding indice in the vector representing the tree; -1 is used to
* represent unassigned children.
*/
struct demultiplexer_node {
demultiplexer_node();
std::array<int, 4> children;
int value;
};
typedef std::vector<demultiplexer_node> demux_node_vec;
/**
*
*/
class barcode_table
{
public:
barcode_table(const fastq_pair_vec& barcodes, size_t max_mm,
size_t max_mm_r1, size_t max_mm_r2);
int identify(const fastq& read_r1) const;
int identify(const fastq& read_r1, const fastq& read_r2) const;
static const int no_match = -1;
static const int ambigious = -2;
protected:
struct candidate {
explicit candidate(int barcode = barcode_table::no_match,
size_t mismatches = -1);
int barcode;
size_t mismatches;
};
candidate lookup(const char* seq, int parent,
const size_t max_global_mismatches,
const next_subsequence* next) const;
candidate lookup_with_mm(const char* seq, int parent,
const size_t max_global_mismatches,
const size_t max_local_mismatches,
const next_subsequence* next) const;
demux_node_vec m_nodes;
size_t m_max_mismatches;
size_t m_max_mismatches_r1;
size_t m_max_mismatches_r2;
size_t m_barcode_1_len;
size_t m_barcode_2_len;
};
} // namespace ar
#endif
......@@ -35,194 +35,12 @@
namespace ar
{
typedef std::pair<std::string, size_t> barcode_pair;
typedef std::vector<barcode_pair> barcode_vec;
typedef std::vector<unsigned> int_vec;
typedef demux_node_vec::iterator node_vec_iter;
///////////////////////////////////////////////////////////////////////////////
/**
* Struct representing node in quad-tree; children are referenced using the
* corresponding indice in the vector representing the tree; -1 is used to
* represent unassigned children.
*/
struct demultiplexer_node
{
public:
demultiplexer_node()
: children()
, barcodes()
{
children[0] = -1;
children[1] = -1;
children[2] = -1;
children[3] = -1;
}
bool has_children() const
{
for (size_t nt_idx = 0; nt_idx < 4; ++nt_idx) {
if (children[nt_idx] != -1) {
return true;
}
}
return false;
}
int children[4];
int_vec barcodes;
};
///////////////////////////////////////////////////////////////////////////////
/**
* Returns a lexicographically sorted list of mate 1 barcodes, each paired with
* the 0-based index of corresponding barcode in the source vector.
*/
barcode_vec sort_barcodes(const fastq_pair_vec& barcodes)
{
barcode_vec sorted_barcodes;
const size_t max_key_1_len = barcodes.front().first.length();
const size_t max_key_2_len = barcodes.front().second.length();
for (auto it = barcodes.begin(); it != barcodes.end(); ++it) {
AR_DEBUG_ASSERT(it->first.length() == max_key_1_len);
AR_DEBUG_ASSERT(it->second.length() == max_key_2_len);
sorted_barcodes.push_back(barcode_pair(it->first.sequence(),
it - barcodes.begin()));
}
std::sort(sorted_barcodes.begin(), sorted_barcodes.end());
return sorted_barcodes;
}
/** Adds a nucleotide sequence with a given ID to a quad-tree. */
void add_sequence_to_tree(demux_node_vec& tree,
const std::string& sequence,
const size_t barcode_id)
{
size_t node_idx = 0;
for (auto nuc: sequence) {
auto& node = tree.at(node_idx);
const auto nuc_idx = ACGT_TO_IDX(nuc);
auto child = node.children[nuc_idx];
if (child == -1) {
// New nodes are added to the end of the list; as barcodes are
// added in lexicographic order, this helps ensure that a set of
// dissimilar barcodes will be placed in mostly contiguous runs
// of the vector representation.
child = node.children[nuc_idx] = tree.size();
tree.push_back(demultiplexer_node());
}
node_idx = child;
}
tree.at(node_idx).barcodes.push_back(barcode_id);
}
/**
* Builds a sparse quad tree using the first sequence in a set of unique
* barcodes pairs; duplicate pairs will negatively impact the identification of
* these, since all hits will be considered ambiguous.
*/
demux_node_vec build_demux_tree(const fastq_pair_vec& barcodes)
{
// Step 1: Construct list of barcodes sorted by the mate 1 barcode;
// this allows construction of the sparse tree in one pass.
const barcode_vec sorted_barcodes = sort_barcodes(barcodes);
// Step 2: Create empty tree containing just the root node; creating
// the root here simplifies the 'add_sequence_to_tree' function.
demux_node_vec tree;
tree.push_back(demultiplexer_node());
// Step 3: Add each barcode to the tree, in sorted order
for (auto& pair: sorted_barcodes) {
add_sequence_to_tree(tree, pair.first, pair.second);
}
return tree;
}
///////////////////////////////////////////////////////////////////////////////
typedef std::vector<std::pair<int, size_t> > candidate_vec;
void rec_lookup_sequence_no_mm(candidate_vec& candidates,
const demux_node_vec& tree,
const std::string& seq,
size_t seq_pos = 0,
int parent = 0,
size_t mismatches = 0)
{
const demultiplexer_node& node = tree.at(parent);
for (const auto& barcode : node.barcodes) {
candidates.emplace_back(barcode, mismatches);
}
if (seq_pos < seq.length()) {
const size_t nt_idx = ACGT_TO_IDX(seq.at(seq_pos));
if (node.children[nt_idx] != -1) {
rec_lookup_sequence_no_mm(candidates, tree, seq, seq_pos + 1,
node.children[nt_idx], mismatches);
}
}
}
void rec_lookup_sequence(candidate_vec& candidates,
const demux_node_vec& tree,
const std::string& seq,
size_t max_mismatches,
size_t seq_pos = 0,
int parent = 0,
size_t mismatches = 0)
{
const demultiplexer_node& node = tree.at(parent);
for (const auto& barcode : node.barcodes) {
candidates.emplace_back(barcode, mismatches);
}
if (seq_pos < seq.length()) {
const size_t current_nt = ACGT_TO_IDX(seq.at(seq_pos));
for (size_t nt_idx = 0; nt_idx < 4; ++nt_idx) {
if (node.children[nt_idx] != -1) {
if (nt_idx == current_nt) {
rec_lookup_sequence(candidates, tree, seq, max_mismatches, seq_pos + 1, node.children[nt_idx], mismatches);
} else if (mismatches + 1 < max_mismatches) {
rec_lookup_sequence(candidates, tree, seq, max_mismatches, seq_pos + 1, node.children[nt_idx], mismatches + 1);
} else if (mismatches + 1 == max_mismatches) {
rec_lookup_sequence_no_mm(candidates, tree, seq, seq_pos + 1, node.children[nt_idx], mismatches + 1);
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
demultiplex_reads::demultiplex_reads(const userconfig* config)
: analytical_step(analytical_step::ordering::ordered)
, m_barcodes(config->adapters.get_barcodes())
, m_tree(build_demux_tree(m_barcodes))
, m_max_mismatches(config->barcode_mm)
, m_max_mismatches_r1(std::min<size_t>(config->barcode_mm, config->barcode_mm_r1))
, m_max_mismatches_r2(std::min<size_t>(config->barcode_mm, config->barcode_mm_r2))
, m_barcode_table(m_barcodes, config->barcode_mm, config->barcode_mm_r1, config->barcode_mm_r2)
, m_config(config)
, m_cache()
, m_unidentified_1(new fastq_output_chunk())
......@@ -247,84 +65,6 @@ demultiplex_reads::~demultiplex_reads()
}
size_t count_mismatches(const std::string& barcode,
const std::string& sequence,
const size_t max_mismatches)
{
std::string::const_iterator b_iter = barcode.begin();
std::string::const_iterator r2_iter = sequence.begin();
size_t mismatches = 0;
size_t n_check = std::min(barcode.length(), sequence.length());
if (n_check < barcode.length()) {
// Missing bases are considered mismatching
mismatches += barcode.length() - n_check;
}
while (n_check-- && mismatches <= max_mismatches) {
if (*b_iter++ != *r2_iter++) {
++mismatches;
}
}
return mismatches;
}
/**
* Returns the best matching barcode (pair) for sequences read_r1 and read_r2
*
*/
int demultiplex_reads::select_barcode(const fastq& read_r1, const fastq& read_r2) const
{
candidate_vec candidates;
if (m_max_mismatches_r1) {
rec_lookup_sequence(candidates, m_tree, read_r1.sequence(), m_max_mismatches_r1);
} else {
rec_lookup_sequence_no_mm(candidates, m_tree, read_r1.sequence());
}
int best_barcode = -1;
size_t min_mismatches = m_max_mismatches + 1;
for (auto& candidate : candidates) {
if (m_config->paired_ended_mode) {
const std::string& barcode = m_barcodes.at(candidate.first).second.sequence();
const size_t max_mismatches_r2 = std::min(m_max_mismatches - candidate.second,
m_max_mismatches_r2);
const size_t mismatches = count_mismatches(barcode,
read_r2.sequence(),
max_mismatches_r2);
if (mismatches > max_mismatches_r2) {
continue;
}
candidate.second += mismatches;
}
if (candidate.second < min_mismatches) {
best_barcode = candidate.first;
min_mismatches = candidate.second;
} else if (candidate.second == min_mismatches) {
// Ambiguous results; multiple best matches
best_barcode = -1;
}
}
if (best_barcode >= 0) {
return best_barcode;
} else if (min_mismatches == m_max_mismatches + 1) {
// No viable candidates
return -1;
} else {
// Ambiguous results
return -2;
}
}
chunk_vec demultiplex_reads::flush_cache(bool eof)
{
chunk_vec output;
......@@ -375,9 +115,8 @@ chunk_vec demultiplex_se_reads::process(analytical_chunk* chunk)
AR_DEBUG_LOCK(m_lock);
read_chunk_ptr read_chunk(dynamic_cast<fastq_read_chunk*>(chunk));
const fastq empty_read;
for (const auto& read : read_chunk->reads_1) {
const int best_barcode = select_barcode(read, empty_read);
const int best_barcode = m_barcode_table.identify(read);
if (best_barcode < 0) {
m_unidentified_1->add(*m_config->quality_output_fmt, read);
......@@ -417,7 +156,7 @@ chunk_vec demultiplex_pe_reads::process(analytical_chunk* chunk)
fastq_vec::iterator it_1 = read_chunk->reads_1.begin();
fastq_vec::iterator it_2 = read_chunk->reads_2.begin();
for (; it_1 != read_chunk->reads_1.end(); ++it_1, ++it_2) {
const int best_barcode = select_barcode(*it_1, *it_2);
const int best_barcode = m_barcode_table.identify(*it_1, *it_2);
if (best_barcode < 0) {
m_unidentified_1->add(*m_config->quality_output_fmt, *it_1);
......
......@@ -25,20 +25,16 @@
#ifndef DEMULTIPLEX_H
#define DEMULTIPLEX_H
#include "barcode_table.hpp"
#include "fastq.hpp"
#include "fastq_io.hpp"
#include "scheduler.hpp"
#include "statistics.hpp"
#include "fastq_io.hpp"
namespace ar
{
class userconfig;
struct demultiplexer_node;
typedef std::vector<demultiplexer_node> demux_node_vec;
/**
* Baseclass for demultiplexing of reads; responsible for building the quad-tree
......@@ -63,22 +59,10 @@ public:
demultiplex_reads& operator=(const demultiplex_reads&) = delete;
protected:
/**
* Returns the id of the best matching barcode(s), or -1 if no matches were
* found or if no single best match was found.
*/
int select_barcode(const fastq& read_r1, const fastq& read_r2) const;
//! List of barcode (pairs) supplied by caller
const fastq_pair_vec& m_barcodes;
//! Quad-tree representing all mate 1 adapters; for search with n mismatches
const demux_node_vec m_tree;
//! Maximum number of mismatches allowed between the mate 1 and mate 2 read
const size_t m_max_mismatches;
//! Maximum number of mismatches allowed for the mate 1 read
const size_t m_max_mismatches_r1;
//! Maximum number of mismatches allowed for the mate 2 read
const size_t m_max_mismatches_r2;
const barcode_table m_barcode_table;
//! Pointer to user settings used for output format for unidentified reads
const userconfig* m_config;
......
......@@ -139,7 +139,9 @@ size_t fastq::count_ns() const
}
fastq::ntrimmed fastq::trim_trailing_bases(const bool trim_ns, char low_quality)
fastq::ntrimmed fastq::trim_trailing_bases(const bool trim_ns,
char low_quality,
const bool preserve5p)
{
low_quality += PHRED_OFFSET_33;
auto is_quality_base = [&] (size_t i) {
......@@ -156,7 +158,7 @@ fastq::ntrimmed fastq::trim_trailing_bases(const bool trim_ns, char low_quality)
}
size_t left_inclusive = 0;
for (size_t i = 0; i < right_exclusive; ++i) {
for (size_t i = 0; !preserve5p && i < right_exclusive; ++i) {
if (is_quality_base(i)) {
left_inclusive = i;
break;
......@@ -188,7 +190,8 @@ size_t calculate_winlen(const size_t read_length, const double window_size)
fastq::ntrimmed fastq::trim_windowed_bases(const bool trim_ns,
char low_quality,
const double window_size)
const double window_size,
const bool preserve5p)
{
AR_DEBUG_ASSERT(window_size >= 0.0);
if (m_sequence.empty()) {
......@@ -235,6 +238,8 @@ fastq::ntrimmed fastq::trim_windowed_bases(const bool trim_ns,
if (left_inclusive == std::string::npos) {
// No starting window found. Trim all bases starting from start.
return trim_sequence_and_qualities(length(), length());
} else if (preserve5p) {
left_inclusive = 0;
}
AR_DEBUG_ASSERT(right_exclusive != std::string::npos);
......
......@@ -111,10 +111,12 @@ public:
*
* @param trim_ns If true, ambiguous bases ('N') are trimmed.
* @param low_quality Trim bases with a quality score at or below this value.
* @param preserve5p Only trim from the 3p end if true.
* @return A pair containing the number of 5' and 3' bases trimmed.
*/
ntrimmed trim_trailing_bases(const bool trim_ns = true,
char low_quality = -1);
char low_quality = -1,
const bool preserve5p = false);
/**
* Trims low-quality bases using a sliding window approach.
......@@ -122,11 +124,13 @@ public:
* @param trim_ns If true, ambiguous bases ('N') are trimmed.
* @param low_quality Trim bases with a quality score at or below this value.
* @param winlen The length of the sliding window.
* @param preserve5p Only trim from the 3p end if true.
* @return A pair containing the number of 5' and 3' bases trimmed.
*/
ntrimmed trim_windowed_bases(const bool trim_ns = true,
char low_quality = -1,
const double window_size = 0.1);
const double window_size = 0.1,
const bool preserve5p = false);
/**
......@@ -231,6 +235,15 @@ inline size_t ACGT_TO_IDX(char nt)
}
/**
* Inverse of ACGT_TO_IDX. Only values in the range 0 to 3 are allowed.
*/
inline char IDX_TO_ACGT(size_t idx)
{
return "ACTG"[idx];
}
///////////////////////////////////////////////////////////////////////////////
......
......@@ -428,6 +428,7 @@ chunk_vec bzip2_fastq::process(analytical_chunk* chunk)
std::pair<size_t, unsigned char*> output_buffer;
try {
input_buffer = build_input_buffer(file_chunk->reads);
file_chunk->reads.clear();
m_stream.avail_in = input_buffer.first;
m_stream.next_in = reinterpret_cast<char*>(input_buffer.second);
......@@ -651,16 +652,10 @@ static bool s_finalized = false;
write_fastq::write_fastq(const std::string& filename)
: analytical_step(analytical_step::ordering::ordered, true)
, m_output(filename.c_str(), std::ofstream::out | std::ofstream::binary)
, m_output(filename)
, m_eof(false)
, m_lock()
{
if (!m_output.is_open()) {
std::string message = std::string("Failed to open file '") + filename + "': ";
throw std::ofstream::failure(message + std::strerror(errno));
}
m_output.exceptions(std::ofstream::failbit | std::ofstream::badbit);
}
......@@ -668,7 +663,6 @@ chunk_vec write_fastq::process(analytical_chunk* chunk)
{
AR_DEBUG_LOCK(m_lock);
output_chunk_ptr file_chunk(dynamic_cast<fastq_output_chunk*>(chunk));
const string_vec& lines = file_chunk->reads;
if (m_eof) {
throw thread_error("write_fastq::process: received data after EOF");
......@@ -676,20 +670,11 @@ chunk_vec write_fastq::process(analytical_chunk* chunk)
m_eof = file_chunk->eof;
if (file_chunk->buffers.empty()) {
for (const auto& line : lines) {
m_output.write(line.data(), line.size());
}
m_output.write_strings(file_chunk->reads, m_eof);
} else {
buffer_vec& buffers = file_chunk->buffers;
for (const auto& buffer : buffers) {
if (buffer.first) {
m_output.write(reinterpret_cast<const char*>(buffer.second), buffer.first);
}
}
}
AR_DEBUG_ASSERT(file_chunk->reads.empty());
if (m_eof) {
m_output.flush();
m_output.write_buffers(file_chunk->buffers, m_eof);
}
std::lock_guard<std::mutex> lock(s_timer_lock);
......