Skip to content

Commits on Source 4

......@@ -112,6 +112,7 @@ add_executable(diamond src/run/main.cpp
src/search/stage0.cpp
src/util/memory/memory_pool.cpp
src/data/seed_array.cpp
src/output/paf_format.cpp
)
if(EXTRA)
......
......@@ -16,7 +16,7 @@ Keep posted about new developments by following me on Twitter.
:target: https://gitter.im/diamond-aligner/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. image:: https://anaconda.org/bioconda/diamond/badges/downloads.svg
:target: https://anaconda.org/bioconda/diamond
.. image:: https://img.shields.io/badge/Google%20Scholar-523-blue.svg
.. image:: https://img.shields.io/badge/Google%20Scholar-546-blue.svg
:target: https://scholar.google.de/citations?user=kjPIF1cAAAAJ
Quick start guide
......@@ -25,7 +25,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond
Installing the software on your system may be done by downloading it in binary format for immediate use::
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.21/diamond-linux64.tar.gz
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.22/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
......
......@@ -75,4 +75,5 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/search/stage0.cpp \
src/util/memory/memory_pool.cpp \
src/data/seed_array.cpp \
src/output/paf_format.cpp \
-lz -lpthread -o diamond
diamond-aligner (0.9.22+dfsg-1) unstable; urgency=medium
* New upstream version
-- Andreas Tille <tille@debian.org> Tue, 22 May 2018 15:54:27 +0200
diamond-aligner (0.9.21+dfsg-1) unstable; urgency=medium
* New upstream version
......
[0.9.22]
- Added output field full_sseq to tabular output format.
- Database sequences that exceed the maximum accession length will no longer cause an error.
- Added support for PAF output format.
- Optimized performance of database taxonomy filtering.
[0.9.21]
- Fixed compiler errors on some systems.
......
......@@ -195,8 +195,9 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Me
|| score_matrix.bitscore(targets[i].filter_score) < config.min_bit_score)
break;
const unsigned subject_len = (unsigned)ref_seqs::get()[targets[i].subject_id].length();
const char *ref_title = ref_ids::get()[targets[i].subject_id].c_str();
const size_t subject_id = targets[i].subject_id;
const unsigned subject_len = (unsigned)ref_seqs::get()[subject_id].length();
const char *ref_title = ref_ids::get()[subject_id].c_str();
targets[i].apply_filters(source_query_len, subject_len, query_title, ref_title);
if (targets[i].hsps.size() == 0)
continue;
......@@ -219,7 +220,7 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Me
if (blocked_processing) {
if (n_hsp == 0)
seek_pos = IntermediateRecord::write_query_intro(buffer, query_id);
IntermediateRecord::write(buffer, *j, query_id, targets[i].subject_id);
IntermediateRecord::write(buffer, *j, query_id, subject_id);
}
else {
if (n_hsp == 0) {
......@@ -229,18 +230,19 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Me
f->print_query_intro(query_id, query_title, source_query_len, buffer, false);
}
if (*f == Output_format::daa)
write_daa_record(buffer, *j, query_id, targets[i].subject_id);
write_daa_record(buffer, *j, query_id, subject_id);
else
f->print_match(Hsp_context(*j,
query_id,
translated_query,
query_title,
targets[i].subject_id,
ReferenceDictionary::get().block_to_database_id(targets[i].subject_id),
subject_id,
ReferenceDictionary::get().block_to_database_id(subject_id),
ref_title,
subject_len,
n_target_seq,
hit_hsps), metadata, buffer);
hit_hsps,
ref_seqs::get()[subject_id]), metadata, buffer);
}
++n_hsp;
......
......@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
const char* Const::version_string = "0.9.21";
const char* Const::version_string = "0.9.22";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
......
......@@ -88,6 +88,7 @@ Config::Config(int argc, const char **argv)
\tsend means End of alignment in subject\n\
\tqseq means Aligned part of query sequence\n\
\tsseq means Aligned part of subject sequence\n\
\tfull_sseq means Full subject sequence\n\
\tevalue means Expect value\n\
\tbitscore means Bit score\n\
\tscore means Raw score\n\
......
......@@ -159,6 +159,7 @@ struct Config
bool sort_join;
bool simple_freq;
double freq_treshold;
bool use_lazy_dict;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
......
......@@ -23,7 +23,7 @@ struct Const
{
enum {
build_version = 122,
build_version = 123,
seedp_bits = 10,
seedp = 1<<seedp_bits,
max_seed_weight = 32,
......
......@@ -224,7 +224,8 @@ struct Hsp_context
const char *subject_name,
unsigned subject_len,
unsigned hit_num,
unsigned hsp_num
unsigned hsp_num,
const sequence &subject_seq
) :
query(query),
query_name(query_name),
......@@ -235,6 +236,7 @@ struct Hsp_context
subject_len(subject_len),
hit_num(hit_num),
hsp_num(hsp_num),
subject_seq(subject_seq),
hsp_(hsp)
{}
struct Iterator : public Hsp::Iterator
......@@ -356,6 +358,7 @@ struct Hsp_context
const TranslatedSequence query;
const char *query_name, *subject_name;
const unsigned query_id, subject_id, orig_subject_id, subject_len, hit_num, hsp_num;
const sequence subject_seq;
private:
Hsp &hsp_;
};
......
......@@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "value.h"
#include "../util/util.h"
#include "sequence.h"
#include "translate.h"
using std::string;
using std::vector;
......@@ -37,6 +38,7 @@ struct Reduction
memset(map_, 0, sizeof(map_));
memset(map8_, 0, sizeof(map8_));
map_[(long)value_traits.mask_char] = value_traits.mask_char;
map_[(long)Translator::STOP] = value_traits.mask_char;
const vector<string> tokens(tokenize(definition_string, " "));
size_ = (unsigned)tokens.size();
bit_size_ = (uint64_t)ceil(log(size_) / log(2));
......
......@@ -27,6 +27,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "reduction.h"
#include "../util/util.h"
#include "config.h"
#include "translate.h"
/*struct All_partitions {};
struct Filter_partition {};
......@@ -132,7 +133,8 @@ struct Shape
#endif
for (unsigned i = 0; i < weight_; ++i) {
Letter l = seq[positions_[i]];
if (l == value_traits.mask_char || l == sequence::DELIMITER)
if (l == value_traits.mask_char || l == sequence::DELIMITER || l == Translator::STOP)
//if (l == value_traits.mask_char || l == sequence::DELIMITER)
return false;
unsigned r = Reduction::reduction(l);
#ifdef FREQUENCY_MASKING
......@@ -153,7 +155,8 @@ struct Shape
const uint64_t b = Reduction::reduction.bit_size();
for (unsigned i = 0; i < weight_; ++i) {
Letter l = seq[positions_[i]];
if (l == value_traits.mask_char || l == sequence::DELIMITER)
if (l == value_traits.mask_char || l == sequence::DELIMITER || l == Translator::STOP)
//if (l == value_traits.mask_char || l == sequence::DELIMITER)
return false;
unsigned r = Reduction::reduction(l);
s <<= b;
......
......@@ -21,15 +21,27 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "taxon_list.h"
#include "taxonomy_nodes.h"
#include "taxonomy_filter.h"
struct Metadata
{
Metadata():
taxon_list(NULL),
taxon_nodes(NULL)
taxon_nodes(NULL),
taxon_filter(NULL)
{}
void free()
{
delete taxon_list;
delete taxon_nodes;
delete taxon_filter;
taxon_list = NULL;
taxon_nodes = NULL;
taxon_filter = NULL;
}
TaxonList *taxon_list;
TaxonomyNodes *taxon_nodes;
TaxonomyFilter *taxon_filter;
};
#endif
\ No newline at end of file
......@@ -16,10 +16,13 @@ You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <utility>
#include "reference.h"
#include "ref_dictionary.h"
#include "../util/util.h"
using std::pair;
ReferenceDictionary ReferenceDictionary::instance_;
string* get_allseqids(const char *s)
......@@ -34,6 +37,15 @@ string* get_allseqids(const char *s)
return r;
}
void ReferenceDictionary::clear()
{
data_.clear();
len_.clear();
database_id_.clear();
name_.clear();
next_ = 0;
}
void ReferenceDictionary::init(unsigned ref_count, const vector<unsigned> &block_to_database_id)
{
const unsigned block = current_ref_block;
......@@ -44,7 +56,7 @@ void ReferenceDictionary::init(unsigned ref_count, const vector<unsigned> &block
block_to_database_id_ = &block_to_database_id;
}
uint32_t ReferenceDictionary::get(unsigned block, unsigned block_id)
uint32_t ReferenceDictionary::get(unsigned block, size_t block_id)
{
uint32_t n = data_[block][block_id];
if (n != std::numeric_limits<uint32_t>::max())
......@@ -72,6 +84,28 @@ uint32_t ReferenceDictionary::get(unsigned block, unsigned block_id)
return n;
}
void ReferenceDictionary::build_lazy_dict(DatabaseFile &db_file)
{
vector<bool> filter(db_file.ref_header.sequences);
vector<pair<unsigned, unsigned> > m;
const size_t dict_size = database_id_.size();
m.reserve(dict_size);
unsigned n = 0;
for (vector<uint32_t>::const_iterator i = database_id_.begin(); i < database_id_.end(); ++i) {
filter[*i] = true;
m.push_back(std::make_pair(*i, n++));
}
db_file.rewind();
vector<unsigned> block_to_database_id;
db_file.load_seqs(block_to_database_id, std::numeric_limits<size_t>::max(), false, false, &filter);
std::sort(m.begin(), m.end());
dict_to_lazy_dict_id_.clear();
dict_to_lazy_dict_id_.resize(dict_size);
n = 0;
for (vector<pair<unsigned, unsigned> >::const_iterator i = m.begin(); i < m.end(); ++i)
dict_to_lazy_dict_id_[i->second] = n++;
}
/*void ReferenceDictionary::init_rev_map()
{
rev_map_.resize(next_);
......
......@@ -26,6 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/tinythread.h"
#include "../util/ptr_vector.h"
#include "../util/io/output_file.h"
#include "reference.h"
using std::vector;
using std::string;
......@@ -39,7 +40,9 @@ struct ReferenceDictionary
void init(unsigned ref_count, const vector<unsigned> &block_to_database_id);
uint32_t get(unsigned block, unsigned i);
uint32_t get(unsigned block, size_t i);
void build_lazy_dict(DatabaseFile &db_file);
void clear();
unsigned length(uint32_t i) const
{
......@@ -51,6 +54,11 @@ struct ReferenceDictionary
return name_[i].c_str();
}
sequence seq(size_t i) const
{
return ref_seqs::get()[dict_to_lazy_dict_id_[i]];
}
//void init_rev_map();
unsigned database_id(unsigned dict_id) const
......@@ -58,7 +66,7 @@ struct ReferenceDictionary
return database_id_[dict_id];
}
unsigned block_to_database_id(unsigned block_id) const
unsigned block_to_database_id(size_t block_id) const
{
return (*block_to_database_id_)[block_id];
}
......@@ -90,6 +98,7 @@ private:
PtrVector<string> name_;
//vector<uint32_t> rev_map_;
uint32_t next_;
vector<uint32_t> dict_to_lazy_dict_id_;
const vector<unsigned> *block_to_database_id_;
friend void finish_daa(OutputFile&, const DatabaseFile&);
......
......@@ -219,20 +219,17 @@ void make_db()
message_stream << "Total time = " << total.getElapsedTimeInSec() << "s" << endl;
}
bool DatabaseFile::load_seqs(const Metadata &metadata, vector<unsigned> &block_to_database_id)
bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, bool load_ids, const vector<bool> *filter)
{
task_timer timer("Loading reference sequences");
const size_t max_letters = (size_t)(config.chunk_size*1e9);
seek(pos_array_offset);
size_t database_id = (pos_array_offset - ref_header.pos_array_offset) / sizeof(Pos_record);
size_t letters = 0, seqs = 0, id_letters = 0, seqs_processed = 0;
vector<uint64_t> filtered_pos;
const set<unsigned> taxon_filter_list(parse_csv(config.taxonlist));
const bool filtered = !taxon_filter_list.empty();
block_to_database_id.clear();
ref_seqs::data_ = new Sequence_set;
ref_ids::data_ = new String_set<0>;
if(load_ids) ref_ids::data_ = new String_set<0>;
Pos_record r;
read(&r, 1);
......@@ -242,15 +239,15 @@ bool DatabaseFile::load_seqs(const Metadata &metadata, vector<unsigned> &block_t
while (r.seq_len > 0 && letters < max_letters) {
Pos_record r_next;
read(&r_next, 1);
if (!filtered || metadata.taxon_nodes->contained((*metadata.taxon_list)[database_id], taxon_filter_list)) {
if (!filter || (*filter)[database_id]) {
letters += r.seq_len;
ref_seqs::data_->reserve(r.seq_len);
const size_t id_len = r_next.pos - r.pos - r.seq_len - 3;
id_letters += id_len;
ref_ids::data_->reserve(id_len);
if (load_ids) ref_ids::data_->reserve(id_len);
++seqs;
block_to_database_id.push_back((unsigned)database_id);
if (filtered) filtered_pos.push_back(last ? 0 : r.pos);
if (filter) filtered_pos.push_back(last ? 0 : r.pos);
last = true;
}
else
......@@ -263,23 +260,28 @@ bool DatabaseFile::load_seqs(const Metadata &metadata, vector<unsigned> &block_t
if (seqs == 0) {
delete ref_seqs::data_;
delete ref_ids::data_;
ref_seqs::data_ = NULL;
if (load_ids) delete ref_ids::data_;
ref_ids::data_ = NULL;
return false;
}
ref_seqs::data_->finish_reserve();
ref_ids::data_->finish_reserve();
if(load_ids) ref_ids::data_->finish_reserve();
seek(start_offset);
size_t masked = 0;
size_t masked_letters = 0;
for (size_t n = 0; n < seqs; ++n) {
if (filtered && filtered_pos[n]) seek(filtered_pos[n]);
if (filter && filtered_pos[n]) seek(filtered_pos[n]);
read(ref_seqs::data_->ptr(n) - 1, ref_seqs::data_->length(n) + 2);
*(ref_seqs::data_->ptr(n) - 1) = sequence::DELIMITER;
*(ref_seqs::data_->ptr(n) + ref_seqs::data_->length(n)) = sequence::DELIMITER;
if (load_ids)
read(ref_ids::data_->ptr(n), ref_ids::data_->length(n) + 1);
if (config.masking == 1)
Masking::get().bit_to_hard_mask(ref_seqs::data_->ptr(n), ref_seqs::data_->length(n), masked);
else
if (!seek_forward('\0')) throw std::runtime_error("Unexpected end of file.");
if (config.masking == 1 && masked)
Masking::get().bit_to_hard_mask(ref_seqs::data_->ptr(n), ref_seqs::data_->length(n), masked_letters);
else
Masking::get().remove_bit_mask(ref_seqs::data_->ptr(n), ref_seqs::data_->length(n));
if (!config.sfilt.empty() && strstr(ref_ids::get()[n].c_str(), config.sfilt.c_str()) == 0)
......@@ -287,7 +289,7 @@ bool DatabaseFile::load_seqs(const Metadata &metadata, vector<unsigned> &block_t
}
timer.finish();
ref_seqs::get().print_stats();
log_stream << "Masked letters = " << masked << endl;
log_stream << "Masked letters = " << masked_letters << endl;
blocked_processing = seqs_processed < ref_header.sequences;
return true;
......
......@@ -73,7 +73,7 @@ struct DatabaseFile : public InputFile
DatabaseFile();
static void read_header(InputFile &stream, ReferenceHeader &header);
void rewind();
bool load_seqs(const Metadata &metadata, vector<unsigned> &block_to_database_id);
bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, bool load_ids = true, const vector<bool> *filter = NULL);
void get_seq();
void read_seq(string &id, vector<char> &seq);
bool has_taxon_id_lists();
......
......@@ -36,8 +36,13 @@ void TaxonList::build(OutputFile &db, FileBackedBuffer &accessions, size_t seqs)
size_t mapped = 0, mappings = 0;
for (size_t i = 0; i < seqs; ++i) {
accessions >> a;
for (vector<string>::const_iterator j = a.begin(); j < a.end(); ++j)
for (vector<string>::const_iterator j = a.begin(); j < a.end(); ++j) {
try {
t.insert(taxonomy.get(Taxonomy::Accession(j->c_str())));
}
catch (AccessionLengthError &) {
}
}
t.erase(0);
db << t;
mappings += t.size();
......
......@@ -29,6 +29,13 @@ using std::string;
string get_accession(const string &t);
struct AccessionLengthError : public std::runtime_error
{
AccessionLengthError() :
std::runtime_error("Accession exceeds maximum length.")
{ }
};
struct Taxonomy
{
enum { max_accesion_len = 14 };
......@@ -37,7 +44,7 @@ struct Taxonomy
Accession(const char *s)
{
if (strlen(s) > max_accesion_len)
throw std::runtime_error("Accession exceeds maximum length.");
throw AccessionLengthError();
strncpy(this->s, s, max_accesion_len);
}
Accession(const string &s)
......@@ -46,7 +53,7 @@ struct Taxonomy
get_accession(t);
if (t.length() > max_accesion_len) {
//this->s[0] = 0;
throw std::runtime_error("Accession exceeds maximum length.");
throw AccessionLengthError();
}
else
strncpy(this->s, t.c_str(), max_accesion_len);
......