Skip to content
Commits on Source (6)
......@@ -54,3 +54,4 @@ Makefile
diamond
src/extra/benchmark.cpp
src/extra/test.cpp
src/extra/seed_stat.cpp
\ No newline at end of file
......@@ -109,6 +109,9 @@ add_executable(diamond src/run/main.cpp
src/data/taxon_list.cpp
src/data/taxonomy_nodes.cpp
src/util/algo/MurmurHash3.cpp
src/search/stage0.cpp
src/util/memory/memory_pool.cpp
src/data/seed_array.cpp
)
if(EXTRA)
......
FROM alpine:latest as build-diamond
RUN apk add --no-cache gcc g++ cmake zlib-dev ninja
WORKDIR /opt/diamond
ADD . .
WORKDIR build
RUN cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_MARCH=x86-64 ..
RUN ninja && ninja install
FROM alpine:latest
LABEL maintainer="Benjamin Buchfink <buchfink@gmail.com>"
RUN apk add --no-cache libstdc++ zlib
COPY --from=build-diamond /usr/local/bin/diamond /usr/local/bin/diamond
CMD ["diamond"]
\ No newline at end of file
......@@ -7,7 +7,7 @@ DIAMOND is a sequence aligner for protein and translated DNA searches, designed
- Low resource requirements and suitable for running on standard desktops or laptops.
- Various output formats, including BLAST pairwise, tabular and XML, as well as taxonomic classification.
Follow me on Twitter to get notified of updates.
Keep posted about new developments by following me on Twitter.
.. image:: https://image.ibb.co/gAmVKR/twitter1.png
:target: https://twitter.com/bbuchfink
......@@ -16,7 +16,7 @@ Follow me on Twitter to get notified of updates.
:target: https://gitter.im/diamond-aligner/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
.. image:: https://anaconda.org/bioconda/diamond/badges/downloads.svg
:target: https://anaconda.org/bioconda/diamond
.. image:: https://img.shields.io/badge/Google%20Scholar-491-blue.svg
.. image:: https://img.shields.io/badge/Google%20Scholar-523-blue.svg
:target: https://scholar.google.de/citations?user=kjPIF1cAAAAJ
Quick start guide
......@@ -25,7 +25,7 @@ Please read the `manual <https://github.com/bbuchfink/diamond/raw/master/diamond
Installing the software on your system may be done by downloading it in binary format for immediate use::
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.19/diamond-linux64.tar.gz
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.21/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted ``diamond`` binary file should be moved to a directory contained in your executable search path (PATH environment variable).
......
theme: jekyll-theme-cayman
\ No newline at end of file
......@@ -72,4 +72,7 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/data/taxon_list.cpp \
src/data/taxonomy_nodes.cpp \
src/util/algo/MurmurHash3.cpp \
src/search/stage0.cpp \
src/util/memory/memory_pool.cpp \
src/data/seed_array.cpp \
-lz -lpthread -o diamond
diamond-aligner (0.9.21+dfsg-1) unstable; urgency=medium
* New upstream version
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
-- Andreas Tille <tille@debian.org> Thu, 26 Apr 2018 15:10:07 +0200
diamond-aligner (0.9.19+dfsg-1) unstable; urgency=medium
* New upstream version
......
......@@ -6,9 +6,9 @@ Priority: optional
Build-Depends: debhelper (>= 11~),
cmake,
zlib1g-dev
Standards-Version: 4.1.3
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/diamond-aligner.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/diamond-aligner.git
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/diamond-aligner
Vcs-Git: https://salsa.debian.org/med-team/diamond-aligner.git
Homepage: https://github.com/bbuchfink/diamond
Package: diamond-aligner
......
[0.9.21]
- Fixed compiler errors on some systems.
[0.9.20]
- Added Bioconda installation instructions to manual.
- Added official docker release: https://hub.docker.com/r/buchfink/diamond/
- Fixed a bug that could cause corrupted output if compression was activated.
- Fixed an issue that could cause high memory usage by automatic use of the query-indexed algorithm.
[0.9.19]
- Fixed a bug in the --un function to write unaligned reads.
- Fixed an issue in the LCA algorithm that could cause an assignment to a higher node.
......@@ -47,7 +56,7 @@
- Added support for using the staxids output field in diamond view.
[0.9.8]
- Fixed a compiler errror.
- Fixed a compiler error.
[0.9.7]
- Fixed compiler errors.
......
......@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
const char* Const::version_string = "0.9.19";
const char* Const::version_string = "0.9.21";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
......
......@@ -61,7 +61,9 @@ Config::Config(int argc, const char **argv)
.add_command("test-extra", "")
.add_command("test-io", "")
.add_command("db-annot-stats", "")
.add_command("read-sim", "");
.add_command("read-sim", "")
.add_command("info", "")
.add_command("seed-stat", "");
Options_group general("General options");
general.add()
......@@ -234,7 +236,16 @@ Config::Config(int argc, const char **argv)
("rank-factor", 0, "", rank_factor, -1.0)
("transcript-len-estimate", 0, "", transcript_len_estimate, 1.0)
("no-traceback", 0, "", disable_traceback)
("family-counts", 0, "", family_counts_file);
("family-counts", 0, "", family_counts_file)
("radix-cluster-buffered", 0, "", radix_cluster_buffered)
("join-split-size", 0, "", join_split_size, 100000u)
("join-split-key-len", 0, "", join_split_key_len, 17u)
("radix-bits", 0, "", radix_bits, 8u)
("join-ht-factor", 0, "", join_ht_factor, 1.3)
("hash-join", 0, "", hash_join)
("sort-join", 0, "", sort_join)
("simple-freq", 0, "", simple_freq)
("freq-treshold", 0, "", freq_treshold);
parser.add(general).add(makedb).add(aligner).add(advanced).add(view_options).add(getseq_options).add(hidden_options);
parser.store(argc, argv, command);
......
......@@ -150,10 +150,19 @@ struct Config
double transcript_len_estimate;
string family_counts_file;
string taxonlist;
bool radix_cluster_buffered;
unsigned join_split_size;
unsigned join_split_key_len;
unsigned radix_bits;
double join_ht_factor;
bool hash_join;
bool sort_join;
bool simple_freq;
double freq_treshold;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18, dbinfo=19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23
match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta=18, dbinfo=19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25
};
unsigned command;
......
......@@ -23,7 +23,7 @@ struct Const
{
enum {
build_version = 120,
build_version = 122,
seedp_bits = 10,
seedp = 1<<seedp_bits,
max_seed_weight = 32,
......
......@@ -64,6 +64,11 @@ struct Reduction
return map_[(long)a];
}
unsigned operator()(size_t a) const
{
return map_[a];
}
const char* map8() const
{
return map8_;
......
......@@ -25,7 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
struct Seed_iterator
{
Seed_iterator(vector<char> &seq, const shape &sh):
Seed_iterator(vector<char> &seq, const Shape &sh):
ptr_ (seq.data()),
end_ (ptr_ + seq.size() - sh.length_ + 1)
{}
......@@ -33,7 +33,7 @@ struct Seed_iterator
{
return ptr_ < end_;
}
bool get(uint64_t &seed, const shape &sh)
bool get(uint64_t &seed, const Shape &sh)
{
return sh.set_seed_reduced(seed, ptr_++);
}
......@@ -44,7 +44,7 @@ private:
template<uint64_t _b>
struct Hashed_seed_iterator
{
Hashed_seed_iterator(const sequence &seq, const shape &sh):
Hashed_seed_iterator(const sequence &seq, const Shape &sh):
ptr_(seq.data()),
end_(ptr_ + seq.length()),
last_(0)
......
......@@ -54,7 +54,7 @@ struct Letter_trail
}
Letter_trail(const Reduction &reduction)
{
for (int i = 0; i < 20; ++i)
for (size_t i = 0; i < 20; ++i)
bucket[i] = reduction(i);
}
int operator()(char l) const
......@@ -84,10 +84,10 @@ struct Letter_trail
#define OPT_W 7
typedef Letter_trail Trail[OPT_W];
struct shape
struct Shape
{
shape():
Shape():
length_ (0),
weight_ (0),
d_ (0),
......@@ -96,7 +96,7 @@ struct shape
id_ (0)
{ memset(positions_, 0, sizeof(uint32_t)*Const::max_seed_weight); }
shape(const char *code, unsigned id):
Shape(const char *code, unsigned id):
weight_ (0),
mask_ (0),
rev_mask_ (0),
......@@ -186,7 +186,7 @@ struct shape
return true;
}
friend std::ostream& operator<<(std::ostream&s, const shape &sh)
friend std::ostream& operator<<(std::ostream&s, const Shape &sh)
{
for (unsigned i = 0; i < sh.length_; ++i)
s << ((sh.mask_ & (1 << i)) ? '1' : '0');
......
......@@ -41,18 +41,18 @@ public:
unsigned maxShapes = count == 0 ? Const::max_shapes : count;
for (unsigned i = 0; i < maxShapes; ++i)
if (shape_codes[mode_][i])
shapes_[n_++] = shape(shape_codes[mode_][i], i);
shapes_[n_++] = Shape(shape_codes[mode_][i], i);
}
else {
for (unsigned i = 0; i < (count == 0 ? shape_mask.size() : std::min((unsigned)shape_mask.size(), count)); ++i)
shapes_[n_++] = shape(shape_mask[i].c_str(), i);
shapes_[n_++] = Shape(shape_mask[i].c_str(), i);
}
}
unsigned count() const
{ return n_; }
const shape& operator[](size_t i) const
const Shape& operator[](size_t i) const
{ return shapes_[i]; }
unsigned mode() const
......@@ -67,7 +67,7 @@ public:
private:
shape shapes_[Const::max_shapes];
Shape shapes_[Const::max_shapes];
unsigned n_, mode_;
};
......
......@@ -24,6 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <string.h>
#include "../util/tinythread.h"
#include "../util/log_stream.h"
#include "../util/memory/memory_pool.h"
typedef uint64_t stat_type;
......@@ -33,7 +34,7 @@ struct Statistics
enum value {
SEED_HITS, TENTATIVE_MATCHES0, TENTATIVE_MATCHES1, TENTATIVE_MATCHES2, TENTATIVE_MATCHES3, TENTATIVE_MATCHES4, TENTATIVE_MATCHESX, MATCHES, ALIGNED, GAPPED, DUPLICATES,
GAPPED_HITS, QUERY_SEEDS, QUERY_SEEDS_HIT, REF_SEEDS, REF_SEEDS_HIT, QUERY_SIZE, REF_SIZE, OUT_HITS, OUT_MATCHES, COLLISION_LOOKUPS, QCOV, BIAS_ERRORS, SCORE_TOTAL, ALIGNED_QLEN, PAIRWISE, HIGH_SIM,
TEMP_SPACE, SECONDARY_HITS, ERASED_HITS, SQUARED_ERROR, CELLS, OUTRANKED_HITS, TARGET_HITS0, TARGET_HITS1, TARGET_HITS2, TIME_GREEDY_EXT, COUNT
TEMP_SPACE, SECONDARY_HITS, ERASED_HITS, SQUARED_ERROR, CELLS, OUTRANKED_HITS, TARGET_HITS0, TARGET_HITS1, TARGET_HITS2, TIME_GREEDY_EXT, LOW_COMPLEXITY_SEEDS, COUNT
};
Statistics()
......@@ -63,6 +64,7 @@ struct Statistics
{
//log_stream << "Used ref size = " << data_[REF_SIZE] << endl;
//log_stream << "Traceback errors = " << data_[BIAS_ERRORS] << endl;
log_stream << "Low complexity seeds = " << data_[LOW_COMPLEXITY_SEEDS] << endl;
log_stream << "Hits (filter stage 0) = " << data_[SEED_HITS] << endl;
log_stream << "Hits (filter stage 1) = " << data_[TENTATIVE_MATCHES1] << " (" << data_[TENTATIVE_MATCHES1]*100.0/ data_[SEED_HITS] << " %)" << endl;
log_stream << "Hits (filter stage 2) = " << data_[TENTATIVE_MATCHES2] << " (" << data_[TENTATIVE_MATCHES2] * 100.0 / data_[TENTATIVE_MATCHES1] << " %)" << endl;
......@@ -86,6 +88,7 @@ struct Statistics
log_stream << "MSE = " << (double)data_[SQUARED_ERROR] / (double)data_[OUT_HITS] << endl;
//log_stream << "Cells = " << data_[CELLS] << endl;
verbose_stream << "Temporary disk space used: " << (double)data_[TEMP_SPACE] / (1 << 30) << " GB" << endl;
log_stream << "Memory pool maximum allocation size: " << (double)MemoryPool::global().max_alloc_size() / (1 << 30) << " GB" << endl;
log_stream << "Outranked hits = " << data_[OUTRANKED_HITS] << " (" << data_[OUTRANKED_HITS]*100.0/ data_[PAIRWISE] << "%)" << endl;
message_stream << "Reported " << data_[PAIRWISE] << " pairwise alignments, " << data_[MATCHES] << " HSPs." << endl;
message_stream << data_[ALIGNED] << " queries aligned." << endl;
......
......@@ -19,6 +19,13 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <numeric>
#include "frequent_seeds.h"
#include "sorted_list.h"
#include "queries.h"
// #define MASK_FREQUENT
#ifdef MASK_FREQUENT
#include "../data/reference.h"
#endif
const double Frequent_seeds::hash_table_factor = 1.3;
Frequent_seeds frequent_seeds;
......@@ -47,7 +54,7 @@ void Frequent_seeds::compute_sd(Atomic<unsigned> *seedp, const sorted_list *ref_
struct Frequent_seeds::Build_context
{
Build_context(const sorted_list &ref_idx, const sorted_list &query_idx, const seedp_range &range, unsigned sid, unsigned ref_max_n, unsigned query_max_n, vector<unsigned> &counts) :
Build_context(const sorted_list &ref_idx, const sorted_list &query_idx, const SeedPartitionRange &range, unsigned sid, unsigned ref_max_n, unsigned query_max_n, vector<unsigned> &counts) :
ref_idx(ref_idx),
query_idx(query_idx),
range(range),
......@@ -69,6 +76,13 @@ struct Frequent_seeds::Build_context
merge_it.i.get(0)->value = 0;
n += (unsigned)merge_it.i.n;
buf.push_back(merge_it.i.key());
#ifdef MASK_FREQUENT
for (unsigned i = 0; i < merge_it.i.n; ++i) {
char *p = ref_seqs::get_nc().data(merge_it.i.get(i)->value);
for (unsigned j = 0; j < shapes[0].weight_; ++j)
p[shapes[0].positions_[j]] |= 128;
}
#endif
}
++merge_it;
}
......@@ -84,12 +98,12 @@ struct Frequent_seeds::Build_context
}
const sorted_list &ref_idx;
const sorted_list &query_idx;
const seedp_range range;
const SeedPartitionRange range;
const unsigned sid, ref_max_n, query_max_n;
vector<unsigned> &counts;
};
void Frequent_seeds::build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx)
void Frequent_seeds::build(unsigned sid, const SeedPartitionRange &range, sorted_list &ref_idx, const sorted_list &query_idx)
{
vector<Sd> ref_sds(range.size()), query_sds(range.size());
Atomic<unsigned> seedp (range.begin());
......@@ -107,3 +121,86 @@ void Frequent_seeds::build(unsigned sid, const seedp_range &range, sorted_list &
launch_scheduled_thread_pool(build_context, Const::seedp, config.threads_);
log_stream << "Masked positions = " << std::accumulate(counts.begin(), counts.end(), 0) << std::endl;
}
void Frequent_seeds::compute_sd2(Atomic<unsigned> *seedp, vector<JoinResult<SeedArray::Entry> >::iterator seed_hits, vector<Sd> *ref_out, vector<Sd> *query_out)
{
unsigned p;
while ((p = (*seedp)++) < current_range.end()) {
Sd ref_sd, query_sd;
for (JoinResult<SeedArray::Entry>::Iterator it = seed_hits[p - current_range.begin()].begin(); it.good(); ++it) {
query_sd.add((double)it.r.count());
ref_sd.add((double)it.s.count());
}
(*ref_out)[p - current_range.begin()] = ref_sd;
(*query_out)[p - current_range.begin()] = query_sd;
}
}
struct Frequent_seeds::Build_context2
{
Build_context2(vector<JoinResult<SeedArray::Entry> >::iterator seed_hits, const SeedPartitionRange &range, unsigned sid, unsigned ref_max_n, unsigned query_max_n, vector<unsigned> &counts) :
seed_hits(seed_hits),
range(range),
sid(sid),
ref_max_n(ref_max_n),
query_max_n(query_max_n),
counts(counts)
{ }
void operator()(unsigned thread_id, unsigned seedp)
{
if (!range.contains(seedp))
return;
vector<uint32_t> buf;
size_t n = 0;
for (JoinResult<SeedArray::Entry>::Iterator it = seed_hits[seedp - range.begin()].begin(); it.good(); ++it) {
if (it.s.count() > ref_max_n || it.r.count() > query_max_n) {
it.s[0] = 0;
n += (unsigned)it.s.count();
Packed_seed s;
shapes[sid].set_seed(s, query_seqs::get().data(it.r[0]));
buf.push_back(seed_partition_offset(s));
#ifdef MASK_FREQUENT
for (unsigned i = 0; i < merge_it.i.n; ++i) {
char *p = ref_seqs::get_nc().data(merge_it.i.get(i)->value);
for (unsigned j = 0; j < shapes[0].weight_; ++j)
p[shapes[0].positions_[j]] |= 128;
}
#endif
}
}
const size_t ht_size = std::max((size_t)(buf.size() * hash_table_factor), buf.size() + 1);
PHash_set<void, murmur_hash> hash_set(ht_size);
for (vector<uint32_t>::const_iterator i = buf.begin(); i != buf.end(); ++i)
hash_set.insert(*i);
frequent_seeds.tables_[sid][seedp] = hash_set;
counts[seedp] = (unsigned)n;
}
const vector<JoinResult<SeedArray::Entry> >::iterator seed_hits;
const SeedPartitionRange range;
const unsigned sid, ref_max_n, query_max_n;
vector<unsigned> &counts;
};
void Frequent_seeds::build(unsigned sid, const SeedPartitionRange &range, vector<JoinResult<SeedArray::Entry> >::iterator seed_hits)
{
vector<Sd> ref_sds(range.size()), query_sds(range.size());
Atomic<unsigned> seedp(range.begin());
Thread_pool threads;
for (unsigned i = 0; i < config.threads_; ++i)
threads.push_back(launch_thread(compute_sd2, &seedp, seed_hits, &ref_sds, &query_sds));
threads.join_all();
Sd ref_sd(ref_sds), query_sd(query_sds);
const unsigned ref_max_n = (unsigned)(ref_sd.mean() + config.freq_sd*ref_sd.sd()), query_max_n = (unsigned)(query_sd.mean() + config.freq_sd*query_sd.sd());
log_stream << "Seed frequency mean (reference) = " << ref_sd.mean() << ", SD = " << ref_sd.sd() << endl;
log_stream << "Seed frequency mean (query) = " << query_sd.mean() << ", SD = " << query_sd.sd() << endl;
vector<unsigned> counts(Const::seedp);
Build_context2 build_context(seed_hits, range, sid, ref_max_n, query_max_n, counts);
launch_scheduled_thread_pool(build_context, Const::seedp, config.threads_);
log_stream << "Masked positions = " << std::accumulate(counts.begin(), counts.end(), 0) << std::endl;
}
\ No newline at end of file
......@@ -22,11 +22,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../basic/const.h"
#include "../util/hash_table.h"
#include "sorted_list.h"
#include "seed_array.h"
#include "../util/algo/join_result.h"
struct Frequent_seeds
{
void build(unsigned sid, const seedp_range &range, sorted_list &ref_idx, const sorted_list &query_idx);
void build(unsigned sid, const SeedPartitionRange &range, sorted_list &ref_idx, const sorted_list &query_idx);
void build(unsigned sid, const SeedPartitionRange &range, vector<JoinResult<SeedArray::Entry> >::iterator seed_hits);
bool get(const Letter *pos, unsigned sid) const
{
......@@ -42,8 +45,10 @@ private:
static const double hash_table_factor;
struct Build_context;
struct Build_context2;
static void compute_sd(Atomic<unsigned> *seedp, const sorted_list *ref_idx, const sorted_list *query_idx, vector<Sd> *ref_out, vector<Sd> *query_out);
static void compute_sd2(Atomic<unsigned> *seedp, vector<JoinResult<SeedArray::Entry> >::iterator seed_hits, vector<Sd> *ref_out, vector<Sd> *query_out);
PHash_set<void,murmur_hash> tables_[Const::max_shapes][Const::seedp];
......