Skip to content
Commits on Source (4)
......@@ -3,6 +3,18 @@ project (DIAMOND)
option(BUILD_STATIC "BUILD_STATIC" OFF)
option(EXTRA "EXTRA" OFF)
option(STATIC_LIBGCC "STATIC_LIBGCC" OFF)
option(STATIC_LIBSTDC++ "STATIC_LIBSTDC++" OFF)
option(SSSE3 "SSSE3" OFF)
option(POPCNT "POPCNT" OFF)
IF(STATIC_LIBSTDC++)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++")
endif()
IF(STATIC_LIBGCC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libgcc")
endif()
if(BUILD_STATIC)
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
......@@ -12,7 +24,17 @@ endif()
set(CMAKE_CXX_STANDARD 11)
if(CMAKE_BUILD_MARCH)
if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++11")
endif()
if(SSSE3)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mssse3")
if(POPCNT)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt")
endif()
elseif(CMAKE_BUILD_MARCH)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${CMAKE_BUILD_MARCH}")
else()
include(CheckCXXCompilerFlag)
......@@ -123,6 +145,7 @@ add_executable(diamond src/run/main.cpp
src/run/cluster.cpp
src/util/algo/greedy_vortex_cover.cpp
src/util/algo/greedy_vortex_cover_weighted.cpp
src/util/sequence/sequence.cpp
)
if(EXTRA)
......
......@@ -32,7 +32,7 @@ quick example for setting up and using the program on Linux.
Installing the software on your system may be done by downloading it in
binary format for immediate use:
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.23/diamond-linux64.tar.gz
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted `diamond` binary file should be moved to a directory
......
......@@ -80,4 +80,5 @@ g++ -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/run/cluster.cpp \
src/util/algo/greedy_vortex_cover.cpp \
src/util/algo/greedy_vortex_cover_weighted.cpp \
src/util/sequence/sequence.cpp \
-lz -lpthread -o diamond
diamond-aligner (0.9.24+dfsg-1) unstable; urgency=medium
* New upstream version
-- Andreas Tille <tille@debian.org> Sat, 29 Dec 2018 21:10:22 +0100
diamond-aligner (0.9.23+dfsg-2) unstable; urgency=medium
* Restrict architecture to amd64
......
[0.9.24]
- Fixed a compiler error on macOS.
- Added --header option to output header for tabular output format.
- The quality string output in tabular format (qqual field) is clipped to the aligned part of the query.
- Print '*' as quality string if quality values are not available in tabular output format.
- Added field 'full_qqual' to print unclipped query quality values to the tabular format.
- Added field 'full_qseq' to print unclipped query sequence to the tabular format.
- Added support for using the hyphen character '-' to denote the standard input for input file parameters.
- Status messages are written to stderr.
- Fixed a bug that could incorrectly report queries as unaligned in the output of the --un option.
- Added option '--al' to write aligned queries to file.
- Added options '--alfmt' and '--unfmt' to set the format of the aligned/unaligned query file (supported values: fasta, fastq).
[0.9.23]
- Fixed an issue that could cause too high memory usage.
- Added output field 'qqual' to print query FASTQ quality values to the tabular format.
......
......@@ -105,8 +105,11 @@ void align_worker(size_t thread_id, const Parameters *params, const Metadata *me
if (*output_format != Output_format::null) {
buf = new TextBuffer;
const bool aligned = mapper->generate_output(*buf, stat, *metadata);
if (aligned && (!config.unaligned.empty() || !config.aligned_file.empty()))
if (aligned && (!config.unaligned.empty() || !config.aligned_file.empty())) {
query_aligned_mtx.lock();
query_aligned[hits.query] = true;
query_aligned_mtx.unlock();
}
}
delete mapper;
OutputSink::get().push(hits.query, buf);
......
......@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
const char* Const::version_string = "0.9.23";
const char* Const::version_string = "0.9.24";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
......
......@@ -70,7 +70,8 @@ Config::Config(int argc, const char **argv)
.add_command("seed-stat", "")
.add_command("smith-waterman", "")
.add_command("protein-snps", "")
.add_command("cluster", "");
.add_command("cluster", "")
.add_command("translate", "");
Options_group general("General options");
general.add()
......@@ -94,8 +95,9 @@ Config::Config(int argc, const char **argv)
\tsstart means Start of alignment in subject\n\
\tsend means End of alignment in subject\n\
\tqseq means Aligned part of query sequence\n\
\tfull_qseq means Query sequence\n\
\tsseq means Aligned part of subject sequence\n\
\tfull_sseq means Full subject sequence\n\
\tfull_sseq means Subject sequence\n\
\tevalue means Expect value\n\
\tbitscore means Bit score\n\
\tscore means Raw score\n\
......@@ -118,7 +120,8 @@ Config::Config(int argc, const char **argv)
\n\tDefault: qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", output_format)
("verbose", 'v', "verbose console output", verbose)
("log", 0, "enable debug log", debug_log)
("quiet", 0, "disable console output", quiet);
("quiet", 0, "disable console output", quiet)
("header", 0, "Write header lines to blast tabular format.", output_header);
Options_group makedb("Makedb options");
makedb.add()
......@@ -130,6 +133,8 @@ Config::Config(int argc, const char **argv)
("strand", 0, "query strands to search (both/minus/plus)", query_strands, string("both"))
("un", 0, "file for unaligned queries", unaligned)
("al", 0, "file or aligned queries", aligned_file)
("unfmt", 0, "format of unaligned query file (fasta/fastq)", unfmt, string("fasta"))
("alfmt", 0, "format of aligned query file (fasta/fastq)", alfmt, string("fasta"))
("unal", 0, "report unaligned queries (0=no, 1=yes)", report_unaligned, -1)
("max-target-seqs", 'k', "maximum number of target sequences to report alignments for", max_alignments, uint64_t(25))
("top", 0, "report alignments within this percentage range of top alignment score (overrides --max-target-seqs)", toppercent, 100.0)
......@@ -342,7 +347,7 @@ Config::Config(int argc, const char **argv)
;
}
invocation = join(' ', vector<string>(&argv[0], &argv[argc]));
invocation = join(" ", vector<string>(&argv[0], &argv[argc]));
log_stream << invocation << endl;
if (!no_auto_append) {
......@@ -423,6 +428,11 @@ Config::Config(int argc, const char **argv)
if (query_strands != "both" && query_strands != "minus" && query_strands != "plus")
throw std::runtime_error("Invalid value for parameter --strand");
if (unfmt == "fastq" || alfmt == "fastq")
store_query_quality = true;
if (!aligned_file.empty())
log_stream << "Aligned file format: " << alfmt << endl;
/*log_stream << "sizeof(hit)=" << sizeof(hit) << " sizeof(packed_uint40_t)=" << sizeof(packed_uint40_t)
<< " sizeof(sorted_list::entry)=" << sizeof(sorted_list::entry) << endl;*/
......
......@@ -168,11 +168,14 @@ struct Config
unsigned swipe_chunk_size;
unsigned query_parallel_limit;
bool long_reads;
bool output_header;
string alfmt;
string unfmt;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
smith_waterman = 26, protein_snps = 27, cluster = 28
smith_waterman = 26, protein_snps = 27, cluster = 28, translate = 29
};
unsigned command;
......
......@@ -23,7 +23,7 @@ struct Const
{
enum {
build_version = 124,
build_version = 125,
seedp_bits = 10,
seedp = 1<<seedp_bits,
max_seed_weight = 32,
......
......@@ -86,6 +86,9 @@ struct sequence
}
bool empty() const
{ return len_ == 0; }
bool present() const {
return len_ > 0;
}
const char* c_str() const
{ return reinterpret_cast<const char*>(data_); }
size_t print(char *ptr, unsigned begin, unsigned len) const
......
......@@ -17,6 +17,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include "queries.h"
#include "../util/sequence/sequence.h"
#include "../basic/config.h"
using namespace std;
unsigned current_query_chunk;
Sequence_set* query_source_seqs::data_ = 0;
......@@ -24,6 +28,7 @@ Sequence_set* query_seqs::data_ = 0;
String_set<0>* query_ids::data_ = 0;
Partitioned_histogram query_hst;
vector<bool> query_aligned;
tthread::mutex query_aligned_mtx;
Seed_set *query_seeds = 0;
Hashed_seed_set *query_seeds_hashed = 0;
String_set<0> *query_qual = nullptr;
......@@ -35,11 +40,12 @@ void write_unaligned(OutputFile *file)
TextBuffer buf;
for (size_t i = 0; i < n; ++i) {
if (!query_aligned[i]) {
buf << '>' << query_ids::get()[i].c_str() << '\n';
(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i]).print(buf, input_value_traits);
buf << '\n';
file->write(buf.get_begin(), buf.size());
buf.clear();
Util::Sequence::format(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i],
query_ids::get()[i].c_str(),
query_qual ? (*query_qual)[i].c_str() : nullptr,
*file,
config.unfmt,
input_value_traits);
}
}
}
......@@ -50,11 +56,12 @@ void write_aligned(OutputFile *file)
TextBuffer buf;
for (size_t i = 0; i < n; ++i) {
if (query_aligned[i]) {
buf << '>' << query_ids::get()[i].c_str() << '\n';
(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i]).print(buf, input_value_traits);
buf << '\n';
file->write(buf.get_begin(), buf.size());
buf.clear();
Util::Sequence::format(align_mode.query_translated ? query_source_seqs::get()[i] : query_seqs::get()[i],
query_ids::get()[i].c_str(),
query_qual ? (*query_qual)[i].c_str() : nullptr,
*file,
config.alfmt,
input_value_traits);
}
}
}
\ No newline at end of file
......@@ -25,6 +25,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../basic/statistics.h"
#include "sequence_set.h"
#include "seed_set.h"
#include "../util/tinythread.h"
extern Partitioned_histogram query_hst;
extern unsigned current_query_chunk;
......@@ -50,6 +51,7 @@ struct query_ids
static String_set<0> *data_;
};
extern tthread::mutex query_aligned_mtx;
extern vector<bool> query_aligned;
extern String_set<0> *query_qual;
......
......@@ -401,12 +401,19 @@ void db_info()
db_file.close();
}
DatabaseFile* DatabaseFile::auto_create_from_fasta() {
InputFile db_file(config.database);
bool DatabaseFile::is_diamond_db(const string &file_name) {
if (file_name == "-")
return false;
InputFile db_file(file_name);
uint64_t magic_number;
if (db_file.read(&magic_number, 1) != 1 || magic_number != ReferenceHeader::MAGIC_NUMBER) {
message_stream << "Database file is not a DIAMOND database, treating as FASTA." << endl;
bool r = db_file.read(&magic_number, 1) == 1 && magic_number == ReferenceHeader::MAGIC_NUMBER;
db_file.close();
return r;
}
DatabaseFile* DatabaseFile::auto_create_from_fasta() {
if (!is_diamond_db(config.database)) {
message_stream << "Database file is not a DIAMOND database, treating as FASTA." << endl;
config.input_ref_file = config.database;
TempFile *db;
make_db(&db);
......@@ -414,8 +421,6 @@ DatabaseFile* DatabaseFile::auto_create_from_fasta() {
delete db;
return r;
}
else {
db_file.close();
else
return new DatabaseFile(config.database);
}
\ No newline at end of file
}
\ No newline at end of file
......@@ -22,6 +22,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include <vector>
#include <string>
#include <string.h>
#include <stdint.h>
#include "../util/io/serializer.h"
#include "../util/io/input_file.h"
#include "../data/seed_histogram.h"
......@@ -43,8 +44,8 @@ struct ReferenceHeader
uint64_t magic_number;
uint32_t build, db_version;
uint64_t sequences, letters, pos_array_offset;
static const uint64_t MAGIC_NUMBER = 0x24af8a415ee186dllu;
enum { current_db_version = 2 };
static constexpr uint64_t MAGIC_NUMBER = 0x24af8a415ee186dllu;
};
struct ReferenceHeader2
......@@ -76,6 +77,7 @@ struct DatabaseFile : public InputFile
DatabaseFile(TempFile &tmp_file);
static void read_header(InputFile &stream, ReferenceHeader &header);
static DatabaseFile* auto_create_from_fasta();
static bool is_diamond_db(const string &file_name);
void rewind();
bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
void get_seq();
......
......@@ -325,7 +325,10 @@ struct score_vector<int16_t>
template<typename _t>
struct ScoreTraits
{};
{
typedef void Score;
enum { CHANNELS = 0 };
};
template<>
struct ScoreTraits<int32_t>
......
......@@ -16,12 +16,16 @@ You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <sstream>
#include <set>
#include <numeric>
#include "../basic/match.h"
#include "output_format.h"
#include "../data/taxonomy.h"
#include "../data/queries.h"
using namespace std;
const char* Blast_tab_format::field_str[] = {
"qseqid", // 0 means Query Seq - id
"qgi", // 1 means Query GI
......@@ -75,7 +79,67 @@ const char* Blast_tab_format::field_str[] = {
"qqual", // 49
"qnum", // 50
"snum", // 51
"scovhsp" // 52
"scovhsp", // 52
"full_qqual", // 53
"full_qseq" // 54
};
const char* Blast_tab_format::field_desc[] = {
"Query ID", // 0 means Query Seq - id
"qgi", // 1 means Query GI
"qacc", // 2 means Query accesion
"qaccver", // 3 means Query accesion.version
"Query length", // 4 means Query sequence length
"Subject ID", // 5 means Subject Seq - id
"Subject IDs", // 6 means All subject Seq - id(s), separated by a ';'
"sgi", // 7 means Subject GI
"sallgi", // 8 means All subject GIs
"sacc", // 9 means Subject accession
"saccver", // 10 means Subject accession.version
"sallacc", // 11 means All subject accessions
"Subject length", // 12 means Subject sequence length
"Start of alignment in query", // 13 means Start of alignment in query
"End of alignment in query", // 14 means End of alignment in query
"Start of alignment in subject", // 15 means Start of alignment in subject
"End of alignment in subject", // 16 means End of alignment in subject
"Aligned part of query sequence", // 17 means Aligned part of query sequence
"Aligned part of subject sequence", // 18 means Aligned part of subject sequence
"Expected value", // 19 means Expect value
"Bit score", // 20 means Bit score
"Raw score", // 21 means Raw score
"Alignment length", // 22 means Alignment length
"Percentage of identical matches", // 23 means Percentage of identical matches
"Number of identical matches", // 24 means Number of identical matches
"Number of mismatches", // 25 means Number of mismatches
"Number of positive-scoring matches", // 26 means Number of positive - scoring matches
"Number of gap openings", // 27 means Number of gap openings
"Total number of gaps", // 28 means Total number of gaps
"Percentage of positive-scoring matches", // 29 means Percentage of positive - scoring matches
"frames", // 30 means Query and subject frames separated by a '/'
"Query frame", // 31 means Query frame
"sframe", // 32 means Subject frame
"Blast traceback operations", // 33 means Blast traceback operations(BTOP)
"Subject Taxonomy IDs", // 34 means unique Subject Taxonomy ID(s), separated by a ';' (in numerical order)
"sscinames", // 35 means unique Subject Scientific Name(s), separated by a ';'
"scomnames", // 36 means unique Subject Common Name(s), separated by a ';'
"sblastnames", // 37 means unique Subject Blast Name(s), separated by a ';' (in alphabetical order)
"sskingdoms", // 38 means unique Subject Super Kingdom(s), separated by a ';' (in alphabetical order)
"Subject title", // 39 means Subject Title
"Subject titles", // 40 means All Subject Title(s), separated by a '<>'
"sstrand", // 41 means Subject Strand
"qcovs", // 42 means Query Coverage Per Subject
"Query coverage per HSP", // 43 means Query Coverage Per HSP
"qcovus", // 44 means Query Coverage Per Unique Subject(blastn only)
"Query title", // 45 means Query title
"swdiff", // 46
"time", // 47
"Subject sequence", // 48
"Aligned part of query quality values", // 49
"qnum", // 50
"snum", // 51
"scovhsp", // 52
"Query quality values", // 53
"Query sequence" // 54
};
Blast_tab_format::Blast_tab_format() :
......@@ -98,7 +162,7 @@ Blast_tab_format::Blast_tab_format() :
config.salltitles = true;
if (j == 48)
config.use_lazy_dict = true;
if (j == 49)
if (j == 49 || j == 53)
config.store_query_quality = true;
}
}
......@@ -250,7 +314,7 @@ void Blast_tab_format::print_match(const Hsp_context& r, const Metadata &metadat
out << r.subject_seq;
break;
case 49:
out << (*query_qual)[r.query_id].c_str();
out << (query_qual && (*query_qual)[r.query_id].present() ? (*query_qual)[r.query_id].subseq(r.query_source_range().begin_, r.query_source_range().end_).c_str() : "*");
break;
case 50:
out << query_block_to_database_id[r.query_id];
......@@ -261,6 +325,12 @@ void Blast_tab_format::print_match(const Hsp_context& r, const Metadata &metadat
case 52:
out << (double)r.subject_range().length() * 100.0 / r.subject_len;
break;
case 53:
out << (query_qual && (*query_qual)[r.query_id].present() ? (*query_qual)[r.query_id].c_str() : "*");
break;
case 54:
r.query.source().print(out, input_value_traits);
break;
default:
throw std::runtime_error(string("Invalid output field: ") + field_str[*i]);
}
......@@ -289,6 +359,7 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
case 39:
case 40:
case 48:
case 49:
out << '*';
break;
case 12:
......@@ -317,8 +388,11 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
case 45:
out << query_name;
break;
case 49:
out << (*query_qual)[query_num].c_str();
case 53:
out << (query_qual && (*query_qual)[query_num].present() ? (*query_qual)[query_num].c_str() : "*");
break;
case 54:
(align_mode.query_translated ? query_source_seqs::get()[query_num] : query_seqs::get()[query_num]).print(out, input_value_traits);
break;
default:
throw std::runtime_error(string("Invalid output field: ") + field_str[*i]);
......@@ -329,3 +403,14 @@ void Blast_tab_format::print_query_intro(size_t query_num, const char *query_nam
out << '\n';
}
}
void Blast_tab_format::print_header(Consumer &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const {
if (!config.output_header)
return;
stringstream ss;
ss << "# DIAMOND v" << Const::version_string << ". http://github.com/bbuchfink/diamond" << endl;
ss << "# Invocation: " << config.invocation << endl;
ss << "# Fields: " << join(", ", apply(fields, [](unsigned i) -> string { return string(field_desc[i]); })) << endl;
const string s(ss.str());
f.consume(s.data(), s.length());
}
\ No newline at end of file
......@@ -87,13 +87,14 @@ struct DAA_format : public Output_format
struct Blast_tab_format : public Output_format
{
static const char* field_str[];
static const char* field_str[], *field_desc[];
Blast_tab_format();
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const;
virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out);
virtual void print_header(Consumer &f, int mode, const char *matrix, int gap_open, int gap_extend, double evalue, const char *first_query_name, unsigned first_query_len) const override;
virtual void print_query_intro(size_t query_num, const char *query_name, unsigned query_len, TextBuffer &out, bool unaligned) const override;
virtual void print_match(const Hsp_context& r, const Metadata &metadata, TextBuffer &out) override;
virtual ~Blast_tab_format()
{ }
virtual Output_format* clone() const
virtual Output_format* clone() const override
{
return new Blast_tab_format(*this);
}
......
......@@ -30,6 +30,7 @@ struct TargetCulling
virtual int cull(const vector<IntermediateRecord> &target_hsp) const = 0;
virtual void add(const Target &t) = 0;
virtual void add(const vector<IntermediateRecord> &target_hsp) = 0;
virtual ~TargetCulling() = default;
enum { FINISHED = 0, NEXT = 1, INCLUDE = 2};
static TargetCulling* get();
};
......@@ -70,6 +71,7 @@ struct GlobalCulling : public TargetCulling
top_score_ = target_hsp[0].score;
++n_;
}
virtual ~GlobalCulling() = default;
private:
size_t n_;
int top_score_;
......@@ -119,6 +121,7 @@ struct RangeCulling : public TargetCulling
for (std::vector<IntermediateRecord>::const_iterator i = target_hsp.begin(); i != target_hsp.end(); ++i)
p_.insert(i->absolute_query_range(), i->score);
}
virtual ~RangeCulling() = default;
private:
IntervalPartition p_;
};
......
......@@ -42,11 +42,11 @@ void info();
void seed_stat();
void pairwise();
void protein_snps();
void translate();
int main(int ac, const char* av[])
{
try {
check_simd();
config = Config(ac, av);
......@@ -126,6 +126,9 @@ int main(int ac, const char* av[])
case Config::cluster:
Workflow::Cluster::run();
break;
case Config::translate:
translate();
break;
#ifdef EXTRA
case Config::test_extra:
test_main();
......