Skip to content
Commits on Source (3)
......@@ -149,6 +149,7 @@ add_executable(diamond src/run/main.cpp
src/tools/tools.cpp
src/util/system/getRSS.cpp
src/util/math/sparse_matrix.cpp
src/lib/tantan/LambdaCalculator.cc
)
if(EXTRA)
......
......@@ -21,11 +21,6 @@ Keep posted about new developments by following me on
![image](https://anaconda.org/bioconda/diamond/badges/platforms.svg)
[![image](https://anaconda.org/bioconda/diamond/badges/downloads.svg)](https://anaconda.org/bioconda/diamond)
Support
=======
The preferred support channel is the [Diamond community website](http://www.diamondsearch.org/). It provides a platform for users to exchange their experiences and get support directly from the developer. You may also use the GitHub issue tracker or send inquiries by email.
Quick start guide
=================
......@@ -37,7 +32,7 @@ quick example for setting up and using the program on Linux.
Installing the software on your system may be done by downloading it in
binary format for immediate use:
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.25/diamond-linux64.tar.gz
wget http://github.com/bbuchfink/diamond/releases/download/v0.9.26/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
The extracted `diamond` binary file should be moved to a directory
......@@ -78,6 +73,11 @@ The output file here is specified with the `–o` option and named
- The default e-value cutoff of DIAMOND is 0.001 while that of
BLAST is 10, so by default the program will search a lot more
stringently than BLAST and not report weak hits.
Support
=======
The preferred support channel is the [Diamond community website](http://www.diamondsearch.org/). It provides a platform for users to exchange their experiences and get support directly from the developer. You may also use the GitHub issue tracker or send inquiries by email.
About
=====
......
......@@ -81,4 +81,5 @@ g++ -std=gnu++11 -DNDEBUG -O3 -Wno-deprecated-declarations $1 $2 $3 \
src/tools/tools.cpp \
src/util/system/getRSS.cpp \
src/util/math/sparse_matrix.cpp \
src/lib/tantan/LambdaCalculator.cc \
-lz -lpthread -o diamond
diamond-aligner (0.9.26+dfsg-1) UNRELEASED; urgency=medium
* Team upload.
* New upstream version
-- Steffen Moeller <moeller@debian.org> Fri, 20 Sep 2019 00:20:39 +0200
diamond-aligner (0.9.25+dfsg-1) unstable; urgency=medium
* New upstream version
......
[0.9.26]
- Fixed a bug that could cause undefined behaviour when using a database file of format version < 2.
- Fixed a compiler error when compiled as generic C++.
- Program will no longer terminate with an error if unlink system call fails.
- Added option `--tantan-minMaskProb` to set minimum repeat probability for tantan masking and changed the default value to 0.9.
- Added option `--tantan-maxRepeatOffset` to set maximum tandem repeat period to consider and changed the default value to 15.
- Added option `--tantan-ungapped` to use tantan in ungapped mode and changed the default to gapped mode.
- Changed score matrix lambda calculation for tantan masking.
- Reference masking is recomputed during alignment runs.
[0.9.25]
- Added support for the `sscinames` output field to print subject scientific names to the tabular output format.
- Fixed a compiler error for GCC 8.2.
......
......@@ -165,7 +165,6 @@ void build_ranking_worker(PtrVector<::Target>::iterator begin, PtrVector<::Targe
void Pipeline::run(Statistics &stat)
{
//cout << "Query=" << query_ids::get()[this->query_id].c_str() << endl;
task_timer timer("Init banded swipe pipeline", target_parallel ? 3 : UINT_MAX);
Config::set_option(config.padding, 32);
if (n_targets() == 0)
......
......@@ -24,7 +24,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "sequence.h"
#include "masking.h"
const char* Const::version_string = "0.9.25";
const char* Const::version_string = "0.9.26";
const char* Const::program_name = "diamond";
const char* Const::id_delimiters = " \a\b\f\n\r\t\v\1";
......
......@@ -197,7 +197,10 @@ Config::Config(int argc, const char **argv)
("dbsize", 0, "effective database size (in letters)", db_size)
("no-auto-append", 0, "disable auto appending of DAA and DMND file extensions", no_auto_append)
("xml-blord-format", 0, "Use gnl|BL_ORD_ID| style format in XML output", xml_blord_format)
("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1);
("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1)
("tantan-minMaskProb", 0, "minimum repeat probability for masking (0.9)", tantan_minMaskProb, 0.9)
("tantan-maxRepeatOffset", 0, "maximum tandem repeat period to consider (50)", tantan_maxRepeatOffset, 15)
("tantan-ungapped", 0, "use tantan masking in ungapped mode", tantan_ungapped);
Options_group view_options("View options");
view_options.add()
......@@ -274,8 +277,6 @@ Config::Config(int argc, const char **argv)
("query-parallel-limit", 0, "", query_parallel_limit, 1000000u)
("hard-masked", 0, "", hardmasked)
("cbs-window", 0, "", cbs_window, 40)
("tantan-r", 0, "", tantan_r, 0.005)
("tantan-s", 0, "", tantan_s, 0.5)
("no-unlink", 0, "", no_unlink)
("no-dict", 0, "", no_dict);
......
......@@ -175,10 +175,12 @@ struct Config
bool hardmasked;
int cbs_window;
double tantan_r;
double tantan_s;
double tantan_minMaskProb;
bool no_unlink;
bool no_dict;
int stop_match_score;
int tantan_maxRepeatOffset;
bool tantan_ungapped;
enum {
makedb = 0, blastp = 1, blastx = 2, view = 3, help = 4, version = 5, getseq = 6, benchmark = 7, random_seqs = 8, compare = 9, sort = 10, roc = 11, db_stat = 12, model_sim = 13,
......
......@@ -23,7 +23,7 @@ struct Const
{
enum {
build_version = 126,
build_version = 127,
seedp_bits = 10,
seedp = 1<<seedp_bits,
max_seed_weight = 32,
......
......@@ -17,8 +17,10 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
****/
#include <math.h>
#include <algorithm>
#include "masking.h"
#include "../lib/tantan/tantan.hh"
#include "../lib/tantan/LambdaCalculator.hh"
using namespace std;
......@@ -27,12 +29,21 @@ const uint8_t Masking::bit_mask = 128;
Masking::Masking(const Score_matrix &score_matrix)
{
const double lambda = score_matrix.lambda(); // 0.324032
const unsigned n = value_traits.alphabet_size;
int int_matrix[20][20], *int_matrix_ptr[20];
std::copy(int_matrix, int_matrix + 20, int_matrix_ptr);
for (int i = 0; i < 20; ++i)
for (int j = 0; j < 20; ++j)
int_matrix[i][j] = score_matrix((char)i, (char)j);
cbrc::LambdaCalculator lc;
lc.calculate(int_matrix_ptr, 20);
const double lambda = lc.lambda(); // 0.324032
for (unsigned i = 0; i < size; ++i) {
mask_table_x_[i] = value_traits.mask_char;
mask_table_bit_[i] = (uint8_t)i | bit_mask;
for (unsigned j = 0; j < size; ++j)
if (i < value_traits.alphabet_size && j < value_traits.alphabet_size)
if (i < n && j < n)
likelihoodRatioMatrix_[i][j] = exp(lambda * score_matrix(i, j));
}
std::copy(likelihoodRatioMatrix_, likelihoodRatioMatrix_ + size, probMatrixPointers_);
......@@ -44,22 +55,22 @@ Masking::Masking(const Score_matrix &score_matrix)
void Masking::operator()(Letter *seq, size_t len) const
{
tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), config.tantan_maxRepeatOffset,
(tantan::const_double_ptr*)probMatrixPointers_,
config.tantan_r, 0.05,
0.005, 0.05,
0.9,
0, 0,
config.tantan_s, (const tantan::uchar*)mask_table_x_);
config.tantan_ungapped ? 0.0 : firstGapProb_, config.tantan_ungapped ? 0.0 : otherGapProb_,
config.tantan_minMaskProb, (const tantan::uchar*)mask_table_x_);
}
void Masking::mask_bit(Letter *seq, size_t len) const
{
tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), 50,
tantan::maskSequences((tantan::uchar*)seq, (tantan::uchar*)(seq + len), config.tantan_maxRepeatOffset,
(tantan::const_double_ptr*)probMatrixPointers_,
config.tantan_r, 0.05,
0.005, 0.05,
0.9,
0, 0,
config.tantan_s, (const tantan::uchar*)mask_table_bit_);
config.tantan_ungapped ? 0.0 : firstGapProb_, config.tantan_ungapped ? 0.0 : otherGapProb_,
config.tantan_minMaskProb, (const tantan::uchar*)mask_table_bit_);
}
void Masking::bit_to_hard_mask(Letter *seq, size_t len, size_t &n) const
......@@ -88,7 +99,7 @@ void mask_worker(Atomic<size_t> *next, Sequence_set *seqs, const Masking *maskin
masking->mask_bit(seqs->ptr(i), seqs->length(i));
}
void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
{
vector<thread> threads;
Atomic<size_t> next(0);
......@@ -96,4 +107,8 @@ void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask)
threads.emplace_back(mask_worker, &next, &seqs, &masking, hard_mask);
for (auto &t : threads)
t.join();
size_t n = 0;
for (size_t i = 0; i < seqs.get_length(); ++i)
n += std::count(seqs[i].data(), seqs[i].end(), value_traits.mask_char);
return n;
}
\ No newline at end of file
......@@ -41,4 +41,4 @@ private:
char mask_table_x_[size], mask_table_bit_[size];
};
void mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
size_t mask_seqs(Sequence_set &seqs, const Masking &masking, bool hard_mask = true);
\ No newline at end of file
......@@ -99,7 +99,7 @@ void ReferenceDictionary::build_lazy_dict(DatabaseFile &db_file)
}
db_file.rewind();
vector<unsigned> block_to_database_id;
db_file.load_seqs(block_to_database_id, std::numeric_limits<size_t>::max(), false, &ref_seqs::data_, &ref_ids::data_, false, &filter);
db_file.load_seqs(block_to_database_id, std::numeric_limits<size_t>::max(), &ref_seqs::data_, &ref_ids::data_, false, &filter);
std::sort(m.begin(), m.end());
dict_to_lazy_dict_id_.clear();
dict_to_lazy_dict_id_.resize(dict_size);
......
......@@ -79,13 +79,13 @@ struct Pos_record
void DatabaseFile::init()
{
read_header(*this, ref_header);
*this >> header2;
if (ref_header.build < min_build_required || ref_header.db_version < MIN_DB_VERSION)
throw std::runtime_error("Database was built with an older version of Diamond and is incompatible.");
if (ref_header.db_version > ReferenceHeader::current_db_version)
throw std::runtime_error("Database was built with a newer version of Diamond and is incompatible.");
if (ref_header.sequences == 0)
throw std::runtime_error("Incomplete database file. Database building did not complete successfully.");
*this >> header2;
pos_array_offset = ref_header.pos_array_offset;
}
......@@ -268,7 +268,7 @@ void DatabaseFile::seek_direct() {
seek(sizeof(ReferenceHeader) + sizeof(ReferenceHeader2) + 8);
}
bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids, const vector<bool> *filter)
bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids, const vector<bool> *filter)
{
task_timer timer("Loading reference sequences");
seek(pos_array_offset);
......@@ -318,7 +318,6 @@ bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_
(*dst_seq)->finish_reserve();
if(load_ids) (*dst_id)->finish_reserve();
seek(start_offset);
size_t masked_letters = 0;
for (size_t n = 0; n < seqs; ++n) {
if (filter && filtered_pos[n]) seek(filtered_pos[n]);
......@@ -329,16 +328,12 @@ bool DatabaseFile::load_seqs(vector<unsigned> &block_to_database_id, size_t max_
read((*dst_id)->ptr(n), (*dst_id)->length(n) + 1);
else
if (!seek_forward('\0')) throw std::runtime_error("Unexpected end of file.");
if (config.masking == 1 && masked)
Masking::get().bit_to_hard_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n), masked_letters);
else
Masking::get().remove_bit_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n));
Masking::get().remove_bit_mask((*dst_seq)->ptr(n), (*dst_seq)->length(n));
if (!config.sfilt.empty() && strstr((**dst_id)[n].c_str(), config.sfilt.c_str()) == 0)
memset((*dst_seq)->ptr(n), value_traits.mask_char, (*dst_seq)->length(n));
}
timer.finish();
(*dst_seq)->print_stats();
log_stream << "Masked letters = " << masked_letters << endl;
blocked_processing = seqs_processed < ref_header.sequences;
return true;
......
......@@ -80,7 +80,7 @@ struct DatabaseFile : public InputFile
static DatabaseFile* auto_create_from_fasta();
static bool is_diamond_db(const string &file_name);
void rewind();
bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, bool masked, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
bool load_seqs(vector<unsigned> &block_to_database_id, size_t max_letters, Sequence_set **dst_seq, String_set<0> **dst_id, bool load_ids = true, const vector<bool> *filter = NULL);
void get_seq();
void read_seq(string &id, vector<char> &seq);
bool has_taxon_id_lists();
......
......@@ -93,7 +93,7 @@ struct BuildCallback
};
template<typename _filter>
SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> seq_partition, const _filter *filter)
SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> &seq_partition, const _filter *filter)
{
task_timer timer("Allocating memory for seed array", 3);
for (size_t i = range.begin(); i < range.end(); ++i) {
......@@ -108,6 +108,6 @@ SeedArray::SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogr
seqs.enum_seeds(cb, seq_partition, shape, shape + 1, filter);
}
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const No_filter *);
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const Seed_set *);
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>, const Hashed_seed_set *);
\ No newline at end of file
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const No_filter *);
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const Seed_set *);
template SeedArray::SeedArray(const Sequence_set &, size_t, const shape_histogram &, const SeedPartitionRange &, const vector<size_t>&, const Hashed_seed_set *);
\ No newline at end of file
......@@ -45,7 +45,7 @@ struct SeedArray
} PACKED_ATTRIBUTE;
template<typename _filter>
SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> seq_partition, const _filter *filter);
SeedArray(const Sequence_set &seqs, size_t shape, const shape_histogram &hst, const SeedPartitionRange &range, const vector<size_t> &seq_partition, const _filter *filter);
Entry* begin(unsigned i)
{
......
......@@ -167,7 +167,7 @@ const Fixed_score_buffer<_score>& needleman_wunsch(sequence query, sequence subj
return mtx.score_buffer();
}
template const Fixed_score_buffer<int>& needleman_wunsch(sequence query, sequence subject, int &max_score, const Local&, const int&);
template const Fixed_score_buffer<int>& needleman_wunsch<int, Local>(sequence query, sequence subject, int &max_score, const Local&, const int&);
int needleman_wunsch(sequence query, sequence subject, int qbegin, int qend, int sbegin, int send, unsigned node, unsigned edge, Diag_graph &diags, bool log)
{
......
......@@ -233,6 +233,26 @@ void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<
}
}
template<typename _sv>
void banded_3frame_swipe_targets(vector<DpTarget>::iterator begin,
vector<DpTarget>::iterator end,
bool score_only,
const TranslatedSequence &query,
Strand strand,
DpStat &stat,
bool parallel,
bool overflow_only)
{
for (vector<DpTarget>::iterator i = begin; i < end; i += ScoreTraits<_sv>::CHANNELS) {
if (!overflow_only || i->overflow) {
if (score_only || config.disable_traceback)
banded_3frame_swipe<_sv, ScoreOnly>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(ScoreTraits<_sv>::CHANNELS), end - i), stat, parallel);
else
banded_3frame_swipe<_sv, Traceback>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(ScoreTraits<_sv>::CHANNELS), end - i), stat, parallel);
}
}
}
void banded_3frame_swipe_worker(vector<DpTarget>::iterator begin,
vector<DpTarget>::iterator end,
Atomic<size_t> *next,
......@@ -242,15 +262,12 @@ void banded_3frame_swipe_worker(vector<DpTarget>::iterator begin,
{
DpStat stat;
size_t pos;
while (begin + (pos = next->post_add(config.swipe_chunk_size)) < end) {
const auto e = min(begin + pos + config.swipe_chunk_size, end);
for (vector<DpTarget>::iterator i = begin + pos; i < e; i += 8) {
if (score_only || config.disable_traceback)
banded_3frame_swipe<score_vector<int16_t>, ScoreOnly>(*query, strand, i, min(i + 8, end), stat, true);
else
banded_3frame_swipe<score_vector<int16_t>, Traceback>(*query, strand, i, min(i + 8, end), stat, true);
}
}
while (begin + (pos = next->post_add(config.swipe_chunk_size)) < end)
#ifdef __SSE2__
banded_3frame_swipe_targets<score_vector<int16_t>>(begin + pos, min(begin + pos + config.swipe_chunk_size, end), score_only, *query, strand, stat, true, false);
#else
banded_3frame_swipe_targets<int32_t>(begin + pos, min(begin + pos + config.swipe_chunk_size, end), score_only, *query, strand, stat, true, false);
#endif
}
void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<DpTarget>::iterator target_begin, vector<DpTarget>::iterator target_end, DpStat &stat, bool score_only, bool parallel)
......@@ -278,26 +295,11 @@ void banded_3frame_swipe(const TranslatedSequence &query, Strand strand, vector<
delete i->tmp;
}
}
else {
for (vector<DpTarget>::iterator i = target_begin; i < target_end; i += 8)
if (score_only || config.disable_traceback)
banded_3frame_swipe<score_vector<int16_t>, ScoreOnly>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(8), target_end - i), stat, false);
else
banded_3frame_swipe<score_vector<int16_t>, Traceback>(query, strand, i, i + std::min(vector<DpTarget>::iterator::difference_type(8), target_end - i), stat, false);
}
else
banded_3frame_swipe_targets<score_vector<int16_t>>(target_begin, target_end, score_only, query, strand, stat, false, false);
for (vector<DpTarget>::iterator i = target_begin; i < target_end; ++i)
if (i->overflow) {
if (score_only || config.disable_traceback)
banded_3frame_swipe<int32_t, ScoreOnly>(query, strand, i, i + 1, stat, false);
else
banded_3frame_swipe<int32_t, Traceback>(query, strand, i, i + 1, stat, false);
}
banded_3frame_swipe_targets<int32_t>(target_begin, target_end, score_only, query, strand, stat, false, true);
#else
for (vector<DpTarget>::iterator i = target_begin; i < target_end; ++i)
if (score_only || config.disable_traceback)
banded_3frame_swipe<int32_t, ScoreOnly>(query, strand, i, i + 1, stat, false);
else
banded_3frame_swipe<int32_t, Traceback>(query, strand, i, i + 1, stat, false);
banded_3frame_swipe_targets<int32_t>(target_begin, target_end, score_only, query, strand, stat, false, false);
#endif
}
\ No newline at end of file
// Copyright 2015 Yutaro Konta
#include "LambdaCalculator.hh"
#include <vector>
#include <cassert>
#include <cstdio> // sprintf
#include <cstdlib>
#include <cfloat>
#include <cmath>
//#include <iostream>
using namespace std;
static double roundToFewDigits(double x)
{
// This rounding fixes some inaccuracies, e.g. for DNA with a simple
// match/mismatch matrix it's likely to make all the probabilities
// exactly 0.25, as they should be.
char buffer[32];
sprintf(buffer, "%g", x);
return atof(buffer);
}
static double** makematrix(int m, int n, double val)
{
double** mat = new double* [m];
for (int i=0; i<m; i++)
{
mat[i] = new double [n];
for (int j=0; j<n; j++)
mat[i][j] = val;
}
return mat;
}
static void deletematrix(double** a, int m)
{
for (int i=0; i<m; i++)
delete[] a[i];
delete[] a;
}
static double summatrix(double** a, int m, int n)
{
double s = 0;
for (int i=0; i<m; i++)
for (int j=0; j<n; j++)
s += a[i][j];
return s;
}
static int max_index(double** a, int n, int i)
{
double max = -DBL_MAX;
int maxindex = -1;
for (int j=i; j<n; j++)
{
if (fabs(a[j][i]) > max)
{
max = fabs(a[j][i]);
maxindex = j;
}
}
return maxindex;
}
static void swap_matrix(double** a, int i, int j)
{
double* v = a[i];
a[i] = a[j];
a[j] = v;
}
static void swap_vector(int* a, int i, int j)
{
int v = a[i];
a[i] = a[j];
a[j] = v;
}
static bool lu_pivoting(double** a, int* idx, int n)
{
int v;
for (int i=0; i<n; i++)
idx[i] = i;
for (int i=0; i<n; i++)
{
v = max_index(a, n, i);
assert(v >= 0);
if (fabs(a[v][i]) < 1e-10)
{
return false; // singular matrix!
}
swap_matrix(a, i, v);
swap_vector(idx, i, v);
a[i][i] = 1.0/a[i][i];
for (int j=i+1; j<n; j++)
{
a[j][i] = a[j][i] * a[i][i];
for (int k=i+1; k<n; k++)
a[j][k] = a[j][k] - a[j][i] * a[i][k];
}
}
return true;
}
static void solvep(double **a, double *x, double *b, int n)
{
double *y = new double [n];
for (int i=0; i<n; i++)
{
y[i] = b[i];
for (int j=0; j<i; j++)
y[i] -= a[i][j] * y[j];
}
for (int i=n-1; i>=0; i--)
{
x[i] = y[i];
for (int j=i+1; j<n; j++)
x[i] -= a[i][j] * x[j];
x[i] *= a[i][i]; // needed because a[i][i] is inverted
}
delete[] y;
}
static void transpose(double **a, int n)
{
double v;
for (int i=0; i<n; i++)
{
for (int j=0; j<i; j++)
{
v = a[i][j];
a[i][j] = a[j][i];
a[j][i] = v;
}
}
}
static bool invert(double **a, double **inv, int n)
{
int* idx = new int [n];
double **e = makematrix(n,n,0);
if(!lu_pivoting(a, idx, n))
return false;
for (int i=0; i<n; i++)
e[idx[i]][i] = 1; // transposed
delete[] idx;
for (int i=0; i<n; i++)
solvep(a, inv[i], e[i], n);
deletematrix(e, n);
transpose(inv, n); // transpose inv to make the inverted matrix of a
return true;
}
static bool calculate_inv_sum(double **matrix, int alpha_size, double tau, double* inv_sum)
{
double **m = makematrix(alpha_size, alpha_size, 0);
double **y = makematrix(alpha_size, alpha_size, 0);
for (int i=0; i<alpha_size; i++)
for (int j=0; j<alpha_size; j++)
m[i][j] = exp(tau * matrix[i][j]);
if(!invert(m, y, alpha_size))
return false;
*inv_sum = summatrix(y, alpha_size, alpha_size);
deletematrix(m, alpha_size);
deletematrix(y, alpha_size);
return true;
}
namespace cbrc{
void LambdaCalculator::setBad(){
lambda_ = -1;
letterProbs1_.clear();
letterProbs2_.clear();
}
bool LambdaCalculator::find_ub(double **matrix, int alpha_size, double *ub)
{
double r_max_min = DBL_MAX;
double c_max_min = DBL_MAX;
double r_max;
double c_max;
double r_min;
double c_min;
int l_r = 0;
int l_c = 0;
for (int i=0; i<alpha_size; i++)
{
r_max = -DBL_MAX;
r_min = DBL_MAX;
for (int j=0; j<alpha_size; j++)
{
if (matrix[i][j] > r_max)
r_max = matrix[i][j];
if (matrix[i][j] < r_min)
r_min = matrix[i][j];
}
if (r_max == 0 && r_min == 0)
l_r++;
else if (r_max <= 0 || r_min >= 0)
return false;
else if (r_max < r_max_min)
r_max_min = r_max;
}
for (int j=0; j<alpha_size; j++)
{
c_max = -DBL_MAX;
c_min = DBL_MAX;
for (int i=0; i<alpha_size; i++)
{
if (matrix[i][j] > c_max)
c_max = matrix[i][j];
if (matrix[i][j] < c_min)
c_min = matrix[i][j];
}
if (c_max == 0 && c_min == 0)
l_c++;
else if (c_max <= 0 || c_min >= 0)
return false;
else if (c_max < c_max_min)
c_max_min = c_max;
}
if (l_r == alpha_size) return false;
// the multiplication by 1.1 is sometimes necessary, presumably to
// prevent the upper bound from being too tight:
if (r_max_min > c_max_min)
*ub = 1.1 * log(1.0 * (alpha_size - l_r))/r_max_min;
else
*ub = 1.1 * log(1.0 * (alpha_size - l_c))/c_max_min;
return true;
}
bool LambdaCalculator::binary_search(double** matrix, int alpha_size, double lb, double ub, vector<double>& letprob1, vector<double>& letprob2, double* lambda, int maxiter)
{
double l=0;
double r=0;
double l_sum=0;
double r_sum=0;
int iter=0;
while (iter < maxiter && (l>=r || (l_sum < 1.0 && r_sum < 1.0) || (l_sum > 1.0 && r_sum > 1.0)))
{
l = lb + (ub - lb)*rand()/RAND_MAX;
r = lb + (ub - lb)*rand()/RAND_MAX;
if (!calculate_inv_sum(matrix, alpha_size, l, &l_sum) || !calculate_inv_sum(matrix, alpha_size, r, &r_sum))
{
l = 0;
r = 0;
}
iter++;
}
if (iter >= maxiter)
return false;
while (l_sum != 1.0 && r_sum != 1.0 && (l+r)/2.0 != l && (l+r)/2.0 != r)
{
double mid = (l + r)/2.0;
double mid_sum;
if (!calculate_inv_sum(matrix, alpha_size, mid, &mid_sum))
return false;
if (fabs(mid_sum) >= DBL_MAX)
return false;
if ((l_sum < 1.0 && mid_sum >= 1.0) || (l_sum > 1.0 && mid_sum <= 1.0))
{
r = mid;
r_sum = mid_sum;
}
else if ((r_sum < 1.0 && mid_sum >= 1.0) || (r_sum > 1.0 && mid_sum <= 1.0))
{
l = mid;
l_sum = mid_sum;
}
else
return false;
}
if (fabs(l_sum - 1.0) < fabs(r_sum - 1.0))
{
if (check_lambda(matrix, l, alpha_size, letprob1, letprob2))
{
*lambda = l;
return true;
}
return false;
}
if (check_lambda(matrix, r, alpha_size, letprob1, letprob2))
{
*lambda = r;
return true;
}
return false;
}
double LambdaCalculator::calculate_lambda(double** matrix, int alpha_size, vector<double>& letprob1, vector<double>& letprob2, int maxiter, int max_boundary_search_iter, double lb_ratio)
{
double ub;
if (!find_ub(matrix, alpha_size, &ub))
return -1;
double lb = ub*lb_ratio;
double lambda = -1;
int iter = 0;
bool flag = false;
while (!flag && iter < maxiter)
{
flag = binary_search(matrix, alpha_size, lb, ub, letprob1, letprob2, &lambda, max_boundary_search_iter);
iter++;
}
return lambda;
}
bool LambdaCalculator::check_lambda(double** matrix, double lambda, int alpha_size, vector<double>& letprob1, vector<double>& letprob2)
{
double **m = makematrix(alpha_size, alpha_size, 0);
double **y = makematrix(alpha_size, alpha_size, 0);
for (int i=0; i<alpha_size; i++)
for (int j=0; j<alpha_size; j++)
m[i][j] = exp(lambda * matrix[i][j]);
invert(m, y, alpha_size);
for (int i=0; i<alpha_size; i++)
{
double p = 0;
for (int j=0;j<alpha_size; j++)
p += y[i][j];
if (p < 0 || p > 1)
{
letprob2.clear();
return false;
}
letprob2.push_back(roundToFewDigits(p));
}
for (int j=0; j<alpha_size; j++)
{
double q = 0;
for (int i=0; i<alpha_size; i++)
q += y[i][j];
if (q < 0 || q > 1)
{
letprob2.clear();
letprob1.clear();
return false;
}
letprob1.push_back(roundToFewDigits(q));
}
deletematrix(m, alpha_size);
deletematrix(y, alpha_size);
return true;
}
void LambdaCalculator::calculate(const const_int_ptr *matrix, int alphSize) {
assert(alphSize >= 0);
setBad();
int maxiter = 1000;
int max_boundary_search_iter = 100;
double lb_ratio = 1e-6;
double** mat = makematrix(alphSize, alphSize, 0);
for (int i=0; i<alphSize; i++)
for (int j=0; j<alphSize; j++)
mat[i][j] = matrix[i][j];
lambda_ = calculate_lambda(mat, alphSize, letterProbs1_, letterProbs2_, maxiter, max_boundary_search_iter, lb_ratio);
deletematrix(mat, alphSize);
}
}