Skip to content
Commits on Source (6)
2018-12-10 Martin C. Frith <Martin C. Frith>
* README.txt, src/mcf_tantan_options.cc, src/mcf_tantan_options.hh,
src/tantan_app.cc, test/tantan_test.out, test/tantan_test.sh:
Add match score, mismatch cost options
[2383404c795a] [tip]
* src/tantan.cc:
Refactor
[d61af119db30]
* src/tantan.cc:
Refactor
[81bdfe23217d]
* src/CA_code/lambda_calculator.c, src/CA_code/lambda_calculator.h,
src/CA_code/lubksb.c, src/CA_code/ludcmp.c, src/CA_code/nrutil.c,
src/CA_code/nrutil.h, src/LambdaCalculator.cc,
src/LambdaCalculator.hh, src/Makefile,
src/mcf_score_matrix_probs.cc, src/mcf_score_matrix_probs.hh,
src/tantan_app.cc:
Use Konta-san's code for matrix lambda
[fccc8e5e9c1b]
* src/tantan.cc, src/tantan_app.cc, test/tantan_test.sh:
Refactor
[f9a9da99553d]
2012-10-16 Martin C. Frith <Martin C. Frith>
* README.txt:
Expanded the documentation: installation & FAQ
[946a951b1a06] [tip]
[946a951b1a06]
* src/CA_code/nrutil.c:
Just fixed a compiler warning
......
......@@ -465,6 +465,12 @@ repeats, so it's easy to lift the masking after determining homology.</p>
<kbd><span class="option">-d</span></kbd></td>
<td>probability decay per period (period-(i+1) / period-i)</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-i</span></kbd></td>
<td>match score</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-j</span></kbd></td>
<td>mismatch cost</td></tr>
<tr><td class="option-group">
<kbd><span class="option">-a</span></kbd></td>
<td>gap existence cost</td></tr>
<tr><td class="option-group">
......
......@@ -141,6 +141,8 @@ Options
-e probability of a repeat ending per position
-w maximum tandem repeat period to consider
-d probability decay per period (period-(i+1) / period-i)
-i match score
-j mismatch cost
-a gap existence cost
-b gap extension cost
-s minimum repeat probability for masking
......
tantan (18-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Standards-Version: 4.2.1
-- Andreas Tille <tille@debian.org> Mon, 17 Dec 2018 13:58:58 +0100
tantan (13-5) unstable; urgency=medium
* Team upload.
......
......@@ -4,7 +4,7 @@ Uploaders: Sascha Steinbiss <sascha@steinbiss.name>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~)
Standards-Version: 4.1.4
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/tantan
Vcs-Git: https://salsa.debian.org/med-team/tantan.git
Homepage: http://www.cbrc.jp/tantan/
......
......@@ -3,33 +3,25 @@ Description: add buildflags
Author: Sascha Steinbiss <sascha@steinbiss.name>
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,17 +1,21 @@
@@ -1,11 +1,14 @@
-CXXFLAGS = -O3 -Wall -W -Wcast-qual -Wswitch-enum -Wundef \
+#CXXFLAGS += -O3 -g -Wall -W -Wcast-qual -Wswitch-enum -Wundef \
-Wcast-align -Wold-style-cast
--Wcast-align -Wold-style-cast
+#CXXFLAGS = -O3 -Wall -W -Wcast-qual -Wswitch-enum -Wundef \
+#-Wcast-align -Wold-style-cast
# -Wconversion
-CFLAGS = -Wall
+#CFLAGS += -Wall
all: tantan
COBJ = CA_code/lambda_calculator.o
-tantan: *.cc *.hh version.hh Makefile
- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ *.cc
+CCSRCS = $(sort $(wildcard *.cc))
+CCHDRS = $(sort $(wildcard *.hh))
+CACODESRCS = $(sort $(wildcard CA_code/*.c))
+CACODEHDRS = $(sort $(wildcard CA_code/*.h))
all: tantan
-tantan: *.cc *.hh version.hh Makefile $(COBJ)
- $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ *.cc $(COBJ)
+tantan: $(CCSRCS) $(CCHDRS) version.hh Makefile $(COBJ)
+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(CCSRCS) $(COBJ)
-$(COBJ): CA_code/*.c CA_code/*.h Makefile
+$(COBJ): $(CACODESRCS) $(CACODEHDRS) Makefile
$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ CA_code/lambda_calculator.c
+
+tantan: $(CCSRCS) $(CCHDRS) version.hh Makefile
+ $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(CCSRCS)
clean:
rm -f tantan
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,5 @@
......
// Copyright 2008 Michiaki Hamada
// Adapted from public domain code by Yi-Kuo Yu, NCBI
/**
* See lambda_calculator.h
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "nrutil.h"
int Alphsize;
#include "nrutil.c"
#include "ludcmp.c"
#include "lubksb.c"
#include "lambda_calculator.h"
#define Epsilon 1.0e-36
#define E_bound 1.0e-12
#define Infty 1000000.0
#define min(A,B) ((A) > (B) ? (B) : (A) )
#define max(A,B) ((A) > (B) ? (A) : (B) )
#define bool int
#define true 1
#define false 0
double Lambda_UB; //lambda upper bound
double r_max_m, c_max_m; //min of each row's (column's) max
void makematrix( const double** mat_b, double **a, double lambda);
typedef struct Lambda {
double min;
double max;
int flag; // 1 means there is a range, -1 means no solution possible.
} Lambda;
typedef struct Sum {
double value;
int flag; // 1 means no negative bg_freq, -1 means there is negative bg_freq
} Sum;
Lambda Find_JP(const double** mat_b, double la_min, double la_max, double **JP, double *p_in, double *q_in);
Sum Check_root(const double** mat_b, double **a, double lambda, double *p, double *q);
double Check_det(const double** mat_b, double **a, double lambda);
Sum Nail_lambda(const double** mat_b, int flag_sign, double lambda_min, double lambda_max, double *p, double *q, double *la_add);
double Nail_det(const double** mat_b, int flag_sign, double lambda_min, double lambda_max);
bool Check_range( const double** mat_b );
double *Locate_det_zero( const double** mat_b, int * ); //pointer to root list are returned with how many of them by int
double calculate_lambda ( const double** mat_b, int alpha_size,
double* p, double* q )
{
double **JP/*, *q, *p*/;
int k;
double *root_location;
int N_root;
Lambda Lambda_local;
Alphsize = alpha_size;
if( ! Check_range( mat_b ) ) return -1.0;
root_location = Locate_det_zero(mat_b, &N_root);
if( root_location == NULL && N_root > 0 ) return -1.0;
//q=dvector(1,Alphsize);
//p=dvector(1,Alphsize);
JP = dmatrix(1,Alphsize,1,Alphsize);
if (N_root == 0){
Lambda_local = Find_JP(mat_b, 0, Lambda_UB, JP, p, q);
if (1 == Lambda_local.flag) { // sensible solution found
// Remember to find the right place to free the vectors
//free_dvector(p, 1,Alphsize);
//free_dvector(q, 1,Alphsize);
free( root_location );
free_dmatrix( JP, 1, Alphsize, 1, Alphsize );
return (Lambda_local.min + Lambda_local.max) / 2.0;
}
else if (-1 == Lambda_local.flag) {
//printf("matrix pass first screening but no sensible solution found. :-( \n");
}
}
else if (N_root > 0) {
//printf("N_root = %d for this matirx \n", N_root);
//for (i=0;i<N_root;i++) printf("N_root[%d] = %lf\n",i,root_location[i]);
for (k=0; k<= N_root; k++){
if (k==0) {Lambda_local.min =0; Lambda_local.max = root_location[0];}
else if (k== N_root) {Lambda_local.min = root_location[N_root-1]; Lambda_local.max = Lambda_UB+Epsilon;}
else {Lambda_local.min = root_location[k-1]; Lambda_local.max = root_location[k]; }
Lambda_local = Find_JP(mat_b, Lambda_local.min, Lambda_local.max, JP, p, q);
if (1 == Lambda_local.flag) { // sensible solution found
//free_dvector(p, 1,Alphsize);
//free_dvector(q, 1,Alphsize);
free( root_location );
free_dmatrix( JP, 1, Alphsize, 1, Alphsize );
return (Lambda_local.min + Lambda_local.max) / 2.0;
}
else if (-1 == Lambda_local.flag) {
//printf("matrix pass first screening but still no sensible solution found. :-( \n");
}
}
}
// Remember to find the right place to free the vectors
//free_dvector(p, 1,Alphsize);
//free_dvector(q, 1,Alphsize);
free( root_location );
free_dmatrix( JP, 1, Alphsize, 1, Alphsize );
return -1.0;
}
bool Check_range( const double** mat_b )
{
int i,j;
int pos_flag_r, neg_flag_r;
int pos_flag_c, neg_flag_c;
double r_max, c_max; //max of each row (or column)
int L_r=0, L_c=0; // number of zero-score rows (columns)
// First make sure each row and column have both pos and neg entries
r_max_m = c_max_m = 100000000000.0;
for (i=1;i<=Alphsize; i++) {
r_max = 0; c_max = 0;
pos_flag_r = -1; neg_flag_r = -1; pos_flag_c = -1; neg_flag_c = -1;
for (j = 1; j<= Alphsize; j++){
if (mat_b[i][j] > 0) {
if (mat_b[i][j]> r_max) r_max = mat_b[i][j];
pos_flag_r = 1;
}
else if (mat_b[i][j] < 0) neg_flag_r = 1;
if (mat_b[j][i] > 0) {
if (mat_b[j][i] > c_max) c_max = mat_b[j][i];
pos_flag_c = 1;
}
else if (mat_b[j][i] < 0) neg_flag_c = 1;
}
if ((pos_flag_r == -1)||(neg_flag_r == -1)||(pos_flag_c == -1)||(neg_flag_c == -1)) {
if ((pos_flag_r == -1)&&(neg_flag_r == -1)){
printf ("only zero score at row %d\n", i);
L_r++;
}
else if ((pos_flag_c == -1)&& (neg_flag_c == -1)){
printf ("only zero score at column %d\n", i);
L_c++;
}
else{
//printf("all positive or all negative at row or column %d\n", i);
//printf("therefore invalid matrix. exit now. \n");
return false;
//exit(1);
}
}
if ((r_max < r_max_m)&&(r_max > 0)) r_max_m = r_max;
if ((c_max < c_max_m)&&(c_max > 0)) c_max_m = c_max;
}
// Find the upper bound for lambda
if (r_max_m > c_max_m) {
Lambda_UB = 1.1*log(1.0*Alphsize-L_r)/r_max_m ;
}
else {
Lambda_UB = 1.1*log(1.0*Alphsize-L_c)/c_max_m;
}
//printf("the upper bound for lambda is %lf\n", Lambda_UB);
return true;
}
double Check_det(const double** mat_b, double **a, double lambda)
{
double d;
int i, /*j,*/ *indx;
indx = ivector(1,Alphsize);
makematrix(mat_b, a,lambda);
ludcmp(a,Alphsize,indx,&d);
for(i=1;i<=Alphsize;i++) d *= a[i][i];
free_ivector(indx,1,Alphsize);
return d; //returning the determinant
}
Sum Check_root(const double** mat_b, double **a, double lambda, double *p, double *q)
{
double **y, /* *col,*/ d;
//double sum = 0.0;
int i,j;//, *indx;
Sum Sum_here;
y=dmatrix(1,Alphsize,1,Alphsize);
//indx = ivector(1,Alphsize);
int indx[Alphsize+1];
//col = dvector(1,Alphsize);
double col[Alphsize+1];
makematrix(mat_b, a,lambda);
ludcmp(a,Alphsize,indx,&d);
Sum_here.value = 0.0;
for(i=1;i<=Alphsize;i++) q[i]=0.0;
for(j=1;j<=Alphsize;j++){
for (i=1;i<=Alphsize;i++) col[i]=0.0;
col[j]=1.0;
lubksb(a,Alphsize,indx,col);
p[j]=0.0;
for (i=1;i<=Alphsize;i++) {
y[i][j]=col[i]; Sum_here.value += y[i][j];
p[j]+=y[i][j]; q[i]+=y[i][j];
}
}
Sum_here.flag = 1;
for (i=1;i<Alphsize; i++) {
if ((p[i] < 0) || (q[i] < 0)) {
Sum_here.flag = -1;
//printf("problematic freq. p[%d] = %.4f q[%d]=%.4f\n",i,p[i],i,q[i]);
}
}
free_dmatrix(y,1,Alphsize,1,Alphsize);
return Sum_here;
}
double * Locate_det_zero(const double** mat_b, int *N_root_add)
{
double **a/*, *q, *p */; // a is the exponentiated matrix of socres, p and q are bg_freqs
int i/*,j,k*/;
int N; // number of points for first round
int flag_sign;
double lambda/*, l_tmp, sum, sum_min, sum_max */;
double lambda_root, dlambda /*, dsum=0.5 */;
//double *l_here, *s_here;
double root[5000];
double *root_temp;
//double error=0.000000000001;
int zero_monitor = 0; // record number of zeros found in the range
//int flag;
a=dmatrix(1,Alphsize,1,Alphsize);
//Sum_local = (Sum *)malloc(sizeof(Sum));
//Lambda_local = (Lambda *)malloc(sizeof(Lambda));
N = 2 + max(400, ((int) (Lambda_UB-0)/0.005));
//printf("N = %d in Locate_det_zero\n", N);
dlambda = (Lambda_UB)/(N*1.0);
//l_here = (double *)malloc((N+1)*sizeof(double));
//s_here = (double *)malloc((N+1)*sizeof(double));
double l_here[N+1];
double s_here[N+1];
for (i=0; i< N ; i++){
lambda = (i+1)*dlambda;
s_here[i] = Check_det(mat_b, a,lambda);
l_here[i] = lambda;
}
if (s_here[0] < 0.0) flag_sign = -1;
if (s_here[0] > 0.0 ) flag_sign = 1;
if (fabs(s_here[0])/exp(l_here[0]*(r_max_m+c_max_m)/2.0) <= Epsilon) {
root[zero_monitor++] = l_here[0];
flag_sign = 0;
}
for (i=1;i<N; i++){
if ((flag_sign != 0)&& (fabs(s_here[i])>Epsilon)) {
if (s_here[i-1]*s_here[i] < 0){
//printf("occurring at regular places\n");
lambda_root = Nail_det(mat_b, flag_sign, l_here[i-1], l_here[i]);
root[zero_monitor++] = lambda_root;
flag_sign = -flag_sign; // the flag switch sign after one sol found
//printf("a (regular) root of det found at %12.10f, i= %d\n", lambda_root,i);
}
}
else {
if (s_here[i] < 0.0) flag_sign = -1;
if (s_here[i] > 0.0 ) flag_sign = 1;
if (fabs(s_here[i])/exp(l_here[i]*(r_max_m+c_max_m)/2.0) <= Epsilon) {
root[zero_monitor++] = l_here[i];
}
}
}
//printf("total number of solution found in range is %d\n", i_monitor);
root_temp = (double *)malloc(zero_monitor*sizeof(double));
*N_root_add = zero_monitor;
if (zero_monitor > 0) {
if (zero_monitor >= N/4) {
//printf("It is likely that uniform zero determinant is occurring.\n");
//printf("number of small det points = %d out of %d, exit now....\n",zero_monitor, N);
free( root_temp );
return NULL;
//exit(1);
}
for (i = 0; i < zero_monitor; i++){
root_temp[i] = root[i];
//printf("root_location[%d] = %lf\n",i,root_temp[i]);
}
}
free_dmatrix( a, 1, Alphsize, 1, Alphsize );
return root_temp;
}
Lambda Find_JP(const double** mat_b, double la_min, double la_max, double **JP, double *p_in, double *q_in)
{
double **a, *q, *p; // a is the exponentiated matrix of socres, p and q are bg_freqs
int i,j/*,k*/;
int N; // number of points for first round
double lambda/*, l_tmp, sum, sum_min, sum_max*/;
double lambda_max, lambda_min, lambda_final, dlambda/*, dsum=0.5*/;
//double *l_here, *s_here;
//double error=0.000000000000001;
//int validity_flag; // 1 means valid, -1 means not valid.
int flag_sign; // 1 means small lambda sum > 1, -1 means otherwise
int flag_done = -1; // 1 means find sensible solution, -1 means sensible not found
int i_monitor = 0; // record number of solution found in the range, including nonsense ones
int j_monitor;
Lambda Lambda_local;
//Sum *Sum_local;
Sum Sum_local;
lambda_min = la_min;
lambda_max = la_max;
q = q_in;
p = p_in;
a=dmatrix(1,Alphsize,1,Alphsize);
//Sum_local = (Sum *)malloc(sizeof(Sum));
//Lambda_local = (Lambda *)malloc(sizeof(Lambda));
N = 2 + max(400, ((int) (lambda_max-lambda_min)/0.005));
//printf("N = %d in Find_JP\n", N);
dlambda = (lambda_max-lambda_min)/(N*1.0);
//l_here = (double *)malloc((N+1)*sizeof(double));
//s_here = (double *)malloc((N+1)*sizeof(double));
double l_here[N+1];
double s_here[N+1];
//printf("lambda_min enter = %12.10e, lambda_max = %12.10f\n", lambda_min, lambda_max);
for (i=0; i< N-1 ; i++){
lambda = lambda_min + (i+1)*dlambda;
makematrix(mat_b, a,lambda);
Sum_local = Check_root(mat_b, a,lambda,p,q);
l_here[i] = lambda;
s_here[i] = Sum_local.value - 1.0;
//printf("scan %d th time in Find_JP\n",i );
}
//printf("finish first time scanining in Find_JP\n");
if (s_here[0] < 0.0) flag_sign = -1;
else if (s_here[0] > 0.0 ) flag_sign = 1;
else if (s_here[0] == 0.0) { //needs refined definition on flag_sign
printf("enter the exact hit, rarely occurs other than when lambda = 0 \n");
j_monitor = 1;
flag_sign = 0;
while ((flag_sign == 0) &&(j_monitor < N)){
Sum_local = Check_root(mat_b, a,l_here[0]+ j_monitor*dlambda/N,p,q);
if (Sum_local.value > 1.0) {
flag_sign = 1;
}
else if (Sum_local.value <1.0){
flag_sign = -1;
}
j_monitor++;
}
}
for (i=1;i<N; i++){ // should be N-1 ???
if (flag_sign == 0) {printf("flag_sign = 0 \n"); exit(1);}
if (s_here[i-1]*s_here[i] < 0){
lambda_min = l_here[i-1];
lambda_max = l_here[i];
Sum_local = Nail_lambda(mat_b, flag_sign, lambda_min, lambda_max, p,q, &lambda_final);
if (Sum_local.flag == 1) {
i =N; flag_done = 1; Lambda_local.flag = 1;
Lambda_local.min = lambda_final, Lambda_local.max = lambda_final;
}
flag_sign = -flag_sign; // the flag switch sign after one sol found
i_monitor++ ;
}
}
if (flag_done == 1){
// Write correct JP to the matrix
makematrix(mat_b, a,lambda_final);
for (i=1;i<=Alphsize;i++){
for (j=1;j<=Alphsize;j++){
JP[i][j] = a[i][j]*p[i]*q[j];
}
}
free_dmatrix(a,1,Alphsize,1,Alphsize);
return Lambda_local;
}
else if (flag_done == -1){
//printf("no sensible solution in the plausible x range: (%lf,%lf)\n", la_min, la_max);
Lambda_local.flag = -1; Lambda_local.min = 0; Lambda_local.max = Infty;
return Lambda_local;
}
// never come here
return Lambda_local;
}
Sum Nail_lambda(const double** mat_b, int flag_sign, double lambda_min, double lambda_max, double *p, double *q, double *lam_add)
{
double **a;
double lambda;
//Sum *Sum_local;
Sum Sum_local;
a = dmatrix(1,Alphsize,1,Alphsize);
//Sum_local = (Sum *)malloc(sizeof(Sum));
lambda = (lambda_min+lambda_max)/2.0;
Sum_local = Check_root(mat_b, a, lambda, p, q);
while (fabs(Sum_local.value-1.0)>E_bound){
if (flag_sign*(Sum_local.value-1.0)<0) lambda_max = lambda;
else if (flag_sign*(Sum_local.value-1.0)> 0) lambda_min = lambda;
// Added by MCF to avoid infinite loop:
if (lambda == (lambda_min+lambda_max)/2.0){
Sum_local.flag = -1;
break;
}
lambda = (lambda_min+lambda_max)/2.0;
Sum_local = Check_root(mat_b, a, lambda, p, q);
}
free_dmatrix(a,1,Alphsize,1,Alphsize);
*lam_add = lambda;
return Sum_local;
}
double Nail_det(const double** mat_b, int flag_sign, double lambda_min, double lambda_max)
{
double **a;
double lambda;
double value;
a = dmatrix(1,Alphsize,1,Alphsize);
lambda = (lambda_min+lambda_max)/2.0;
value = Check_det(mat_b, a, lambda);
while ((fabs(value)>E_bound)&&(lambda > 0)){
if (flag_sign*(value)<0) lambda_max = lambda;
else if (flag_sign*(value)> 0) lambda_min = lambda;
lambda = (lambda_min+lambda_max)/2.0;
value = Check_det(mat_b, a, lambda);
}
free_dmatrix(a,1,Alphsize,1,Alphsize);
return lambda;
}
void makematrix(const double** mat_b, double **a, double lambda)
{
int i,j;
for (i=1;i<=Alphsize;i++)
for(j=1;j<=Alphsize;j++){
*(*(a+i)+j)=exp(lambda*mat_b[i][j]);
}
}
// Copyright 2008 Michiaki Hamada
#ifndef __H_INCLUDE_LAMBDA_CALCULATOR_HH
#define __H_INCLUDE_LAMBDA_CALCULATOR_HH
// These pointers are 1-based!
double calculate_lambda( const double** mat_b, int alpha_size,
double* p, double* q );
#endif // __H_INCLUDE_LAMBDA_CALCULATOR_HH
// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
void lubksb(a,n,indx,b)
double **a,b[];
int n,*indx;
{
int i,ii=0,ip,j;
double sum;
for (i=1;i<=n;i++) {
ip=indx[i];
sum=b[ip];
b[ip]=b[i];
if (ii)
for (j=ii;j<=i-1;j++) sum -= a[i][j]*b[j];
else if (sum) ii=i;
b[i]=sum;
}
for (i=n;i>=1;i--) {
sum=b[i];
for (j=i+1;j<=n;j++) sum -= a[i][j]*b[j];
b[i]=sum/a[i][i];
}
}
// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
#include <math.h>
#define TINY 1.0e-20;
void ludcmp(a,n,indx,d)
int n,*indx;
double **a,*d;
{
int i,imax,j,k;
double big,dum,sum,temp;
double *vv,*dvector();
void nrerror(),free_dvector();
vv=dvector(1,n);
*d=1.0;
for (i=1;i<=n;i++) {
big=0.0;
for (j=1;j<=n;j++)
if ((temp=fabs(a[i][j])) > big) big=temp;
if (big == 0.0) nrerror("Singular matrix in routine LUDCMP");
vv[i]=1.0/big;
}
for (j=1;j<=n;j++) {
for (i=1;i<j;i++) {
sum=a[i][j];
for (k=1;k<i;k++) sum -= a[i][k]*a[k][j];
a[i][j]=sum;
}
big=0.0;
for (i=j;i<=n;i++) {
sum=a[i][j];
for (k=1;k<j;k++)
sum -= a[i][k]*a[k][j];
a[i][j]=sum;
if ( (dum=vv[i]*fabs(sum)) >= big) {
big=dum;
imax=i;
}
}
if (j != imax) {
for (k=1;k<=n;k++) {
dum=a[imax][k];
a[imax][k]=a[j][k];
a[j][k]=dum;
}
*d = -(*d);
vv[imax]=vv[j];
}
indx[j]=imax;
if (a[j][j] == 0.0) a[j][j]=TINY;
if (j != n) {
dum=1.0/(a[j][j]);
for (i=j+1;i<=n;i++) a[i][j] *= dum;
}
}
free_dvector(vv,1,n);
}
#undef TINY
// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
//#include <malloc.h>
#include <stdlib.h>
#include <stdio.h>
void nrerror(error_text)
char error_text[];
{
void exit();
fprintf(stderr,"Numerical Recipes run-time error...\n");
fprintf(stderr,"%s\n",error_text);
fprintf(stderr,"...now exiting to system...\n");
exit(1);
}
float *vector(nl,nh)
int nl,nh;
{
float *v;
v=(float *)malloc((unsigned) (nh-nl+1)*sizeof(float));
if (!v) nrerror("allocation failure in vector()");
return v-nl;
}
int *ivector(nl,nh)
int nl,nh;
{
int *v;
v=(int *)malloc((unsigned) (nh-nl+1)*sizeof(int));
if (!v) nrerror("allocation failure in ivector()");
return v-nl;
}
double *dvector(nl,nh)
int nl,nh;
{
double *v;
v=(double *)malloc((unsigned) (nh-nl+1)*sizeof(double));
if (!v) nrerror("allocation failure in dvector()");
return v-nl;
}
float **matrix(nrl,nrh,ncl,nch)
int nrl,nrh,ncl,nch;
{
int i;
float **m;
m=(float **) malloc((unsigned) (nrh-nrl+1)*sizeof(float *));
if (!m) nrerror("allocation failure 1 in matrix()");
m -= nrl;
for(i=nrl;i<=nrh;i++) {
m[i]=(float *) malloc((unsigned) (nch-ncl+1)*sizeof(float));
if (!m[i]) nrerror("allocation failure 2 in matrix()");
m[i] -= ncl;
}
return m;
}
double **dmatrix(nrl,nrh,ncl,nch)
int nrl,nrh,ncl,nch;
{
int i;
double **m;
m=(double **) malloc((unsigned) (nrh-nrl+1)*sizeof(double *));
if (!m) nrerror("allocation failure 1 in dmatrix()");
m -= nrl;
for(i=nrl;i<=nrh;i++) {
m[i]=(double *) malloc((unsigned) (nch-ncl+1)*sizeof(double));
if (!m[i]) nrerror("allocation failure 2 in dmatrix()");
m[i] -= ncl;
}
return m;
}
double ***darray3(n1l,n1h,n2l,n2h,n3l,n3h)
int n1l,n1h,n2l,n2h,n3l,n3h;
{
int i;
double ***m;
m=(double ***) malloc((unsigned) (n1h-n1l+1)*sizeof(double **));
if (!m) nrerror("allocation failure 1 in darray3()");
m -= n1l;
for(i=n1l;i<=n1h;i++)
{
m[i]=dmatrix(n2l,n2h,n3l,n3h);
}
return m;
}
double ****darray4(n1l,n1h,n2l,n2h,n3l,n3h,n4l,n4h)
int n1l,n1h,n2l,n2h,n3l,n3h,n4l,n4h;
{
int i;
double ****m;
m=(double ****) malloc((unsigned) (n1h-n1l+1)*sizeof(double ***));
if (!m) nrerror("allocation failure 1 in darray4()");
m -= n1l;
for(i=n1l;i<=n1h;i++)
{
m[i]=darray3(n2l,n2h,n3l,n3h,n4l,n4h);
}
return m;
}
int **imatrix(nrl,nrh,ncl,nch)
int nrl,nrh,ncl,nch;
{
int i,**m;
m=(int **)malloc((unsigned) (nrh-nrl+1)*sizeof(int *));
if (!m) nrerror("allocation failure 1 in imatrix()");
m -= nrl;
for(i=nrl;i<=nrh;i++) {
m[i]=(int *)malloc((unsigned) (nch-ncl+1)*sizeof(int));
if (!m[i]) nrerror("allocation failure 2 in imatrix()");
m[i] -= ncl;
}
return m;
}
float **submatrix(a,oldrl,oldrh,oldcl,oldch,newrl,newcl)
float **a;
int oldrl,oldrh,oldcl,oldch,newrl,newcl;
{
int i,j;
float **m;
m=(float **) malloc((unsigned) (oldrh-oldrl+1)*sizeof(float *));
if (!m) nrerror("allocation failure in submatrix()");
m -= newrl;
for(i=oldrl,j=newrl;i<=oldrh;i++,j++) m[j]=a[i]+oldcl-newcl;
return m;
}
void free_vector(v,nl,nh)
float *v;
int nl,nh;
{
free((char*) (v+nl));
}
void free_ivector(v,nl,nh)
int *v,nl,nh;
{
free((char*) (v+nl));
}
void free_dvector(v,nl,nh)
double *v;
int nl,nh;
{
free((char*) (v+nl));
}
void free_matrix(m,nrl,nrh,ncl,nch)
float **m;
int nrl,nrh,ncl,nch;
{
int i;
for(i=nrh;i>=nrl;i--) free((char*) (m[i]+ncl));
free((char*) (m+nrl));
}
void free_dmatrix(m,nrl,nrh,ncl,nch)
double **m;
int nrl,nrh,ncl,nch;
{
int i;
for(i=nrh;i>=nrl;i--) free((char*) (m[i]+ncl));
free((char*) (m+nrl));
}
void free_imatrix(m,nrl,nrh,ncl,nch)
int **m;
int nrl,nrh,ncl,nch;
{
int i;
for(i=nrh;i>=nrl;i--) free((char*) (m[i]+ncl));
free((char*) (m+nrl));
}
void free_darray3(m,n1l,n1h,n2l,n2h,n3l,n3h)
double ***m;
int n1l,n1h,n2l,n2h,n3l,n3h;
{
int i;
for(i=n1h;i>=n1l;i--) free_dmatrix(m[i],n2l,n2h,n3l,n3h);
free((char*) (m+n1l));
}
void free_submatrix(b,nrl,nrh,ncl,nch)
float **b;
int nrl,nrh,ncl,nch;
{
free((char*) (b+nrl));
}
float **convert_matrix(a,nrl,nrh,ncl,nch)
float *a;
int nrl,nrh,ncl,nch;
{
int i,j,nrow,ncol;
float **m;
nrow=nrh-nrl+1;
ncol=nch-ncl+1;
m = (float **) malloc((unsigned) (nrow)*sizeof(float*));
if (!m) nrerror("allocation failure in convert_matrix()");
m -= nrl;
for(i=0,j=nrl;i<=nrow-1;i++,j++) m[j]=a+ncol*i-ncl;
return m;
}
void free_convert_matrix(b,nrl,nrh,ncl,nch)
float **b;
int nrl,nrh,ncl,nch;
{
free((char*) (b+nrl));
}
// Public domain code from Yi-Kuo Yu & Stephen Altschul, NCBI
float *vector();
float **matrix();
float **convert_matrix();
double *dvector();
double **dmatrix();
int *ivector();
int **imatrix();
float **submatrix();
void free_vector();
void free_dvector();
void free_ivector();
void free_matrix();
void free_dmatrix();
void free_imatrix();
void free_submatrix();
void free_convert_matrix();
void nrerror();
// Copyright 2015 Yutaro Konta
#include "LambdaCalculator.hh"
#include <vector>
#include <cassert>
#include <cstdio> // sprintf
#include <cstdlib>
#include <cfloat>
#include <cmath>
//#include <iostream>
using namespace std;
static double roundToFewDigits(double x)
{
// This rounding fixes some inaccuracies, e.g. for DNA with a simple
// match/mismatch matrix it's likely to make all the probabilities
// exactly 0.25, as they should be.
char buffer[32];
sprintf(buffer, "%g", x);
return atof(buffer);
}
static double** makematrix(int m, int n, double val)
{
double** mat = new double* [m];
for (int i=0; i<m; i++)
{
mat[i] = new double [n];
for (int j=0; j<n; j++)
mat[i][j] = val;
}
return mat;
}
static void deletematrix(double** a, int m)
{
for (int i=0; i<m; i++)
delete[] a[i];
delete[] a;
}
static double summatrix(double** a, int m, int n)
{
double s = 0;
for (int i=0; i<m; i++)
for (int j=0; j<n; j++)
s += a[i][j];
return s;
}
static int max_index(double** a, int n, int i)
{
double max = -DBL_MAX;
int maxindex = -1;
for (int j=i; j<n; j++)
{
if (fabs(a[j][i]) > max)
{
max = fabs(a[j][i]);
maxindex = j;
}
}
return maxindex;
}
static void swap_matrix(double** a, int i, int j)
{
double* v = a[i];
a[i] = a[j];
a[j] = v;
}
static void swap_vector(int* a, int i, int j)
{
int v = a[i];
a[i] = a[j];
a[j] = v;
}
static bool lu_pivoting(double** a, int* idx, int n)
{
int v;
for (int i=0; i<n; i++)
idx[i] = i;
for (int i=0; i<n; i++)
{
v = max_index(a, n, i);
assert(v >= 0);
if (fabs(a[v][i]) < 1e-10)
{
return false; // singular matrix!
}
swap_matrix(a, i, v);
swap_vector(idx, i, v);
a[i][i] = 1.0/a[i][i];
for (int j=i+1; j<n; j++)
{
a[j][i] = a[j][i] * a[i][i];
for (int k=i+1; k<n; k++)
a[j][k] = a[j][k] - a[j][i] * a[i][k];
}
}
return true;
}
static void solvep(double **a, double *x, double *b, int n)
{
double *y = new double [n];
for (int i=0; i<n; i++)
{
y[i] = b[i];
for (int j=0; j<i; j++)
y[i] -= a[i][j] * y[j];
}
for (int i=n-1; i>=0; i--)
{
x[i] = y[i];
for (int j=i+1; j<n; j++)
x[i] -= a[i][j] * x[j];
x[i] *= a[i][i]; // needed because a[i][i] is inverted
}
delete[] y;
}
static void transpose(double **a, int n)
{
double v;
for (int i=0; i<n; i++)
{
for (int j=0; j<i; j++)
{
v = a[i][j];
a[i][j] = a[j][i];
a[j][i] = v;
}
}
}
static bool invert(double **a, double **inv, int n)
{
int* idx = new int [n];
double **e = makematrix(n,n,0);
if(!lu_pivoting(a, idx, n))
return false;
for (int i=0; i<n; i++)
e[idx[i]][i] = 1; // transposed
delete[] idx;
for (int i=0; i<n; i++)
solvep(a, inv[i], e[i], n);
deletematrix(e, n);
transpose(inv, n); // transpose inv to make the inverted matrix of a
return true;
}
static bool calculate_inv_sum(double **matrix, int alpha_size, double tau, double* inv_sum)
{
double **m = makematrix(alpha_size, alpha_size, 0);
double **y = makematrix(alpha_size, alpha_size, 0);
for (int i=0; i<alpha_size; i++)
for (int j=0; j<alpha_size; j++)
m[i][j] = exp(tau * matrix[i][j]);
if(!invert(m, y, alpha_size))
return false;
*inv_sum = summatrix(y, alpha_size, alpha_size);
deletematrix(m, alpha_size);
deletematrix(y, alpha_size);
return true;
}
namespace cbrc{
void LambdaCalculator::setBad(){
lambda_ = -1;
letterProbs1_.clear();
letterProbs2_.clear();
}
bool LambdaCalculator::find_ub(double **matrix, int alpha_size, double *ub)
{
double r_max_min = DBL_MAX;
double c_max_min = DBL_MAX;
double r_max;
double c_max;
double r_min;
double c_min;
int l_r = 0;
int l_c = 0;
for (int i=0; i<alpha_size; i++)
{
r_max = -DBL_MAX;
r_min = DBL_MAX;
for (int j=0; j<alpha_size; j++)
{
if (matrix[i][j] > r_max)
r_max = matrix[i][j];
if (matrix[i][j] < r_min)
r_min = matrix[i][j];
}
if (r_max == 0 && r_min == 0)
l_r++;
else if (r_max <= 0 || r_min >= 0)
return false;
else if (r_max < r_max_min)
r_max_min = r_max;
}
for (int j=0; j<alpha_size; j++)
{
c_max = -DBL_MAX;
c_min = DBL_MAX;
for (int i=0; i<alpha_size; i++)
{
if (matrix[i][j] > c_max)
c_max = matrix[i][j];
if (matrix[i][j] < c_min)
c_min = matrix[i][j];
}
if (c_max == 0 && c_min == 0)
l_c++;
else if (c_max <= 0 || c_min >= 0)
return false;
else if (c_max < c_max_min)
c_max_min = c_max;
}
if (l_r == alpha_size) return false;
// the multiplication by 1.1 is sometimes necessary, presumably to
// prevent the upper bound from being too tight:
if (r_max_min > c_max_min)
*ub = 1.1 * log(1.0 * (alpha_size - l_r))/r_max_min;
else
*ub = 1.1 * log(1.0 * (alpha_size - l_c))/c_max_min;
return true;
}
bool LambdaCalculator::binary_search(double** matrix, int alpha_size, double lb, double ub, vector<double>& letprob1, vector<double>& letprob2, double* lambda, int maxiter)
{
double l=0;
double r=0;
double l_sum=0;
double r_sum=0;
int iter=0;
while (iter < maxiter && (l>=r || (l_sum < 1.0 && r_sum < 1.0) || (l_sum > 1.0 && r_sum > 1.0)))
{
l = lb + (ub - lb)*rand()/RAND_MAX;
r = lb + (ub - lb)*rand()/RAND_MAX;
if (!calculate_inv_sum(matrix, alpha_size, l, &l_sum) || !calculate_inv_sum(matrix, alpha_size, r, &r_sum))
{
l = 0;
r = 0;
}
iter++;
}
if (iter >= maxiter)
return false;
while (l_sum != 1.0 && r_sum != 1.0 && (l+r)/2.0 != l && (l+r)/2.0 != r)
{
double mid = (l + r)/2.0;
double mid_sum;
if (!calculate_inv_sum(matrix, alpha_size, mid, &mid_sum))
return false;
if (fabs(mid_sum) >= DBL_MAX)
return false;
if ((l_sum < 1.0 && mid_sum >= 1.0) || (l_sum > 1.0 && mid_sum <= 1.0))
{
r = mid;
r_sum = mid_sum;
}
else if ((r_sum < 1.0 && mid_sum >= 1.0) || (r_sum > 1.0 && mid_sum <= 1.0))
{
l = mid;
l_sum = mid_sum;
}
else
return false;
}
if (fabs(l_sum - 1.0) < fabs(r_sum - 1.0))
{
if (check_lambda(matrix, l, alpha_size, letprob1, letprob2))
{
*lambda = l;
return true;
}
return false;
}
if (check_lambda(matrix, r, alpha_size, letprob1, letprob2))
{
*lambda = r;
return true;
}
return false;
}
double LambdaCalculator::calculate_lambda(double** matrix, int alpha_size, vector<double>& letprob1, vector<double>& letprob2, int maxiter, int max_boundary_search_iter, double lb_ratio)
{
double ub;
if (!find_ub(matrix, alpha_size, &ub))
return -1;
double lb = ub*lb_ratio;
double lambda = -1;
int iter = 0;
bool flag = false;
while (!flag && iter < maxiter)
{
flag = binary_search(matrix, alpha_size, lb, ub, letprob1, letprob2, &lambda, max_boundary_search_iter);
iter++;
}
return lambda;
}
bool LambdaCalculator::check_lambda(double** matrix, double lambda, int alpha_size, vector<double>& letprob1, vector<double>& letprob2)
{
double **m = makematrix(alpha_size, alpha_size, 0);
double **y = makematrix(alpha_size, alpha_size, 0);
for (int i=0; i<alpha_size; i++)
for (int j=0; j<alpha_size; j++)
m[i][j] = exp(lambda * matrix[i][j]);
invert(m, y, alpha_size);
for (int i=0; i<alpha_size; i++)
{
double p = 0;
for (int j=0;j<alpha_size; j++)
p += y[i][j];
if (p < 0 || p > 1)
{
letprob2.clear();
return false;
}
letprob2.push_back(roundToFewDigits(p));
}
for (int j=0; j<alpha_size; j++)
{
double q = 0;
for (int i=0; i<alpha_size; i++)
q += y[i][j];
if (q < 0 || q > 1)
{
letprob2.clear();
letprob1.clear();
return false;
}
letprob1.push_back(roundToFewDigits(q));
}
deletematrix(m, alpha_size);
deletematrix(y, alpha_size);
return true;
}
void LambdaCalculator::calculate(const const_int_ptr *matrix, int alphSize) {
assert(alphSize >= 0);
setBad();
int maxiter = 1000;
int max_boundary_search_iter = 100;
double lb_ratio = 1e-6;
double** mat = makematrix(alphSize, alphSize, 0);
for (int i=0; i<alphSize; i++)
for (int j=0; j<alphSize; j++)
mat[i][j] = matrix[i][j];
lambda_ = calculate_lambda(mat, alphSize, letterProbs1_, letterProbs2_, maxiter, max_boundary_search_iter, lb_ratio);
deletematrix(mat, alphSize);
}
}
// Copyright 2010 Martin C. Frith
// Copyright 2008 Michiaki Hamada
// Modified 2015 Yutaro Konta
// This class calculates the scale factor (lambda), and the letter
// probabilities, that are implicit in a scoring matrix. The
......@@ -9,23 +10,20 @@
// probabilities should be identical. With this code, they might
// differ minutely from exact identity.
#ifndef MCF_SCORE_MATRIX_PROBS_HH
#define MCF_SCORE_MATRIX_PROBS_HH
#ifndef LAMBDA_CALCULATOR_HH
#define LAMBDA_CALCULATOR_HH
#include <vector>
namespace mcf {
namespace cbrc{
typedef const int *const_int_ptr;
class ScoreMatrixProbs {
class LambdaCalculator{
public:
ScoreMatrixProbs() { setBad(); }
LambdaCalculator() { setBad(); }
ScoreMatrixProbs(const const_int_ptr *scoreMatrix, unsigned alphabetSize)
{ init(scoreMatrix, alphabetSize); }
void init(const const_int_ptr *scoreMatrix, unsigned alphabetSize);
void calculate(const const_int_ptr *matrix, int alphSize);
// Put us in the bad/undefined state.
void setBad();
......@@ -37,19 +35,24 @@ class ScoreMatrixProbs {
double lambda() const { return lambda_; }
// The probabilities of letters corresponding to matrix rows (1st index).
// In the bad/undefined state, it is an empty vector.
const std::vector<double>& letterProbs1() const { return letterProbs1_; }
// In the bad/undefined state, it is a null pointer.
const double *letterProbs1() const {return isBad() ? 0 : &letterProbs1_[0];}
// The probabilities of letters corresponding to matrix columns (2nd index).
// In the bad/undefined state, it is an empty vector.
const std::vector<double>& letterProbs2() const { return letterProbs2_; }
// In the bad/undefined state, it is a null pointer.
const double *letterProbs2() const {return isBad() ? 0 : &letterProbs2_[0];}
private:
double lambda_;
std::vector<double> letterProbs1_;
std::vector<double> letterProbs2_;
bool find_ub(double **matrix, int alpha_size, double *ub);
bool binary_search(double** matrix, int alpha_size, double lb, double ub, std::vector<double>& letprob1, std::vector<double>& letprob2, double* lambda, int maxiter);
double calculate_lambda(double** matrix, int alpha_size, std::vector<double>& letprob1, std::vector<double>& letprob2, int maxiter, int max_boundary_search_iter, double lb_ratio);
bool check_lambda(double** matrix, double lambda, int alpha_size, std::vector<double>& letprob1, std::vector<double>& letprob2);
};
}
} // end namespace
#endif
......@@ -2,20 +2,13 @@ CXXFLAGS = -O3 -Wall -W -Wcast-qual -Wswitch-enum -Wundef \
-Wcast-align -Wold-style-cast
# -Wconversion
CFLAGS = -Wall
COBJ = CA_code/lambda_calculator.o
all: tantan
tantan: *.cc *.hh version.hh Makefile $(COBJ)
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ *.cc $(COBJ)
$(COBJ): CA_code/*.c CA_code/*.h Makefile
$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ CA_code/lambda_calculator.c
tantan: *.cc *.hh version.hh Makefile
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ *.cc
clean:
rm -f tantan $(COBJ)
rm -f tantan
VERSION = \"`hg id -n`\"
......
// Copyright 2010 Martin C. Frith
#include "mcf_score_matrix_probs.hh"
namespace lambda {
extern "C" {
#include "CA_code/lambda_calculator.h"
}
}
namespace mcf {
void ScoreMatrixProbs::setBad() {
lambda_ = -1;
letterProbs1_.clear();
letterProbs2_.clear();
}
void ScoreMatrixProbs::init(const const_int_ptr *scoreMatrix,
unsigned alphabetSize) {
// We need to pass the parameters as 1-based pointers, hence the +1s
// and -1s.
std::vector<double> cells(alphabetSize * alphabetSize + 1);
std::vector<const double*> pointers(alphabetSize + 1);
for (unsigned i = 0; i < alphabetSize; ++i) {
pointers[i+1] = &cells[i * alphabetSize];
for (unsigned j = 0; j < alphabetSize; ++j) {
cells[i * alphabetSize + j + 1] = scoreMatrix[i][j];
}
}
letterProbs1_.resize(alphabetSize);
letterProbs2_.resize(alphabetSize);
lambda_ = lambda::calculate_lambda(&pointers[0], alphabetSize,
&letterProbs1_[0] - 1,
&letterProbs2_[0] - 1);
if (lambda_ < 0) setBad();
}
}
......@@ -52,6 +52,8 @@ TantanOptions::TantanOptions() :
repeatEndProb(0.05),
maxCycleLength(-1), // depends on isProtein
repeatOffsetProbDecay(0.9),
matchScore(0),
mismatchCost(0),
gapExistenceCost(0),
gapExtensionCost(-1), // means: no gaps
minMaskProb(0.5),
......@@ -75,6 +77,8 @@ Options (default settings):\n\
-w maximum tandem repeat period to consider (100, but -p selects 50)\n\
-d probability decay per period ("
+ stringify(repeatOffsetProbDecay) + ")\n\
-i match score (1, but -p selects BLOSUM62)\n\
-j mismatch cost (1, but -p selects BLOSUM62)\n\
-a gap existence cost ("
+ stringify(gapExistenceCost) + ")\n\
-b gap extension cost (infinite: no gaps)\n\
......@@ -94,7 +98,7 @@ Home page: http://www.cbrc.jp/tantan/\n\
#include "version.hh"
"\n";
const char *optstring = "px:cm:r:e:w:d:a:b:s:f:h";
const char *optstring = "px:cm:r:e:w:d:i:j:a:b:s:f:h";
int i;
while ((i = myGetopt(argc, argv, optstring, help, version)) != -1) {
......@@ -132,6 +136,16 @@ Home page: http://www.cbrc.jp/tantan/\n\
if (repeatOffsetProbDecay <= 0 || repeatOffsetProbDecay > 1)
badopt(c, optarg);
break;
case 'i':
unstringify(matchScore, optarg);
if (matchScore <= 0)
badopt(c, optarg);
break;
case 'j':
unstringify(mismatchCost, optarg);
if (mismatchCost <= 0)
badopt(c, optarg);
break;
case 'a':
unstringify(gapExistenceCost, optarg);
break;
......
......@@ -18,6 +18,8 @@ struct TantanOptions {
double repeatEndProb;
int maxCycleLength;
double repeatOffsetProbDecay;
int matchScore;
int mismatchCost;
int gapExistenceCost;
int gapExtensionCost;
double minMaskProb;
......
......@@ -99,7 +99,7 @@ struct Tantan {
//f2g = firstGapProb;
//g2f = 1 - otherGapProb;
oneGapProb = firstGapProb * (1 - otherGapProb);
endGapProb = firstGapProb * 1;
endGapProb = firstGapProb * (maxRepeatOffset > 1);
f2f0 = 1 - repeatEndProb;
f2f1 = 1 - repeatEndProb - firstGapProb;
f2f2 = 1 - repeatEndProb - firstGapProb * 2;
......@@ -125,8 +125,7 @@ struct Tantan {
double forwardTotal() {
double fromForeground = std::accumulate(foregroundProbs.begin(),
foregroundProbs.end(), 0.0);
fromForeground *= f2b;
double total = backgroundProb * b2b + fromForeground;
double total = backgroundProb * b2b + fromForeground * f2b;
assert(total > 0);
return total;
}
......@@ -148,9 +147,6 @@ struct Tantan {
double f = *foregroundPtr;
double fromForeground = f;
if (insertionProbs.empty()) {
*foregroundPtr = fromBackground + f * f2f0;
} else {
double *insertionPtr = &insertionProbs.back();
double i = *insertionPtr;
*foregroundPtr = fromBackground + f * f2f1 + i * endGapProb;
......@@ -174,10 +170,8 @@ struct Tantan {
fromForeground += f;
*foregroundPtr = fromBackground + f * f2f1 + d * endGapProb;
*insertionPtr = f;
}
fromForeground *= f2b;
backgroundProb = backgroundProb * b2b + fromForeground;
backgroundProb = backgroundProb * b2b + fromForeground * f2b;
}
void calcBackwardTransitionProbsWithGaps() {
......@@ -186,9 +180,6 @@ struct Tantan {
double f = *foregroundPtr;
double toForeground = f;
if (insertionProbs.empty()) {
*foregroundPtr = toBackground + f2f0 * f;
} else {
double *insertionPtr = &insertionProbs.front();
double i = *insertionPtr;
*foregroundPtr = toBackground + f2f1 * f + i;
......@@ -213,10 +204,8 @@ struct Tantan {
toForeground += f;
*foregroundPtr = toBackground + f2f1 * f + d;
*insertionPtr = endGapProb * f;
}
toForeground *= b2fLast;
backgroundProb = b2b * backgroundProb + toForeground;
backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
}
void calcForwardTransitionProbs() {
......@@ -235,8 +224,7 @@ struct Tantan {
fromBackground *= b2fGrowth;
}
fromForeground *= f2b;
backgroundProb = backgroundProb * b2b + fromForeground;
backgroundProb = backgroundProb * b2b + fromForeground * f2b;
}
void calcBackwardTransitionProbs() {
......@@ -255,8 +243,7 @@ struct Tantan {
++foregroundPtr;
}
toForeground *= b2fLast;
backgroundProb = b2b * backgroundProb + toForeground;
backgroundProb = b2b * backgroundProb + b2fLast * toForeground;
}
void addEndCounts(double forwardProb,
......@@ -281,12 +268,17 @@ struct Tantan {
}
}
void calcEmissionProbs() {
const double *lrRow = likelihoodRatioMatrix[*seqPtr];
bool isNearSeqBeg() {
return seqPtr - seqBeg < maxRepeatOffset;
}
bool isNearSeqBeg = (seqPtr - seqBeg < maxRepeatOffset);
const uchar *seqStop = isNearSeqBeg ? seqBeg : seqPtr - maxRepeatOffset;
const uchar *seqFurthestBack() {
return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset;
}
void calcEmissionProbs() {
const double *lrRow = likelihoodRatioMatrix[*seqPtr];
const uchar *seqStop = seqFurthestBack();
double *foregroundPtr = BEG(foregroundProbs);
const uchar *offsetPtr = seqPtr;
......
......@@ -6,10 +6,10 @@
#include "mcf_alphabet.hh"
#include "mcf_fasta_sequence.hh"
#include "mcf_score_matrix.hh"
#include "mcf_score_matrix_probs.hh"
#include "mcf_tantan_options.hh"
#include "mcf_util.hh"
#include "tantan.hh"
#include "LambdaCalculator.hh"
#include <algorithm> // copy, fill_n
#include <cassert>
......@@ -73,29 +73,42 @@ void initScoresAndProbabilities() {
ScoreMatrix scoreMatrix;
if (options.scoreMatrixFileName)
if (options.scoreMatrixFileName) {
unfilify(scoreMatrix, options.scoreMatrixFileName);
else if (options.isProtein)
} else if (options.isProtein) {
if (options.matchScore || options.mismatchCost) {
scoreMatrix.initMatchMismatch(std::max(options.matchScore, 1),
std::max(options.mismatchCost, 1),
Alphabet::protein);
} else {
unstringify(scoreMatrix, ScoreMatrix::blosum62);
else
scoreMatrix.initMatchMismatch(1, 1, "ACGTU"); // allow for RNA
}
} else {
scoreMatrix.initMatchMismatch(std::max(options.matchScore, 1),
std::max(options.mismatchCost, 1),
"ACGTU"); // allow for RNA
}
scoreMatrix.makeFastMatrix(fastMatrixPointers, scoreMatrixSize,
alphabet.lettersToNumbers,
scoreMatrix.minScore(), false);
ScoreMatrixProbs smp(fastMatrixPointers, alphabet.size);
if (smp.isBad())
cbrc::LambdaCalculator matCalc;
matCalc.calculate(fastMatrixPointers, alphabet.size);
if (matCalc.isBad())
throw Error("can't calculate probabilities for this score matrix");
double matrixLambda = matCalc.lambda();
for (int i = 0; i < scoreMatrixSize; ++i)
for (int j = 0; j < scoreMatrixSize; ++j)
probMatrix[i][j] = std::exp(smp.lambda() * fastMatrix[i][j]);
for (int i = 0; i < scoreMatrixSize; ++i) {
for (int j = 0; j < scoreMatrixSize; ++j) {
probMatrix[i][j] = std::exp(matrixLambda * fastMatrix[i][j]);
}
}
if (options.gapExtensionCost > 0) {
int firstGapCost = options.gapExistenceCost + options.gapExtensionCost;
firstGapProb = std::exp(-smp.lambda() * firstGapCost);
otherGapProb = std::exp(-smp.lambda() * options.gapExtensionCost);
firstGapProb = std::exp(-matrixLambda * firstGapCost);
otherGapProb = std::exp(-matrixLambda * options.gapExtensionCost);
// the gap existence cost includes the gap ending cost:
firstGapProb /= (1 - otherGapProb);
......@@ -103,7 +116,7 @@ void initScoresAndProbabilities() {
// XXX check if firstGapProb is too high
}
//std::cerr << "lambda: " << smp.lambda() << "\n";
//std::cerr << "lambda: " << matrixLambda << "\n";
//std::cerr << "firstGapProb: " << firstGapProb << "\n";
//std::cerr << "otherGapProb: " << otherGapProb << "\n";
}
......