hhhit.h 9.13 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*********************************************************************
 * Clustal Omega - Multiple sequence alignment
 *
 * Copyright (C) 2010 University College Dublin
 *
 * Clustal-Omega is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This file is part of Clustal-Omega.
 *
 ********************************************************************/

/*
 * RCS $Id: hhhit.h 243 2011-05-31 13:49:19Z fabian $
 */

// hhhit.h

//////////////////////////////////////////////////////////////////////////////
/* Describes an alignment of two profiles. 
   Used as list element in Hits : List<Hit> */
//////////////////////////////////////////////////////////////////////////////
class Hit
{
 public:  
  char* longname;       // Name of HMM
  char* name;           // One-word name of HMM
  char* file;           // Basename (with path, without extension) of alignment file that was used to construct the HMM
                        // (path from db-file is prepended to FILE record in HMM file!)
  char fam[IDLEN];      // family ID (derived from name) (FAM field)
  char sfam[IDLEN];     // superfamily ID (derived from name) 
  char fold[IDLEN];     // fold ID (derived from name)
  char cl[IDLEN];       // class ID (derived from name)
  int index;            // index of HMM in order of reading in (first=0)
  char* dbfile;         // full database file name from which HMM was read
  long ftellpos;        // start position of HMM in database file

  float score;          // Score of alignment (i.e. of Viterbi path)
  float score_sort;     // score to sort hits in output list (negative means first/best!)
  float score_aass;     // first: just hit.score, then hit.logPval-SSSCORE2NATLOG*hit.score_ss;(negative means best!)
  float score_ss;       // Part of score due to secondary structure
  float Pval;           // P-value for whole protein based on score distribution of query
  float Pvalt;          // P-value for whole protein based on score distribution of template
  float logPval;        // natural logarithm of Pval
  float logPvalt;       // natural logarithm of Pvalt
  float Eval;           // E-value for whole protein
  float Probab;         // probability in % for a positive (depends only on score)
  float weight;         // weight of hit for P-value calculation (= 1/#HMMs-in-family/#families-in-superfamily)
  double Pforward;      // scaled total forward probability : Pforward * Product_{i=1}^{Lq+1}(scale[i])
  
/*   float score_comp;     // compositional similarity score */
/*   float logPcomp;       // natural logarithm of Pvalue for compositional similarity score */
/*   float Prep;           // P-value for single-repeat hit */
/*   float Erep;           // E-value for single-repeat hit */
/*   float logPrep;        // natural logarithm of P-value for single-repeat hit */
  float E1val;          // E-value for whole protein from transitive scoring
  float logP1val;       // natural logarithm of P1val, the transitive P-value

  int L;                // Number of match states in template
  int irep;             // Index  of single-repeat hit (1: highest scoring repeat hit)
  int nrep;             // Number of single-repeat hits with one template
  
  int n_display;        // number of sequences stored for display of alignment 
  char** sname;         // names of stored sequences 
  char** seq;           // residues of stored sequences (first at pos 1)
  int nss_dssp;         // index of dssp secondary structure sequence in seq[]
  int nsa_dssp;         // index of of dssp solvent accessibility in seq[]
  int nss_pred;         // index of dssp secondary structure sequence in seq[]
  int nss_conf;         // index of dssp secondary structure sequence in seq[]
  int nfirst;           // index of query sequence in seq[]
  int ncons;            // index of consensus sequence
  
  int nsteps;           // index for last step in Viterbi path; (first=1)
  int* i;               // i[step] = query match state at step of Viterbi path
  int* j;               // j[step] = template match state at step of Viterbi path
  char* states;         // state at step of Viterbi path  0: Start  1: M(MM)  2: A(-D)  3: B(IM)  4: C(D-)  5 D(MI)
  float* S;             // S[step] = match-match score contribution at alignment step
  float* S_ss;          // S_ss[step] = secondary structure score contribution
  float* P_posterior;   // P_posterior[step] = posterior prob for MM states (otherwise zero)
  char* Xcons;          // consensus sequence for aligned states in internal representation (A=0 R=1 N=2 D=3 ...)
  int i1;               // First aligned residue in query
  int i2;               // Last aligned residue in query
  int j1;               // First aligned residue in template 
  int j2;               // Last aligned residue in template
  int matched_cols;     // number of matched columns in alignment against query
  int ssm1;             // SS scoring AFTER  alignment? 0:no  1:yes; t->dssp q->psipred  2:yes; q->dssp t->psipred
  int ssm2;             // SS scoring DURING alignment? 0:no  1:yes; t->dssp q->psipred  2:yes; q->dssp t->psipred
  char self;            // 0: align two different HMMs  1: align HMM with itself
  int min_overlap;      // Minimum overlap between query and template
  float sum_of_probs;   // sum of probabilities for Maximum ACcuracy alignment (if dssp states defined, only aligned pairs with defined dssp state contribute to sum)
  float Neff_HMM;       // Diversity of underlying alignment

  // Constructor (only set pointers to NULL)
  Hit();
  ~Hit(){};
  
  // Free all allocated memory (to delete list of hits)
  void Delete();

  // Allocate/delete memory for dynamic programming matrix
  void AllocateBacktraceMatrix(int Nq, int Nt);
  void DeleteBacktraceMatrix(int Nq);
  void AllocateForwardMatrix(int Nq, int Nt);
  void DeleteForwardMatrix(int Nq);
  void AllocateBackwardMatrix(int Nq, int Nt);
  void DeleteBackwardMatrix(int Nq);
  
  // Compare an HMM with overlapping subalignments
  void Viterbi(HMM& q, HMM& t, float** Sstruc=NULL);

  // Compare two HMMs with each other in lin space
  int Forward(HMM& q, HMM& t, float** Pstruc=NULL);

  // Compare two HMMs with each other in lin space
  int Backward(HMM& q, HMM& t);

   // Find maximum accuracy alignment (after running Forward and Backward algorithms)
  void MACAlignment(HMM& q, HMM& t);

  // Trace back alignment of two profiles based on matrices bXX[][]
  void Backtrace(HMM& q, HMM& t);

  // Trace back alignment of two profiles based on matrices bXX[][]
  void StochasticBacktrace(HMM& q, HMM& t, char maximize=0);

  // Trace back MAC alignment of two profiles based on matrix bMM[][]
  void BacktraceMAC(HMM& q, HMM& t);

  // Calculate secondary structure score between columns i and j of two HMMs (query and template)
  inline float ScoreSS(HMM& q, HMM& t, int i, int j, int ssm);

  // Calculate secondary structure score between columns i and j of two HMMs (query and template)
  inline float ScoreSS(HMM& q, HMM& t, int i, int j);

  // Calculate total score (including secondary structure score and compositional bias correction
  inline float ScoreTot(HMM& q, HMM& t, int i, int j);

  // Calculate score (excluding secondary structure score and compositional bias correction
  inline float ScoreAA(HMM& q, HMM& t, int i, int j);

  // Comparison (used to sort list of hits)
  int operator<(const Hit& hit2)  {return score_sort<hit2.score_sort;}

  // Merge HMM with next aligned HMM  
  void MergeHMM(HMM& Q, HMM& t, float wk[]);

#ifdef CLUSTALO
  void ClobberGlobal(void);
#endif  


  double** B_MM;        // Backward matrices
  
private:
  char state;          // 0: Start/STOP state  1: MM state  2: GD state (-D)  3: IM state  4: DG state (D-)  5 MI state
  char** bMM;          // (backtracing) bMM[i][j] = STOP:start of alignment  MM:prev was MM  GD:prev was GD etc
  char** bGD;          // (backtracing) bMM[i][j] = STOP:start of alignment  MM:prev was MM  SAME:prev was GD
  char** bDG;          // (backtracing)
  char** bIM;          // (backtracing)
  char** bMI;          // (backtracing)
  char** cell_off;     // cell_off[i][j]=1 means this cell will get score -infinity

  double** F_MM;        // Forward matrices 
  double** F_GD;        // F_XY[i][j] * Prod_1^i(scale[i]) 
  double** F_DG;        //   = Sum_x1..xl{ P(HMMs aligned up to Xi||Yj co-emmitted x1..xl ) / (Prod_k=1^l f(x_k)) }   
  double** F_IM;        // end gaps are not penalized!
  double** F_MI;        // 
  double* scale;        // 

  double** B_GD;        // B_XY[i][j] * Prod_i+1^(L+1) (scale[i])
  double** B_DG;        //   = Sum_x2..xl{ P(HMMs aligned from Xi||Yj to end co-emmitted x2..xl ) / (Prod_k=2^l f(x_k)) }   
  double** B_IM;        // end gaps are not penalized!
  double** B_MI;        // 

  void InitializeBacktrace(HMM& q, HMM& t);
  void InitializeForAlignment(HMM& q, HMM& t);
};


double Pvalue(double x, double a[]);
double Pvalue(float x, float lamda, float mu);
double logPvalue(float x, float lamda, float mu);
double logPvalue(float x, double a[]);
double Probab(Hit& hit);