model.c 6.88 KB
Newer Older
1 2 3 4 5
/** @file
 * @brief This file contains all functions for the mutation matrix and the
 * estimation of evolutionary distances thereof.
 */

6 7 8
#include "model.h"
#include "global.h"
#include <gsl/gsl_randist.h>
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
#include <inttypes.h>
#include <math.h>
#include <stdio.h>

/**
 * @brief Sum some mutation count specified by `summands`. Intended to be used
 * through the `model_sum` macro.
 *
 * @param MM - The mutation matrix.
 * @param summands - The mutations to add.
 * @returns The sum of mutations.
 */
static size_t model_sum_types(const model *MM, const int summands[]) {
	size_t total = 0;
	for (int i = 0; summands[i] != MUTCOUNTS; ++i) {
		total += MM->counts[summands[i]];
	}
	return total;
}

#define model_sum(MM, ...)                                                     \
	model_sum_types((MM), (int[]){__VA_ARGS__, MUTCOUNTS})

/**
 * @brief Average two mutation matrices.
 *
 * @param MM - One matrix
 * @param NN - Second matrix
 * @returns The average (sum) of two mutation matrices.
 */
model model_average(const model *MM, const model *NN) {
	model ret = *MM;
	for (int i = 0; i != MUTCOUNTS; ++i) {
		ret.counts[i] += NN->counts[i];
	}
	ret.seq_len += NN->seq_len;
	return ret;
}

/**
 * @brief Compute the total number of nucleotides in the pairwise alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The length of the alignment.
 */
size_t model_total(const model *MM) {
	size_t total = 0;
	for (size_t i = 0; i < MUTCOUNTS; ++i) {
		total += MM->counts[i];
	}
	return total;
}

/**
 * @brief Compute the coverage of an alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The relative coverage
 */
double model_coverage(const model *MM) {
	size_t covered = model_total(MM);
	size_t actual = MM->seq_len;

	return (double)covered / (double)actual;
}

/**
 * @brief Estimate the uncorrected distance of a pairwise alignment.
 *
 * @param MM - The mutation matrix.
 * @returns The uncorrected substitution rate.
 */
double estimate_RAW(const model *MM) {
	size_t nucl = model_total(MM);
	size_t SNPs = model_sum(MM, AtoC, AtoG, AtoT, CtoG, CtoT, GtoT);

	// Insignificant results. All abort the fail train.
	if (nucl <= 3) {
		return NAN;
	}

	return (double)SNPs / (double)nucl;
}

/**
 * @brief Compute the Jukes-Cantor distance.
 *
 * @param MM - The mutation matrix.
 * @returns The corrected JC distance.
 */
double estimate_JC(const model *MM) {
	double dist = estimate_RAW(MM);
	dist = -0.75 * log(1.0 - (4.0 / 3.0) * dist); // jukes cantor

	// fix negative zero
	return dist <= 0.0 ? 0.0 : dist;
}

/** @brief computes the evolutionary distance using K80.
 *
 * @param MM - The mutation matrix.
 * @returns The corrected Kimura distance.
 */
double estimate_KIMURA(const model *MM) {
	size_t nucl = model_total(MM);
	size_t transitions = model_sum(MM, AtoG, CtoT);
	size_t transversions = model_sum(MM, AtoC, AtoT, GtoC, GtoT);

	double P = (double)transitions / (double)nucl;
	double Q = (double)transversions / (double)nucl;

	double tmp = 1.0 - 2.0 * P - Q;
	double dist = -0.25 * log((1.0 - 2.0 * Q) * tmp * tmp);

	// fix negative zero
	return dist <= 0.0 ? 0.0 : dist;
}

127
/** @brief Bootstrap a mutation matrix.
128 129 130 131
 *
 * The classical bootstrapping process, as described by Felsenstein, resamples
 * all nucleotides of a MSA. As andi only computes a pairwise alignment, this
 * process boils down to a simple multinomial distribution. We just have to
132 133
 * resample the elements of the mutation matrix. See Klötzl & Haubold (2016)
 * for details. http://www.mdpi.com/2075-1729/6/1/11/htm
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 *
 * @param MM - The original mutation matrix.
 * @returns A bootstrapped mutation matrix.
 */
model model_bootstrap(const model MM) {
	model datum = MM;

	size_t nucl = model_total(&MM);
	double p[MUTCOUNTS];
	for (size_t i = 0; i < MUTCOUNTS; ++i) {
		p[i] = MM.counts[i] / (double)nucl;
	}

	unsigned int n[MUTCOUNTS];

	gsl_ran_multinomial(RNG, MUTCOUNTS, nucl, p, n);

	for (size_t i = 0; i < MUTCOUNTS; ++i) {
		datum.counts[i] = n[i];
	}

	return datum;
}

/**
 * @brief Given an anchor, classify nucleotides.
 *
 * For anchors we already know that the nucleotides of the subject and the query
162 163 164
 * are equal. Thus only one sequence has to be analysed. Most models don't
 * actually care about the individual nucleotides as long as they are equal in
 * the two sequences. For these models, we just assume equal distribution.
165 166 167 168 169 170
 *
 * @param MM - The mutation matrix
 * @param S - The subject
 * @param len - The anchor length
 */
void model_count_equal(model *MM, const char *S, size_t len) {
171 172 173 174 175 176 177 178 179 180 181 182
	if (MODEL == M_RAW || MODEL == M_JC || MODEL == M_KIMURA) {
		size_t fourth = len / 4;
		MM->counts[AtoA] += fourth;
		MM->counts[CtoC] += fourth;
		MM->counts[GtoG] += fourth;
		MM->counts[TtoT] += fourth + (len & 3);
		return;
	}

	// Fall-back algorithm for future models. Note, as this works on a
	// per-character basis it is slow.

183 184 185 186 187
	size_t local_counts[4] = {0};

	for (; len--;) {
		char s = *S++;

188
		// ';!#' are all smaller than 'A'
189
		if (s < 'A') {
190
			// Technically, s can only be ';' at this point.
191 192 193
			continue;
		}

194 195 196
		// The four canonical nucleotides can be uniquely identified by the bits
		// 0x6: A -> 0, C → 1, G → 3, T → 2. Thus the order below is changed.
		local_counts[(s >> 1) & 3]++;
197 198 199 200
	}

	MM->counts[AtoA] += local_counts[0];
	MM->counts[CtoC] += local_counts[1];
201 202
	MM->counts[GtoG] += local_counts[3];
	MM->counts[TtoT] += local_counts[2];
203 204
}

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
/** @brief Convert a nucleotide to a 2bit representation.
 *
 * We want to map characters:
 *  A → 0
 *  C → 1
 *  G → 2
 *  T → 3
 * The trick used below is that the three lower bits of the
 * characters are unique. Thus, they can be used to compute the mapping
 * above. The mapping itself is done via tricky bitwise operations.
 *
 * @param c - input nucleotide
 * @returns 2bit representation.
 */
char nucl2bit(char c) {
	c &= 6;
	c ^= c >> 1;
	return c >> 1;
}

225 226 227 228 229 230 231 232 233 234 235
/**
 * @brief Count the substitutions and add them to the mutation matrix.
 *
 * @param MM - The mutation matrix.
 * @param S - The subject
 * @param Q - The query
 * @param len - The length of the alignment
 */
void model_count(model *MM, const char *S, const char *Q, size_t len) {
	size_t local_counts[MUTCOUNTS] = {0};

236
	for (size_t i = 0; i < len; S++, Q++, i++) {
237 238 239 240 241 242 243 244 245
		char s = *S;
		char q = *Q;

		// Skip special characters.
		if (s < 'A' || q < 'A') {
			continue;
		}

		// Pick the correct two bits representing s and q.
246 247
		unsigned char foo = nucl2bit(s);
		unsigned char baz = nucl2bit(q);
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273

		/*
		 * The mutation matrix is symmetric. For convenience we define the
		 * mutation XtoY with X > Y.
		 */
		if (baz > foo) {
			int temp = foo;
			foo = baz;
			baz = temp;
		}

		/*
		 * Finally, we want to map the indices to the correct mutation. This is
		 * done by utilising the mutation types in model.h.
		 */
		static const unsigned int map4 = 0x9740;
		unsigned int base = (map4 >> (4 * baz)) & 0xf;
		unsigned int index = base + (foo - baz);

		local_counts[index]++;
	}

	for (int i = 0; i != MUTCOUNTS; ++i) {
		MM->counts[i] += local_counts[i];
	}
}