sequence.c 5.55 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/**
 * @file
 * @brief Sequence utilities
 *
 * This file contains utility functions for working with DNA sequences.
 */
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>

#include "global.h"
13 14
#include "sequence.h"
#include <compat-stdlib.h>
15

16
void normalize(seq_t *S);
17 18

/** Create a new dynamic array for sequences. */
19
int dsa_init(dsa_t *A) {
20 21
	// allocate at least 4 slots so the growth by 1.5 below doesn't get stuck
	// at 3 slots.
22 23
	A->data = malloc(sizeof(*A->data) * 4);
	CHECK_MALLOC(A->data);
24 25 26 27 28 29 30

	A->capacity = 4;
	A->size = 0;
	return 0;
}

/** Add a sequence to an array. */
31 32
void dsa_push(dsa_t *A, seq_t S) {
	if (A->size < A->capacity) {
33 34 35
		A->data[A->size++] = S;
	} else {
		// use the near-optimal growth factor of 1.5
36
		seq_t *ptr = reallocarray(A->data, A->capacity / 2, sizeof(seq_t) * 3);
37
		CHECK_MALLOC(ptr);
38 39 40 41 42 43 44 45

		A->capacity = (A->capacity / 2) * 3;
		A->data = ptr;
		A->data[A->size++] = S;
	}
}

/** Frees the array and all sequences stored within. */
46
void dsa_free(dsa_t *A) {
47
	size_t i;
48
	for (i = 0; i < A->size; i++) {
49 50 51 52 53 54 55 56
		seq_free(&A->data[i]);
	}

	free(A->data);
	*A = (dsa_t){};
}

/** Returns the number of sequences stored within an array. */
57
size_t dsa_size(const dsa_t *A) {
58 59 60 61
	return A->size;
}

/** Get the raw C array. */
62
seq_t *dsa_data(dsa_t *A) {
63 64 65 66 67
	return A->data;
}

/**
 * @brief Convert an array of multiple sequences into a single sequence.
68
 *
69 70 71 72 73 74
 * This function joins all sequences contained in an array into one
 * long sequence. The sequences are separated by a `!` character. The
 * caller has to free the initial array.
 *
 * @returns A new sequence representation the union of the array.
 */
75
seq_t dsa_join(dsa_t *A) {
76 77
	seq_t joined = {};

78
	if (A->size == 0) {
79 80 81
		return joined;
	}

82
	if (A->size == 1) {
83 84 85 86 87 88 89 90
		/* If we are to join just one sequence, _move_ its contents. */
		joined = A->data[0];
		A->data[0] = (seq_t){};
		return joined;
	}

	seq_t *data = A->data;
	seq_t *it = data;
91

92 93
	// Compute the total length
	size_t total = 0, i;
94
	for (i = 0; i < A->size; i++, it++) {
95 96
		total += it->len + 1;
	}
97

98
	// A single malloc for the whole new sequence
99
	char *ptr = malloc(total);
100
	CHECK_MALLOC(ptr);
101
	char *next = ptr;
102

103 104 105
	// Copy all old sequences and add a `!` in between

	it = data;
106
	memcpy(next, it->S, it->len);
107 108
	next += it->len;

109
	for (i = 1, it++; i < A->size; i++, it++) {
110
		*next++ = '!';
111
		memcpy(next, it->S, it->len);
112 113
		next += it->len;
	}
114

115 116
	// Don't forget the null byte.
	*next = '\0';
117

118
	joined.S = ptr;
119 120
	joined.len = total - 1; // subtract the null byte

121 122 123 124 125 126 127
	return joined;
}

/**
 * @brief Frees the memory of a given sequence.
 * @param S - The sequence to free.
 */
128 129 130
void seq_free(seq_t *S) {
	free(S->S);
	free(S->name);
131 132 133 134 135 136 137 138 139
	*S = (seq_t){};
}

/**
 * @brief Compute the reverse complement.
 * @param str The master string.
 * @param len The length of the master string
 * @return The reverse complement. The caller has to free it!
 */
140
char *revcomp(const char *str, size_t len) {
141
	if (!str) return NULL;
142
	char *rev = malloc(len + 1);
143
	CHECK_MALLOC(rev);
144

145
	char *r = rev;
146
	const char *s = &str[len - 1];
147
	rev[len] = '\0';
148

149
	do {
150 151 152
		char d;

		switch (*s--) {
153 154 155 156
			case 'A': d = 'T'; break;
			case 'T': d = 'A'; break;
			case 'G': d = 'C'; break;
			case 'C': d = 'G'; break;
157 158 159
			case '!':
				d = ';';
				break; // rosebud
160
			default: continue;
161
		}
162

163
		*r++ = d;
164 165
	} while (--len);

166 167 168 169
	return rev;
}

/**
170 171
 * @brief This function concatenates the reverse complement to a given master
 * string. A `#` sign is used as a separator.
172 173 174 175
 * @param s The master string.
 * @param len Its length.
 * @return The newly concatenated string.
 */
176 177 178 179 180 181
char *catcomp(char *s, size_t len) {
	if (!s) return NULL;

	char *rev = revcomp(s, len);

	char *temp = realloc(rev, 2 * len + 2);
182
	CHECK_MALLOC(temp);
183

184 185
	rev = temp;
	rev[len] = '#';
186 187 188

	memcpy(rev + len + 1, s, len + 1);

189 190 191 192 193 194 195 196
	return rev;
}

/**
 * @brief Calculates the GC content of a sequence.
 *
 * This function computes the relative amount of G and C in the total sequence.
 */
197
double calc_gc(const seq_t *S) {
198
	size_t GC = 0;
199 200 201 202
	const char *p = S->S;

	for (; *p; p++) {
		if (*p == 'G' || *p == 'C') {
203 204 205
			GC++;
		}
	}
206 207

	return (double)GC / S->len;
208 209 210
}

/** @brief Prepares a sequences to be used as the subject in a comparison. */
211 212 213
int seq_subject_init(seq_subject *S, const seq_t *base) {
	S->gc = calc_gc(base);
	S->RS = catcomp(base->S, base->len);
214
	if (!S->RS) return 1;
215
	S->RSlen = 2 * base->len + 1;
216
	return 0;
217 218
}

219 220
/** @brief Frees some memory unused for when a sequence is only used as query.
 */
221
void seq_subject_free(seq_subject *S) {
222 223 224 225 226 227 228 229 230 231
	free(S->RS);
	S->RS = NULL;
	S->RSlen = 0;
	S->gc = 0.0;
}

/** @brief Initializes a sequences
 *
 * @returns 0 iff successful.
 */
232 233
int seq_init(seq_t *S, const char *seq, const char *name) {
	if (!S || !seq || !name) {
234 235 236
		return 1;
	}

237
	*S = (seq_t){.S = strdup(seq), .name = strdup(name)};
238

239 240
	CHECK_MALLOC(S->S);
	CHECK_MALLOC(S->name);
241

242
	normalize(S);
243

244 245
	// recalculate the length because `normalize` might have stripped some
	// characters.
246 247 248 249 250 251 252 253 254
	S->len = strlen(S->S);

	return 0;
}

/**
 * @brief Restricts a sequence characters set to ACGT.
 *
 * This function strips a sequence of non ACGT characters and converts acgt to
255 256
 * the upper case equivalent. A flag is set if a non-canonical character was
 * encountered.
257
 */
258
void normalize(seq_t *S) {
259 260
	char *p, *q;
	char local_non_acgt = 0;
261 262
	for (p = q = S->S; *p; p++) {
		switch (*p) {
263 264 265 266
			case 'A':
			case 'C':
			case 'G':
			case 'T':
267
			case '!': *q++ = *p; break;
268 269 270
			case 'a':
			case 'c':
			case 'g':
271 272
			case 't': *q++ = toupper((unsigned char)*p); break;
			default: local_non_acgt = 1; break;
273 274 275
		}
	}
	*q = '\0';
276 277
	if (local_non_acgt) {
#pragma omp atomic
278 279 280
		FLAGS |= F_NON_ACGT;
	}
}