bwase.c 19.1 KB
Newer Older
1 2 3 4 5 6
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
7
#include <assert.h>
8
#include "bwase.h"
9 10 11 12
#include "bwtaln.h"
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
13
#include "bwa.h"
14 15 16 17 18
#include "ksw.h"

#ifdef USE_MALLOC_WRAPPERS
#  include "malloc_wrap.h"
#endif
19

20
int g_log_n[256];
21

22
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
23 24 25 26 27 28 29
{
	int i, cnt, best;
	if (n_aln == 0) {
		s->type = BWA_TYPE_NO_MATCH;
		s->c1 = s->c2 = 0;
		return;
	}
30 31 32 33 34 35

	if (set_main) {
		best = aln[0].score;
		for (i = cnt = 0; i < n_aln; ++i) {
			const bwt_aln1_t *p = aln + i;
			if (p->score > best) break;
36
			if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
37
				s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape;
38
				s->ref_shift = (int)p->n_del - (int)p->n_ins;
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
				s->score = p->score;
				s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
			}
			cnt += p->l - p->k + 1;
		}
		s->c1 = cnt;
		for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
		s->c2 = cnt - s->c1;
		s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
	}

	if (n_multi) {
		int k, rest, n_occ, z = 0;
		for (k = n_occ = 0; k < n_aln; ++k) {
			const bwt_aln1_t *q = aln + k;
			n_occ += q->l - q->k + 1;
		}
		if (s->multi) free(s->multi);
		if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
			s->multi = 0; s->n_multi = 0;
			return;
		}
		/* The following code is more flexible than what is required
		 * here. In principle, due to the requirement above, we can
		 * simply output all hits, but the following samples "rest"
		 * number of random hits. */
		rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
66
		s->multi = calloc(rest, sizeof(bwt_multi1_t));
67 68 69 70 71 72 73
		for (k = 0; k < n_aln; ++k) {
			const bwt_aln1_t *q = aln + k;
			if (q->l - q->k + 1 <= rest) {
				bwtint_t l;
				for (l = q->k; l <= q->l; ++l) {
					s->multi[z].pos = l;
					s->multi[z].gap = q->n_gapo + q->n_gape;
74
					s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
75
					s->multi[z++].mm = q->n_mm;
76 77 78
				}
				rest -= q->l - q->k + 1;
			} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. 
79 80
				int j, i;
				for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
81 82 83 84
					double p = 1.0, x = drand48();
					while (x < p) p -= p * j / (i--);
					s->multi[z].pos = q->l - i;
					s->multi[z].gap = q->n_gapo + q->n_gape;
85
					s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
86
					s->multi[z++].mm = q->n_mm;
87 88 89 90
				}
				rest = 0;
				break;
			}
91
		}
92
		s->n_multi = z;
93
	}
94 95 96 97 98
}

void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
{
	bwa_aln2seq_core(n_aln, aln, s, 1, 0);
99 100 101 102 103 104 105 106 107 108 109 110 111
}

int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
{
	int n;
	if (p->c1 == 0) return 23;
	if (p->c1 > 1) return 0;
	if (p->n_mm == mm) return 25;
	if (p->c2 == 0) return 37;
	n = (p->c2 >= 255)? 255 : p->c2;
	return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
}

112
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand)
113 114 115
{
	bwtint_t pos_f;
	int is_rev;
116
	*strand = 0; // initialise strand to 0 otherwise we could return without setting it
117 118 119
	pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate
	if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1;
	pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base
120
	*strand = !is_rev;
121
	if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base
122 123 124
	return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
}

125 126 127 128 129 130
/**
 * Derive the actual position in the read from the given suffix array
 * coordinates. Note that the position will be approximate based on
 * whether indels appear in the read and whether calculations are
 * performed from the start or end of the read.
 */
131
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
132
{
133
	int max_diff, strand;
134 135
	if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
	max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
136
	seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
137 138
	//fprintf(stderr, "%d\n", seq->ref_shift);
	seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand);
139 140
	seq->strand = strand;
	seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
141
	if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH;
142 143
}

144
void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
145
{
146
	int i, j, strand, n_multi;
147 148 149 150 151 152
	char str[1024];
	bwt_t *bwt;
	// load forward SA
	strcpy(str, prefix); strcat(str, ".bwt");  bwt = bwt_restore_bwt(str);
	strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
	for (i = 0; i != n_seqs; ++i) {
153 154 155 156
		bwa_seq_t *p = &seqs[i];
		bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
		for (j = n_multi = 0; j < p->n_multi; ++j) {
			bwt_multi1_t *q = p->multi + j;
157
			q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand);
158
			q->strand = strand;
159
			if (q->pos != p->pos && q->pos != (bwtint_t)-1)
160
				p->multi[n_multi++] = *q;
161
		}
162
		p->n_multi = n_multi;
163 164 165 166
	}
	bwt_destroy(bwt);
}

167 168 169
#define SW_BW 50

bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar)
170
{
171
	bwa_cigar_t *cigar = 0;
172 173 174 175 176 177 178 179 180 181
	uint32_t *cigar32 = 0;
	ubyte_t *rseq;
	int64_t k, rb, re, rlen;
	int8_t mat[25];

	bwa_fill_scmat(1, 3, mat);
	rb = *_rb; re = rb + len + ref_shift;
	assert(re <= l_pac);
	rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
	assert(re - rb == rlen);
182
	ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32);
183 184 185 186 187 188 189 190
	assert(*n_cigar > 0);
	if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping
	if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping
	if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del
	if ((cigar32[0]&0xf) == 2) { // delete beginning del
		*_rb += cigar32[0]>>4;
		--*n_cigar;
		memmove(cigar32, cigar32+1, (*n_cigar) * 4);
191
	}
192 193 194 195
	cigar = (bwa_cigar_t*)cigar32;
	for (k = 0; k < *n_cigar; ++k)
		cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
	free(rseq);
196 197 198
	return cigar;
}

199
char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
200 201 202 203 204 205 206 207 208
				  bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
{
	bwtint_t x, y;
	int z, u, c, nm = 0;
	str->l = 0; // reset
	x = pos; y = 0;
	if (cigar) {
		int k, l;
		for (k = u = 0; k < n_cigar; ++k) {
209 210
			l = __cigar_len(cigar[k]);
			if (__cigar_op(cigar[k]) == FROM_M) {
211 212 213 214 215 216 217 218 219 220
				for (z = 0; z < l && x+z < l_pac; ++z) {
					c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
					if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
						ksprintf(str, "%d", u);
						kputc("ACGTN"[c], str);
						++nm;
						u = 0;
					} else ++u;
				}
				x += l; y += l;
221
			} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
222
				y += l;
223 224
				if (__cigar_op(cigar[k]) == FROM_I) nm += l;
			} else if (__cigar_op(cigar[k]) == FROM_D) {
225 226 227 228 229 230 231 232 233
				ksprintf(str, "%d", u);
				kputc('^', str);
				for (z = 0; z < l && x+z < l_pac; ++z)
					kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
				u = 0;
				x += l; nm += l;
			}
		}
	} else { // no gaps
234
		for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) {
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
			c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
			if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
				ksprintf(str, "%d", u);
				kputc("ACGTN"[c], str);
				++nm;
				u = 0;
			} else ++u;
		}
	}
	ksprintf(str, "%d", u);
	*_nm = nm;
	return strdup(str->s);
}

void bwa_correct_trimmed(bwa_seq_t *s)
{
	if (s->len == s->full_len) return;
	if (s->strand == 0) { // forward
253
		if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
254 255 256 257
			s->cigar[s->n_cigar-1] += s->full_len - s->len;
		} else {
			if (s->cigar == 0) {
				s->n_cigar = 2;
258 259
				s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
				s->cigar[0] = __cigar_create(0, s->len);
260 261
			} else {
				++s->n_cigar;
262
				s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
263
			}
264
			s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
265 266
		}
	} else { // reverse
267
		if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
268 269 270 271
			s->cigar[0] += s->full_len - s->len;
		} else {
			if (s->cigar == 0) {
				s->n_cigar = 2;
272 273
				s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
				s->cigar[1] = __cigar_create(0, s->len);
274 275
			} else {
				++s->n_cigar;
276 277
				s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
				memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
278
			}
279
			s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
280 281 282 283 284
		}
	}
	s->len = s->full_len;
}

285
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
286
{
287
	ubyte_t *pacseq;
288
	int i, j, k;
289 290 291 292
	kstring_t *str;

	if (!_pacseq) {
		pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
293 294
		err_rewind(bns->fp_pac);
		err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
295 296 297 298
	} else pacseq = _pacseq;
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *s = seqs + i;
		seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
299
		for (j = k = 0; j < s->n_multi; ++j) {
300 301
			bwt_multi1_t *q = s->multi + j;
			int n_cigar;
302 303 304 305 306
			if (q->gap) { // gapped alignment
				q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar);
				q->n_cigar = n_cigar;
				if (q->cigar) s->multi[k++] = *q;
			} else s->multi[k++] = *q;
307
		}
308
		s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation
309
		if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
310 311
		s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar);
		if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH;
312 313 314 315 316 317 318
	}
	// generate MD tag
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	for (i = 0; i != n_seqs; ++i) {
		bwa_seq_t *s = seqs + i;
		if (s->type != BWA_TYPE_NO_MATCH) {
			int nm;
319
			s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm);
320 321 322 323 324 325
			s->nm = nm;
		}
	}
	free(str->s); free(str);

	// correct for trimmed reads
326
	for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
327 328 329 330

	if (!_pacseq) free(pacseq);
}

331
int64_t pos_end(const bwa_seq_t *p)
332 333 334 335 336
{
	if (p->cigar) {
		int j;
		int64_t x = p->pos;
		for (j = 0; j != p->n_cigar; ++j) {
337 338
			int op = __cigar_op(p->cigar[j]);
			if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
339 340 341 342 343
		}
		return x;
	} else return p->pos + p->len;
}

344 345 346 347 348 349 350 351 352 353 354 355 356
int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
{
	if (p->cigar) {
		int j;
		int64_t x = p->pos;
		for (j = 0; j != p->n_cigar; ++j) {
			int op = __cigar_op(p->cigar[j]);
			if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
		}
		return x;
	} else return p->pos + len;
}

357 358 359 360 361 362 363
static int64_t pos_5(const bwa_seq_t *p)
{
	if (p->type != BWA_TYPE_NO_MATCH)
		return p->strand? pos_end(p) : p->pos;
	return -1;
}

364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
void bwa_print_seq(FILE *stream, bwa_seq_t *seq) {
	char buffer[4096];
	const int bsz = sizeof(buffer);
	int i, j, l;
	
	if (seq->strand == 0) {
		for (i = 0; i < seq->full_len; i += bsz) {
			l = seq->full_len - i > bsz ? bsz : seq->full_len - i;
			for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]];
			err_fwrite(buffer, 1, l, stream);
		}
	} else {
		for (i = seq->full_len - 1; i >= 0; i -= bsz) {
			l = i + 1 > bsz ? bsz : i + 1;
			for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]];
			err_fwrite(buffer, 1, l, stream);
		}
	}
}

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
{
	int j;
	if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
		int seqid, nn, am = 0, flag = p->extra_flag;
		char XT;

		if (p->type == BWA_TYPE_NO_MATCH) {
			p->pos = mate->pos;
			p->strand = mate->strand;
			flag |= SAM_FSU;
			j = 1;
		} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment

		// get seqid
399
		nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
400 401
		if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
			flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
402 403 404 405 406 407 408 409

		// update flag and print it
		if (p->strand) flag |= SAM_FSR;
		if (mate) {
			if (mate->type != BWA_TYPE_NO_MATCH) {
				if (mate->strand) flag |= SAM_FMR;
			} else flag |= SAM_FMU;
		}
410 411
		err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
		err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
412 413 414 415

		// print CIGAR
		if (p->cigar) {
			for (j = 0; j != p->n_cigar; ++j)
416 417 418
				err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
		} else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*");
		else err_printf("%dM", p->len);
419 420 421

		// print mate coordinate
		if (mate && mate->type != BWA_TYPE_NO_MATCH) {
422
			int m_seqid;
423 424 425
			long long isize;
			am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
			// redundant calculation here, but should not matter too much
426
			bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
427
			err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
428 429
			isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
			if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
430 431 432
			err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
		} else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
		else err_printf("\t*\t0\t0\t");
433 434

		// print sequence and quality
435 436
		bwa_print_seq(stdout, p);
		err_putchar('\t');
437 438
		if (p->qual) {
			if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
439 440
			err_printf("%s", p->qual);
		} else err_printf("*");
441

442
		if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
443 444
		if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
		if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
445
		if (p->type != BWA_TYPE_NO_MATCH) {
446
			int i;
447 448 449 450
			// calculate XT tag
			XT = "NURM"[p->type];
			if (nn > 10) XT = 'N';
			// print tags
451 452 453
			err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
			if (nn) err_printf("\tXN:i:%d", nn);
			if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
454
			if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
455 456
				err_printf("\tX0:i:%d", p->c1);
				if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2);
457
			}
458 459
			err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
			if (p->md) err_printf("\tMD:Z:%s", p->md);
460 461
			// print multiple hits
			if (p->n_multi) {
462
				err_printf("\tXA:Z:");
463 464 465 466
				for (i = 0; i < p->n_multi; ++i) {
					bwt_multi1_t *q = p->multi + i;
					int k;
					j = pos_end_multi(q, p->len) - q->pos;
467
					nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
468
					err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
469 470 471
						   (int)(q->pos - bns->anns[seqid].offset + 1));
					if (q->cigar) {
						for (k = 0; k < q->n_cigar; ++k)
472 473 474
							err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
					} else err_printf("%dM", p->len);
					err_printf(",%d;", q->gap + q->mm);
475 476
				}
			}
477
		}
478
		err_putchar('\n');
479
	} else { // this read has no match
480
		//ubyte_t *s = p->strand? p->rseq : p->seq;
481 482
		int flag = p->extra_flag | SAM_FSU;
		if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
483
		err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
484 485 486 487
		//Why did this work differently to the version above??
		//for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
		bwa_print_seq(stdout, p);
		err_putchar('\t');
488 489
		if (p->qual) {
			if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
490 491
			err_printf("%s", p->qual);
		} else err_printf("*");
492
		if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
493 494
		if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
		if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
495
		err_putchar('\n');
496 497 498
	}
}

499 500 501 502 503 504
void bwase_initialize() 
{
	int i;
	for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}

505
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
506
{
507
	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
508 509 510 511 512
	int i, n_seqs, tot_seqs = 0, m_aln;
	bwt_aln1_t *aln = 0;
	bwa_seq_t *seqs;
	bwa_seqio_t *ks;
	clock_t t;
513
	bntseq_t *bns;
514 515
	FILE *fp_sa;
	gap_opt_t opt;
516
	char magic[4];
517 518

	// initialization
519
	bwase_initialize();
520 521 522 523 524
	bns = bns_restore(prefix);
	srand48(bns->seed);
	fp_sa = xopen(fn_sa, "r");

	m_aln = 0;
525 526 527 528 529 530
	err_fread_noeof(magic, 1, 4, fp_sa);
	if (strncmp(magic, SAI_MAGIC, 4) != 0) {
		fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
		exit(1);
	}
	err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
531
	bwa_print_sam_hdr(bns, rg_line);
532 533 534
	// set ks
	ks = bwa_open_reads(opt.mode, fn_fa);
	// core loop
535
	while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
536 537 538 539 540 541 542
		tot_seqs += n_seqs;
		t = clock();

		// read alignment
		for (i = 0; i < n_seqs; ++i) {
			bwa_seq_t *p = seqs + i;
			int n_aln;
543
			err_fread_noeof(&n_aln, 4, 1, fp_sa);
544 545 546 547
			if (n_aln > m_aln) {
				m_aln = n_aln;
				aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
			}
548
			err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
549
			bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
550 551 552
		}

		fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
553
		bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
554 555 556
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();

		fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
557
		bwa_refine_gapped(bns, n_seqs, seqs, 0);
558 559 560 561 562
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();

		fprintf(stderr, "[bwa_aln_core] print alignments... ");
		for (i = 0; i < n_seqs; ++i)
			bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
563
		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
564 565 566 567 568 569 570 571

		bwa_free_read_seq(n_seqs, seqs);
		fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
	}

	// destroy
	bwa_seq_close(ks);
	bns_destroy(bns);
572
	err_fclose(fp_sa);
573 574 575 576 577
	free(aln);
}

int bwa_sai2sam_se(int argc, char *argv[])
{
578
	int c, n_occ = 3;
579
	char *prefix, *rg_line = 0;
580
	while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
581 582
		switch (c) {
		case 'h': break;
583
		case 'r':
584
			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
585
			break;
586
		case 'n': n_occ = atoi(optarg); break;
587
		case 'f': xreopen(optarg, "w", stdout); break;
588 589 590 591 592
		default: return 1;
		}
	}

	if (optind + 3 > argc) {
593
		fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
594 595
		return 1;
	}
596
	if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
597
		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
598
		return 1;
599
	}
600
	bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
601
	free(prefix);
602 603
	return 0;
}