Commit e513f4df authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 0.7.8

parent b93d3357
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -O2
CFLAGS= -g -Wall -O2 -Wno-unused-function
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
......
Release 0.7.8 (31 March, 2014)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes in BWA-MEM:
* Bugfix: off-diagonal X-dropoff (option -d) not working as intended.
Short-read alignment is not affected.
* Bugfix: unnecessarily large bandwidth used during global alignment,
which reduces the mapping speed by ~5% for short reads. Results are not
affected.
* Bugfix: when the matching score is not one, paired-end mapping quality is
inaccurate.
* When the matching score (option -A) is changed, scale all score-related
options accordingly unless overridden by users.
* Allow to specify different gap open (or extension) penalties for deletions
and insertions separately.
* Allow to specify the insert size distribution.
* Better and more detailed debugging information.
With the default setting, 0.7.8 and 0.7.7 gave identical output on one million
100bp read pairs.
(0.7.8: 31 March 2014, r455)
Release 0.7.7 (25 Feburary, 2014)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
.TH bwa 1 "25 Feburary 2014" "bwa-0.7.7" "Bioinformatics tools"
.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
......@@ -148,9 +148,10 @@ not work with split alignments. One may consider to use option
.B -M
to flag shorter split hits as secondary.
.B OPTIONS:
.RS
.TP 10
.B ALGORITHM OPTIONS:
.TP
.BI -t \ INT
Number of threads [1]
.TP
......@@ -202,11 +203,14 @@ Matching score. [1]
.BI -B \ INT
Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
.TP
.BI -O \ INT
Gap open penalty. [6]
.BI -O \ INT[,INT]
Gap open penalty. If two numbers are specified, the first is the penalty of
openning a deletion and the second for openning an insertion. [6]
.TP
.BI -E \ INT
Gap extension penalty. A gap of length k costs O + k*E (i.e.
.BI -E \ INT[,INT]
Gap extension penalty. If two numbers are specified, the first is the penalty
of extending a deletion and second for extending an insertion. A gap of length
k costs O + k*E (i.e.
.B -O
is for opening a zero-length gap). [1]
.TP
......@@ -224,6 +228,9 @@ Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
two scores to determine whether we should force pairing. A larger value leads to
more aggressive read pair. [17]
.TP
.B INPUT/OUTPUT OPTIONS:
.TP
.B -p
Assume the first input query file is interleaved paired-end FASTA/Q. See the
......@@ -260,6 +267,13 @@ supported throughout BWA. Ideally, a value 0 for disabling all the output to
stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
all normal messages; 4 or higher for debugging. When this option takes value
4, the output is not SAM. [3]
.TP
.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
from the mean if absent) and min (4 sigma if absent) of the insert size
distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
these numbers and the pair orientations given enough reads. [inferred]
.RE
.TP
......
......@@ -86,7 +86,7 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25])
}
// Generate CIGAR when the alignment end points are known
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
uint32_t *cigar = 0;
uint8_t tmp, *rseq;
......@@ -106,24 +106,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
// FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
cigar = malloc(4);
cigar[0] = l_query<<4 | 0;
*n_cigar = 1;
for (i = 0, *score = 0; i < l_query; ++i)
*score += mat[rseq[i]*5 + query[i]];
} else {
int w, max_gap, min_w;
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
int w, max_gap, max_ins, max_del, min_w;
// set the band-width
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
max_gap = max_ins > max_del? max_ins : max_del;
max_gap = max_gap > 1? max_gap : 1;
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
w = w < w_? w : w_;
min_w = abs(rlen - l_query) + 3;
w = w > min_w? w : min_w;
// NW alignment
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
if (bwa_verbose >= 4) {
printf("* Global bandwidth: %d\n", w);
printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
}
*score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
}
{// compute NM and MD
int k, x, y, u, n_mm = 0, n_gap = 0;
......@@ -165,7 +171,12 @@ ret_gen_cigar:
return cigar;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
}
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int is_rev;
int64_t cb, ce, fm;
......@@ -184,7 +195,7 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
int64_t x;
cb = cb > *rb? cb : *rb;
ce = ce < *re? ce : *re;
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
......@@ -210,6 +221,11 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns,
return (*qb == *qe || *rb == *re)? -2 : 0;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re);
}
/*********************
* Full index reader *
*********************/
......
......@@ -32,7 +32,9 @@ extern "C" {
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
......
This diff is collapsed.
......@@ -16,9 +16,12 @@ typedef struct __smem_i smem_i;
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_NO_EXACT 0x40
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
int a, b; // match score and mismatch penalty
int o_del, e_del;
int o_ins, e_ins;
int pen_unpaired; // phred-scaled penalty for unpaired reads
int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
int w; // band width
......
......@@ -144,8 +144,8 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me
if (len == re - rb) { // no funny things happening
kswr_t aln;
mem_alnreg_t b;
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
......@@ -219,7 +219,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
y[v.a[i].y&3] = i;
}
if (u.n) { // found at least one proper pair
int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
int tmp = opt->a + opt->b;
tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
ks_introsort_128(u.n, u.a);
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
......@@ -233,6 +235,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
return ret;
}
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
......@@ -274,7 +278,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un? subo : score_un;
q_pe = (o - subo) * 6;
q_pe = raw_mapq(o - subo, opt->a);
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
if (q_pe < 0) q_pe = 0;
if (q_pe > 60) q_pe = 60;
......@@ -291,8 +295,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
......
......@@ -96,14 +96,14 @@ extern "C" {
void bwt_bwtupdate_core(bwt_t *bwt);
inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
void bwt_gen_cnt_table(bwt_t *bwt);
inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
......
......@@ -56,7 +56,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
}
return b;
}
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
{
uint32_t n, b;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
......
......@@ -17,9 +17,9 @@ extern "C" {
#endif
bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
void bwtl_destroy(bwtl_t *bwt);
#ifdef __cplusplus
......
This diff is collapsed.
This diff is collapsed.
......@@ -61,6 +61,7 @@ extern "C" {
* query profile will be deallocated in ksw_align().
*/
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
/**
* Banded global alignment
......@@ -80,6 +81,7 @@ extern "C" {
* @return score of the alignment
*/
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
/**
* Extend alignment
......@@ -103,6 +105,7 @@ extern "C" {
* @return best semi-local alignment score
*/
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
#ifdef __cplusplus
}
......
......@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.7-r441"
#define PACKAGE_VERSION "0.7.8-r455"
#endif
int bwa_fa2pac(int argc, char *argv[]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment