Commit e12ce072 authored by Charles Plessy's avatar Charles Plessy

Merge commit 'upstream/0.5.4'

parents 4a6b7e2f 0dd4ceab
------------------------------------------------------------------------
r1244 | lh3 | 2009-10-09 10:53:52 +0100 (Fri, 09 Oct 2009) | 5 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/bwaseqio.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/main.c
M /branches/prog/bwa/stdaln.c
* bwa-0.5.3-4 (r1244)
* output the clipped length in XC:i: tag
* skip mate alignment when stdaln is buggy
* fixed a bug in NM:i: tag
------------------------------------------------------------------------
r1243 | lh3 | 2009-10-07 13:15:04 +0100 (Wed, 07 Oct 2009) | 3 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/main.c
* bwa-0.5.3-3 (r1243)
* sampe: fixed a bug when a read sequence is identical its reverse complement.
------------------------------------------------------------------------
r1242 | lh3 | 2009-10-07 12:49:13 +0100 (Wed, 07 Oct 2009) | 4 lines
Changed paths:
M /branches/prog/bwa/bntseq.c
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/main.c
* bwa-0.5.3-2 (r1242)
* sampe: optionall preload the full index into memory
* aln: change the default seed length to 32bp
------------------------------------------------------------------------
r1238 | lh3 | 2009-09-26 23:38:15 +0100 (Sat, 26 Sep 2009) | 2 lines
Changed paths:
M /branches/prog/bwa/khash.h
Improve portability of khash.h
------------------------------------------------------------------------
r1228 | lh3 | 2009-09-15 14:20:22 +0100 (Tue, 15 Sep 2009) | 2 lines
Changed paths:
M /branches/prog/bwa/main.c
fixed a typo
------------------------------------------------------------------------
r1227 | lh3 | 2009-09-15 14:19:35 +0100 (Tue, 15 Sep 2009) | 3 lines
Changed paths:
M /branches/prog/bwa/bwtsw2.h
M /branches/prog/bwa/bwtsw2_aux.c
M /branches/prog/bwa/bwtsw2_main.c
M /branches/prog/bwa/main.c
* bwa-0.5.3-1 (r1226)
* in dBWT-SW, optionall use hard clipping instead of soft clipping
------------------------------------------------------------------------
r1225 | lh3 | 2009-09-15 13:32:30 +0100 (Tue, 15 Sep 2009) | 2 lines
Changed paths:
M /branches/prog/bwa/NEWS
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/main.c
Release bwa-0.5.3 (r1225)
------------------------------------------------------------------------
r1223 | lh3 | 2009-09-13 12:30:41 +0100 (Sun, 13 Sep 2009) | 2 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/NEWS
M /branches/prog/bwa/bwa.1
M /branches/prog/bwa/main.c
Release bwa-0.5.2
------------------------------------------------------------------------
r1222 | lh3 | 2009-09-11 14:11:39 +0100 (Fri, 11 Sep 2009) | 3 lines
Changed paths:
......
Beta Release 0.5.4 (9 October, 2009)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since this version, the default seed length used in the "aln" command is
changed to 32.
Notable changes in bwa-short:
* Added a new tag "XC:i" which gives the length of clipped reads.
* In sampe, skip alignments in case of a bug in the Smith-Waterman
alignment module.
* In sampe, fixed a bug in pairing when the read sequence is identical
to its reverse complement.
* In sampe, optionally preload the entire FM-index into memory to
reduce disk operations.
Notable changes in dBWT-SW/BWA-SW:
* Changed name dBWT-SW to BWA-SW.
* Optionally use "hard clipping" in the SAM output.
(0.5.4: 9 October 2009, r1245)
Beta Release 0.5.3 (15 September, 2009)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
......@@ -90,12 +90,14 @@ bntseq_t *bns_restore(const char *prefix)
char str[1024];
FILE *fp;
bntseq_t *bns;
long long xx;
int i;
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
{ // read .ann
strcpy(str, prefix); strcat(str, ".ann");
fp = xopen(str, "r");
fscanf(fp, "%lld%d%u", &bns->l_pac, &bns->n_seqs, &bns->seed);
fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
bns->l_pac = xx;
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
for (i = 0; i < bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
......@@ -110,7 +112,8 @@ bntseq_t *bns_restore(const char *prefix)
if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup("");
// read the rest
fscanf(fp, "%lld%d%d", &p->offset, &p->len, &p->n_ambs);
fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
p->offset = xx;
}
fclose(fp);
}
......@@ -119,12 +122,14 @@ bntseq_t *bns_restore(const char *prefix)
int32_t n_seqs;
strcpy(str, prefix); strcat(str, ".amb");
fp = xopen(str, "r");
fscanf(fp, "%lld%d%d", &l_pac, &n_seqs, &bns->n_holes);
fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
l_pac = xx;
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
for (i = 0; i < bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
fscanf(fp, "%lld%d%s", &p->offset, &p->len, str);
fscanf(fp, "%lld%d%s", &xx, &p->len, str);
p->offset = xx;
p->amb = str[0];
}
fclose(fp);
......
.TH bwa 1 "13 September 2009" "bwa-0.5.2" "Bioinformatics tools"
.TH bwa 1 "9 October 2009" "bwa-0.5.4" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
......@@ -23,10 +23,10 @@ Burrows-Wheeler Transform (BWT). The first algorithm is designed for
short queries up to ~200bp with low error rate (<3%). It does gapped
global alignment w.r.t. queries, supports paired-end reads, and is one
of the fastest short read alignment algorithms to date while also
visiting suboptimal hits. The second algorithm, dBWT-SW, is designed for
visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
long reads with more errors. It performs heuristic Smith-Waterman-like
alignment to find high-scoring local hits (and thus chimera). On
low-error short queries, dBWT-SW is slower and less accurate than the
low-error short queries, BWA-SW is slower and less accurate than the
first algorithm, but on long queries, it is better.
.PP
For both algorithms, the database file in the FASTA format must be
......@@ -178,7 +178,7 @@ hits. [-1]
.TP
.B sampe
bwa sampe [-a maxInsSize] [-o maxOcc] <in.db.fasta> <in1.sai> <in2.sai>
bwa sampe [-a maxInsSize] [-o maxOcc] [-P] <in.db.fasta> <in1.sai> <in2.sai>
<in1.fq> <in2.fq> > <out.sam>
Generate alignments in the SAM format given paired-end reads. Repetitive
......@@ -196,6 +196,11 @@ enough good alignment to infer the distribution of insert sizes. [500]
Maximum occurrences of a read for pairing. A read with more occurrneces
will be treated as a single-end read. Reducing this parameter helps
faster pairing. [100000]
.TP
.B -P
Load the entire FM-index into memory to reduce disk operations
(base-space reads only). With this option, at least 1.25N bytes of
memory are required, where N is the length of the genome.
.RE
.TP
......@@ -413,20 +418,20 @@ usually much faster.
.PP
Command
.B `dbwtsw'
is designed for long-read alignment. The algorithm behind, dBWT-SW, is
is designed for long-read alignment. The algorithm behind, BWA-SW, is
similar to BWT-SW, but does not guarantee to find all local hits due to
the heuristic acceleration. It tends to be faster and more accurate if
the resultant alignment is supported by more seeds, and therefore
dBWT-SW usually performs better on long queries than on short ones.
BWA-SW usually performs better on long queries than on short ones.
On 350-1000bp reads, dBWT-SW is several to tens of times faster than the
On 350-1000bp reads, BWA-SW is several to tens of times faster than the
existing programs. Its accuracy is comparable to SSAHA2, more accurate
than BLAT. Like BLAT, dBWT-SW also finds chimera which may pose a
than BLAT. Like BLAT, BWA-SW also finds chimera which may pose a
challenge to SSAHA2. On 10-100kbp queries where chimera detection is
important, dBWT-SW is over 10X faster than BLAT while being more
important, BWA-SW is over 10X faster than BLAT while being more
sensitive.
DBWT-SW can also be used to align ~100bp reads, but it is slower than
BWA-SW can also be used to align ~100bp reads, but it is slower than
the short-read algorithm. Its sensitivity and accuracy is lower than
SSAHA2 especially when the sequencing error rate is above 2%. This is
the trade-off of the 30X speed up in comparison to SSAHA2's -454 mode.
......@@ -458,11 +463,11 @@ Burrows-Wheeler transform. Bioinformatics, 25, 1754-60.
.SH HISTORY
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
and mimics its binary file formats; dBWT-SW resembles BWT-SW in several
and mimics its binary file formats; BWA-SW resembles BWT-SW in several
ways. The initial idea about BWT-based alignment also came from the
group who developed BWT-SW. At the same time, BWA is different enough
from BWT-SW. The short-read alignment algorithm bears no similarity to
Smith-Waterman algorithm any more. While dBWT-SW learns from BWT-SW, it
Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
introduces heuristics that can hardly be applied to the original
algorithm. In all, BWA does not guarantee to find all local hits as what
BWT-SW is designed to do, but it is much faster than BWT-SW on both
......@@ -478,5 +483,5 @@ BWT-based short read aligner, bowtie, was first released in August
2008. At the time of writing this manual, at least three more BWT-based
short-read aligners are being implemented.
The dBWT-SW algorithm is a new component of BWA. It was conceived in
The BWA-SW algorithm is a new component of BWA. It was conceived in
November 2008 and implemented ten months later.
This diff is collapsed.
......@@ -140,9 +140,9 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
} while (0)
#define __pairing_aux2(q, w) do { \
const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
(q)->extra_flag |= SAM_FPP; \
if ((q)->pos != (w)>>32) { \
const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \
if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \
(q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \
(q)->score = r->score; (q)->mapQ = mapQ_p; \
(q)->pos = (w)>>32; \
......@@ -229,7 +229,7 @@ typedef struct {
kvec_t(bwt_aln1_t) aln;
} aln_buf_t;
int bwa_cal_pac_pos_pe(const char *prefix, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
{
int i, j, cnt_chg = 0;
......@@ -242,11 +242,12 @@ int bwa_cal_pac_pos_pe(const char *prefix, int n_seqs, bwa_seq_t *seqs[2], FILE
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
// load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
if (_bwt[0] == 0) { // load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
} else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
// SE
for (i = 0; i != n_seqs; ++i) {
......@@ -334,7 +335,9 @@ int bwa_cal_pac_pos_pe(const char *prefix, int n_seqs, bwa_seq_t *seqs[2], FILE
kv_destroy(buf[1][i].aln);
}
free(buf[0]); free(buf[1]);
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
if (_bwt[0] == 0) {
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
}
kv_destroy(d->arr);
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
......@@ -351,7 +354,7 @@ uint16_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyt
uint16_t *cigar = 0;
ubyte_t *ref_seq;
bwtint_t k, x, y, l;
int path_len;
int path_len, ret;
AlnParam ap = aln_param_bwa;
path_t *path, *p;
......@@ -368,7 +371,11 @@ uint16_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyt
path = (path_t*)calloc(l+len, sizeof(path_t));
// do alignment
aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0);
if (ret < 0) {
free(path); free(cigar); *n_cigar = 0;
return 0;
}
cigar = aln_path2cigar(path, path_len, n_cigar);
// check whether the alignment is good enough
......@@ -426,16 +433,18 @@ uint16_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyt
return cigar;
}
ubyte_t *bwa_paired_sw(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
{
ubyte_t *pacseq;
int i;
uint64_t x, n;
// load reference sequence
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
if (_pacseq == 0) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
} else pacseq = (ubyte_t*)_pacseq;
if (!popt->is_sw || ii->avg < 0.0) return pacseq;
// perform mate alignment
......@@ -530,8 +539,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
gap_opt_t opt;
khint_t iter;
isize_info_t last_ii; // this is for the last batch of reads
char str[1024];
bwt_t *bwt[2];
uint8_t *pac;
// initialization
pac = 0; bwt[0] = bwt[1] = 0;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
bns = bns_restore(prefix);
srand48(bns->seed);
......@@ -542,13 +555,24 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
g_hash = kh_init(64);
last_ii.avg = -1.0;
// core loop
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]);
if (!(opt.mode & BWA_MODE_COMPREAD)) {
popt->type = BWA_PET_SOLID;
ntbns = bwa_open_nt(prefix);
} else { // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
}
// core loop
bwa_print_sam_SQ(bns);
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
int cnt_chg;
......@@ -560,19 +584,19 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
cnt_chg = bwa_cal_pac_pos_pe(prefix, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] change of coordinates in %d alignments.\n", cnt_chg);
fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
pacseq = bwa_paired_sw(bns, n_seqs, seqs, popt, &ii);
pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
free(pacseq);
if (pac == 0) free(pacseq);
fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
for (i = 0; i < n_seqs; ++i) {
......@@ -597,6 +621,9 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
kh_destroy(64, g_hash);
if (pac) {
free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
}
}
int bwa_sai2sam_pe(int argc, char *argv[])
......@@ -604,11 +631,12 @@ int bwa_sai2sam_pe(int argc, char *argv[])
int c;
pe_opt_t *popt;
popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:s")) >= 0) {
while ((c = getopt(argc, argv, "a:o:sP")) >= 0) {
switch (c) {
case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break;
case 's': popt->is_sw = 0; break;
case 'P': popt->is_preload = 1; break;
default: return 1;
}
}
......@@ -618,6 +646,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n\n");
fprintf(stderr, "Notes: 1. For SOLiD read, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
......
......@@ -160,7 +160,8 @@ char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, bwtint_t pos, ubyte_t *
}
x += l; y += l;
} else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) {
y += l; nm += l;
y += l;
if (cigar[k]>>14 == FROM_I) nm += l;
} else if (cigar[k]>>14 == FROM_D) {
ksprintf(str, "%d", u);
kputc('^', str);
......@@ -357,6 +358,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
printf("%s", p->qual);
} else printf("*");
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
if (p->type != BWA_TYPE_NO_MATCH) {
// calculate XT tag
XT = "NURM"[p->type];
......@@ -384,6 +386,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
printf("%s", p->qual);
} else printf("*");
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
putchar('\n');
}
}
......
......@@ -59,7 +59,7 @@ int bwa_trim_read(int trim_qual, bwa_seq_t *p)
max = s; max_l = l;
}
}
p->len = max_l + 1;
p->clip_len = p->len = max_l + 1;
return p->full_len - p->len;
}
......@@ -76,7 +76,7 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int
p = &seqs[n_seqs++];
p->tid = -1; // no assigned to a thread
p->qual = 0;
p->full_len = p->len = l;
p->full_len = p->clip_len = p->len = l;
n_tot += p->full_len;
p->seq = (ubyte_t*)calloc(p->len, 1);
for (i = 0; i != p->full_len; ++i)
......
......@@ -25,7 +25,7 @@ gap_opt_t *gap_init_opt()
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
o->seed_len = 0x7fffffff; o->max_seed_diff = 2;
o->seed_len = 32; o->max_seed_diff = 2;
o->fnr = 0.04;
o->n_threads = 1;
o->max_top2 = 30;
......
......@@ -43,6 +43,7 @@ typedef struct {
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
int score;
int clip_len;
// alignments in SA coordinates
int n_aln;
bwt_aln1_t *aln;
......@@ -81,7 +82,7 @@ typedef struct {
typedef struct {
int max_isize;
int max_occ;
int type, is_sw;
int type, is_sw, is_preload;
} pe_opt_t;
struct __bwa_seqio_t;
......
......@@ -8,7 +8,7 @@
typedef struct {
int a, b, q, r, t, qr, bw;
int z, is, t_seeds;
int z, is, t_seeds, hard_clip;
float yita, mask_level, coef;
int n_threads, chunk_size;
} bsw2opt_t;
......
......@@ -47,7 +47,7 @@ bsw2opt_t *bsw2_init_opt()
bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
o->bw = 50;
o->z = 1; o->is = 3; o->t_seeds = 5;
o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0;
o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f;
o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
return o;
......@@ -392,6 +392,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
bsw2hit_t *p = b->hits + i;
int32_t seqid = -1, coor = -1;
int j, qual, nn = 0;
int beg, end;
if (p->l == 0) {
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
nn = bns_coor_pac2real(bns, p->k, p->len, &seqid);
......@@ -411,16 +412,21 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
}
ksprintf(&str, "\t%d\t", qual);
for (k = 0; k < b->n_cigar[i]; ++k)
ksprintf(&str, "%d%c", b->cigar[i][k]>>4, "MIDNSHP"[b->cigar[i][k]&0xf]);
ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]);
} else ksprintf(&str, "\t0\t*");
ksprintf(&str, "\t*\t0\t0\t");
for (j = 0; j < ks->l; ++j) {
beg = 0; end = ks->l;
if (opt->hard_clip) {
if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4;
if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4;
}
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
else kputc(ks->seq[j], &str);
}
if (ks->qual) {
kputc('\t', &str);
for (j = 0; j < ks->l; ++j) {
for (j = beg; j < end; ++j) {
if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
else kputc(ks->qual[j], &str);
}
......
......@@ -16,7 +16,7 @@ int bwa_bwtsw2(int argc, char *argv[])
opt = bsw2_init_opt();
srand48(11);
while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:")) >= 0) {
while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:H")) >= 0) {
switch (c) {
case 'q': opt->q = atoi(optarg); break;
case 'r': opt->r = atoi(optarg); break;
......@@ -31,6 +31,7 @@ int bwa_bwtsw2(int argc, char *argv[])
case 'm': opt->mask_level = atof(optarg); break;
case 'c': opt->coef = atof(optarg); break;
case 'N': opt->t_seeds = atoi(optarg); break;
case 'H': opt->hard_clip = 1; break;
}
}
opt->qr = opt->q + opt->r;
......@@ -55,6 +56,7 @@ int bwa_bwtsw2(int argc, char *argv[])
fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds);
fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n");
fprintf(stderr, "\n");
{
......
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
Copyright (c) 2008, 2009 by attractor <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
......@@ -47,6 +47,10 @@ int main() {
*/
/*
2009-09-26 (0.2.4):
* Improve portability
2008-09-19 (0.2.3):
* Corrected the example
......@@ -86,17 +90,35 @@ int main() {
@copyright Heng Li
*/
#define AC_VERSION_KHASH_H "0.2.2"
#define AC_VERSION_KHASH_H "0.2.4"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
/* compipler specific configuration */
#if UINT_MAX == 0xffffffffu
typedef unsigned int khint32_t;
#elif ULONG_MAX == 0xffffffffu
typedef unsigned long khint32_t;
#endif
#if ULONG_MAX == ULLONG_MAX
typedef unsigned long khint64_t;
#else
typedef unsigned long long khint64_t;
#endif
#ifdef _MSC_VER
#define inline __inline
#endif
typedef uint32_t khint_t;
typedef khint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_HASH_PRIME_SIZE 32
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
{
0ul, 3ul, 11ul, 23ul, 53ul,
97ul, 193ul, 389ul, 769ul, 1543ul,
......@@ -120,7 +142,7 @@ static const double __ac_HASH_UPPER = 0.77;
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
uint32_t *flags; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
......@@ -138,7 +160,7 @@ static const double __ac_HASH_UPPER = 0.77;
static inline void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
......@@ -158,7 +180,7 @@ static const double __ac_HASH_UPPER = 0.77;
} \
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ \
uint32_t *new_flags = 0; \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
......@@ -166,8 +188,8 @@ static const double __ac_HASH_UPPER = 0.77;
new_n_buckets = __ac_prime_list[t+1]; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
else { \
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
......@@ -266,20 +288,20 @@ static const double __ac_HASH_UPPER = 0.77;
/*! @function
@abstract Integer hash function
@param key The integer [uint32_t]
@param key The integer [khint32_t]
@return The hash value [khint_t]
*/
#define kh_int_hash_func(key) (uint32_t)(key)
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function
@abstract Integer comparison function
*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract 64-bit integer hash function
@param key The integer [uint64_t]
@param key The integer [khint64_t]
@return The hash value [khint_t]
*/
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function
@abstract 64-bit integer comparison function
*/
......@@ -440,7 +462,7 @@ static inline khint_t __ac_X31_hash_string(const char *s)
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
......@@ -448,14 +470,14 @@ static inline khint_t __ac_X31_hash_string(const char *s)
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
......@@ -463,7 +485,7 @@ static inline khint_t __ac_X31_hash_string(const char *s)
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
......
......@@ -3,7 +3,7 @@
#include "main.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.5.3 (r1225)"
#define PACKAGE_VERSION "0.5.4 (r1245)"
#endif
static int usage()
......@@ -17,7 +17,7 @@ static int usage()
fprintf(stderr, " aln gapped/ungapped alignment\n");
fprintf(stderr, " samse generate alignment (single ended)\n");
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " dbwtsw dBWT-SW for long queries\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");