Commit 11ce18b5 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 0.7.13

parent 41430b90
*.[oa]
bwa
test
test64
.*.swp
Makefile.bak
bwamem-lite
language: c
compiler:
- gcc
- clang
script: make
......@@ -4,9 +4,10 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwape.o kopen.o pemerge.o maxk.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
......@@ -45,7 +46,8 @@ depend:
QSufSort.o: QSufSort.h
bamlite.o: bamlite.h malloc_wrap.h
bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h
bwa.o: kseq.h
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h utils.h kbtree.h
bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
......@@ -62,7 +64,7 @@ bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
......@@ -79,5 +81,8 @@ kstring.o: kstring.h malloc_wrap.h
ksw.o: ksw.h malloc_wrap.h
main.o: kstring.h malloc_wrap.h utils.h
malloc_wrap.o: malloc_wrap.h
maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
rle.o: rle.h
rope.o: rle.h rope.h
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
Release 0.7.13 (23 Feburary 2016)
---------------------------------
This release fixes a few minor bugs in the previous version and adds a few
minor features. All BWA algorithms should produce identical output to 0.7.12
when there are no ALT contigs.
Detailed changes:
* Fixed a bug in "bwa-postalt.js". The old version may produce 0.5% of wrong
bases for reads mapped to the ALT contigs.
* Fixed a potential bug in the multithreading mode. It may occur when mapping
is much faster than file reading, which should almost never happen in
practice.
* Changed the download URL of GRCh38.
* Removed the read overlap mode. It is not working well.
* Added the ropebwt2 algorithm as an alternative to index large genomes.
Ropebwt2 is slower than the "bwtsw" algorithm, but it has a permissive
license. This allows us to create an Apache2-licensed BWA (in the "Apache2"
branch) for commercial users who are concerned with GPL.
(0.7.13: 23 Feburary 2016, r1126)
Release 0.7.12 (28 December 2014)
---------------------------------
......
......@@ -2,7 +2,7 @@
```sh
# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
| gzip -dc | tar xf -
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
......
......@@ -60,11 +60,12 @@ Index database sequences in the FASTA format.
Prefix of the output database [same as db filename]
.TP
.BI -a \ STR
Algorithm for constructing BWT index. BWA implements two algorithms for BWT
Algorithm for constructing BWT index. BWA implements three algorithms for BWT
construction:
.B is
.BR is ,
.B bwtsw
and
.BR bwtsw .
.BR rb2 .
The first algorithm is a little faster for small database but requires large
RAM and does not work for databases with total length longer than 2GB. The
second algorithm is adapted from the BWT-SW source code. It in theory works
......
......@@ -12,6 +12,11 @@
#define BWA_CTL_SIZE 0x10000
#define BWTALGO_AUTO 0
#define BWTALGO_RB2 1
#define BWTALGO_BWTSW 2
#define BWTALGO_IS 3
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
......@@ -41,6 +46,8 @@ extern "C" {
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
......
......@@ -18,7 +18,7 @@ how to use bwakit:
```sh
# Download the bwa-0.7.11 binary package (download link may change)
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
| gzip -dc | tar xf -
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
......
......@@ -78,7 +78,7 @@ Bytes.prototype.revcomp = function()
this[this.length - i - 1] = Bytes.rctab[this[i]];
this[i] = Bytes.rctab[tmp];
}
if (this.length>>1)
if (this.length&1)
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
}
......
......@@ -2,7 +2,7 @@
root=`dirname $0`
url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
if [ $# -eq 0 ]; then
......
......@@ -114,7 +114,7 @@ static void smem_aux_destroy(smem_aux_t *a)
static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)
{
int i, k, x = 0, old_n;
int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1;
int start_width = 1;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
a->mem.n = 0;
// first pass: find all SMEMs
......@@ -488,13 +488,6 @@ int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_
return m;
}
int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen)
{
if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n;
memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t));
return n - 1;
}
typedef kvec_t(int) int_v;
static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
......@@ -1046,8 +1039,6 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
}
free(chn.a);
regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
if (opt->flag & MEM_F_SELF_OVLP)
regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq);
if (bwa_verbose >= 4) {
err_printf("* %ld chains remain after removing duplicated chains\n", regs.n);
for (i = 0; i < regs.n; ++i) {
......@@ -1168,12 +1159,8 @@ static void worker2(void *data, int i, int tid)
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
if (w->opt->flag & MEM_F_ALN_REG) {
mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
} else {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
}
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
} else {
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
......
......@@ -16,8 +16,6 @@ typedef struct __smem_i smem_i;
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
#define MEM_F_NO_RESCUE 0x20
#define MEM_F_SELF_OVLP 0x40
#define MEM_F_ALN_REG 0x80
#define MEM_F_REF_HDR 0x100
#define MEM_F_SOFTCLIP 0x200
#define MEM_F_SMARTPE 0x400
......
......@@ -87,31 +87,6 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
return ar;
}
void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a)
{
int i;
kstring_t str = {0,0,0};
for (i = 0; i < a->n; ++i) {
const mem_alnreg_t *p = &a->a[i];
int is_rev, rid, qb = p->qb, qe = p->qe;
int64_t pos, rb = p->rb, re = p->re;
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
rid = bns_pos2rid(bns, pos);
assert(rid == p->rid);
pos -= bns->anns[rid].offset;
kputs(s->name, &str); kputc('\t', &str);
kputw(s->l_seq, &str); kputc('\t', &str);
if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap
kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str);
kputs(bns->anns[rid].name, &str); kputc('\t', &str);
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
kputc('\n', &str);
}
s->sam = str.s;
}
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
{
int k = a[i].secondary_all;
......
......@@ -379,7 +379,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
bwt_multi1_t *q = p[j]->multi + k;
q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand);
q->strand = strand;
if (q->pos != p[j]->pos)
if (q->pos != p[j]->pos && q->pos != (bwtint_t)-1)
p[j]->multi[n_multi++] = *q;
}
p[j]->n_multi = n_multi;
......
......@@ -113,6 +113,7 @@ bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int r
{
bwtint_t pos_f;
int is_rev;
*strand = 0; // initialise strand to 0 otherwise we could return without setting it
pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate
if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1;
pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base
......
......@@ -32,8 +32,11 @@
#include <time.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "bwt.h"
#include "utils.h"
#include "rle.h"
#include "rope.h"
#ifdef _DIVBWT
#include "divsufsort.h"
......@@ -63,7 +66,7 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
{
bwt_t *bwt;
ubyte_t *buf, *buf2;
int i, pac_size;
int64_t i, pac_size;
FILE *fp;
// initialization
......@@ -90,11 +93,31 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
if (use_is) {
bwt->primary = is_bwt(buf, bwt->seq_len);
} else {
#ifdef _DIVBWT
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
#else
err_fatal_simple("libdivsufsort is not compiled in.");
#endif
rope_t *r;
int64_t x;
rpitr_t itr;
const uint8_t *blk;
r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
int c = buf[i] + 1;
x = rope_insert_run(r, x, c, 1, 0) + 1;
while (--c >= 0) x += r->c[c];
}
bwt->primary = x;
rope_itr_first(r, &itr);
x = 0;
while ((blk = rope_itr_next_block(&itr)) != 0) {
const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
while (q < end) {
int c = 0;
int64_t l;
rle_dec1(q, c, l);
for (i = 0; i < l; ++i)
buf[x++] = c - 1;
}
}
rope_destroy(r);
}
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
for (i = 0; i < bwt->seq_len; ++i)
......@@ -186,19 +209,14 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
int bwa_index(int argc, char *argv[]) // the "index" command
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *prefix = 0, *str, *str2, *str3;
int c, algo_type = 0, is_64 = 0, block_size = 10000000;
clock_t t;
int64_t l_pac;
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
char *prefix = 0, *str;
while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2;
else if (strcmp(optarg, "is") == 0) algo_type = 3;
if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2;
else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW;
else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS;
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = strdup(optarg); break;
......@@ -216,7 +234,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
if (optind + 1 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
......@@ -230,16 +248,29 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(prefix, argv[optind]);
if (is_64) strcat(prefix, ".64");
}
bwa_idx_build(argv[optind], prefix, algo_type, block_size);
free(prefix);
return 0;
}
int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size)
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *str, *str2, *str3;
clock_t t;
int64_t l_pac;
str = (char*)calloc(strlen(prefix) + 10, 1);
str2 = (char*)calloc(strlen(prefix) + 10, 1);
str3 = (char*)calloc(strlen(prefix) + 10, 1);
{ // nucleotide indexing
gzFile fp = xzopen(argv[optind], "r");
gzFile fp = xzopen(fa, "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
......@@ -247,7 +278,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(str, prefix); strcat(str, ".pac");
strcpy(str2, prefix); strcat(str2, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
else if (algo_type == 1 || algo_type == 3) {
bwt_t *bwt;
......@@ -255,25 +286,25 @@ int bwa_index(int argc, char *argv[]) // the "index" command
bwt_dump_bwt(str2, bwt);
bwt_destroy(bwt);
}
fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");
t = clock();
fprintf(stderr, "[bwa_index] Update BWT... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... ");
bwt = bwt_restore_bwt(str);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(str, bwt);
bwt_destroy(bwt);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
gzFile fp = xzopen(argv[optind], "r");
gzFile fp = xzopen(fa, "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 1);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
{
......@@ -281,13 +312,13 @@ int bwa_index(int argc, char *argv[]) // the "index" command
strcpy(str, prefix); strcat(str, ".bwt");
strcpy(str3, prefix); strcat(str3, ".sa");
t = clock();
fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
bwt = bwt_restore_bwt(str);
bwt_cal_sa(bwt, 32);
bwt_dump_sa(str3, bwt);
bwt_destroy(bwt);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
free(str3); free(str2); free(str); free(prefix);
free(str3); free(str2); free(str);
return 0;
}
......@@ -130,7 +130,7 @@ int main_mem(int argc, char *argv[])
aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "1epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
while ((c = getopt(argc, argv, "1paMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
......@@ -145,8 +145,6 @@ int main_mem(int argc, char *argv[])
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
......@@ -251,7 +249,6 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -e discard full-length exact matches\n");
fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
......@@ -263,7 +260,6 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
......@@ -296,21 +292,14 @@ int main_mem(int argc, char *argv[])
if (!opt0.b) opt->b = 9;
if (!opt0.pen_clip5) opt->pen_clip5 = 5;
if (!opt0.pen_clip3) opt->pen_clip3 = 5;
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) {
} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
if (!opt0.o_del) opt->o_del = 1;
if (!opt0.e_del) opt->e_del = 1;
if (!opt0.o_ins) opt->o_ins = 1;
if (!opt0.e_ins) opt->e_ins = 1;
if (!opt0.b) opt->b = 1;
if (opt0.split_factor == 0.) opt->split_factor = 10.;
if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well!
opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG;
if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
if (!opt0.max_occ) opt->max_occ = 1000;
if (!opt0.min_seed_len) opt->min_seed_len = 13;
if (!opt0.max_chain_extend) opt->max_chain_extend = 25;
if (opt0.drop_ratio == 0.) opt->drop_ratio = .001;
} else if (strcmp(mode, "ont2d") == 0) {
if (strcmp(mode, "ont2d") == 0) {
if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
if (!opt0.min_seed_len) opt->min_seed_len = 14;
if (!opt0.pen_clip5) opt->pen_clip5 = 0;
......@@ -359,8 +348,7 @@ int main_mem(int argc, char *argv[])
opt->flag |= MEM_F_PE;
}
}
if (!(opt->flag & MEM_F_ALN_REG))
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
free(hdr_line);
......@@ -403,7 +391,7 @@ int main_fastmap(int argc, char *argv[])
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
fprintf(stderr, " -l INT max MEM length [%d]\n", max_len);
fprintf(stderr, " -L INT max MEM length [%d]\n", max_len);
fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
fprintf(stderr, "\n");
return 1;
......
......@@ -67,13 +67,15 @@ struct ktp_t;
typedef struct {
struct ktp_t *pl;
int step, running;
int64_t index;
int step;
void *data;
} ktp_worker_t;
typedef struct ktp_t {
void *shared;
void *(*func)(void*, int, void*);
int64_t index;
int n_workers, n_steps;
ktp_worker_t *workers;
pthread_mutex_t mutex;
......@@ -92,13 +94,12 @@ static void *ktp_worker(void *data)
// test whether another worker is doing the same step
for (i = 0; i < p->n_workers; ++i) {
if (w == &p->workers[i]) continue; // ignore itself
if (p->workers[i].running && p->workers[i].step == w->step)
if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
break;
}
if (i == p->n_workers) break; // no other workers doing w->step; then this worker will
if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
pthread_cond_wait(&p->cv, &p->mutex);
}
w->running = 1;
pthread_mutex_unlock(&p->mutex);
// working on w->step
......@@ -107,7 +108,7 @@ static void *ktp_worker(void *data)
// update step and let other workers know
pthread_mutex_lock(&p->mutex);
w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
w->running = 0;
if (w->step == 0) w->index = p->index++;
pthread_cond_broadcast(&p->cv);
pthread_mutex_unlock(&p->mutex);
}
......@@ -125,16 +126,18 @@ void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_d
aux.n_steps = n_steps;
aux.func = func;
aux.shared = shared_data;
aux.index = 0;
pthread_mutex_init(&aux.mutex, 0);
pthread_cond_init(&aux.cv, 0);
aux.workers = alloca(n_threads * sizeof(ktp_worker_t));
aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
for (i = 0; i < n_threads; ++i) {
ktp_worker_t *w = &aux.workers[i];
w->step = w->running = 0; w->pl = &aux; w->data = 0;
w->step = 0; w->pl = &aux; w->data = 0;
w->index = aux.index++;
}
tid = alloca(n_threads * sizeof(pthread_t));
tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
......
......@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.12-r1039"
#define PACKAGE_VERSION "0.7.13-r1126"
#endif
int bwa_fa2pac(int argc, char *argv[]);
......@@ -25,6 +25,7 @@ int main_mem(int argc, char *argv[]);
int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
int main_maxk(int argc, char *argv[]);
static int usage()
{
......@@ -84,6 +85,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
......
#include <zlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <unistd.h>
#include "bwa.h"
#include "bwamem.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
int main_maxk(int argc, char *argv[])
{
int i, c, self = 0, max_len = 0;
uint8_t *cnt = 0;
uint64_t hist[256];
bwt_t *bwt;
kseq_t *ks;
smem_i *itr;
gzFile fp;
while ((c = getopt(argc, argv, "s")) >= 0) {
if (c == 's') self = 1;
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
return 1;
}
fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");
ks = kseq_init(fp);
bwt = bwt_restore_bwt(argv[optind]);
itr = smem_itr_init(bwt);
if (self) smem_config(itr, 2, INT_MAX, 0);
memset(hist, 0, 8 * 256);
while (kseq_read(ks) >= 0) {
const bwtintv_v *a;
if (ks->seq.l > max_len) {
max_len = ks->seq.l;
kroundup32(max_len);
cnt = realloc(cnt, max_len);
}
memset(cnt, 0, ks->seq.l);
for (i = 0; i < ks->seq.l; ++i)
ks->seq.s[i] = nst_nt4_table[(int)ks->seq.s[i]];
smem_set_query(itr, ks->seq.l, (uint8_t*)ks->seq.s);
while ((a = smem_next(itr)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
int j, l, start = p->info>>32, end = (uint32_t)p->info;
l = end - start < 255? end - start : 255;
for (j = start; j < end; ++j)
cnt[j] = cnt[j] > l? cnt[j] : l;
}
}
for (i = 0; i < ks->seq.l; ++i) ++hist[cnt[i]];
}
for (i = 0; i < 256; ++i)
printf("%d\t%lld\n", i, (long long)hist[i]);
free(cnt);
smem_itr_destroy(itr);
bwt_destroy(bwt);
kseq_destroy(ks);
gzclose(fp);
return 0;
}
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use warnings;
......
#include <string.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include "rle.h"
const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
{
uint16_t *nptr = (uint16_t*)block;
int diff;
block += 2; // skip the first 2 counting bytes
if (*nptr == 0) {
memset(cnt, 0, 48);
diff = rle_enc1(block, a, rl);
} else {
uint8_t *p, *end = block + *nptr, *q;
int64_t pre, z, l = 0, tot, beg_l;