Commit 947c8141 authored by Charles Plessy's avatar Charles Plessy

Imported Upstream version 0.7.0

parent 7ef66e21
CC= gcc
CXX= g++
CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \
bwaseqio.o bwase.o kstring.o
AOBJS= QSufSort.o bwt_gen.o \
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
bwape.o cs2nt.o \
LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
PROG= bwa bwamem-lite
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
......@@ -26,19 +23,29 @@ SUBDIRS= .
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
bwa.o:bwa.h
ksw.o:ksw.h
kstring.o:kstring.h
utils.o:utils.h ksort.h kseq.h
bntseq.o:bntseq.h
bwt.o:bwt.h utils.h
bwa.o:bwa.h bwt.h bntseq.h
bwamem.o:ksw.h kbtree.h ksort.h kvec.h kstring.h utils.h bwamem.h
bwamem_pair.o:ksw.h kvec.h kstring.h utils.h bwamem.h
QSufSort.o:QSufSort.h
bwt_gen.o:QSufSort.h
fastmap.o:bwt.h bwamem.h
bwt.o:bwt.h
bwtio.o:bwt.h
bwtaln.o:bwt.h bwtaln.h kseq.h
bntseq.o:bntseq.h
bwtgap.o:bwtgap.h bwtaln.h bwt.h
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
......
Beta Release 0.7.0 (28 Feburary, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
algorithm and extends seeds with banded affine-gap-penalty dynamic programming
(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
to find hits within a certain edit distance, but BWA is not efficient for such
task given longer reads anyway, and the edit-distance criterion is arguably
not as important in long-read alignment.
In addition to the algorithmic improvements, BWA-MEM also implements a few
handy features in practical aspects:
1. BWA-MEM automatically switches between local and glocal (global wrt reads;
local wrt reference) alignment. It reports the end-to-end glocal alignment
if the glocal alignment is not much worse than the optimal local alignment.
Glocal alignment reduces reference bias.
2. BWA-MEM automatically infers pair orientation from a batch of single-end
alignments. It allows more than one orientations if there are sufficient
supporting reads. This feature has not been tested on reads from Illumina
jumping library yet. (EXPERIMENTAL)
3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
is possible to convert a name-sorted BAM to an interleaved fastq on the fly
and feed the data stream to BWA-MEM for mapping.
4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
helps to transfer individual read annotations to the output.
5. BWA-MEM supports more advanced piping. Users can now run:
(bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
files without replying on bash features.
6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
program in the source code directory implements a full single-end mapper in
50 lines of code.
The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
for production use yet. However, when the implementation becomes stable after a
few release cycles, existing BWA users are recommended to migrate to BWA-MEM
for 76bp or longer Illumina reads and long query sequences. The original BWA
short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
reads. Change of mappers will be necessary sooner or later.
(0.7.0 beta: 28 Feburary 2013, r313)
Release 0.6.2 (19 June, 2012)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
......@@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
qsint_t maxNumInputSymbol;
qsint_t numSortedPos = 1;
qsint_t newAlphabetSize;
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
if (!skipTransform) {
/* bucketing possible*/
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
......
......@@ -35,7 +35,7 @@
#include "utils.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
KSEQ_DECLARE(gzFile)
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
......@@ -288,21 +288,26 @@ int bwa_fa2pac(int argc, char *argv[])
return 0;
}
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
{
int left, mid, right;
if (pos_f >= bns->l_pac) return -1;
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) { // binary search
mid = (left + right) >> 1;
if (pos_f >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
left = mid + 1;
} else right = mid;
}
return mid;
}
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{
int left, mid, right, nn;
if (ref_id) {
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) {
mid = (left + right) >> 1;
if (pos_f >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
left = mid + 1;
} else right = mid;
}
*ref_id = mid;
}
if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
mid = (left + right) >> 1;
......@@ -321,3 +326,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
}
return nn;
}
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
{
uint8_t *seq = 0;
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
if (end > l_pac<<1) end = l_pac<<1;
if (beg < 0) beg = 0;
if (beg >= l_pac || end <= l_pac) {
int64_t k, l = 0;
*len = end - beg;
seq = malloc(end - beg);
if (beg >= l_pac) { // reverse strand
int64_t beg_f = (l_pac<<1) - 1 - end;
int64_t end_f = (l_pac<<1) - 1 - beg;
for (k = end_f; k > beg_f; --k)
seq[l++] = 3 - _get_pac(pac, k);
} else { // forward strand
for (k = beg; k < end; ++k)
seq[l++] = _get_pac(pac, k);
}
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
return seq;
}
......@@ -29,6 +29,7 @@
#define BWT_BNTSEQ_H
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
#ifndef BWA_UBYTE
......@@ -71,7 +72,9 @@ extern "C" {
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
#ifdef __cplusplus
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -2,103 +2,45 @@
#define BWA_H_
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#define BWA_DEF_MAX_SCORE 2048
#define BWA_MAX_QUERY_LEN 1024
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
// BWA index
struct bwa_idx_t;
typedef struct bwa_idx_t bwa_idx_t;
// Buffer for BWA alignment
struct bwa_buf_t;
typedef struct bwa_buf_t bwa_buf_t;
// BWA alignment options
typedef struct {
int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
float fnr; // parameter for automatic length-adjusted max differences
} bwa_opt_t;
// default BWA alignment options
extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
// an interval hit in the SA coordinate; basic unit in .sai files
typedef struct {
uint32_t n_mm:16, n_gapo:8, n_gape:8;
int score;
uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
} bwa_sai1_t;
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
} bwaidx_t;
// all interval hits in the SA coordinate
typedef struct {
int n; // number of interval hits
bwa_sai1_t *sai;
} bwa_sai_t;
int l_seq;
char *name, *comment, *seq, *qual, *sam;
} bseq1_t;
// an alignment
typedef struct {
uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
uint32_t offset; // coordinate on the reference; zero-based
uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
} bwa_aln_t;
typedef struct {
int mapQs, mapQ, c1, c2;
uint64_t sa;
bwa_sai1_t *which;
bwa_sai_t sai;
bwa_aln_t one;
} bwa_one_t;
typedef struct {
double avg, std, ap_prior;
uint64_t low, high, high_bayesian;
} bwa_pestat_t;
extern int bwa_verbose;
extern char bwa_rg_id[256];
#ifdef __cplusplus
extern "C" {
#endif
// load a BWA index
bwa_idx_t *bwa_idx_load(const char *prefix);
void bwa_idx_destroy(bwa_idx_t *p);
// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
void bwa_buf_destroy(bwa_buf_t *p);
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
/**
* Find all the SA intervals
*
* @param idx BWA index; multiple threads can share the same index
* @param buf BWA alignment buffer; each thread should have its own buffer
* @param seq NULL terminated C string, consisting of A/C/G/T/N only
*
* @return SA intervals seq is matched to
*/
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
/**
* Construct an alignment in the base-pair coordinate
*
* @param idx BWA index
* @param buf BWA alignment buffer
* @param seq NULL terinated C string
* @param sa Suffix array value
* @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
*
* @return An alignment
*/
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
void bwa_one_destroy(bwa_one_t *one);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);
#ifdef __cplusplus
}
......
This diff is collapsed.
This diff is collapsed.
#ifndef BWAMEM_H_
#define BWAMEM_H_
#include "bwt.h"
#include "bntseq.h"
#include "bwa.h"
#define MEM_MAPQ_COEF 30.0
#define MEM_MAPQ_MAX 60
struct __smem_i;
typedef struct __smem_i smem_i;
#define MEM_F_HARDCLIP 0x1
#define MEM_F_PE 0x2
#define MEM_F_NOPAIRING 0x4
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
int pen_unpaired; // phred-scaled penalty for unpaired reads
int pen_clip; // clipping penalty. This score is not deducted from the DP score.
int w; // band width
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width; // split into a seed if its occurence is smaller than this value
int max_occ; // skip a seed if its occurence is larger than this value
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // process chunk_size-bp sequences in a batch
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
typedef struct {
int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment
int score; // best SW score
int sub; // 2nd best SW score
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
} mem_alnreg_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
typedef struct {
int low, high, failed;
double avg, std;
} mem_pestat_t;
typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
int64_t rb, re;
int qb, qe, flag, qual;
// optional info
int score, sub;
} bwahit_t;
typedef struct { // This struct is only used for the convenience of API.
int rid;
int pos;
uint32_t is_rev:1, mapq:8, NM:23;
int n_cigar;
uint32_t *cigar;
} mem_aln_t;
#ifdef __cplusplus
extern "C" {
#endif
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
* Note that $seqs[i].sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits.
*
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
* sequences must be interleaved: $n must be an even number and the 2i-th
* sequence and the (2i+1)-th sequence constitute a read pair. In this
* mode, there should be enough (typically >50) unique pairs for the
* routine to infer the orientation and insert size.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param n number of query sequences
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
* generated later by bwa_gen_cigar() defined in bwa.c.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param l_seq length of query sequence
* @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
*
* @return list of aligned regions.
*/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1(), as long as paired-end
* reads are properly interleaved.
*
* @param opt alignment parameters
* @param l_pac length of concatenated reference sequence
* @param n number of query sequences; must be an even number
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
* @param pes inferred insert size distribution (output)
*/
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
#ifdef __cplusplus
}
#endif
#endif
This diff is collapsed.
......@@ -10,6 +10,7 @@
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"
#include "bwa.h"
typedef struct {
int n;
......@@ -21,24 +22,15 @@ typedef struct {
bwtint_t low, high, high_bayesian;
} isize_info_t;
typedef struct {
uint64_t x, y;
} b128_t;
#define b128_lt(a, b) ((a).x < (b).x)
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
#define b128_hash(a) ((uint32_t)(a).x)
#include "khash.h"
KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq)
#include "ksort.h"
KSORT_INIT(b128, b128_t, b128_lt)
KSORT_INIT_GENERIC(uint64_t)
KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
typedef struct {
kvec_t(b128_t) arr;
kvec_t(b128_t) pos[2];
pair64_v arr;
pair64_v pos[2];
kvec_t(bwt_aln1_t) aln[2];
} pe_data_t;
......@@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt()
po->ap_prior = 1e-5;
return po;
}
static inline uint64_t hash_64(uint64_t key)
{
key += ~(key << 32);
key ^= (key >> 22);
key += ~(key << 13);
key ^= (key >> 8);
key += (key << 3);
key ^= (key >> 15);
key += ~(key << 27);
key ^= (key >> 31);
return key;
}
/*
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
{
......@@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
free(isizes);
return -1;
}
ks_introsort(uint64_t, tot, isizes);
ks_introsort_64(tot, isizes);
p25 = isizes[(int)(tot*0.25 + 0.5)];
p50 = isizes[(int)(tot*0.50 + 0.5)];
p75 = isizes[(int)(tot*0.75 + 0.5)];
......@@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
{
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
uint64_t o_score, subo_score;
b128_t last_pos[2][2], o_pos[2];
pair64_t last_pos[2][2], o_pos[2];
max_len = p[0]->full_len;
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
if (low_bound < max_len) low_bound = max_len;
......@@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
o_score = subo_score = (uint64_t)-1;
o_n = subo_n = 0;
ks_introsort(b128, d->arr.n, d->arr.a);
ks_introsort_128(d->arr.n, d->arr.a);
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
if (opt->type == BWA_PET_STD) {
for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i];
pair64_t x = d->arr.a[i];
int strand = x.y>>1&1;
if (strand == 1) { // reverse strand, then check
int y = 1 - (x.y&1);
......@@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
last_pos[x.y&1][1] = x;
}
}
} else if (opt->type == BWA_PET_SOLID) {
for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i];
int strand = x.y>>1&1;
if ((strand^x.y)&1) { // push
int y = 1 - (x.y&1);
__pairing_aux(last_pos[y][1], x);
__pairing_aux(last_pos[y][0], x);
} else { // check
last_pos[x.y&1][0] = last_pos[x.y&1][1];
last_pos[x.y&1][1] = x;
}
}
} else {
fprintf(stderr, "[paring] not implemented yet!\n");
exit(1);
......@@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
{ // only when both ends mapped
b128_t x;
pair64_t x;
int j, k;
long long n_occ[2];
for (j = 0; j < 2; ++j) {
......@@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
bwt_aln1_t *r = d->aln[j].a + k;
bwtint_t l;
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
b128_t key;
pair64_t key;
int ret;
key.x = r->k; key.y = r->l;
khint_t iter = kh_put(b128, g_hash, key, &ret);
......@@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x.x = kh_val(g_hash, iter).a[l]>>1;
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
kv_push(b128_t, d->arr, x);
kv_push(pair64_t, d->arr, x);
}
} else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) {
int strand;
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
x.y = k<<2 | strand<<1 | j;
kv_push(b128_t, d->arr, x);
kv_push(pair64_t, d->arr, x);
}
}
}
......@@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
++n_tot[is_singleton];
cigar[0] = cigar[1] = 0;
n_cigar[0] = n_cigar[1] = 0;
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
ubyte_t *seq;
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
if (popt->type == BWA_PET_STD) {
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->rseq;
......@@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
seq = p[k]->seq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
}
} else { // BWA_PET_SOLID
if (p[1-k]->strand == 0) { // R3-F3 pairing
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->rseq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
} else { // F3-R3 pairing
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->seq;
}
}
// perform SW alignment
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
......@@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
return pacseq;
}
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs, tot_seqs = 0;
bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2];
clock_t t;
bntseq_t *bns, *ntbns = 0;
bntseq_t *bns;
FILE *fp_sa[2];
gap_opt_t opt, opt0;
khint_t iter;
......@@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
opt0 = opt;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
if (!(opt.mode & BWA_MODE_COMPREAD)) {
popt->type = BWA_PET_SOLID;
ntbns = bwa_open_nt(prefix);
} else { // for Illumina alignment only
{ // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
......@@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}
// core loop
bwa_print_sam_SQ(bns);
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_PG();
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
......@@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
if (pac == 0) free(pacseq);
......@@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
// destroy
bns_destroy(bns);
if (ntbns) bns_destroy(ntbns);
for (i = 0; i < 2; ++i) {
bwa_seq_close(ks[i]);
fclose(fp_sa[i]);
......@@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
int bwa_sai2sam_pe(int argc, char *argv[])
{
extern char *bwa_rg_line, *bwa_rg_id;