Commit 814bc893 authored by Charles Plessy's avatar Charles Plessy

Imported Upstream version 0.5.9~rc1

parent 83bd9a53
------------------------------------------------------------------------
r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwaseqio.c
M /branches/prog/bwa/main.c
* fixed a small memory leak caused by the BAM reader
* fixed a memory violation, also in the BAM reader
------------------------------------------------------------------------
r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/Makefile
change Makefile gcc options
------------------------------------------------------------------------
r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines
Changed paths:
M /branches/prog/bwa/bwtsw2_aux.c
M /branches/prog/bwa/bwtsw2_core.c
M /branches/prog/bwa/main.c
* bwa-0.5.8-6 (r1557)
* added a little more comments to BWA-SW
* randomly choosing a mapping if there are more than one
------------------------------------------------------------------------
r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/Makefile
M /branches/prog/bwa/bwtsw2_aux.c
sometimes unmapped reads may not be printed...
------------------------------------------------------------------------
r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/Makefile
M /branches/prog/bwa/bwtsw2_aux.c
print unmapped reads
------------------------------------------------------------------------
r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/bwa.1
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/main.c
* bwa-0.5.8-5 (r1555)
* BAM input documentation
------------------------------------------------------------------------
r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/main.c
* bwa-0.5.8-4 (r1544)
* supporting adding RG tags and RG lines
------------------------------------------------------------------------
r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/main.c
* bwa-0.5.8-3 (r1543)
* fixed a memory leak
------------------------------------------------------------------------
r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/main.c
* bwa-0.5.8-2 (r1542)
* fixed a long existing bug in random placement of reads
------------------------------------------------------------------------
r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/Makefile
A /branches/prog/bwa/bamlite.c
A /branches/prog/bwa/bamlite.h
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/bwaseqio.c
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/main.c
preliminary BAM input support
------------------------------------------------------------------------
r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/bwa.1
change version number and ChangeLog
------------------------------------------------------------------------ ------------------------------------------------------------------------
r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines
Changed paths: Changed paths:
......
Simply type `make' to compile and copy the resulting executable `bwa'
anywhere you want. You may also like to copy `solid2fastq.pl' and
`qualfa2fq.pl' for format conversion.
On 32-bit system, you should compile bwa with `make CFLAGS=-O2'.
The GNU building system is also supported, which is necessary for
building Java binding. Nonetheless, directly runing `make' is
recommended in most other cases.
\ No newline at end of file
CC= gcc CC= gcc
CXX= g++ CXX= g++
CFLAGS= -g -Wall -O2 -m64 CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS) CXXFLAGS= $(CFLAGS)
DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64
OBJS= utils.o bwt.o bwtio.o bwtaln.o bwtgap.o is.o \ OBJS= utils.o bwt.o bwtio.o bwtaln.o bwtgap.o is.o \
bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \
bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o bwtsw2_chain.o bamlite.o
PROG= bwa PROG= bwa
INCLUDES= INCLUDES=
LIBS= -lm -lz -lpthread -Lbwt_gen -lbwtgen LIBS= -lm -lz -lpthread -Lbwt_gen -lbwtgen
......
## Makefile.am -- Process this file with automake to produce Makefile.in
ACLOCAL_AMFLAGS=-I m4
AM_CFLAGS = -Wall -m64 -fPIC
bin_PROGRAMS = bwa
noinst_LIBRARIES = libbwacore.a
libbwacore_a_SOURCES = utils.c bwt.c bwtio.c bwtaln.c bwtgap.c bntseq.c \
stdaln.c bwaseqio.c bwase.c kstring.c cs2nt.c
bwa_SOURCES = is.c bwtmisc.c bwtindex.c simple_dp.c bwape.c \
bwtsw2_core.c bwtsw2_main.c bwtsw2_aux.c bwt_lite.c bwtsw2_chain.c \
main.c
bwa_LDADD = bwt_gen/libbwtgen.a libbwacore.a
man_MANS= bwa.1
SUBDIRS= bwt_gen .
Beta Release Candidate 0.5.9rc1 (10 December, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Notable changes in bwasw:
* Output unmapped reads.
* For a repetitive read, choose a random hit instead of a fixed
one. This is not well tested.
Notable changes in bwa-short:
* Fixed a bug in the SW scoring system, which may lead to unexpected
gaps towards the end of a read.
* Fixed a bug which invalidates the randomness of repetitive reads.
* Fixed a rare memory leak.
* Allowed to specify the read group at the command line.
* Take name-grouped BAM files as input.
Changes to this release are usually safe in that they do not interfere
with the key functionality. However, the release has only been tested on
small samples instead of on large-scale real data. If anything weird
happens, please report the bugs to the bio-bwa-help mailing list.
(0.5.9rc1: 10 December 2010, r1561)
Beta Release 0.5.8 (8 June, 2010) Beta Release 0.5.8 (8 June, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
#!/bin/sh
autoreconf -vi
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include "bamlite.h"
/*********************
* from bam_endian.c *
*********************/
static inline int bam_is_big_endian()
{
long one= 1;
return !(*((char *)(&one)));
}
static inline uint16_t bam_swap_endian_2(uint16_t v)
{
return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
}
static inline void *bam_swap_endian_2p(void *x)
{
*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
return x;
}
static inline uint32_t bam_swap_endian_4(uint32_t v)
{
v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
}
static inline void *bam_swap_endian_4p(void *x)
{
*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
return x;
}
static inline uint64_t bam_swap_endian_8(uint64_t v)
{
v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
}
static inline void *bam_swap_endian_8p(void *x)
{
*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
return x;
}
/**************
* from bam.c *
**************/
int bam_is_be;
bam_header_t *bam_header_init()
{
bam_is_be = bam_is_big_endian();
return (bam_header_t*)calloc(1, sizeof(bam_header_t));
}
void bam_header_destroy(bam_header_t *header)
{
int32_t i;
if (header == 0) return;
if (header->target_name) {
for (i = 0; i < header->n_targets; ++i)
free(header->target_name[i]);
free(header->target_name);
free(header->target_len);
}
free(header->text);
free(header);
}
bam_header_t *bam_header_read(bamFile fp)
{
bam_header_t *header;
char buf[4];
int magic_len;
int32_t i = 1, name_len;
// read "BAM1"
magic_len = bam_read(fp, buf, 4);
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
return 0;
}
header = bam_header_init();
// read plain text and the number of reference sequences
bam_read(fp, &header->l_text, 4);
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
header->text = (char*)calloc(header->l_text + 1, 1);
bam_read(fp, header->text, header->l_text);
bam_read(fp, &header->n_targets, 4);
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
// read reference sequence names and lengths
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
for (i = 0; i != header->n_targets; ++i) {
bam_read(fp, &name_len, 4);
if (bam_is_be) bam_swap_endian_4p(&name_len);
header->target_name[i] = (char*)calloc(name_len, 1);
bam_read(fp, header->target_name[i], name_len);
bam_read(fp, &header->target_len[i], 4);
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
}
return header;
}
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
{
uint8_t *s;
uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
while (s < data + data_len) {
uint8_t type;
s += 2; // skip key
type = toupper(*s); ++s; // skip type
if (type == 'C' || type == 'A') ++s;
else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
}
}
int bam_read1(bamFile fp, bam1_t *b)
{
bam1_core_t *c = &b->core;
int32_t block_len, ret, i;
uint32_t x[8];
if ((ret = bam_read(fp, &block_len, 4)) != 4) {
if (ret == 0) return -1; // normal end-of-file
else return -2; // truncated
}
if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
if (bam_is_be) {
bam_swap_endian_4p(&block_len);
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
}
c->tid = x[0]; c->pos = x[1];
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
c->l_qseq = x[4];
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
b->data_len = block_len - sizeof(bam1_core_t);
if (b->m_data < b->data_len) {
b->m_data = b->data_len;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
return 4 + block_len;
}
#ifndef BAMLITE_H_
#define BAMLITE_H_
#include <stdint.h>
#include <zlib.h>
typedef gzFile bamFile;
#define bam_open(fn, mode) gzopen(fn, mode)
#define bam_dopen(fd, mode) gzdopen(fd, mode)
#define bam_close(fp) gzclose(fp)
#define bam_read(fp, buf, size) gzread(fp, buf, size)
typedef struct {
int32_t n_targets;
char **target_name;
uint32_t *target_len;
size_t l_text, n_text;
char *text;
} bam_header_t;
#define BAM_FPAIRED 1
#define BAM_FPROPER_PAIR 2
#define BAM_FUNMAP 4
#define BAM_FMUNMAP 8
#define BAM_FREVERSE 16
#define BAM_FMREVERSE 32
#define BAM_FREAD1 64
#define BAM_FREAD2 128
#define BAM_FSECONDARY 256
#define BAM_FQCFAIL 512
#define BAM_FDUP 1024
#define BAM_CIGAR_SHIFT 4
#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
#define BAM_CMATCH 0
#define BAM_CINS 1
#define BAM_CDEL 2
#define BAM_CREF_SKIP 3
#define BAM_CSOFT_CLIP 4
#define BAM_CHARD_CLIP 5
#define BAM_CPAD 6
typedef struct {
int32_t tid;
int32_t pos;
uint32_t bin:16, qual:8, l_qname:8;
uint32_t flag:16, n_cigar:16;
int32_t l_qseq;
int32_t mtid;
int32_t mpos;
int32_t isize;
} bam1_core_t;
typedef struct {
bam1_core_t core;
int l_aux, data_len, m_data;
uint8_t *data;
} bam1_t;
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
#define bam1_qname(b) ((char*)((b)->data))
#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
#define bam_destroy1(b) do { \
if (b) { free((b)->data); free(b); } \
} while (0)
extern int bam_is_be;
#ifdef __cplusplus
extern "C" {
#endif
bam_header_t *bam_header_init(void);
void bam_header_destroy(bam_header_t *header);
bam_header_t *bam_header_read(bamFile fp);
int bam_read1(bamFile fp, bam1_t *b);
#ifdef __cplusplus
}
#endif
#endif
.TH bwa 1 "16 October 2010" "bwa-0.5.8c" "Bioinformatics tools" .TH bwa 1 "10 December 2010" "bwa-0.5.9rc1" "Bioinformatics tools"
.SH NAME .SH NAME
.PP .PP
bwa - Burrows-Wheeler Alignment Tool bwa - Burrows-Wheeler Alignment Tool
...@@ -154,6 +154,36 @@ differences will be found. This mode is much slower than the default. ...@@ -154,6 +154,36 @@ differences will be found. This mode is much slower than the default.
Parameter for read trimming. BWA trims a read down to Parameter for read trimming. BWA trims a read down to
argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
read length. [0] read length. [0]
.TP
.B -b
Specify the input read sequence file is the BAM format. For paired-end
data, two ends in a pair must be grouped together and options
.B -1
or
.B -2
are usually applied to specify which end should be mapped. Typical
command lines for mapping pair-end data in the BAM format are:
bwa aln ref.fa -b1 reads.bam > 1.sai
bwa aln ref.fa -b2 reads.bam > 2.sai
bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
.TP
.B -0
When
.B -b
is specified, only use single-end reads in mapping.
.TP
.B -1
When
.B -b
is specified, only use the first read in a read pair in mapping (skip
single-end reads and the second reads).
.TP
.B -2
When
.B -b
is specified, only use the second read in a read pair in mapping.
.B
.RE .RE
.TP .TP
...@@ -166,10 +196,13 @@ hits will be randomly chosen. ...@@ -166,10 +196,13 @@ hits will be randomly chosen.
.B OPTIONS: .B OPTIONS:
.RS .RS
.TP 10 .TP 10
.B -n INT .BI -n \ INT
Maximum number of alignments to output in the XA tag for reads paired Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be properly. If a read has more than INT hits, the XA tag will not be
written. [3] written. [3]
.TP
.BI -r \ STR
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
.RE .RE
.TP .TP
...@@ -183,12 +216,12 @@ read pairs will be placed randomly. ...@@ -183,12 +216,12 @@ read pairs will be placed randomly.
.B OPTIONS: .B OPTIONS:
.RS .RS
.TP 8 .TP 8
.B -a INT .BI -a \ INT
Maximum insert size for a read pair to be considered being mapped Maximum insert size for a read pair to be considered being mapped
properly. Since 0.4.5, this option is only used when there are not properly. Since 0.4.5, this option is only used when there are not
enough good alignment to infer the distribution of insert sizes. [500] enough good alignment to infer the distribution of insert sizes. [500]
.TP .TP
.B -o INT .BI -o \ INT
Maximum occurrences of a read for pairing. A read with more occurrneces Maximum occurrences of a read for pairing. A read with more occurrneces
will be treated as a single-end read. Reducing this parameter helps will be treated as a single-end read. Reducing this parameter helps
faster pairing. [100000] faster pairing. [100000]
...@@ -198,15 +231,18 @@ Load the entire FM-index into memory to reduce disk operations ...@@ -198,15 +231,18 @@ Load the entire FM-index into memory to reduce disk operations
(base-space reads only). With this option, at least 1.25N bytes of (base-space reads only). With this option, at least 1.25N bytes of
memory are required, where N is the length of the genome. memory are required, where N is the length of the genome.
.TP .TP
.B -n INT .BI -n \ INT
Maximum number of alignments to output in the XA tag for reads paired Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be properly. If a read has more than INT hits, the XA tag will not be
written. [3] written. [3]
.TP .TP
.B -N INT .BI -N \ INT
Maximum number of alignments to output in the XA tag for disconcordant Maximum number of alignments to output in the XA tag for disconcordant
read pairs (excluding singletons). If a read has more than INT hits, the read pairs (excluding singletons). If a read has more than INT hits, the
XA tag will not be written. [10] XA tag will not be written. [10]
.TP
.BI -r \ STR
Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
.RE .RE
.TP .TP
......
...@@ -643,6 +643,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ...@@ -643,6 +643,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
{ {
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs, tot_seqs = 0; int i, j, n_seqs, tot_seqs = 0;
bwa_seq_t *seqs[2]; bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2]; bwa_seqio_t *ks[2];
...@@ -662,15 +663,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f ...@@ -662,15 +663,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
bns = bns_restore(prefix); bns = bns_restore(prefix);
srand48(bns->seed); srand48(bns->seed);
for (i = 0; i < 2; ++i) { fp_sa[0] = xopen(fn_sa[0], "r");
ks[i] = bwa_seq_open(fn_fa[i]); fp_sa[1] = xopen(fn_sa[1], "r");
fp_sa[i] = xopen(fn_sa[i], "r");
}
g_hash = kh_init(64); g_hash = kh_init(64);
last_ii.avg = -1.0; last_ii.avg = -1.0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
if (!(opt.mode & BWA_MODE_COMPREAD)) { if (!(opt.mode & BWA_MODE_COMPREAD)) {
popt->type = BWA_PET_SOLID; popt->type = BWA_PET_SOLID;
ntbns = bwa_open_nt(prefix); ntbns = bwa_open_nt(prefix);
...@@ -742,11 +743,19 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f ...@@ -742,11 +743,19 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
int bwa_sai2sam_pe(int argc, char *argv[]) int bwa_sai2sam_pe(int argc, char *argv[])
{ {
extern char *bwa_rg_line, *bwa_rg_id;
extern int bwa_set_rg(const char *s);
int c; int c;
pe_opt_t *popt; pe_opt_t *popt;
popt = bwa_init_pe_opt(); popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:A")) >= 0) { while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
switch (c) { switch (c) {
case 'r':
if (bwa_set_rg(optarg) < 0) {
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
return 1;
}
break;
case 'a': popt->max_isize = atoi(optarg); break; case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break;
case 's': popt->is_sw = 0; break; case 's': popt->is_sw = 0; break;
...@@ -768,7 +777,8 @@ int bwa_sai2sam_pe(int argc, char *argv[]) ...@@ -768,7 +777,8 @@ int bwa_sai2sam_pe(int argc, char *argv[])
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n\n"); fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
...@@ -779,6 +789,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) ...@@ -779,6 +789,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
return 1; return 1;
} }
bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt); bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt);
free(bwa_rg_line); free(bwa_rg_id);
free(popt); free(popt);
return 0; return 0;
} }
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "kstring.h" #include "kstring.h"
int g_log_n[256]; int g_log_n[256];
char *bwa_rg_line, *bwa_rg_id;
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
{ {
...@@ -27,7 +28,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma ...@@ -27,7 +28,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
for (i = cnt = 0; i < n_aln; ++i) { for (i = cnt = 0; i < n_aln; ++i) {
const bwt_aln1_t *p = aln + i; const bwt_aln1_t *p = aln + i;
if (p->score > best) break; if (p->score > best) break;
if (drand48() * (p->l - p->k + 1) > (double)cnt) { if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a; s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
s->score = p->score; s->score = p->score;
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
...@@ -468,6 +469,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in ...@@ -468,6 +469,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
printf("%s", p->qual); printf("%s", p->qual);
} else printf("*"); } else printf("*");
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
if (p->type != BWA_TYPE_NO_MATCH) { if (p->type != BWA_TYPE_NO_MATCH) {
int i; int i;
...@@ -535,6 +537,7 @@ void bwa_print_sam_SQ(const bntseq_t *bns) ...@@ -535,6 +537,7 @@ void bwa_print_sam_SQ(const bntseq_t *bns)
int i; int i;
for (i = 0; i < bns->n_seqs; ++i) for (i = 0; i < bns->n_seqs; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (bwa_rg_line) printf("%s\n", bwa_rg_line);
} }
void bwase_initialize() void bwase_initialize()
...@@ -543,8 +546,44 @@ void bwase_initialize() ...@@ -543,8 +546,44 @@ void bwase_initialize()
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
} }
char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
int bwa_set_rg(const char *s)
{
char *p, *q, *r;
if (strstr(s, "@RG") != s) return -1;
if (bwa_rg_line) free(bwa_rg_line);
if (bwa_rg_id) free(bwa_rg_id);
bwa_rg_line = strdup(s);
bwa_rg_id = 0;