Commit 89fdd1c4 authored by Charles Plessy's avatar Charles Plessy

Merge commit 'upstream/0.5.6'

parents dc7f9c4a 723987e1
------------------------------------------------------------------------
r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
M /branches/prog/bwa/NEWS
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/main.c
* bwa-0.5.5-10 (r1302)
* improve max insert size estimate (method suggested by Gerton Lunter)
------------------------------------------------------------------------
r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-9 (r1301)
* improve mapping quality calculation for abnomalous pairs
* fixed a bug in multiple hits
* SOLiD multiple hits should work now
------------------------------------------------------------------------
r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-8 (r1300)
* output kurtosis
------------------------------------------------------------------------
r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-7 (r1299)
* calculate skewness in sampe
* increase min_len in SW to 20
* perform more SW to fix discordant pairs
------------------------------------------------------------------------
r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/cs2nt.c
M /branches/prog/bwa/main.c
M /branches/prog/bwa/stdaln.h
* bwa-0.5.5-6 (r1297)
* prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya)
------------------------------------------------------------------------
r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines
Changed paths:
M /branches/prog/bwa/solid2fastq.pl
the old fix seems not working!
------------------------------------------------------------------------
r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwa.1
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-5 (r1296)
* fixed a minor issue that the lower bound of insert size is not correctly set.
------------------------------------------------------------------------
r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/bwaseqio.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-4 (r1295)
* fixed a memory leak
* change the behaviour of -n (samse and sampe)
* change the default of -n
------------------------------------------------------------------------
r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/bwaseqio.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/main.c
* bwa-0.5.5-3 (r1294)
* improved multi-hit report
------------------------------------------------------------------------
r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines
Changed paths:
M /branches/prog/bwa/bwape.c
M /branches/prog/bwa/bwase.c
M /branches/prog/bwa/cs2nt.c
M /branches/prog/bwa/main.c
M /branches/prog/bwa/solid2fastq.pl
* bwa-0.5.5-2 (r1293)
* bugfix: truncated quality string
* bugfix: quality -1 in solid->fastq conversion
* bugfix: color reads on the reverse strand is not complemented
------------------------------------------------------------------------
r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines
Changed paths:
M /branches/prog/bwa/bntseq.c
M /branches/prog/bwa/bntseq.h
M /branches/prog/bwa/bwase.c
A /branches/prog/bwa/bwase.h
M /branches/prog/bwa/bwtaln.c
M /branches/prog/bwa/bwtaln.h
M /branches/prog/bwa/bwtsw2_aux.c
M /branches/prog/bwa/main.c
* bwa-0.5.5-1 (r1279)
* incorporate changes from Matt Hanna for Java bindings.
------------------------------------------------------------------------
r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines
Changed paths:
M /branches/prog/bwa/ChangeLog
update ChangeLog
------------------------------------------------------------------------
r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines
Changed paths:
......
Installation Instructions
*************************
Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
2006 Free Software Foundation, Inc.
This file is free documentation; the Free Software Foundation gives
unlimited permission to copy, distribute and modify it.
Basic Installation
==================
Briefly, the shell commands `./configure; make; make install' should
configure, build, and install this package. The following
more-detailed instructions are generic; see the `README' file for
instructions specific to this package.
The `configure' shell script attempts to guess correct values for
various system-dependent variables used during compilation. It uses
those values to create a `Makefile' in each directory of the package.
It may also create one or more `.h' files containing system-dependent
definitions. Finally, it creates a shell script `config.status' that
you can run in the future to recreate the current configuration, and a
file `config.log' containing compiler output (useful mainly for
debugging `configure').
It can also use an optional file (typically called `config.cache'
and enabled with `--cache-file=config.cache' or simply `-C') that saves
the results of its tests to speed up reconfiguring. Caching is
disabled by default to prevent problems with accidental use of stale
cache files.
If you need to do unusual things to compile the package, please try
to figure out how `configure' could check whether to do them, and mail
diffs or instructions to the address given in the `README' so they can
be considered for the next release. If you are using the cache, and at
some point `config.cache' contains results you don't want to keep, you
may remove or edit it.
The file `configure.ac' (or `configure.in') is used to create
`configure' by a program called `autoconf'. You need `configure.ac' if
you want to change it or regenerate `configure' using a newer version
of `autoconf'.
The simplest way to compile this package is:
1. `cd' to the directory containing the package's source code and type
`./configure' to configure the package for your system.
Running `configure' might take a while. While running, it prints
some messages telling which features it is checking for.
2. Type `make' to compile the package.
3. Optionally, type `make check' to run any self-tests that come with
the package.
4. Type `make install' to install the programs and any data files and
documentation.
5. You can remove the program binaries and object files from the
source code directory by typing `make clean'. To also remove the
files that `configure' created (so you can compile the package for
a different kind of computer), type `make distclean'. There is
also a `make maintainer-clean' target, but that is intended mainly
for the package's developers. If you use it, you may have to get
all sorts of other programs in order to regenerate files that came
with the distribution.
Compilers and Options
=====================
Some systems require unusual options for compilation or linking that the
`configure' script does not know about. Run `./configure --help' for
details on some of the pertinent environment variables.
You can give `configure' initial values for configuration parameters
by setting variables in the command line or in the environment. Here
is an example:
./configure CC=c99 CFLAGS=-g LIBS=-lposix
*Note Defining Variables::, for more details.
Compiling For Multiple Architectures
====================================
You can compile the package for more than one kind of computer at the
same time, by placing the object files for each architecture in their
own directory. To do this, you can use GNU `make'. `cd' to the
directory where you want the object files and executables to go and run
the `configure' script. `configure' automatically checks for the
source code in the directory that `configure' is in and in `..'.
With a non-GNU `make', it is safer to compile the package for one
architecture at a time in the source code directory. After you have
installed the package for one architecture, use `make distclean' before
reconfiguring for another architecture.
Installation Names
==================
By default, `make install' installs the package's commands under
`/usr/local/bin', include files under `/usr/local/include', etc. You
can specify an installation prefix other than `/usr/local' by giving
`configure' the option `--prefix=PREFIX'.
You can specify separate installation prefixes for
architecture-specific files and architecture-independent files. If you
pass the option `--exec-prefix=PREFIX' to `configure', the package uses
PREFIX as the prefix for installing programs and libraries.
Documentation and other data files still use the regular prefix.
In addition, if you use an unusual directory layout you can give
options like `--bindir=DIR' to specify different values for particular
kinds of files. Run `configure --help' for a list of the directories
you can set and what kinds of files go in them.
If the package supports it, you can cause programs to be installed
with an extra prefix or suffix on their names by giving `configure' the
option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
Optional Features
=================
Some packages pay attention to `--enable-FEATURE' options to
`configure', where FEATURE indicates an optional part of the package.
They may also pay attention to `--with-PACKAGE' options, where PACKAGE
is something like `gnu-as' or `x' (for the X Window System). The
`README' should mention any `--enable-' and `--with-' options that the
package recognizes.
For packages that use the X Window System, `configure' can usually
find the X include and library files automatically, but if it doesn't,
you can use the `configure' options `--x-includes=DIR' and
`--x-libraries=DIR' to specify their locations.
Specifying the System Type
==========================
There may be some features `configure' cannot figure out automatically,
but needs to determine by the type of machine the package will run on.
Usually, assuming the package is built to be run on the _same_
architectures, `configure' can figure that out, but if it prints a
message saying it cannot guess the machine type, give it the
`--build=TYPE' option. TYPE can either be a short name for the system
type, such as `sun4', or a canonical name which has the form:
CPU-COMPANY-SYSTEM
where SYSTEM can have one of these forms:
OS KERNEL-OS
See the file `config.sub' for the possible values of each field. If
`config.sub' isn't included in this package, then this package doesn't
need to know the machine type.
If you are _building_ compiler tools for cross-compiling, you should
use the option `--target=TYPE' to select the type of system they will
produce code for.
If you want to _use_ a cross compiler, that generates code for a
platform different from the build platform, you should specify the
"host" platform (i.e., that on which the generated programs will
eventually be run) with `--host=TYPE'.
Sharing Defaults
================
If you want to set default values for `configure' scripts to share, you
can create a site shell script called `config.site' that gives default
values for variables like `CC', `cache_file', and `prefix'.
`configure' looks for `PREFIX/share/config.site' if it exists, then
`PREFIX/etc/config.site' if it exists. Or, you can set the
`CONFIG_SITE' environment variable to the location of the site script.
A warning: not all `configure' scripts look for a site script.
Defining Variables
==================
Variables not defined in a site shell script can be set in the
environment passed to `configure'. However, some packages may run
configure again during the build, and the customized values of these
variables may be lost. In order to avoid this problem, you should set
them in the `configure' command line, using `VAR=value'. For example:
./configure CC=/usr/local2/bin/gcc
causes the specified `gcc' to be used as the C compiler (unless it is
overridden in the site shell script).
Unfortunately, this technique does not work for `CONFIG_SHELL' due to
an Autoconf bug. Until the bug is fixed you can use this workaround:
CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
`configure' Invocation
======================
`configure' recognizes the following options to control how it operates.
`--help'
`-h'
Print a summary of the options to `configure', and exit.
`--version'
`-V'
Print the version of Autoconf used to generate the `configure'
script, and exit.
`--cache-file=FILE'
Enable the cache: use and save the results of the tests in FILE,
traditionally `config.cache'. FILE defaults to `/dev/null' to
disable caching.
`--config-cache'
`-C'
Alias for `--cache-file=config.cache'.
`--quiet'
`--silent'
`-q'
Do not print messages saying which checks are being made. To
suppress all normal output, redirect it to `/dev/null' (any error
messages will still be shown).
`--srcdir=DIR'
Look for the package's source code in directory DIR. Usually
`configure' can determine that directory automatically.
`configure' also accepts some other, not widely useful, options. Run
`configure --help' for more details.
## Makefile.am -- Process this file with automake to produce Makefile.in
ACLOCAL_AMFLAGS=-I m4
AM_CFLAGS = -Wall -m64 -fPIC
bin_PROGRAMS = bwa
noinst_LIBRARIES = libbwacore.a
libbwacore_a_SOURCES = utils.c bwt.c bwtio.c bwtaln.c bwtgap.c bntseq.c \
stdaln.c bwaseqio.c bwase.c kstring.c cs2nt.c
bwa_SOURCES = is.c bwtmisc.c bwtindex.c simple_dp.c bwape.c \
bwtsw2_core.c bwtsw2_main.c bwtsw2_aux.c bwt_lite.c bwtsw2_chain.c \
main.c
bwa_LDADD = bwt_gen/libbwtgen.a libbwacore.a
man_MANS= bwa.1
SUBDIRS= bwt_gen .
Beta Release 0.5.6 (10 Feburary, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Notable changes in bwa-short:
* Report multiple hits in the SAM format at a new tag XA encoded as:
(chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has
4 or fewer hits, they will all be reported; if a read in a anomalous
pair has 11 or fewer hits, all of them will be reported.
* Perform Smith-Waterman alignment also for anomalous read pairs when
both ends have quality higher than 17. This reduces false positives
for some SV discovery algorithms.
* Do not report "weird pairing" when the insert size distribution is
too fat or has a mean close to zero.
* If a read is bridging two adjacent chromsomes, flag it as unmapped.
* Fixed a small but long existing memory leak in paired-end mapping.
* Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly
parsed by solid2fastq.pl; b) truncated quality string is resolved; c)
SOLiD read mapped to the reverse strand is complemented.
* Bwa now calculates skewness and kurtosis of the insert size
distribution.
* Deploy a Bayesian method to estimate the maximum distance for a read
pair considered to be paired properly. The method is proposed by
Gerton Lunter, but bwa only implements a simplified version.
* Export more functions for Java bindings, by Matt Hanna (See:
http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings)
* Abstract bwa CIGAR for further extension, by Rodrigo Goya.
(0.5.6: 10 Feburary 2010, r1303)
Beta Release 0.5.5 (10 November, 2009)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
#!/bin/sh
autoreconf -vi
......@@ -85,7 +85,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
}
}
bntseq_t *bns_restore(const char *prefix)
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
char str[1024];
FILE *fp;
......@@ -94,8 +94,7 @@ bntseq_t *bns_restore(const char *prefix)
int i;
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
{ // read .ann
strcpy(str, prefix); strcat(str, ".ann");
fp = xopen(str, "r");
fp = xopen(ann_filename, "r");
fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
bns->l_pac = xx;
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
......@@ -120,8 +119,7 @@ bntseq_t *bns_restore(const char *prefix)
{ // read .amb
int64_t l_pac;
int32_t n_seqs;
strcpy(str, prefix); strcat(str, ".amb");
fp = xopen(str, "r");
fp = xopen(amb_filename, "r");
fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
l_pac = xx;
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
......@@ -135,12 +133,20 @@ bntseq_t *bns_restore(const char *prefix)
fclose(fp);
}
{ // open .pac
strcpy(str, prefix); strcat(str, ".pac");
bns->fp_pac = xopen(str, "rb");
bns->fp_pac = xopen(pac_filename, "rb");
}
return bns;
}
bntseq_t *bns_restore(const char *prefix)
{
char ann_filename[1024], amb_filename[1024], pac_filename[1024];
strcat(strcpy(ann_filename, prefix), ".ann");
strcat(strcpy(amb_filename, prefix), ".amb");
strcat(strcpy(pac_filename, prefix), ".pac");
return bns_restore_core(ann_filename, amb_filename, pac_filename);
}
void bns_destroy(bntseq_t *bns)
{
if (bns == 0) return;
......
......@@ -68,6 +68,7 @@ extern "C" {
void bns_dump(const bntseq_t *bns, const char *prefix);
bntseq_t *bns_restore(const char *prefix);
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
......
.TH bwa 1 "10 November 2009" "bwa-0.5.5" "Bioinformatics tools"
.TH bwa 1 "10 Feburuary 2010" "bwa-0.5.6" "Bioinformatics tools"
.SH NAME
.PP
bwa - Burrows-Wheeler Alignment Tool
......@@ -12,7 +12,7 @@ bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
.PP
bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
.PP
bwa dbwtsw database.fasta long_read.fastq > aln.sam
bwa bwasw database.fasta long_read.fastq > aln.sam
.SH DESCRIPTION
.PP
......@@ -161,25 +161,21 @@ read length. [0]
bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
Generate alignments in the SAM format given single-end reads. Repetitive
hits will be randomly chosen. When
.B -n
is applied, multiple positions will be printed in a concise format. The
`>' line in the output shows the read name, the number of printed hits
and the number of stored hits; the following lines give the chromosomal
coordinates and the edit distance.
hits will be randomly chosen.
.B OPTIONS:
.RS
.TP 10
.B -n INT
Output up to INT top hits. Value -1 to disable outputting multiple
hits. [-1]
Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be
written. [3]
.RE
.TP
.B sampe
bwa sampe [-a maxInsSize] [-o maxOcc] [-P] <in.db.fasta> <in1.sai> <in2.sai>
<in1.fq> <in2.fq> > <out.sam>
bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
[-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
Generate alignments in the SAM format given paired-end reads. Repetitive
read pairs will be placed randomly.
......@@ -201,11 +197,21 @@ faster pairing. [100000]
Load the entire FM-index into memory to reduce disk operations
(base-space reads only). With this option, at least 1.25N bytes of
memory are required, where N is the length of the genome.
.TP
.B -n INT
Maximum number of alignments to output in the XA tag for reads paired
properly. If a read has more than INT hits, the XA tag will not be
written. [3]
.TP
.B -N INT
Maximum number of alignments to output in the XA tag for disconcordant
read pairs (excluding singletons). If a read has more than INT hits, the
XA tag will not be written. [10]
.RE
.TP
.B dbwtsw
bwa dbwtsw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
.B bwasw
bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
nHspRev] [-c thresCoef] <in.db.fasta> <in.fq>
......@@ -327,6 +333,7 @@ XM Number of mismatches in the alignment
XO Number of gap opens
XG Number of gap extentions
XT Type: Unique/Repeat/N/Mate-sw
XA Alternative hits; format: (chr,pos,CIGAR,NM;)*
_
XS Suboptimal alignment score
XF Support from forward/reverse alignment
......@@ -368,12 +375,27 @@ behaviour will be avoided in future releases.
By default, if the best hit is no so repetitive (controlled by -R), BWA
also finds all hits contains one more mismatch; otherwise, BWA finds all
equally best hits only. Base quality is NOT considered in evaluating
hits. In pairing, BWA searches, among the found hits under the
constraint of the
.I maxOcc
option, for pairs within
.I maxInsSize
and with proper orientation.
hits. In paired-end alignment, BWA pairs all hits it found. It further
performs Smith-Waterman alignment for unmapped reads with mates mapped
to rescue mapped mates, and for high-quality anomalous pairs to fix
potential alignment errors.
.SS Estimating Insert Size Distribution
.PP
BWA estimates the insert size distribution per 256*1024 read pairs. It
first collects pairs of reads with both ends mapped with a single-end
quality 20 or higher and then calculates median (Q2), lower and higher
quartile (Q1 and Q3). It estimates the mean and the variance of the
insert size distribution from pairs whose insert sizes are within
interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
considered to be properly paired (SAM flag 0x2) is calculated by solving
equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
standard error of the insert size distribution, L is the length of the
genome, p0 is prior of anomalous pair and Phi() is the standard
cumulative distribution function. For mapping Illumina short-insert
reads to the human genome, x is about 6-7 sigma away from the
mean. Quartiles, mean, variance and x will be printed to the standard
error output.
.SS Memory Requirement
.PP
......@@ -417,7 +439,7 @@ usually much faster.
.SH NOTES ON LONG-READ ALIGNMENT
.PP
Command
.B `dbwtsw'
.B `bwasw'
is designed for long-read alignment. The algorithm behind, BWA-SW, is
similar to BWT-SW, but does not guarantee to find all local hits due to
the heuristic acceleration. It tends to be faster and more accurate if
......@@ -459,7 +481,12 @@ If you use the short-read alignment component, please cite the following
paper:
.PP
Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. Bioinformatics, 25, 1754-60.
Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. [PMID: 19451168]
.PP
If you use the long-read component (BWA-SW), please cite:
.PP
Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
Burrows-Wheeler transform. Bioinformatics. [PMID: 20080505]
.SH HISTORY
BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef BWASE_H
#define BWASE_H
#include "bntseq.h"
#include "bwt.h"
#include "bwtaln.h"
#ifdef __cplusplus
extern "C" {
#endif
// Initialize mapping tables in the bwa single-end mapper.
void bwase_initialize();
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.
int64_t pos_end(const bwa_seq_t *p);
#ifdef __cplusplus
}
#endif
#endif // BWASE_H
......@@ -108,11 +108,13 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
{
int i;
int i, j;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
for (j = 0; j < p->n_multi; ++j)
if (p->multi[j].cigar) free(p->multi[j].cigar);
free(p->name);
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md);
free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
free(p->cigar);
}
free(seqs);
......
## Makefile.am -- Process this file with automake to produce Makefile.in
AM_CFLAGS = -g -Wall -O2 -m64
noinst_LIBRARIES = libbwtgen.a
libbwtgen_a_SOURCES = QSufSort.c bwt_gen.c
......@@ -5,6 +5,9 @@
#include <string.h>
#include <time.h>
#include <stdint.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "bwtaln.h"
#include "bwtgap.h"
#include "utils.h"
......@@ -74,7 +77,7 @@ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_wid
return bid;
}
static void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
{
int i, max_l = 0, max_len;
gap_stack_t *stack;
......@@ -293,3 +296,18 @@ int bwa_aln(int argc, char *argv[])
free(opt);
return 0;
}
/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t,
__cigar_op and __cigar_len while keeping stdaln stand alone */
bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
{
uint32_t *cigar32;
bwa_cigar_t *cigar;
int i;
cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar);
cigar = (bwa_cigar_t*)cigar32;
for (i = 0; i < *n_cigar; ++i)
cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) );
return cigar;
}
......@@ -37,6 +37,23 @@ typedef struct {
int score;
} bwt_aln1_t;
typedef uint16_t bwa_cigar_t;
/* rgoya: If changing order of bytes, beware of operations like:
* s->cigar[0] += s->full_len - s->len;
*/
#define CIGAR_OP_SHIFT 14
#define CIGAR_LN_MASK 0x3fff
#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
typedef struct {
uint32_t pos;
uint32_t n_cigar:15, gap:8, mm:8, strand:1;
bwa_cigar_t *cigar;
} bwt_multi1_t;
typedef struct {
char *name;
ubyte_t *seq, *rseq, *qual;
......@@ -47,11 +64,14 @@ typedef struct {
// alignments in SA coordinates
int n_aln;
bwt_aln1_t *aln;
// multiple hits
int n_multi;
bwt_multi1_t *multi;
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
int n_cigar;
uint16_t *cigar;
bwa_cigar_t *cigar;
// for multi-threading only
int tid;
// NM an