Commit 9ce640c8 authored by Charles Plessy's avatar Charles Plessy

Imported Upstream version 0.6.0

parent a01f6ca1
......@@ -3,14 +3,14 @@ CXX= g++
CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64
OBJS= utils.o bwt.o bwtio.o bwtaln.o bwtgap.o is.o \
bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \
OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \
is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \
bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o bamlite.o
bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread -Lbwt_gen -lbwtgen
LIBS= -lm -lz -lpthread
SUBDIRS= . bwt_gen
.SUFFIXES:.c .o .cc
......@@ -22,21 +22,11 @@ SUBDIRS= . bwt_gen
all:$(PROG)
lib-recur all-recur clean-recur cleanlocal-recur install-recur:
@target=`echo $@ | sed s/-recur//`; \
wdir=`pwd`; \
list='$(SUBDIRS)'; for subdir in $$list; do \
cd $$subdir; \
$(MAKE) CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
INCLUDES="$(INCLUDES)" $$target || exit 1; \
cd $$wdir; \
done;
lib:
bwa:lib-recur $(OBJS) main.o
bwa:$(OBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS)
QSufSort.o:QSufSort.h
bwt.o:bwt.h
bwtio.o:bwt.h
bwtaln.o:bwt.h bwtaln.h kseq.h
......@@ -44,12 +34,11 @@ bwt1away.o:bwt.h bwtaln.h
bwt2fmv.o:bwt.h
bntseq.o:bntseq.h
bwtgap.o:bwtgap.h bwtaln.h bwt.h
fastmap:bwt.h
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_main.o:bwtsw2.h
cleanlocal:
clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
clean:cleanlocal-recur
Release 0.5.10 and 0.6.0 (12 November, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The 0.6.0 release comes with two major changes. Firstly, the index data
structure has been changed to support genomes longer than 4GB. The forward and
reverse backward genome is now integrated in one index. This change speeds up
BWA-short by about 20% and BWA-SW by 90% with the mapping acccuracy largely
unchanged. A tradeoff is BWA requires more memory, but this is the price almost
all mappers that index the genome have to pay.
Secondly, BWA-SW in 0.6.0 now works with paired-end data. It is more accurate
for highly unique reads and more robust to long indels and structural
variations. However, BWA-short still has edges for reads with many suboptimal
hits. It is yet to know which algorithm is the best for variant calling.
0.5.10 is a bugfix release only and is likely to be the last release in the 0.5
branch unless I find critical bugs in future.
Other notable changes:
* Added the `fastmap' command that finds super-maximal exact matches. It does
not give the final alignment, but runs much faster. It can be a building
block for other alignment algorithms. [0.6.0 only]
* Output the timing information before BWA exits. This also tells users that
the task has been finished instead of being killed or aborted. [0.6.0 only]
* Sped up multi-threading when using many (>20) CPU cores.
* Check I/O error.
* Increased the maximum barcode length to 63bp.
* Automatically choose the indexing algorithm.
* Bugfix: very rare segfault due to an uninitialized variable. The bug also
affects the placement of suboptimal alignments. The effect is very minor.
This release involves quite a lot of tricky changes. Although it has been
tested on a few data sets, subtle bugs may be still hidden. It is *NOT*
recommended to use this release in a production pipeline. In future, however,
BWA-SW may be better when reads continue to go longer. I would encourage users
to try the 0.6 release. I would also like to hear the users' experience. Thank
you.
(0.6.0: 12 November 2011, r85)
Beta Release 0.5.9 (24 January, 2011)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
......@@ -32,63 +32,36 @@
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include "bwt_gen.h"
#include "QSufSort.h"
// Static functions
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar);
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar);
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar);
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize);
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated);
// from MiscUtilities.c
static unsigned int leadingZero(const unsigned int input) {
unsigned int l;
const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
if (input & 0xFFFF0000) {
if (input & 0xFF000000) {
l = leadingZero8bit[input >> 24];
} else {
l = 8 + leadingZero8bit[input >> 16];
}
} else {
if (input & 0x0000FF00) {
l = 16 + leadingZero8bit[input >> 8];
} else {
l = 24 + leadingZero8bit[input];
}
}
return l;
#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) )
#define med3(a, b, c) ( a<b ? (b<c ? b : a<c ? c : a) : (b>c ? b : a>c ? c : a))
#define swap(a, b, t); t = a; a = b; b = t;
}
// Static functions
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar);
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize);
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated);
/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
contents of x[n] is disregarded, the n-th symbol being regarded as
end-of-string smaller than all other symbols.*/
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
const int smallestInputSymbol, const int skipTransform) {
int i, j;
int s, negatedSortedGroupLength;
int numSymbolAggregated;
int maxNumInputSymbol;
int numSortedPos = 1;
int newAlphabetSize;
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const int skipTransform)
{
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
qsint_t maxNumInputSymbol;
qsint_t numSortedPos = 1;
qsint_t newAlphabetSize;
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
......@@ -102,7 +75,7 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar,
numSortedPos = numSymbolAggregated;
}
while ((int)(I[0]) >= -(int)numChar) {
while ((qsint_t)(I[0]) >= -(qsint_t)numChar) {
i = 0;
negatedSortedGroupLength = 0;
do {
......@@ -126,16 +99,13 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar,
}
numSortedPos *= 2; /* double sorted-depth.*/
}
}
void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) {
int i;
for (i=0; i<=numChar; i++) {
void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar)
{
qsint_t i;
for (i=0; i<=numChar; i++)
I[V[i]] = i + 1;
}
}
/* Sorting routine called for each unsorted group. Sorts the array of integers
......@@ -143,21 +113,14 @@ void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int nu
quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
function is based on Program 7.*/
static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar) {
int a, b, c, d;
int l, m;
int f, v, s, t;
int tmp;
int numItem;
#ifdef DEBUG
if (lowestPos > highestPos) {
fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n");
exit(1);
}
#endif
static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar) {
qsint_t a, b, c, d;
qsint_t l, m;
qsint_t f, v, s, t;
qsint_t tmp;
qsint_t numItem;
numItem = highestPos - lowestPos + 1;
......@@ -171,7 +134,7 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
a = b = lowestPos;
c = d = highestPos;
while (TRUE) {
while (1) {
while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) {
if (f == v) {
swap(I[a], I[b], tmp);
......@@ -186,9 +149,8 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
}
c--;
}
if (b > c) {
if (b > c)
break;
}
swap(I[b], I[c], tmp);
b++;
c--;
......@@ -210,9 +172,8 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
s = b - a;
t = d - c;
if (s > 0) {
if (s > 0)
QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar);
}
// Update group number for equal portion
a = lowestPos + s;
......@@ -223,42 +184,26 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo
I[a] = -1;
} else {
// Unsorted group
for (c=a; c<=b; c++) {
for (c=a; c<=b; c++)
V[I[c]] = b;
}
}
if (t > 0) {
if (t > 0)
QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar);
}
}
/* Algorithm by Bentley & McIlroy.*/
static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar) {
int m;
int keyl, keym, keyn;
int key1, key2, key3;
int s;
int numItem;
#ifdef DEBUG
if (lowestPos > highestPos) {
fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n");
exit(1);
}
#endif
static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar) {
numItem = highestPos - lowestPos + 1;
qsint_t m;
qsint_t keyl, keym, keyn;
qsint_t key1, key2, key3;
qsint_t s;
qsint_t numItem;
#ifdef DEBUG
if (numItem <= INSERT_SORT_NUM_ITEM) {
fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n");
exit(1);
}
#endif
numItem = highestPos - lowestPos + 1;
m = lowestPos + numItem / 2;
......@@ -282,39 +227,19 @@ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int l
}
/* Quadratic sorting method to use for small subarrays. */
static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos,
const int highestPos, const int numSortedChar) {
int i, j;
int tmpKey, tmpPos;
int numItem;
int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
int negativeSortedLength;
int groupNum;
#ifdef DEBUG
if (lowestPos > highestPos) {
fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n");
exit(1);
}
#endif
static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos,
const qsint_t highestPos, const qsint_t numSortedChar)
{
qsint_t i, j;
qsint_t tmpKey, tmpPos;
qsint_t numItem;
qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM];
qsint_t negativeSortedLength;
qsint_t groupNum;
numItem = highestPos - lowestPos + 1;
#ifdef DEBUG
if (numItem > INSERT_SORT_NUM_ITEM) {
fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n");
exit(1);
}
#endif
for (i=0; i<numItem; i++) {
#ifdef DEBUG
if (I[lowestPos + i] < 0) {
fprintf(stderr, "QSufSortInsertSortSplit(): I < 0 in unsorted region!\n");
exit(1);
}
#endif
pos[i] = I[lowestPos + i];
key[i] = V[pos[i] + numSortedChar];
}
......@@ -340,9 +265,8 @@ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const
if (key[i-1] == key[i]) {
negativeSortedLength = 0;
} else {
if (negativeSortedLength < 0) {
if (negativeSortedLength < 0)
I[i+lowestPos] = negativeSortedLength;
}
groupNum = i + lowestPos - 1;
negativeSortedLength--;
}
......@@ -351,10 +275,8 @@ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const
I[lowestPos] = pos[0];
V[I[lowestPos]] = groupNum;
if (negativeSortedLength < 0) {
if (negativeSortedLength < 0)
I[lowestPos] = negativeSortedLength;
}
}
/* Bucketsort for first iteration.
......@@ -366,29 +288,28 @@ static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const
Output: x is V and p is I after the initial sorting stage of the refined
suffix sorting algorithm.*/
static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) {
int i, c;
int d;
int groupNum;
int currentIndex;
static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize)
{
qsint_t i, c;
qsint_t d;
qsint_t groupNum;
qsint_t currentIndex;
// mark linked list empty
for (i=0; i<alphabetSize; i++) {
for (i=0; i<alphabetSize; i++)
I[i] = -1;
}
// insert to linked list
for (i=0; i<=numChar; i++) {
c = V[i];
V[i] = (int)(I[c]);
V[i] = (qsint_t)(I[c]);
I[c] = i;
}
currentIndex = numChar;
for (i=alphabetSize; i>0; i--) {
c = I[i-1];
d = (int)(V[c]);
d = (qsint_t)(V[c]);
groupNum = currentIndex;
V[c] = groupNum;
if (d >= 0) {
......@@ -406,7 +327,6 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n
}
currentIndex--;
}
}
/* Transforms the alphabet of x by attempting to aggregate several symbols into
......@@ -424,20 +344,20 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n
Output: Returns an integer j in the range 1...q representing the size of the
new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
set to the number of old symbols grouped into one. Only x[n] is 0.*/
static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) {
int c, i, j;
int a; // numSymbolAggregated
int mask;
int minSymbolInChunk = 0, maxSymbolInChunk = 0;
int newAlphabetSize;
int maxNumInputSymbol, maxNumBit, maxSymbol;
static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated)
{
qsint_t c, i, j;
qsint_t a; // numSymbolAggregated
qsint_t mask;
qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0;
qsint_t newAlphabetSize;
qsint_t maxNumInputSymbol, maxNumBit, maxSymbol;
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol);
maxSymbol = INT_MAX >> maxNumBit;
for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit;
maxSymbol = QSINT_MAX >> maxNumBit;
c = maxNumInputSymbol;
for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) {
......@@ -449,18 +369,9 @@ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int num
mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/
V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/
#ifdef DEBUG
// Section of code for maxSymbolInChunk > numChar removed!
if (maxSymbolInChunk > numChar) {
fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n");
exit(1);
}
#endif
/* bucketing possible, compact alphabet.*/
for (i=0; i<=maxSymbolInChunk; i++) {
for (i=0; i<=maxSymbolInChunk; i++)
I[i] = 0; /* zero transformation table.*/
}
c = minSymbolInChunk;
for (i=a; i<=numChar; i++) {
I[c] = 1; /* mark used chunk symbol.*/
......@@ -491,6 +402,4 @@ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int num
*numSymbolAggregated = a;
return newAlphabetSize;
}
......@@ -29,12 +29,17 @@
#ifndef __QSUFSORT_H__
#define __QSUFSORT_H__
#include <stdint.h>
#define KEY(V, I, p, h) ( V[ I[p] + h ] )
#define INSERT_SORT_NUM_ITEM 16
void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol,
const int smallestInputSymbol, const int skipTransform);
void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar);
typedef int64_t qsint_t;
#define QSINT_MAX INT64_MAX
void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol,
const qsint_t smallestInputSymbol, const int skipTransform);
void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar);
#endif
......@@ -163,16 +163,67 @@ void bns_destroy(bntseq_t *bns)
}
}
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
{
bntann1_t *p;
int i, lasts;
if (bns->n_seqs == *m_seqs) {
*m_seqs <<= 1;
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = lasts = 0; i < seq->seq.l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++(*q)->len;
} else {
if (bns->n_holes == *m_holes) {
(*m_holes) <<= 1;
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
}
*q = bns->ambs + bns->n_holes;
(*q)->len = 1;
(*q)->offset = p->offset + i;
(*q)->amb = seq->seq.s[i];
++p->n_ambs;
++bns->n_holes;
}
}
lasts = seq->seq.s[i];
{ // fill buffer
if (c >= 4) c = lrand48()&3;
if (bns->l_pac == *m_pac) { // double the pac size
*m_pac <<= 1;
pac = realloc(pac, *m_pac/4);
memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
}
_set_pac(pac, bns->l_pac, c);
++bns->l_pac;
}
}
++bns->n_seqs;
return pac;
}
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
{
extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
kseq_t *seq;
char name[1024];
bntseq_t *bns;
uint8_t *pac = 0;
int32_t m_seqs, m_holes;
int64_t ret = -1, m_pac, l;
bntamb1_t *q;
int l_buf;
unsigned char buf[0x10000];
int32_t m_seqs, m_holes, l, i;
int64_t ret = -1;
FILE *fp;
// initialization
......@@ -180,66 +231,26 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
bns->seed = 11; // fixed seed for random generator
srand48(bns->seed);
m_seqs = m_holes = 8;
m_seqs = m_holes = 8; m_pac = 0x10000;
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
pac = calloc(m_pac/4, 1);
q = bns->ambs;
l_buf = 0;
strcpy(name, prefix); strcat(name, ".pac");
fp = xopen(name, "wb");
memset(buf, 0, 0x10000);
// read sequences
while ((l = kseq_read(seq)) >= 0) {
bntann1_t *p;
int lasts;
if (bns->n_seqs == m_seqs) {
m_seqs <<= 1;
bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = 0, lasts = 0; i < l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++q->len;
} else {
if (bns->n_holes == m_holes) {
m_holes <<= 1;
bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
}
q = bns->ambs + bns->n_holes;
q->len = 1;
q->offset = p->offset + i;
q->amb = seq->seq.s[i];
++p->n_ambs;
++bns->n_holes;
}
}
lasts = seq->seq.s[i];
{ // fill buffer
if (c >= 4) c = lrand48()&0x3;
if (l_buf == 0x40000) {
fwrite(buf, 1, 0x10000, fp);
memset(buf, 0, 0x10000);
l_buf = 0;
}
buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
++l_buf;
}
}
++bns->n_seqs;
bns->l_pac += seq->seq.l;
while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
if (!for_only) { // add the reverse complemented sequence
m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
pac = realloc(pac, m_pac/4);
memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
_set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
}
xassert(bns->l_pac, "zero length sequence.");
ret = bns->l_pac;
{ // finalize .pac file
ubyte_t ct;
fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
// the following codes make the pac file size always (l_pac/4+1+1)
if (bns->l_pac % 4 == 0) {
ct = 0;
......@@ -253,51 +264,56 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
bns_dump(bns, prefix);
bns_destroy(bns);
kseq_destroy(seq);
free(pac);
return ret;
}
int bwa_fa2pac(int argc, char *argv[])
{
int c, for_only = 0;
gzFile fp;
if (argc < 2) {
fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
while ((c = getopt(argc, argv, "f")) >= 0) {
switch (c) {
case 'f': for_only = 1; break;
}
}
if (argc == optind) {
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
return 1;
}
fp = xzopen(argv[1], "r");
bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
fp = xzopen(argv[optind], "r");
bns_fasta2bntseq(fp