Commit ec169025 authored by Ognyan Kulev's avatar Ognyan Kulev

Imported Upstream version 0.7.5a

parent 947c8141
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o \
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa bwamem-lite
PROG= bwa
INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
......@@ -17,40 +18,60 @@ SUBDIRS= .
.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
.cc.o:
$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
ksw.o:ksw.h
kstring.o:kstring.h
utils.o:utils.h ksort.h kseq.h
bntseq.o:bntseq.h
bwt.o:bwt.h utils.h
bwa.o:bwa.h bwt.h bntseq.h
bwamem.o:ksw.h kbtree.h ksort.h kvec.h kstring.h utils.h bwamem.h
bwamem_pair.o:ksw.h kvec.h kstring.h utils.h bwamem.h
QSufSort.o:QSufSort.h
bwt_gen.o:QSufSort.h
fastmap.o:bwt.h bwamem.h
clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
bwtaln.o:bwt.h bwtaln.h kseq.h
bwtgap.o:bwtgap.h bwtaln.h bwt.h
depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
bwtsw2_main.o:bwtsw2.h
# DO NOT DELETE THIS LINE -- make depend depends on it.
clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
QSufSort.o: QSufSort.h
bamlite.o: bamlite.h malloc_wrap.h
bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h malloc_wrap.h kseq.h
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h utils.h kbtree.h
bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
bwamem_pair.o: utils.h ksw.h
bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
bwape.o: ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
bwase.o: bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
bwtsw2_core.o: khash.h ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
bwtsw2_pair.o: malloc_wrap.h ksw.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h
kopen.o: malloc_wrap.h
kstring.o: kstring.h malloc_wrap.h
ksw.o: ksw.h malloc_wrap.h
main.o: utils.h
malloc_wrap.o: malloc_wrap.h
pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
Release 0.7.5a (30 May, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare
cases.
(0.7.5a: 30 May 2013, r405)
Release 0.7.5 (29 May, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes in all components:
* Improved error checking on memory allocation and file I/O. Patches provided
by Rob Davies.
* Updated README.
* Bugfix: return code is zero upon errors.
Changes in BWA-MEM:
* Changed the way a chimeric alignment is reported (conforming to the upcoming
SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired
or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100
bits. All the other hits part of the chimeric alignment will use hard
clipping and be marked with 0x800 if option "-M" is not in use, or marked
with 0x100 otherwise.
* Other hits part of a chimeric alignment are now reported in the SA tag,
conforming to the SAM spec v1.5.
* Better method for resolving an alignment bridging two or more short
reference sequences. The current strategy maps the query to the reference
sequence that covers the middle point of the alignment. For most
applications, this change has no effects.
Changes in BWA-backtrack:
* Added a magic number to .sai files. This prevents samse/sampe from reading
corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai
generated by a different version of bwa.
* Bugfix: alignments in the XA:Z: tag were wrong.
* Keep track of #ins and #del during backtracking. This simplifies the code
and reduces errors in rare corner cases. I should have done this in the
early days of bwa.
In addition, if you use BWA-MEM or the fastmap command of BWA, please cite:
- Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN].
Thank you.
(0.7.5: 29 May 2013, r404)
Release 0.7.4 (23 April, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is a bugfix release. Most of bugs are considered to be minor which only
occur very rarely.
* Bugfix: wrong CIGAR when a query sequence bridges three or more target
sequences. This only happens when aligning reads to short assembly contigs.
* Bugfix: leading "D" operator in CIGAR.
* Extend more seeds for better alignment around tandem repeats. This is also
a cause of the leading "D" operator in CIGAR.
* Bugfix: SSE2-SSW may occasionally find incorrect query starting position
around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and
a wrong CIGAR in BWA.
* Bugfix: clipping penalty does not work as is intended when there is a gap
towards the end of a read.
* Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin,
fread() is unable to read a data block longer than 2GB due to an integer
overflow bug in its implementation.
Since version 0.7.4, BWA-MEM is considered to reach similar stability to
BWA-backtrack for short-read mapping.
(0.7.4: 23 April, r385)
Release 0.7.3a (15 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed
in another corner case.
(0.7.3a: 15 March 2013, r367)
Release 0.7.3 (15 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes to BWA-MEM:
* Bugfix: pairing score is inaccurate when option -A does not take the default
value. This is a very minor issue even if it happens.
* Bugfix: occasionally wrong CIGAR. This happens when in the alignment there
is a 1bp deletion and a 1bp insertion which are close to the end of the
reads, and there are no other substitutions or indels. BWA-MEM would not do
a gapped alignment due to the bug.
* New feature: output other non-overlapping alignments in the XP tag such that
we can see the entire picture of alignment from one SAM line. XP gives the
position, CIGAR, NM and mapQ of each aligned subsequence of the query.
BWA-MEM has been used to align ~300Gbp 100-700bp SE/PE reads. SNP/indel calling
has also been evaluated on part of these data. BWA-MEM generally gives better
pre-filtered SNP calls than BWA. No significant issues have been observed since
0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release)
are still possible. If you find potential issues, please send bug reports to
<bio-bwa-help@lists.sourceforge.net> (free registration required).
In addition, more detailed description of the BWA-MEM algorithm can be found at
<https://github.com/lh3/mem-paper>.
(0.7.3: 15 March 2013, r366)
Release 0.7.2 (9 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition,
flagging `properly paired' also gets improved a little.
(0.7.2: 9 March 2013, r351)
Release 0.7.1 (8 March, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes to BWA-MEM:
* Bugfix: rare segmentation fault caused by a partial hit to the end of the
last sequence.
* Bugfix: occasional mis-pairing given an interleaved fastq.
* Bugfix: wrong mate information when the mate is unmapped. SAM generated by
BWA-MEM can now be validated with Picard.
* Improved the performance and accuracy for ultra-long query sequences.
Short-read alignment is not affected.
Changes to other components:
* In BWA-backtrack and BWA-SW, replaced the code for global alignment,
Smith-Waterman and SW extension. The performance and accuracy of the two
algorithms stay the same.
* Added an experimental subcommand to merge overlapping paired ends. The
algorithm is very conservative: it may miss true overlaps but rarely makes
mistakes.
An important note is that like BWA-SW, BWA-MEM may output multiple primary
alignments for a read, which may cause problems to some tools. For aligning
sequence reads, it is advised to use `-M' to flag extra hits as secondary. This
option is not the default because multiple primary alignments are theoretically
possible in sequence alignment.
(0.7.1: 8 March 2013, r347)
Beta Release 0.7.0 (28 Feburary, 2013)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
Released packages can be downloaded from SourceForge.net:
http://sourceforge.net/projects/bio-bwa/files/
Introduction and FAQ are available at:
http://bio-bwa.sourceforge.net
Manual page at:
http://bio-bwa.sourceforge.net/bwa.shtml
Mailing list:
bio-bwa-help@lists.sourceforge.net
To sign up:
http://sourceforge.net/mail/?group_id=276243
Publications (Open Access):
http://www.ncbi.nlm.nih.gov/pubmed/20080505
http://www.ncbi.nlm.nih.gov/pubmed/19451168
Incomplete list of citations (via HubMed.org):
http://www.hubmed.org/references.cgi?uids=20080505
http://www.hubmed.org/references.cgi?uids=19451168
Related projects:
http://pbwa.sourceforge.net/
http://www.many-core.group.cam.ac.uk/projects/lam.shtml
http://biodoop-seal.sourceforge.net/
http://gitorious.org/bwa-cuda
###Getting started
git clone https://github.com/lh3/bwa.git
cd bwa; make
./bwa index ref.fa
./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
###Introduction
BWA is a software package for mapping low-divergent sequences against a large
reference genome, such as the human genome. It consists of three algorithms:
BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
sequence reads up to 100bp, while the rest two for longer sequences ranged from
70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of
long reads and chimeric alignment, but BWA-MEM, which is the latest, is
generally recommended for high-quality queries as it is faster and more
accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp
Illumina reads.
For all the algorithms, BWA first needs to construct the FM-index for the
reference genome (the **index** command). Alignment algorithms are invoked with
different sub-commands: **aln/samse/sampe** for BWA-backtrack,
**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.
###Availability
BWA is released under [GPLv3][1]. The latest souce code is [freely
available][2] at github. Released packages can [be downloaded ][3] at
SourceForge. After you acquire the source code, simply use `make` to compile
and copy the single executable `bwa` to the destination you want.
###Seeking helps
The detailed usage is described in the man page available together with the
source code. You can use `man ./bwa.1` to view the man page in a terminal. The
[HTML version][4] of the man page can be found at the [BWA website][5]. If you
have questions about BWA, you may [sign up the mailing list][6] and then send
the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions
in forums such as [BioStar][8] and [SEQanswers][9].
###Citing BWA
* Li H. and Durbin R. (2009) Fast and accurate short read alignment with
Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
[19451168][10]]. (if you use the BWA-backtrack algorithm)
* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID:
[20080505][11]]. (if you use the BWA-SW algorithm)
* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM
algorithm or the **fastmap** command, or want to cite the whole BWA package)
Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
do not have plan to submit it to a peer-reviewed journal in the near future.
[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
[2]: https://github.com/lh3/bwa
[3]: http://sourceforge.net/projects/bio-bwa/files/
[4]: http://bio-bwa.sourceforge.net/bwa.shtml
[5]: http://bio-bwa.sourceforge.net/
[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help
[7]: mailto:bio-bwa-help@sourceforge.net
[8]: http://biostars.org
[9]: http://seqanswers.com/
[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168
[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505
[12]: http://arxiv.org/abs/1303.3997
[13]: http://arxiv.org/
......@@ -2,8 +2,13 @@
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "bamlite.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
/*********************
* from bam_endian.c *
*********************/
......@@ -62,11 +67,11 @@ void bam_header_destroy(bam_header_t *header)
if (header == 0) return;
if (header->target_name) {
for (i = 0; i < header->n_targets; ++i)
free(header->target_name[i]);
if (header->target_name[i]) free(header->target_name[i]);
if (header->target_len) free(header->target_len);
free(header->target_name);
free(header->target_len);
}
free(header->text);
if (header->text) free(header->text);
free(header);
}
......@@ -80,28 +85,33 @@ bam_header_t *bam_header_read(bamFile fp)
magic_len = bam_read(fp, buf, 4);
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
return 0;
return NULL;
}
header = bam_header_init();
// read plain text and the number of reference sequences
bam_read(fp, &header->l_text, 4);
if (bam_read(fp, &header->l_text, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
header->text = (char*)calloc(header->l_text + 1, 1);
bam_read(fp, header->text, header->l_text);
bam_read(fp, &header->n_targets, 4);
if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail;
if (bam_read(fp, &header->n_targets, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
// read reference sequence names and lengths
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
for (i = 0; i != header->n_targets; ++i) {
bam_read(fp, &name_len, 4);
if (bam_read(fp, &name_len, 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&name_len);
header->target_name[i] = (char*)calloc(name_len, 1);
bam_read(fp, header->target_name[i], name_len);
bam_read(fp, &header->target_len[i], 4);
if (bam_read(fp, header->target_name[i], name_len) != name_len) {
goto fail;
}
if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail;
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
}
return header;
fail:
bam_header_destroy(header);
return NULL;
}
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
......@@ -153,3 +163,48 @@ int bam_read1(bamFile fp, bam1_t *b)
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
return 4 + block_len;
}
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
// Versions of gzopen, gzread and gzclose that print up error messages
gzFile bamlite_gzopen(const char *fn, const char *mode) {
gzFile fp;
if (strcmp(fn, "-") == 0) {
fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
if (!fp) {
fprintf(stderr, "Couldn't open %s : %s",
(strstr(mode, "r"))? "stdin" : "stdout",
strerror(errno));
}
return fp;
}
if ((fp = gzopen(fn, mode)) == 0) {
fprintf(stderr, "Couldn't open %s : %s\n", fn,
errno ? strerror(errno) : "Out of memory");
}
return fp;
}
int bamlite_gzread(gzFile file, void *ptr, unsigned int len) {
int ret = gzread(file, ptr, len);
if (ret < 0) {
int errnum = 0;
const char *msg = gzerror(file, &errnum);
fprintf(stderr, "gzread error: %s\n",
Z_ERRNO == errnum ? strerror(errno) : msg);
}
return ret;
}
int bamlite_gzclose(gzFile file) {
int ret = gzclose(file);
if (Z_OK != ret) {
fprintf(stderr, "gzclose error: %s\n",
Z_ERRNO == ret ? strerror(errno) : zError(ret));
}
return ret;
}
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
......@@ -4,11 +4,25 @@
#include <stdint.h>
#include <zlib.h>
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
#define USE_VERBOSE_ZLIB_WRAPPERS
typedef gzFile bamFile;
#define bam_open(fn, mode) gzopen(fn, mode)
#define bam_dopen(fd, mode) gzdopen(fd, mode)
#define bam_close(fp) gzclose(fp)
#define bam_read(fp, buf, size) gzread(fp, buf, size)
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
/* These print error messages on failure */
# define bam_open(fn, mode) bamlite_gzopen(fn, mode)
# define bam_dopen(fd, mode) gzdopen(fd, mode)
# define bam_close(fp) bamlite_gzclose(fp)
# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size)
#else
# define bam_open(fn, mode) gzopen(fn, mode)
# define bam_dopen(fd, mode) gzdopen(fd, mode)
# define bam_close(fp) gzclose(fp)
# define bam_read(fp, buf, size) gzread(fp, buf, size)
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
typedef struct {
int32_t n_targets;
......@@ -87,6 +101,12 @@ extern "C" {
bam_header_t *bam_header_read(bamFile fp);
int bam_read1(bamFile fp, bam1_t *b);
#ifdef USE_VERBOSE_ZLIB_WRAPPERS
gzFile bamlite_gzopen(const char *fn, const char *mode);
int bamlite_gzread(gzFile file, void *ptr, unsigned int len);
int bamlite_gzclose(gzFile file);
#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
#ifdef __cplusplus
}
#endif
......
......@@ -30,13 +30,17 @@
#include <string.h>
#include <zlib.h>
#include <unistd.h>
#include <errno.h>
#include "bntseq.h"
#include "main.h"
#include "utils.h"
#include "kseq.h"
KSEQ_DECLARE(gzFile)
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
......@@ -64,25 +68,27 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
{ // dump .ann
strcpy(str, prefix); strcat(str, ".ann");
fp = xopen(str, "w");
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
for (i = 0; i != bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
fprintf(fp, "%d %s", p->gi, p->name);
if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
else fprintf(fp, "\n");
fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
err_fprintf(fp, "%d %s", p->gi, p->name);
if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno);
else err_fprintf(fp, "\n");
err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
}
fclose(fp);
err_fflush(fp);
err_fclose(fp);
}
{ // dump .amb
strcpy(str, prefix); strcat(str, ".amb");
fp = xopen(str, "w");
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
for (i = 0; i != bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
}
fclose(fp);
err_fflush(fp);
err_fclose(fp);
}
}
......@@ -90,13 +96,16 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
{
char str[1024];
FILE *fp;
const char *fname;
bntseq_t *bns;
long long xx;
int i;
int scanres;
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
{ // read .ann
fp = xopen(ann_filename, "r");
fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
fp = xopen(fname = ann_filename, "r");
scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
if (scanres != 3) goto badread;
bns->l_pac = xx;
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
for (i = 0; i < bns->n_seqs; ++i) {
......@@ -104,39 +113,54 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
char *q = str;
int c;
// read gi and sequence name
fscanf(fp, "%u%s", &p->gi, str);
scanres = fscanf(fp, "%u%s", &p->gi, str);
if (scanres != 2) goto badread;
p->name = strdup(str);
// read fasta comments
while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
while (c != '\n' && c != EOF) c = fgetc(fp);
if (c == EOF) {
scanres = EOF;
goto badread;
}
*q = 0;
if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
else p->anno = strdup("");
// read the rest
fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
if (scanres != 3) goto badread;
p->offset = xx;
}
fclose(fp);
err_fclose(fp);
}
{ // read .amb
int64_t l_pac;
int32_t n_seqs;
fp = xopen(amb_filename, "r");
fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
fp = xopen(fname = amb_filename, "r");
scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
if (scanres != 3) goto badread;
l_pac = xx;
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
for (i = 0; i < bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
fscanf(fp, "%lld%d%s", &xx, &p->len, str);
scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
if (scanres != 3) goto badread;
p->offset = xx;
p->amb = str[0];
}
fclose(fp);
err_fclose(fp);
}
{ // open .pac
bns->fp_pac = xopen(pac_filename, "rb");
}
return bns;
badread:
if (EOF == scanres) {
err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file");
}
err_fatal(__func__, "Parse error reading %s\n", fname);
}
bntseq_t *bns_restore(const char *prefix)
......@@ -153,7 +177,7 @@ void bns_destroy(bntseq_t *bns)
if (bns == 0) return;
else {
int i;
if (bns->fp_pac) fclose(bns->fp_pac);
if (bns->fp_pac) err_fclose(bns->fp_pac);
free(bns->ambs);
for (i = 0; i < bns->n_seqs; ++i) {
free(bns->anns[i].name);
......@@ -251,16 +275,17 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
ret = bns->l_pac;
{ // finalize .pac file
ubyte_t ct;
fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
// the following codes make the pac file size always (l_pac/4+1+1)
if (bns->l_pac % 4 == 0) {
ct = 0;
fwrite(&ct, 1, 1, fp);
err_fwrite(&ct, 1, 1, fp);
}
ct = bns->l_pac % 4;
fwrite(&ct, 1, 1, fp);
err_fwrite(&ct, 1, 1, fp);
// close .pac file
fclose(fp);
err_fflush(fp);
err_fclose(fp);
}
bns_dump(bns, prefix);
bns_destroy(bns);
......@@ -284,7 +309,7 @@ int bwa_fa2pac(int argc, char *argv[])
}
fp = xzopen(argv[optind], "r");
bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
gzclose(fp);
err_gzclose(fp);
return 0;