Skip to content
Commits on Source (10)
r805:
* fixed fastq-mcf 'start' bug for ubuntu 14.04
r780:
* fixed fastq-mcf output
* added support for fastq-multx reading from headers
* bundled entire sparsehash library, changed Makefile to include it
* fastq-join corrected quality in overlapped regions
* added a proper test suite. requires perl & Test::More
* fastq-multx minimum distance
* fastq-join allow insert < read-length
* fastq-mcf corrected low-complexity filtering
r181 - CASAVA purity filtering
r154 - fixed major bug in RMN's that would invalidate reads
r152 - allowed short adapters to work at the 'begin' of reads
......
#
# $Id$
# $Id: Makefile 670 2013-12-13 16:47:07Z earonesty $
CC=g++
PREFIX?=/usr
BINDIR?=$(PREFIX)/bin
CFLAGS?=-O3 -I.
CPPFLAGS?=-O3 -I.
# for debugging:
# CFLAGS?=-g -I.
# CPPFLAGS?=-g -I.
PKG=ea-utils
REL := $(shell svnversion 2>/dev/null | perl -ne 'print $$1 if /(\d+)/' )
REL := $(shell svnversion 2>/dev/null | perl -ne 'print $$1 if /:(\d+)/' )
VER := $(shell grep '%define ver' ${PKG}.spec | perl -ne 'print $$1 if / (\S+) *$$/')
SRC=fastq-clipper.cpp fastq-mcf.cpp fastq-multx.cpp fastq-join.cpp fastq-stats.cpp gcModel.cpp
SRC=fastq-clipper.c fastq-mcf.c fastq-multx.c fastq-join.c fastq-stats.cpp gcModel.c
BIN=fastq-mcf fastq-multx fastq-join fastq-stats fastq-clipper sam-stats varcall
TOOLS=fastx-graph gtf2bed determine-phred randomFQ alc
all: $(BIN) check
all: $(BIN)
debug:
CFLAGS="-g -I." ${MAKE} $(MFLAGS) varcall
CPPFLAGS=-g ${MAKE} $(MFLAGS) varcall
install: $(BIN) $(BINDIR)/fastq-clipper $(BINDIR)/fastq-mcf $(BINDIR)/fastq-multx $(BINDIR)/fastq-join $(BINDIR)/fastq-stats $(BINDIR)/sam-stats $(BINDIR)/varcall $(BINDIR)/fastx-graph $(BINDIR)/determine-phred $(BINDIR)/randomFQ $(BINDIR)/alc
......@@ -37,12 +39,12 @@ dist: getrel $(PKG).${VER}-${REL}.tar.gz
getrel:
grep "${REL}" $(PKG).spec || touch $(PKG).spex
.PHONY: getrel debug $(PKG).spec
.PHONY: getrel debug
$(PKG).spec:
$(PKG).spec: $(PKG).spex
perl -pe 's/%RELEASE%/${REL}/' $(PKG).spex > $(PKG).spec
$(PKG).tar.gz: Makefile $(TOOLS) $(SRC) $(PKG).spec fastq-lib.cpp fastq-lib.h sam-stats.cpp fastq-stats.cpp gcModel.cpp gcModel.h varcall.cpp utils.h README CHANGES sparsehash-2.0.2 samtools/*.c t
$(PKG).tar.gz: Makefile $(TOOLS) $(SRC) $(PKG).spec fastq-lib.cpp fastq-lib.h sam-stats.cpp fastq-stats.cpp gcModel.c gcModel.h varcall.cpp utils.h README CHANGES google sparsehash samtools/*.c
rm -rf $(PKG).${VER}-${REL}
mkdir $(PKG).${VER}-${REL}
mkdir $(PKG).${VER}-${REL}/tidx
......@@ -53,28 +55,29 @@ $(PKG).tar.gz: Makefile $(TOOLS) $(SRC) $(PKG).spec fastq-lib.cpp fastq-lib.h sa
tar --exclude=".svn" -cvzf $(PKG).tar.gz $(PKG).${VER}-${REL}
rm -rf $(PKG).${VER}-${REL}
check: $(BIN)
prove -j 4 t
disttest: $(PKG).tar.gz
rm -rf $(PKG).${VER}-${REL}
tar -xzvf $(PKG).tar.gz
cd $(PKG).${VER}-${REL} && make check
cd $(PKG).${VER}-${REL} && make
rm -rf $(PKG).${VER}-${REL}
$(PKG).${VER}-${REL}.tar.gz: $(PKG).tar.gz
cp $< $@
%: %.cpp fastq-lib.cpp fastq-lib.h sparsehash
$(CC) $(CFLAGS) $< fastq-lib.cpp -o $@
%: %.c fastq-lib.cpp fastq-lib.h
$(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
%: %.cpp fastq-lib.cpp fastq-lib.h
$(CC) $(CFLAGS) fastq-lib.cpp -o $@ $<
sparsehash: sparsehash-2.0.2
cd sparsehash-2.0.2; ./configure; make
mkdir sparsehash
cp -r sparsehash-2.0.2/src/sparsehash/* sparsehash/
%: %.c gcModel.c gcModel.h
$(CC) $(CFLAGS) gcModel.c -o $@ $<
%: %.cpp gcModel.c gcModel.h
$(CC) $(CFLAGS) gcModel.c -o $@ $<
# why the libbam.a doesn't work? not sure... *.o works
sam-stats: sam-stats.cpp samtools/libbam.a samtools/bam.h fastq-lib.h sparsehash
sam-stats: sam-stats.cpp samtools/libbam.a samtools/bam.h fastq-lib.h
ifeq ($(OS),Windows_NT)
$(CC) $(CFLAGS) samtools/*.o -lz -lpthread -lws2_32 fastq-lib.cpp $< -o $@
else
......@@ -84,18 +87,15 @@ endif
samtools/libbam.a: samtools/*.c samtools/*.h
cd samtools && make libbam.a
ea-bcl2fastq: ea-bcl2fastq.cpp
$(CC) $(CFLAGS) $< -lz -o $@
varcall: varcall.cpp fastq-lib.cpp tidx/tidx-lib.cpp sparsehash
varcall: varcall.cpp fastq-lib.cpp tidx/tidx-lib.cpp
ifeq ($(OS),Windows_NT)
echo varcall: not supported yet
else
$(CC) $(CFLAGS) fastq-lib.cpp tidx/tidx-lib.cpp -o $@ $< -lgsl -lgslcblas
endif
fastq-stats: fastq-stats.cpp fastq-lib.cpp gcModel.cpp sparsehash
$(CC) $(CFLAGS) fastq-lib.cpp gcModel.cpp -o $@ $<
fastq-stats: fastq-stats.cpp fastq-lib.cpp gcModel.c
$(CC) $(CFLAGS) fastq-lib.cpp gcModel.c -o $@ $<
bam-filter: bam-filter.cpp
$(CC) $(CFLAGS) fastq-lib.cpp -o $@ $< -lbamtools
......
......@@ -22,10 +22,6 @@ Variant caller, takes bam or pileup output and does variant calling with advance
REQUIRES:
On Ubuntu:
sudo apt-get install subversion zlib1g-dev libgsl0-dev
For building sam-stats, please install this first!
https://github.com/pezmaster31/bamtools/wiki/Building-and-installing
......
# ea-utils
This project was automatically exported from code.google.com/p/ea-utils and will now be maintained in this repository on GitHub.
To build ea-utils, follow these [build instructions](https://github.com/ExpressionAnalysis/ea-utils/blob/wiki/Compiling.md). Please see http://expressionanalysis.github.io/ea-utils/ for further help information.
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <assert.h>
#include "bam.h"
#include "bam_endian.h"
#include "kstring.h"
#include "sam_header.h"
int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
/**************************
* CIGAR related routines *
**************************/
uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
{
int k, end = c->pos;
for (k = 0; k < c->n_cigar; ++k) {
int op = bam_cigar_op(cigar[k]);
int len = bam_cigar_oplen(cigar[k]);
if (op == BAM_CBACK) { // move backward
int l, u, v;
if (k == c->n_cigar - 1) break; // skip trailing 'B'
for (l = k - 1, u = v = 0; l >= 0; --l) {
int op1 = bam_cigar_op(cigar[l]);
int len1 = bam_cigar_oplen(cigar[l]);
if (bam_cigar_type(op1)&1) { // consume query
if (u + len1 >= len) { // stop
if (bam_cigar_type(op1)&2) v += len - u;
break;
} else u += len1;
}
if (bam_cigar_type(op1)&2) v += len1;
}
end = l < 0? c->pos : end - v;
} else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
}
return end;
}
int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
{
uint32_t k;
int32_t l = 0;
for (k = 0; k < c->n_cigar; ++k)
if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
l += bam_cigar_oplen(cigar[k]);
return l;
}
/********************
* BAM I/O routines *
********************/
bam_header_t *bam_header_init()
{
bam_is_be = bam_is_big_endian();
return (bam_header_t*)calloc(1, sizeof(bam_header_t));
}
void bam_header_destroy(bam_header_t *header)
{
int32_t i;
extern void bam_destroy_header_hash(bam_header_t *header);
if (header == 0) return;
if (header->target_name) {
for (i = 0; i < header->n_targets; ++i)
free(header->target_name[i]);
free(header->target_name);
free(header->target_len);
}
free(header->text);
if (header->dict) sam_header_free(header->dict);
if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
bam_destroy_header_hash(header);
free(header);
}
bam_header_t *bam_header_read(bamFile fp)
{
bam_header_t *header;
char buf[4];
int magic_len;
int32_t i = 1, name_len;
// check EOF
i = bgzf_check_EOF(fp);
if (i < 0) {
// If the file is a pipe, checking the EOF marker will *always* fail
// with ESPIPE. Suppress the error message in this case.
if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
}
else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
// read "BAM1"
magic_len = bam_read(fp, buf, 4);
if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
return 0;
}
header = bam_header_init();
// read plain text and the number of reference sequences
bam_read(fp, &header->l_text, 4);
if (bam_is_be) bam_swap_endian_4p(&header->l_text);
header->text = (char*)calloc(header->l_text + 1, 1);
bam_read(fp, header->text, header->l_text);
bam_read(fp, &header->n_targets, 4);
if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
// read reference sequence names and lengths
header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
header->target_len = (uint32_t*)calloc(header->n_targets, 4);
for (i = 0; i != header->n_targets; ++i) {
bam_read(fp, &name_len, 4);
if (bam_is_be) bam_swap_endian_4p(&name_len);
header->target_name[i] = (char*)calloc(name_len, 1);
bam_read(fp, header->target_name[i], name_len);
bam_read(fp, &header->target_len[i], 4);
if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
}
return header;
}
int bam_header_write(bamFile fp, const bam_header_t *header)
{
char buf[4];
int32_t i, name_len, x;
// write "BAM1"
strncpy(buf, "BAM\001", 4);
bam_write(fp, buf, 4);
// write plain text and the number of reference sequences
if (bam_is_be) {
x = bam_swap_endian_4(header->l_text);
bam_write(fp, &x, 4);
if (header->l_text) bam_write(fp, header->text, header->l_text);
x = bam_swap_endian_4(header->n_targets);
bam_write(fp, &x, 4);
} else {
bam_write(fp, &header->l_text, 4);
if (header->l_text) bam_write(fp, header->text, header->l_text);
bam_write(fp, &header->n_targets, 4);
}
// write sequence names and lengths
for (i = 0; i != header->n_targets; ++i) {
char *p = header->target_name[i];
name_len = strlen(p) + 1;
if (bam_is_be) {
x = bam_swap_endian_4(name_len);
bam_write(fp, &x, 4);
} else bam_write(fp, &name_len, 4);
bam_write(fp, p, name_len);
if (bam_is_be) {
x = bam_swap_endian_4(header->target_len[i]);
bam_write(fp, &x, 4);
} else bam_write(fp, &header->target_len[i], 4);
}
bgzf_flush(fp);
return 0;
}
static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
{
uint8_t *s;
uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
while (s < data + data_len) {
uint8_t type;
s += 2; // skip key
type = toupper(*s); ++s; // skip type
if (type == 'C' || type == 'A') ++s;
else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
else if (type == 'B') {
int32_t n, Bsize = bam_aux_type2size(*s);
memcpy(&n, s + 1, 4);
if (1 == Bsize) {
} else if (2 == Bsize) {
for (i = 0; i < n; i += 2)
bam_swap_endian_2p(s + 5 + i);
} else if (4 == Bsize) {
for (i = 0; i < n; i += 4)
bam_swap_endian_4p(s + 5 + i);
}
bam_swap_endian_4p(s+1);
}
}
}
int bam_read1(bamFile fp, bam1_t *b)
{
bam1_core_t *c = &b->core;
int32_t block_len, ret, i;
uint32_t x[8];
assert(BAM_CORE_SIZE == 32);
if ((ret = bam_read(fp, &block_len, 4)) != 4) {
if (ret == 0) return -1; // normal end-of-file
else return -2; // truncated
}
if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
if (bam_is_be) {
bam_swap_endian_4p(&block_len);
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
}
c->tid = x[0]; c->pos = x[1];
c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
c->l_qseq = x[4];
c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
b->data_len = block_len - BAM_CORE_SIZE;
if (b->m_data < b->data_len) {
b->m_data = b->data_len;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
if (bam_no_B) bam_remove_B(b);
return 4 + block_len;
}
inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
{
uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
int i;
assert(BAM_CORE_SIZE == 32);
x[0] = c->tid;
x[1] = c->pos;
x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
x[4] = c->l_qseq;
x[5] = c->mtid;
x[6] = c->mpos;
x[7] = c->isize;
bgzf_flush_try(fp, 4 + block_len);
if (bam_is_be) {
for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
y = block_len;
bam_write(fp, bam_swap_endian_4p(&y), 4);
swap_endian_data(c, data_len, data);
} else bam_write(fp, &block_len, 4);
bam_write(fp, x, BAM_CORE_SIZE);
bam_write(fp, data, data_len);
if (bam_is_be) swap_endian_data(c, data_len, data);
return 4 + block_len;
}
int bam_write1(bamFile fp, const bam1_t *b)
{
return bam_write1_core(fp, &b->core, b->data_len, b->data);
}
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
{
uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
int i;
const bam1_core_t *c = &b->core;
kstring_t str;
str.l = str.m = 0; str.s = 0;
kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
else { // BAM_OFSTR
for (i = 0; i < 16; ++i)
if ((c->flag & 1<<i) && bam_flag2char_table[i])
kputc(bam_flag2char_table[i], &str);
kputc('\t', &str);
}
if (c->tid < 0) kputsn("*\t", 2, &str);
else {
if (header) kputs(header->target_name[c->tid] , &str);
else kputw(c->tid, &str);
kputc('\t', &str);
}
kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
if (c->n_cigar == 0) kputc('*', &str);
else {
uint32_t *cigar = bam1_cigar(b);
for (i = 0; i < c->n_cigar; ++i) {
kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
kputc(bam_cigar_opchr(cigar[i]), &str);
}
}
kputc('\t', &str);
if (c->mtid < 0) kputsn("*\t", 2, &str);
else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
else {
if (header) kputs(header->target_name[c->mtid], &str);
else kputw(c->mtid, &str);
kputc('\t', &str);
}
kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
if (c->l_qseq) {
for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
kputc('\t', &str);
if (t[0] == 0xff) kputc('*', &str);
else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
} else kputsn("*\t*", 3, &str);
s = bam1_aux(b);
while (s < b->data + b->data_len) {
uint8_t type, key[2];
key[0] = s[0]; key[1] = s[1];
s += 2; type = *s; ++s;
kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
else if (type == 'B') {
uint8_t sub_type = *(s++);
int32_t n;
memcpy(&n, s, 4);
s += 4; // no point to the start of the array
kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
for (i = 0; i < n; ++i) {
kputc(',', &str);
if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
}
}
}
return str.s;
}
char *bam_format1(const bam_header_t *header, const bam1_t *b)
{
return bam_format1_core(header, b, BAM_OFDEC);
}
void bam_view1(const bam_header_t *header, const bam1_t *b)
{
char *s = bam_format1(header, b);
puts(s);
free(s);
}
int bam_validate1(const bam_header_t *header, const bam1_t *b)
{
char *s;
if (b->core.tid < -1 || b->core.mtid < -1) return 0;
if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
if (b->data_len < b->core.l_qname) return 0;
s = memchr(bam1_qname(b), '\0', b->core.l_qname);
if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
// FIXME: Other fields could also be checked, especially the auxiliary data
return 1;
}
// FIXME: we should also check the LB tag associated with each alignment
const char *bam_get_library(bam_header_t *h, const bam1_t *b)
{
const uint8_t *rg;
if (h->dict == 0) h->dict = sam_header_parse2(h->text);
if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
rg = bam_aux_get(b, "RG");
return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
}
/************
* Remove B *
************/
int bam_remove_B(bam1_t *b)
{
int i, j, end_j, k, l, no_qual;
uint32_t *cigar, *new_cigar;
uint8_t *seq, *qual, *p;
// test if removal is necessary
if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
cigar = bam1_cigar(b);
for (k = 0; k < b->core.n_cigar; ++k)
if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
if (k == b->core.n_cigar) return 0; // no 'B'
if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
// allocate memory for the new CIGAR
if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
b->m_data = b->data_len + b->core.n_cigar * 4;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
cigar = bam1_cigar(b); // after realloc, cigar may be changed
}
new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
// the core loop
seq = bam1_seq(b); qual = bam1_qual(b);
no_qual = (qual[0] == 0xff); // test whether base quality is available
i = j = 0; end_j = -1;
for (k = l = 0; k < b->core.n_cigar; ++k) {
int op = bam_cigar_op(cigar[k]);
int len = bam_cigar_oplen(cigar[k]);
if (op == BAM_CBACK) { // the backward operation
int t, u;
if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
if (len > j) goto rmB_err; // an excessively long backward
for (t = l - 1, u = 0; t >= 0; --t) { // look back
int op1 = bam_cigar_op(new_cigar[t]);
int len1 = bam_cigar_oplen(new_cigar[t]);
if (bam_cigar_type(op1)&1) { // consume the query
if (u + len1 >= len) { // stop
new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
break;
} else u += len1;
}
}
if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
l = t + 1;
end_j = j; j -= len;
} else { // other CIGAR operations
new_cigar[l++] = cigar[k];
if (bam_cigar_type(op)&1) { // consume the query
if (i != j) { // no need to copy if i == j
int u, c, c0;
for (u = 0; u < len; ++u) { // construct the consensus
c = bam1_seqi(seq, i+u);
if (j + u < end_j) { // in an overlap
c0 = bam1_seqi(seq, j+u);
if (c != c0) { // a mismatch; choose the better base
if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u] - qual[j+u];
} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
} else { // not in an overlap; copy over
bam1_seq_seti(seq, j+u, c);
qual[j+u] = qual[i+u];
}
}
}
i += len, j += len;
}
}
}
if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
// merge adjacent operations if possible
for (k = 1; k < l; ++k)
if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
// kill zero length operations
for (k = i = 0; k < l; ++k)
if (new_cigar[k] >> BAM_CIGAR_SHIFT)
new_cigar[i++] = new_cigar[k];
l = i;
// update b
memcpy(cigar, new_cigar, l * 4); // set CIGAR
p = b->data + b->core.l_qname + l * 4;
memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
memmove(p, qual, j); p += j; // set QUAL
memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
b->data_len = p - b->data; // update record length
return 0;
rmB_err:
b->core.flag |= BAM_FUNMAP;
return -1;
}
#include <ctype.h>
#include "bam.h"
#include "khash.h"
typedef char *str_p;
KHASH_MAP_INIT_STR(s, int)
KHASH_MAP_INIT_STR(r2l, str_p)
void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
{
int ori_len = b->data_len;
b->data_len += 3 + len;
b->l_aux += 3 + len;
if (b->m_data < b->data_len) {
b->m_data = b->data_len;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
b->data[ori_len + 2] = type;
memcpy(b->data + ori_len + 3, data, len);
}
uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
{
return bam_aux_get(b, tag);
}
#define __skip_tag(s) do { \
int type = toupper(*(s)); \
++(s); \
if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
else (s) += bam_aux_type2size(type); \
} while(0)
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
{
uint8_t *s;
int y = tag[0]<<8 | tag[1];
s = bam1_aux(b);
while (s < b->data + b->data_len) {
int x = (int)s[0]<<8 | s[1];
s += 2;
if (x == y) return s;
__skip_tag(s);
}
return 0;
}
// s MUST BE returned by bam_aux_get()
int bam_aux_del(bam1_t *b, uint8_t *s)
{
uint8_t *p, *aux;
aux = bam1_aux(b);
p = s - 2;
__skip_tag(s);
memmove(p, s, b->l_aux - (s - aux));
b->data_len -= s - p;
b->l_aux -= s - p;
return 0;
}
int bam_aux_drop_other(bam1_t *b, uint8_t *s)
{
if (s) {
uint8_t *p, *aux;
aux = bam1_aux(b);
p = s - 2;
__skip_tag(s);
memmove(aux, p, s - p);
b->data_len -= b->l_aux - (s - p);
b->l_aux = s - p;
} else {
b->data_len -= b->l_aux;
b->l_aux = 0;
}
return 0;
}
void bam_init_header_hash(bam_header_t *header)
{
if (header->hash == 0) {
int ret, i;
khiter_t iter;
khash_t(s) *h;
header->hash = h = kh_init(s);
for (i = 0; i < header->n_targets; ++i) {
iter = kh_put(s, h, header->target_name[i], &ret);
kh_value(h, iter) = i;
}
}
}
void bam_destroy_header_hash(bam_header_t *header)
{
if (header->hash)
kh_destroy(s, (khash_t(s)*)header->hash);
}
int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
{
khint_t k;
khash_t(s) *h = (khash_t(s)*)header->hash;
k = kh_get(s, h, seq_name);
return k == kh_end(h)? -1 : kh_value(h, k);
}
int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
{
char *s;
int i, l, k, name_end;
khiter_t iter;
khash_t(s) *h;
bam_init_header_hash(header);
h = (khash_t(s)*)header->hash;
*ref_id = *beg = *end = -1;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
// remove space
for (i = k = 0; i < l; ++i)
if (!isspace(str[i])) s[k++] = str[i];
s[k] = 0; l = k;
// determine the sequence name
for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
if (i >= 0) name_end = i;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
iter = kh_get(s, h, s);
if (iter == kh_end(h)) { // cannot find the sequence name
iter = kh_get(s, h, str); // try str as the name
if (iter == kh_end(h)) {
if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
free(s); return -1;
} else s[name_end] = ':', name_end = l;
}
} else iter = kh_get(s, h, str);
*ref_id = kh_val(h, iter);
// parse the interval
if (name_end < l) {
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
*beg = atoi(s + name_end + 1);
for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
*end = i < k? atoi(s + i + 1) : 1<<29;
if (*beg > 0) --*beg;
} else *beg = 0, *end = 1<<29;
free(s);
return *beg <= *end? 0 : -1;
}
int32_t bam_aux2i(const uint8_t *s)
{
int type;
if (s == 0) return 0;
type = *s++;
if (type == 'c') return (int32_t)*(int8_t*)s;
else if (type == 'C') return (int32_t)*(uint8_t*)s;
else if (type == 's') return (int32_t)*(int16_t*)s;
else if (type == 'S') return (int32_t)*(uint16_t*)s;
else if (type == 'i' || type == 'I') return *(int32_t*)s;
else return 0;
}
float bam_aux2f(const uint8_t *s)
{
int type;
type = *s++;
if (s == 0) return 0.0;
if (type == 'f') return *(float*)s;
else return 0.0;
}
double bam_aux2d(const uint8_t *s)
{
int type;
type = *s++;
if (s == 0) return 0.0;
if (type == 'd') return *(double*)s;
else return 0.0;
}
char bam_aux2A(const uint8_t *s)
{
int type;
type = *s++;
if (s == 0) return 0;
if (type == 'A') return *(char*)s;
else return 0;
}
char *bam_aux2Z(const uint8_t *s)
{
int type;
type = *s++;
if (s == 0) return 0;
if (type == 'Z' || type == 'H') return (char*)s;
else return 0;
}
#ifdef _WIN32
double drand48()
{
return (double)rand() / RAND_MAX;
}
#endif
/*
bam_cat -- efficiently concatenates bam files
bam_cat can be used to concatenate BAM files. Under special
circumstances, it can be used as an alternative to 'samtools merge' to
concatenate multiple sorted files into a single sorted file. For this
to work each file must be sorted, and the sorted files must be given
as command line arguments in order such that the final read in file i
is less than or equal to the first read in file i+1.
This code is derived from the bam_reheader function in samtools 0.1.8
and modified to perform concatenation by Chris Saunders on behalf of
Illumina.
########## License:
The MIT License
Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
Modified SAMtools work copyright (c) 2010 Illumina, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
makefile:
"""
CC=gcc
CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)
LDFLAGS+=-L$(SAMTOOLS_DIR)
LDLIBS+=-lbam -lz
all:bam_cat
"""
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "knetfile.h"
#include "bgzf.h"
#include "bam.h"
#define BUF_SIZE 0x10000
#define GZIPID1 31
#define GZIPID2 139
#define BGZF_EMPTY_BLOCK_SIZE 28
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
{
BGZF *fp;
FILE* fp_file;
uint8_t *buf;
uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
const int es=BGZF_EMPTY_BLOCK_SIZE;
int i;
fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
if (fp == 0) {
fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
return 1;
}
if (h) bam_header_write(fp, h);
buf = (uint8_t*) malloc(BUF_SIZE);
for(i = 0; i < nfn; ++i){
BGZF *in;
bam_header_t *old;
int len,j;
in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");
if (in == 0) {
fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
return -1;
}
if (in->is_write) return -1;
old = bam_header_read(in);
if (h == 0 && i == 0) bam_header_write(fp, old);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
bgzf_flush(fp);
}
j=0;
#ifdef _USE_KNETFILE
fp_file = fp->fp;
while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) {
#else
fp_file = fp->fp;
while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
#endif
if(len<es){
int diff=es-len;
if(j==0) {
fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
return -1;
}
fwrite(ebuf, 1, len, fp_file);
memcpy(ebuf,ebuf+len,diff);
memcpy(ebuf+diff,buf,len);
} else {
if(j!=0) fwrite(ebuf, 1, es, fp_file);
len-= es;
memcpy(ebuf,buf+len,es);
fwrite(buf, 1, len, fp_file);
}
j=1;
}
/* check final gzip block */
{
const uint8_t gzip1=ebuf[0];
const uint8_t gzip2=ebuf[1];
const uint32_t isize=*((uint32_t*)(ebuf+es-4));
if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
fprintf(stderr, " Possible output corruption.\n");
fwrite(ebuf, 1, es, fp_file);
}
}
bam_header_destroy(old);
bgzf_close(in);
}
free(buf);
bgzf_close(fp);
return 0;
}
int main_cat(int argc, char *argv[])
{
bam_header_t *h = 0;
char *outfn = 0;
int c, ret;
while ((c = getopt(argc, argv, "h:o:")) >= 0) {
switch (c) {
case 'h': {
tamFile fph = sam_open(optarg);
if (fph == 0) {
fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
return 1;
}
h = sam_header_read(fph);
sam_close(fph);
break;
}
case 'o': outfn = strdup(optarg); break;
}
}
if (argc - optind < 2) {
fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
return 1;
}
ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
free(outfn);
return ret;
}
#include <zlib.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#ifdef _WIN32
#include <fcntl.h>
#endif
#include "kstring.h"
#include "bam.h"
#include "sam_header.h"
#include "kseq.h"
#include "khash.h"
KSTREAM_INIT(gzFile, gzread, 16384)
KHASH_MAP_INIT_STR(ref, uint64_t)
void bam_init_header_hash(bam_header_t *header);
void bam_destroy_header_hash(bam_header_t *header);
int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
unsigned char bam_nt16_table[256] = {
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
};
unsigned short bam_char2flag_table[256] = {
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
};
char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
struct __tamFile_t {
gzFile fp;
kstream_t *ks;
kstring_t *str;
uint64_t n_lines;
int is_first;
};
char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
{
char **list = 0, *s;
int n = 0, dret, m = 0;
gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
kstream_t *ks;
kstring_t *str;
str = (kstring_t*)calloc(1, sizeof(kstring_t));
ks = ks_init(fp);
while (ks_getuntil(ks, '\n', str, &dret) > 0) {
if (n == m) {
m = m? m << 1 : 16;
list = (char**)realloc(list, m * sizeof(char*));
}
if (str->s[str->l-1] == '\r')
str->s[--str->l] = '\0';
s = list[n++] = (char*)calloc(str->l + 1, 1);
strcpy(s, str->s);
}
ks_destroy(ks);
gzclose(fp);
free(str->s); free(str);
*_n = n;
return list;
}
static bam_header_t *hash2header(const kh_ref_t *hash)
{
bam_header_t *header;
khiter_t k;
header = bam_header_init();
header->n_targets = kh_size(hash);
header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
for (k = kh_begin(hash); k != kh_end(hash); ++k) {
if (kh_exist(hash, k)) {
int i = (int)kh_value(hash, k);
header->target_name[i] = (char*)kh_key(hash, k);
header->target_len[i] = kh_value(hash, k)>>32;
}
}
bam_init_header_hash(header);
return header;
}
bam_header_t *sam_header_read2(const char *fn)
{
bam_header_t *header;
int c, dret, ret, error = 0;
gzFile fp;
kstream_t *ks;
kstring_t *str;
kh_ref_t *hash;
khiter_t k;
if (fn == 0) return 0;
fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
if (fp == 0) return 0;
hash = kh_init(ref);
ks = ks_init(fp);
str = (kstring_t*)calloc(1, sizeof(kstring_t));
while (ks_getuntil(ks, 0, str, &dret) > 0) {
char *s = strdup(str->s);
int len, i;
i = kh_size(hash);
ks_getuntil(ks, 0, str, &dret);
len = atoi(str->s);
k = kh_put(ref, hash, s, &ret);
if (ret == 0) {
fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
error = 1;
}
kh_value(hash, k) = (uint64_t)len<<32 | i;
if (dret != '\n')
while ((c = ks_getc(ks)) != '\n' && c != -1);
}
ks_destroy(ks);
gzclose(fp);
free(str->s); free(str);
fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
if (error) return 0;
header = hash2header(hash);
kh_destroy(ref, hash);
return header;
}
static inline uint8_t *alloc_data(bam1_t *b, int size)
{
if (b->m_data < size) {
b->m_data = size;
kroundup32(b->m_data);
b->data = (uint8_t*)realloc(b->data, b->m_data);
}
return b->data;
}
static inline void parse_error(int64_t n_lines, const char * __restrict msg)
{
fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
abort();
}
static inline void append_text(bam_header_t *header, kstring_t *str)
{
size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
kroundup32(x); kroundup32(y);
if (x < y)
{
header->n_text = y;
header->text = (char*)realloc(header->text, y);
if ( !header->text )
{
fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
abort();
}
}
// Sanity check
if ( header->l_text+str->l+1 >= header->n_text )
{
fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y);
abort();
}
strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
header->l_text += str->l + 1;
header->text[header->l_text] = 0;
}
int sam_header_parse(bam_header_t *h)
{
char **tmp;
int i;
free(h->target_len); free(h->target_name);
h->n_targets = 0; h->target_len = 0; h->target_name = 0;
if (h->l_text < 3) return 0;
if (h->dict == 0) h->dict = sam_header_parse2(h->text);
tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
if (h->n_targets == 0) return 0;
h->target_name = calloc(h->n_targets, sizeof(void*));
for (i = 0; i < h->n_targets; ++i)
h->target_name[i] = strdup(tmp[i]);
free(tmp);
tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
h->target_len = calloc(h->n_targets, 4);
for (i = 0; i < h->n_targets; ++i)
h->target_len[i] = atoi(tmp[i]);
free(tmp);
return h->n_targets;
}
bam_header_t *sam_header_read(tamFile fp)
{
int ret, dret;
bam_header_t *header = bam_header_init();
kstring_t *str = fp->str;
while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
str->s[str->l] = dret; // note that str->s is NOT null terminated!!
append_text(header, str);
if (dret != '\n') {
ret = ks_getuntil(fp->ks, '\n', str, &dret);
str->s[str->l] = '\n'; // NOT null terminated!!
append_text(header, str);
}
++fp->n_lines;
}
sam_header_parse(header);
bam_init_header_hash(header);
fp->is_first = 1;
return header;
}
int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
{
int ret, doff, doff0, dret, z = 0;
bam1_core_t *c = &b->core;
kstring_t *str = fp->str;
kstream_t *ks = fp->ks;
if (fp->is_first) {
fp->is_first = 0;
ret = str->l;
} else {
do { // special consideration for empty lines
ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
if (ret >= 0) z += str->l + 1;
} while (ret == 0);
}
if (ret < 0) return -1;
++fp->n_lines;
doff = 0;
{ // name
c->l_qname = strlen(str->s) + 1;
memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
doff += c->l_qname;
}
{ // flag
long flag;
char *s;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
flag = strtol((char*)str->s, &s, 0);
if (*s) { // not the end of the string
flag = 0;
for (s = str->s; *s; ++s)
flag |= bam_char2flag_table[(int)*s];
}
c->flag = flag;
}
{ // tid, pos, qual
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
if (c->tid < 0 && strcmp(str->s, "*")) {
if (header->n_targets == 0) {
fprintf(stderr, "[sam_read1] missing header? Abort!\n");
exit(1);
} else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
}
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
if (ret < 0) return -2;
}
{ // cigar
char *s, *t;
int i, op;
long x;
c->n_cigar = 0;
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
z += str->l + 1;
if (str->s[0] != '*') {
uint32_t *cigar;
for (s = str->s; *s; ++s) {
if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
}
b->data = alloc_data(b, doff + c->n_cigar * 4);
cigar = bam1_cigar(b);
for (i = 0, s = str->s; i != c->n_cigar; ++i) {
x = strtol(s, &t, 10);
op = toupper(*t);
if (op == 'M') op = BAM_CMATCH;
else if (op == 'I') op = BAM_CINS;
else if (op == 'D') op = BAM_CDEL;
else if (op == 'N') op = BAM_CREF_SKIP;
else if (op == 'S') op = BAM_CSOFT_CLIP;
else if (op == 'H') op = BAM_CHARD_CLIP;
else if (op == 'P') op = BAM_CPAD;
else if (op == '=') op = BAM_CEQUAL;
else if (op == 'X') op = BAM_CDIFF;
else if (op == 'B') op = BAM_CBACK;
else parse_error(fp->n_lines, "invalid CIGAR operation");
s = t + 1;
cigar[i] = bam_cigar_gen(x, op);
}
if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
doff += c->n_cigar * 4;
} else {
if (!(c->flag&BAM_FUNMAP)) {
fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
c->flag |= BAM_FUNMAP;
}
c->bin = bam_reg2bin(c->pos, c->pos + 1);
}
}
{ // mtid, mpos, isize
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
if (ret < 0) return -4;
}
{ // seq and qual
int i;
uint8_t *p = 0;
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
z += str->l + 1;
if (strcmp(str->s, "*")) {
c->l_qseq = strlen(str->s);
if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
(long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
}
p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
memset(p, 0, (c->l_qseq+1)/2);
for (i = 0; i < c->l_qseq; ++i)
p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
} else c->l_qseq = 0;
if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
z += str->l + 1;
if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
parse_error(fp->n_lines, "sequence and quality are inconsistent");
p += (c->l_qseq+1)/2;
if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
doff += c->l_qseq + (c->l_qseq+1)/2;
}
doff0 = doff;
if (dret != '\n' && dret != '\r') { // aux
while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
uint8_t *s, type, key[2];
z += str->l + 1;
if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
parse_error(fp->n_lines, "missing colon in auxiliary data");
key[0] = str->s[0]; key[1] = str->s[1];
type = str->s[3];
s = alloc_data(b, doff + 3) + doff;
s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
s = alloc_data(b, doff + 2) + doff;
*s++ = 'A'; *s = str->s[5];
doff += 2;
} else if (type == 'I' || type == 'i') {
long long x;
s = alloc_data(b, doff + 5) + doff;
x = (long long)atoll(str->s + 5);
if (x < 0) {
if (x >= -127) {
*s++ = 'c'; *(int8_t*)s = (int8_t)x;
s += 1; doff += 2;
} else if (x >= -32767) {
*s++ = 's'; *(int16_t*)s = (int16_t)x;
s += 2; doff += 3;
} else {
*s++ = 'i'; *(int32_t*)s = (int32_t)x;
s += 4; doff += 5;
if (x < -2147483648ll)
fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
(long long)fp->n_lines, x);
}
} else {
if (x <= 255) {
*s++ = 'C'; *s++ = (uint8_t)x;
doff += 2;
} else if (x <= 65535) {
*s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
s += 2; doff += 3;
} else {
*s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
s += 4; doff += 5;
if (x > 4294967295ll)
fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
(long long)fp->n_lines, x);
}
}
} else if (type == 'f') {
s = alloc_data(b, doff + 5) + doff;
*s++ = 'f';
*(float*)s = (float)atof(str->s + 5);
s += 4; doff += 5;
} else if (type == 'd') {
s = alloc_data(b, doff + 9) + doff;
*s++ = 'd';
*(float*)s = (float)atof(str->s + 9);
s += 8; doff += 9;
} else if (type == 'Z' || type == 'H') {
int size = 1 + (str->l - 5) + 1;
if (type == 'H') { // check whether the hex string is valid
int i;
if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
for (i = 0; i < str->l - 5; ++i) {
int c = toupper(str->s[5 + i]);
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
parse_error(fp->n_lines, "invalid hex character");
}
}
s = alloc_data(b, doff + size) + doff;
*s++ = type;
memcpy(s, str->s + 5, str->l - 5);
s[str->l - 5] = 0;
doff += size;
} else if (type == 'B') {
int32_t n = 0, Bsize, k = 0, size;
char *p;
if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B");
Bsize = bam_aux_type2size(str->s[5]); // the size of each element
for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array
if (*p == ',') ++n;
p = str->s + 7; // now p points to the first number in the array
size = 6 + Bsize * n; // total number of bytes allocated to this tag
s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory
*s++ = 'B'; *s++ = str->s[5];
memcpy(s, &n, 4); s += 4; // write the number of elements
if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory
else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p;
else parse_error(fp->n_lines, "unrecognized array type");
s += Bsize * n; doff += size;
} else parse_error(fp->n_lines, "unrecognized type");
if (dret == '\n' || dret == '\r') break;
}
}
b->l_aux = doff - doff0;
b->data_len = doff;
if (bam_no_B) bam_remove_B(b);
return z;
}
tamFile sam_open(const char *fn)
{
tamFile fp;
gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb");
if (gzfp == 0) return 0;
fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
fp->fp = gzfp;
fp->ks = ks_init(fp->fp);
return fp;
}
void sam_close(tamFile fp)
{
if (fp) {
ks_destroy(fp->ks);
gzclose(fp->fp);
free(fp->str->s); free(fp->str);
free(fp);
}
}
This diff is collapsed.
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include "bam.h"
#include "ksort.h"
#define TV_GAP 2
typedef struct __freenode_t {
uint32_t level:28, cnt:4;
struct __freenode_t *next;
} freenode_t, *freenode_p;
#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
KSORT_INIT(node, freenode_p, freenode_lt)
/* Memory pool, similar to the one in bam_pileup.c */
typedef struct {
int cnt, n, max;
freenode_t **buf;
} mempool_t;
static mempool_t *mp_init()
{
return (mempool_t*)calloc(1, sizeof(mempool_t));
}
static void mp_destroy(mempool_t *mp)
{
int k;
for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
free(mp->buf); free(mp);
}
static inline freenode_t *mp_alloc(mempool_t *mp)
{
++mp->cnt;
if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
else return mp->buf[--mp->n];
}
static inline void mp_free(mempool_t *mp, freenode_t *p)
{
--mp->cnt; p->next = 0; p->cnt = TV_GAP;
if (mp->n == mp->max) {
mp->max = mp->max? mp->max<<1 : 256;
mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
}
mp->buf[mp->n++] = p;
}
/* core part */
struct __bam_lplbuf_t {
int max, n_cur, n_pre;
int max_level, *cur_level, *pre_level;
mempool_t *mp;
freenode_t **aux, *head, *tail;
int n_nodes, m_aux;
bam_pileup_f func;
void *user_data;
bam_plbuf_t *plbuf;
};
void bam_lplbuf_reset(bam_lplbuf_t *buf)
{
freenode_t *p, *q;
bam_plbuf_reset(buf->plbuf);
for (p = buf->head; p->next;) {
q = p->next;
mp_free(buf->mp, p);
p = q;
}
buf->head = buf->tail;
buf->max_level = 0;
buf->n_cur = buf->n_pre = 0;
buf->n_nodes = 0;
}
static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
{
bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
freenode_t *p;
int i, l, max_level;
// allocate memory if necessary
if (tv->max < n) { // enlarge
tv->max = n;
kroundup32(tv->max);
tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
}
tv->n_cur = n;
// update cnt
for (p = tv->head; p->next; p = p->next)
if (p->cnt > 0) --p->cnt;
// calculate cur_level[]
max_level = 0;
for (i = l = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
if (p->is_head) {
if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
freenode_t *p = tv->head->next;
tv->cur_level[i] = tv->head->level;
mp_free(tv->mp, tv->head);
tv->head = p;
--tv->n_nodes;
} else tv->cur_level[i] = ++tv->max_level;
} else {
tv->cur_level[i] = tv->pre_level[l++];
if (p->is_tail) { // then return a free slot
tv->tail->level = tv->cur_level[i];
tv->tail->next = mp_alloc(tv->mp);
tv->tail = tv->tail->next;
++tv->n_nodes;
}
}
if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
((bam_pileup1_t*)p)->level = tv->cur_level[i];
}
assert(l == tv->n_pre);
tv->func(tid, pos, n, pl, tv->user_data);
// sort the linked list
if (tv->n_nodes) {
freenode_t *q;
if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
tv->m_aux = tv->n_nodes + 1;
kroundup32(tv->m_aux);
tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
}
for (p = tv->head, i = l = 0; p->next;) {
if (p->level > max_level) { // then discard this entry
q = p->next;
mp_free(tv->mp, p);
p = q;
} else {
tv->aux[i++] = p;
p = p->next;
}
}
tv->aux[i] = tv->tail; // add a proper tail for the loop below
tv->n_nodes = i;
if (tv->n_nodes) {
ks_introsort(node, tv->n_nodes, tv->aux);
for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
tv->head = tv->aux[0];
} else tv->head = tv->tail;
}
// clean up
tv->max_level = max_level;
memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
// squeeze out terminated levels
for (i = l = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
if (!p->is_tail)
tv->pre_level[l++] = tv->pre_level[i];
}
tv->n_pre = l;
/*
fprintf(stderr, "%d\t", pos+1);
for (i = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
if (p->is_head) fprintf(stderr, "^");
if (p->is_tail) fprintf(stderr, "$");
fprintf(stderr, "%d,", p->level);
}
fprintf(stderr, "\n");
*/
return 0;
}
bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
{
bam_lplbuf_t *tv;
tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
tv->mp = mp_init();
tv->head = tv->tail = mp_alloc(tv->mp);
tv->func = func;
tv->user_data = data;
tv->plbuf = bam_plbuf_init(tview_func, tv);
return (bam_lplbuf_t*)tv;
}
void bam_lplbuf_destroy(bam_lplbuf_t *tv)
{
freenode_t *p, *q;
free(tv->cur_level); free(tv->pre_level);
bam_plbuf_destroy(tv->plbuf);
free(tv->aux);
for (p = tv->head; p->next;) {
q = p->next;
mp_free(tv->mp, p); p = q;
}
mp_free(tv->mp, p);
assert(tv->mp->cnt == 0);
mp_destroy(tv->mp);
free(tv);
}
int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
{
return bam_plbuf_push(b, tv->plbuf);
}
#include <unistd.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include "faidx.h"
#include "sam.h"
#include "kstring.h"
#include "kaln.h"
#include "kprobaln.h"
#define USE_EQUAL 1
#define DROP_TAG 2
#define BIN_QUAL 4
#define UPDATE_NM 8
#define UPDATE_MD 16
#define HASH_QNM 32
char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm)
{
uint8_t *seq = bam1_seq(b);
uint32_t *cigar = bam1_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, u = 0;
kstring_t *str;
int32_t old_nm_i = -1, nm = 0;
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
if (ref[x+j] == 0) break; // out of boundary
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
++u;
} else {
kputw(u, str); kputc(ref[x+j], str);
u = 0; ++nm;
}
}
if (j < l) break;
x += l; y += l;
} else if (op == BAM_CDEL) {
kputw(u, str); kputc('^', str);
for (j = 0; j < l; ++j) {
if (ref[x+j] == 0) break;
kputc(ref[x+j], str);
}
u = 0;
if (j < l) break;
x += l; nm += l;
} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
y += l;
if (op == BAM_CINS) nm += l;
} else if (op == BAM_CREF_SKIP) {
x += l;
}
}
kputw(u, str);
// apply max_nm
if (max_nm > 0 && nm >= max_nm) {
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
if (ref[x+j] == 0) break; // out of boundary
if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
seq[z/2] |= (z&1)? 0x0f : 0xf0;
bam1_qual(b)[z] = 0;
}
}
if (j < l) break;
x += l; y += l;
} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
}
}
// update NM
if (flag & UPDATE_NM) {
uint8_t *old_nm = bam_aux_get(b, "NM");
if (c->flag & BAM_FUNMAP) return;
if (old_nm) old_nm_i = bam_aux2i(old_nm);
if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
else if (nm != old_nm_i) {
fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
bam_aux_del(b, old_nm);
bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
}
}
// update MD
if (flag & UPDATE_MD) {
uint8_t *old_md = bam_aux_get(b, "MD");
if (c->flag & BAM_FUNMAP) return;
if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
else {
int is_diff = 0;
if (strlen((char*)old_md+1) == str->l) {
for (i = 0; i < str->l; ++i)
if (toupper(old_md[i+1]) != toupper(str->s[i]))
break;
if (i < str->l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s);
bam_aux_del(b, old_md);
bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
}
}
}
// drop all tags but RG
if (flag&DROP_TAG) {
uint8_t *q = bam_aux_get(b, "RG");
bam_aux_drop_other(b, q);
}
// reduce the resolution of base quality
if (flag&BIN_QUAL) {
uint8_t *qual = bam1_qual(b);
for (i = 0; i < b->core.l_qseq; ++i)
if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
}
free(str->s); free(str);
}
void bam_fillmd1(bam1_t *b, char *ref, int flag)
{
bam_fillmd1_core(b, ref, flag, 0);
}
int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
{
uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
uint32_t *cigar = bam1_cigar(b);
bam1_core_t *c = &b->core;
int i, x, y, mm, q, len, clip_l, clip_q;
double t;
if (thres < 0) thres = 40; // set the default
mm = q = len = clip_l = clip_q = 0;
for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
int j, l = cigar[i]>>4, op = cigar[i]&0xf;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (j = 0; j < l; ++j) {
int z = y + j;
int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
if (ref[x+j] == 0) break; // out of boundary
if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
++len;
if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
++mm;
q += qual[z] > 33? 33 : qual[z];
}
}
}
if (j < l) break;
x += l; y += l; len += l;
} else if (op == BAM_CDEL) {
for (j = 0; j < l; ++j)
if (ref[x+j] == 0) break;
if (j < l) break;
x += l;
} else if (op == BAM_CSOFT_CLIP) {
for (j = 0; j < l; ++j) clip_q += qual[y+j];
clip_l += l;
y += l;
} else if (op == BAM_CHARD_CLIP) {
clip_q += 13 * l;
clip_l += l;
} else if (op == BAM_CINS) y += l;
else if (op == BAM_CREF_SKIP) x += l;
}
for (i = 0, t = 1; i < mm; ++i)
t *= (double)len / (i+1);
t = q - 4.343 * log(t) + clip_q / 5.;
if (t > thres) return -1;
if (t < 0) t = 0;
t = sqrt((thres - t) / thres) * thres;
// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
return (int)(t + .499);
}
int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
{
int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
uint32_t *cigar = bam1_cigar(b);
bam1_core_t *c = &b->core;
kpa_par_t conf = kpa_par_def;
uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
// test if BQ or ZQ is present
if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
if (bq && redo_baq)
{
bam_aux_del(b, bq-1);
bq = 0;
}
if (bq && zq) { // remove the ZQ tag
bam_aux_del(b, zq-1);
zq = 0;
}
if (bq || zq) {
if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
if (bq && apply_baq) { // then convert BQ to ZQ
for (i = 0; i < c->l_qseq; ++i)
qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
*(bq - 3) = 'Z';
} else if (zq && !apply_baq) { // then convert ZQ to BQ
for (i = 0; i < c->l_qseq; ++i)
qual[i] += (int)zq[i] - 64;
*(zq - 3) = 'B';
}
return 0;
}
// find the start and end of the alignment
x = c->pos, y = 0, yb = ye = xb = xe = -1;
for (k = 0; k < c->n_cigar; ++k) {
int op, l;
op = cigar[k]&0xf; l = cigar[k]>>4;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
if (yb < 0) yb = y;
if (xb < 0) xb = x;
ye = y + l; xe = x + l;
x += l; y += l;
} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
else if (op == BAM_CDEL) x += l;
else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
}
// set bandwidth and the start and the end
bw = 7;
if (abs((xe - xb) - (ye - yb)) > bw)
bw = abs((xe - xb) - (ye - yb)) + 3;
conf.bw = bw;
xb -= yb + bw/2; if (xb < 0) xb = 0;
xe += c->l_qseq - ye + bw/2;
if (xe - xb - c->l_qseq > bw)
xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
{ // glocal
uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
int *state;
bq = calloc(c->l_qseq + 1, 1);
memcpy(bq, qual, c->l_qseq);
s = calloc(c->l_qseq, 1);
for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
r = calloc(xe - xb, 1);
for (i = xb; i < xe; ++i) {
if (ref[i] == 0) { xe = i; break; }
r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
}
state = calloc(c->l_qseq, sizeof(int));
q = calloc(c->l_qseq, 1);
kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
int op = cigar[k]&0xf, l = cigar[k]>>4;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (i = y; i < y + l; ++i) {
if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
else bq[i] = bq[i] < q[i]? bq[i] : q[i];
}
x += l; y += l;
} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
else if (op == BAM_CDEL) x += l;
}
for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
} else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
uint8_t *left, *rght;
left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
int op = cigar[k]&0xf, l = cigar[k]>>4;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
for (i = y; i < y + l; ++i)
bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
for (i = y; i < y + l; ++i)
bq[i] = left[i] < rght[i]? left[i] : rght[i];
x += l; y += l;
} else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
else if (op == BAM_CDEL) x += l;
}
for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
free(left); free(rght);
}
if (apply_baq) {
for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
} else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
free(bq); free(s); free(r); free(q); free(state);
}
return 0;
}
int bam_prob_realn(bam1_t *b, const char *ref)
{
return bam_prob_realn_core(b, ref, 1);
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
samfile_t *fp, *fpout = 0;
faidx_t *fai;
char *ref = 0, mode_w[8], mode_r[8];
bam1_t *b;
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
mode_w[0] = mode_r[0] = 0;
strcpy(mode_r, "r"); strcpy(mode_w, "w");
while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
case 'd': flt_flag |= DROP_TAG; break;
case 'q': flt_flag |= BIN_QUAL; break;
case 'h': flt_flag |= HASH_QNM; break;
case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
case 'S': is_sam_in = 1; break;
case 'n': max_nm = atoi(optarg); break;
case 'C': capQ = atoi(optarg); break;
case 'A': baq_flag |= 1; break;
case 'E': baq_flag |= 2; break;
default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
}
}
if (!is_sam_in) strcat(mode_r, "b");
if (is_bam_out) strcat(mode_w, "b");
else strcat(mode_w, "h");
if (is_uncompressed) strcat(mode_w, "u");
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
fprintf(stderr, "Options: -e change identical bases to '='\n");
fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
fprintf(stderr, " -b compressed BAM output\n");
fprintf(stderr, " -S the input is SAM with header\n");
fprintf(stderr, " -A modify the quality string\n");
fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n");
return 1;
}
fp = samopen(argv[optind], mode_r, 0);
if (fp == 0) return 1;
if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) {
fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
return 1;
}
fpout = samopen("-", mode_w, fp->header);
fai = fai_load(argv[optind+1]);
b = bam_init1();
while ((ret = samread(fp, b)) >= 0) {
if (b->core.tid >= 0) {
if (tid != b->core.tid) {
free(ref);
ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len);
tid = b->core.tid;
if (ref == 0)
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
fp->header->target_name[tid]);
}
if (is_realn) bam_prob_realn_core(b, ref, baq_flag);
if (capQ > 10) {
int q = bam_cap_mapQ(b, ref, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm);
}
samwrite(fpout, b);
}
bam_destroy1(b);
free(ref);
fai_destroy(fai);
samclose(fp); samclose(fpout);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#include "sam.h"
typedef struct {
int k, x, y, end;
} cstate_t;
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
typedef struct __linkbuf_t {
bam1_t b;
uint32_t beg, end;
cstate_t s;
struct __linkbuf_t *next;
} lbnode_t;
/* --- BEGIN: Memory pool */
typedef struct {
int cnt, n, max;
lbnode_t **buf;
} mempool_t;
static mempool_t *mp_init()
{
mempool_t *mp;
mp = (mempool_t*)calloc(1, sizeof(mempool_t));
return mp;
}
static void mp_destroy(mempool_t *mp)
{
int k;
for (k = 0; k < mp->n; ++k) {
free(mp->buf[k]->b.data);
free(mp->buf[k]);
}
free(mp->buf);
free(mp);
}
static inline lbnode_t *mp_alloc(mempool_t *mp)
{
++mp->cnt;
if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
else return mp->buf[--mp->n];
}
static inline void mp_free(mempool_t *mp, lbnode_t *p)
{
--mp->cnt; p->next = 0; // clear lbnode_t::next here
if (mp->n == mp->max) {
mp->max = mp->max? mp->max<<1 : 256;
mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
}
mp->buf[mp->n++] = p;
}
/* --- END: Memory pool */
/* --- BEGIN: Auxiliary functions */
/* s->k: the index of the CIGAR operator that has just been processed.
s->x: the reference coordinate of the start of s->k
s->y: the query coordiante of the start of s->k
*/
static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
{
#define _cop(c) ((c)&BAM_CIGAR_MASK)
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
bam1_t *b = p->b;
bam1_core_t *c = &b->core;
uint32_t *cigar = bam1_cigar(b);
int k, is_head = 0;
// determine the current CIGAR operation
// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
if (s->k == -1) { // never processed
is_head = 1;
if (c->n_cigar == 1) { // just one operation, save a loop
if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
} else { // find the first match or deletion
for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
int op = _cop(cigar[k]);
int l = _cln(cigar[k]);
if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
else if (op == BAM_CREF_SKIP) s->x += l;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
assert(k < c->n_cigar);
s->k = k;
}
} else { // the read has been processed before
int op, l = _cln(cigar[s->k]);
if (pos - s->x >= l) { // jump to the next operation
assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
op = _cop(cigar[s->k+1]);
if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
s->x += l;
++s->k;
} else { // find the next M/D/N/=/X
if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
s->x += l;
for (k = s->k + 1; k < c->n_cigar; ++k) {
op = _cop(cigar[k]), l = _cln(cigar[k]);
if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
s->k = k;
}
assert(s->k < c->n_cigar); // otherwise a bug
} // else, do nothing
}
{ // collect pileup information
int op, l;
op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
p->is_del = p->indel = p->is_refskip = 0;
if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
int op2 = _cop(cigar[s->k+1]);
int l2 = _cln(cigar[s->k+1]);
if (op2 == BAM_CDEL) p->indel = -(int)l2;
else if (op2 == BAM_CINS) p->indel = l2;
else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
int l3 = 0;
for (k = s->k + 2; k < c->n_cigar; ++k) {
op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
if (op2 == BAM_CINS) l3 += l2;
else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
}
if (l3 > 0) p->indel = l3;
}
}
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
p->qpos = s->y + (pos - s->x);
} else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
p->is_refskip = (op == BAM_CREF_SKIP);
} // cannot be other operations; otherwise a bug
p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
}
return 1;
}
/* --- END: Auxiliary functions */
/*******************
* pileup iterator *
*******************/
struct __bam_plp_t {
mempool_t *mp;
lbnode_t *head, *tail, *dummy;
int32_t tid, pos, max_tid, max_pos;
int is_eof, flag_mask, max_plp, error, maxcnt;
bam_pileup1_t *plp;
// for the "auto" interface only
bam1_t *b;
bam_plp_auto_f func;
void *data;
};
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
{
bam_plp_t iter;
iter = calloc(1, sizeof(struct __bam_plp_t));
iter->mp = mp_init();
iter->head = iter->tail = mp_alloc(iter->mp);
iter->dummy = mp_alloc(iter->mp);
iter->max_tid = iter->max_pos = -1;
iter->flag_mask = BAM_DEF_MASK;
iter->maxcnt = 8000;
if (func) {
iter->func = func;
iter->data = data;
iter->b = bam_init1();
}
return iter;
}
void bam_plp_destroy(bam_plp_t iter)
{
mp_free(iter->mp, iter->dummy);
mp_free(iter->mp, iter->head);
if (iter->mp->cnt != 0)
fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
mp_destroy(iter->mp);
if (iter->b) bam_destroy1(iter->b);
free(iter->plp);
free(iter);
}
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
{
if (iter->error) { *_n_plp = -1; return 0; }
*_n_plp = 0;
if (iter->is_eof && iter->head->next == 0) return 0;
while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
int n_plp = 0;
lbnode_t *p, *q;
// write iter->plp at iter->pos
iter->dummy->next = iter->head;
for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
q->next = p->next; mp_free(iter->mp, p); p = q;
} else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
if (n_plp == iter->max_plp) { // then double the capacity
iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
}
iter->plp[n_plp].b = &p->b;
if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
}
}
iter->head = iter->dummy->next; // dummy->next may be changed
*_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
// update iter->tid and iter->pos
if (iter->head->next) {
if (iter->tid > iter->head->b.core.tid) {
fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
iter->error = 1;
*_n_plp = -1;
return 0;
}
}
if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
} else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
iter->pos = iter->head->beg; // jump to the next position
} else ++iter->pos; // scan contiguously
// return
if (n_plp) return iter->plp;
if (iter->is_eof && iter->head->next == 0) break;
}
return 0;
}
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
{
if (iter->error) return -1;
if (b) {
if (b->core.tid < 0) return 0;
if (b->core.flag & iter->flag_mask) return 0;
if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
bam_copy1(&iter->tail->b, b);
iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
if (b->core.tid < iter->max_tid) {
fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
iter->error = 1;
return -1;
}
if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
iter->error = 1;
return -1;
}
iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
iter->tail->next = mp_alloc(iter->mp);
iter->tail = iter->tail->next;
}
} else iter->is_eof = 1;
return 0;
}
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
{
const bam_pileup1_t *plp;
if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
else { // no pileup line can be obtained; read alignments
*_n_plp = 0;
if (iter->is_eof) return 0;
while (iter->func(iter->data, iter->b) >= 0) {
if (bam_plp_push(iter, iter->b) < 0) {
*_n_plp = -1;
return 0;
}
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
// otherwise no pileup line can be returned; read the next alignment.
}
bam_plp_push(iter, 0);
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
return 0;
}
}
void bam_plp_reset(bam_plp_t iter)
{
lbnode_t *p, *q;
iter->max_tid = iter->max_pos = -1;
iter->tid = iter->pos = 0;
iter->is_eof = 0;
for (p = iter->head; p->next;) {
q = p->next;
mp_free(iter->mp, p);
p = q;
}
iter->head = iter->tail;
}
void bam_plp_set_mask(bam_plp_t iter, int mask)
{
iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
}
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
{
iter->maxcnt = maxcnt;
}
/*****************
* callback APIs *
*****************/
int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
{
bam_plbuf_t *buf;
int ret;
bam1_t *b;
b = bam_init1();
buf = bam_plbuf_init(func, func_data);
bam_plbuf_set_mask(buf, mask);
while ((ret = bam_read1(fp, b)) >= 0)
bam_plbuf_push(b, buf);
bam_plbuf_push(0, buf);
bam_plbuf_destroy(buf);
bam_destroy1(b);
return 0;
}
void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
{
bam_plp_set_mask(buf->iter, mask);
}
void bam_plbuf_reset(bam_plbuf_t *buf)
{
bam_plp_reset(buf->iter);
}
bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
{
bam_plbuf_t *buf;
buf = calloc(1, sizeof(bam_plbuf_t));
buf->iter = bam_plp_init(0, 0);
buf->func = func;
buf->data = data;
return buf;
}
void bam_plbuf_destroy(bam_plbuf_t *buf)
{
bam_plp_destroy(buf->iter);
free(buf);
}
int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
{
int ret, n_plp, tid, pos;
const bam_pileup1_t *plp;
ret = bam_plp_push(buf->iter, b);
if (ret < 0) return ret;
while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
buf->func(tid, pos, n_plp, plp, buf->data);
return 0;
}
/***********
* mpileup *
***********/
struct __bam_mplp_t {
int n;
uint64_t min, *pos;
bam_plp_t *iter;
int *n_plp;
const bam_pileup1_t **plp;
};
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
{
int i;
bam_mplp_t iter;
iter = calloc(1, sizeof(struct __bam_mplp_t));
iter->pos = calloc(n, 8);
iter->n_plp = calloc(n, sizeof(int));
iter->plp = calloc(n, sizeof(void*));
iter->iter = calloc(n, sizeof(void*));
iter->n = n;
iter->min = (uint64_t)-1;
for (i = 0; i < n; ++i) {
iter->iter[i] = bam_plp_init(func, data[i]);
iter->pos[i] = iter->min;
}
return iter;
}
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
{
int i;
for (i = 0; i < iter->n; ++i)
iter->iter[i]->maxcnt = maxcnt;
}
void bam_mplp_destroy(bam_mplp_t iter)
{
int i;
for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
free(iter);
}
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
{
int i, ret = 0;
uint64_t new_min = (uint64_t)-1;
for (i = 0; i < iter->n; ++i) {
if (iter->pos[i] == iter->min) {
int tid, pos;
iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
iter->pos[i] = (uint64_t)tid<<32 | pos;
}
if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
}
iter->min = new_min;
if (new_min == (uint64_t)-1) return 0;
*_tid = new_min>>32; *_pos = (uint32_t)new_min;
for (i = 0; i < iter->n; ++i) {
if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
++ret;
} else n_plp[i] = 0, plp[i] = 0;
}
return ret;
}
#include <stdio.h>
#include <stdlib.h>
#include "knetfile.h"
#include "bgzf.h"
#include "bam.h"
#define BUF_SIZE 0x10000
int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
{
BGZF *fp;
bam_header_t *old;
int len;
uint8_t *buf;
if (in->is_write) return -1;
buf = malloc(BUF_SIZE);
old = bam_header_read(in);
fp = bgzf_fdopen(fd, "w");
bam_header_write(fp, h);
if (in->block_offset < in->block_length) {
bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
bgzf_flush(fp);
}
#ifdef _USE_KNETFILE
while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
fwrite(buf, 1, len, fp->fp);
#else
while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0)
fwrite(buf, 1, len, fp->file);
#endif
free(buf);
fp->block_offset = in->block_offset = 0;
bgzf_close(fp);
return 0;
}
int main_reheader(int argc, char *argv[])
{
bam_header_t *h;
BGZF *in;
if (argc != 3) {
fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n");
return 1;
}
{ // read the header
tamFile fph = sam_open(argv[1]);
if (fph == 0) {
fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]);
return 1;
}
h = sam_header_read(fph);
sam_close(fph);
}
in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r");
if (in == 0) {
fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]);
return 1;
}
bam_reheader(in, h, fileno(stdout));
bgzf_close(in);
return 0;
}
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "bam.h"
#include "ksort.h"
static int g_is_by_qname = 0;
static int strnum_cmp(const char *_a, const char *_b)
{
const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
const unsigned char *pa = a, *pb = b;
while (*pa && *pb) {
if (isdigit(*pa) && isdigit(*pb)) {
while (*pa == '0') ++pa;
while (*pb == '0') ++pb;
while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
if (isdigit(*pa) && isdigit(*pb)) {
int i = 0;
while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
} else if (isdigit(*pa)) return 1;
else if (isdigit(*pb)) return -1;
else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
} else {
if (*pa != *pb) return (int)*pa - (int)*pb;
++pa; ++pb;
}
}
return *pa? 1 : *pb? -1 : 0;
}
#define HEAP_EMPTY 0xffffffffffffffffull
typedef struct {
int i;
uint64_t pos, idx;
bam1_t *b;
} heap1_t;
#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx))))
static inline int heap_lt(const heap1_t a, const heap1_t b)
{
if (g_is_by_qname) {
int t;
if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0;
t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0)));
} else return __pos_cmp(a, b);
}
KSORT_INIT(heap, heap1_t, heap_lt)
static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
{
bam_header_t t;
t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
}
static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
{
int tempi;
char *temps;
tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi;
temps = h1->text, h1->text = h2->text, h2->text = temps;
}
#define MERGE_RG 1
#define MERGE_UNCOMP 2
#define MERGE_LEVEL1 4
#define MERGE_FORCE 8
/*!
@abstract Merge multiple sorted BAM.
@param is_by_qname whether to sort by query name
@param out output BAM file name
@param headers name of SAM file from which to copy '@' header lines,
or NULL to copy them from the first file to be merged
@param n number of files to be merged
@param fn names of files to be merged
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level)
{
bamFile fpout, *fp;
heap1_t *heap;
bam_header_t *hout = 0;
bam_header_t *hheaders = NULL;
int i, j, *RG_len = 0;
uint64_t idx = 0;
char **RG = 0, mode[8];
bam_iter_t *iter = 0;
if (headers) {
tamFile fpheaders = sam_open(headers);
if (fpheaders == 0) {
const char *message = strerror(errno);
fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
return -1;
}
hheaders = sam_header_read(fpheaders);
sam_close(fpheaders);
}
g_is_by_qname = by_qname;
fp = (bamFile*)calloc(n, sizeof(bamFile));
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
// prepare RG tag
if (flag & MERGE_RG) {
RG = (char**)calloc(n, sizeof(void*));
RG_len = (int*)calloc(n, sizeof(int));
for (i = 0; i != n; ++i) {
int l = strlen(fn[i]);
const char *s = fn[i];
if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
++j; l -= j;
RG[i] = calloc(l + 1, 1);
RG_len[i] = l;
strncpy(RG[i], s + j, l);
}
}
// read the first
for (i = 0; i != n; ++i) {
bam_header_t *hin;
fp[i] = bam_open(fn[i], "r");
if (fp[i] == 0) {
int j;
fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
for (j = 0; j < i; ++j) bam_close(fp[j]);
free(fp); free(heap);
// FIXME: possible memory leak
return -1;
}
hin = bam_header_read(fp[i]);
if (i == 0) { // the first BAM
hout = hin;
} else { // validate multiple baf
int min_n_targets = hout->n_targets;
if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
for (j = 0; j < min_n_targets; ++j)
if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
hout->target_name[j], hin->target_name[j], fn[i]);
return -1;
}
// If this input file has additional target reference sequences,
// add them to the headers to be output
if (hin->n_targets > hout->n_targets) {
swap_header_targets(hout, hin);
// FIXME Possibly we should also create @SQ text headers
// for the newly added reference sequences
}
bam_header_destroy(hin);
}
}
if (hheaders) {
// If the text headers to be swapped in include any @SQ headers,
// check that they are consistent with the existing binary list
// of reference information.
if (hheaders->n_targets > 0) {
if (hout->n_targets != hheaders->n_targets) {
fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
if (!reg) return -1;
}
for (j = 0; j < hout->n_targets; ++j)
if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
if (!reg) return -1;
}
}
swap_header_text(hout, hheaders);
bam_header_destroy(hheaders);
}
if (reg) {
int tid, beg, end;
if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
return -1;
}
for (i = 0; i < n; ++i) {
bam_index_t *idx;
idx = bam_index_load(fn[i]);
iter[i] = bam_iter_query(idx, tid, beg, end);
bam_index_destroy(idx);
}
}
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
h->i = i;
h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
h->idx = idx++;
}
else h->pos = HEAP_EMPTY;
}
if (flag & MERGE_UNCOMP) level = 0;
else if (flag & MERGE_LEVEL1) level = 1;
strcpy(mode, "w");
if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
return -1;
}
bam_header_write(fpout, hout);
bam_header_destroy(hout);
if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->b;
if (flag & MERGE_RG) {
uint8_t *rg = bam_aux_get(b, "RG");
if (rg) bam_aux_del(b, rg);
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
bam_write1_core(fpout, &b->core, b->data_len, b->data);
if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
heap->idx = idx++;
} else if (j == -1) {
heap->pos = HEAP_EMPTY;
free(heap->b->data); free(heap->b);
heap->b = 0;
} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
ks_heapadjust(heap, 0, n, heap);
}
if (flag & MERGE_RG) {
for (i = 0; i != n; ++i) free(RG[i]);
free(RG); free(RG_len);
}
for (i = 0; i != n; ++i) {
bam_iter_destroy(iter[i]);
bam_close(fp[i]);
}
bam_close(fpout);
free(fp); free(heap); free(iter);
return 0;
}
int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg)
{
return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1);
}
int bam_merge(int argc, char *argv[])
{
int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
char *fn_headers = NULL, *reg = 0;
while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) {
switch (c) {
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
case 'h': fn_headers = strdup(optarg); break;
case 'n': is_by_qname = 1; break;
case '1': flag |= MERGE_LEVEL1; break;
case 'u': flag |= MERGE_UNCOMP; break;
case 'R': reg = strdup(optarg); break;
case 'l': level = atoi(optarg); break;
case '@': n_threads = atoi(optarg); break;
}
}
if (optind + 2 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
fprintf(stderr, "Options: -n sort by read names\n");
fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
fprintf(stderr, " -u uncompressed BAM output\n");
fprintf(stderr, " -f overwrite the output BAM if exist\n");
fprintf(stderr, " -1 compress level 1\n");
fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
fprintf(stderr, " -@ INT number of BAM compression threads [0]\n");
fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
fprintf(stderr, " the header dictionary in merging.\n\n");
return 1;
}
if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) {
FILE *fp = fopen(argv[optind], "rb");
if (fp != NULL) {
fclose(fp);
fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]);
return 1;
}
}
if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1;
free(reg);
free(fn_headers);
return ret;
}
/***************
* BAM sorting *
***************/
#include <pthread.h>
typedef bam1_t *bam1_p;
static int change_SO(bam_header_t *h, const char *so)
{
char *p, *q, *beg = 0, *end = 0, *newtext;
if (h->l_text > 3) {
if (strncmp(h->text, "@HD", 3) == 0) {
if ((p = strchr(h->text, '\n')) == 0) return -1;
*p = '\0';
if ((q = strstr(h->text, "\tSO:")) != 0) {
*p = '\n'; // change back
if (strncmp(q + 4, so, p - q - 4) != 0) {
beg = q;
for (q += 4; *q != '\n' && *q != '\t'; ++q);
end = q;
} else return 0; // no need to change
} else beg = end = p, *p = '\n';
}
}
if (beg == 0) { // no @HD
h->l_text += strlen(so) + 15;
newtext = malloc(h->l_text + 1);
sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so);
strcat(newtext, h->text);
} else { // has @HD but different or no SO
h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end);
newtext = malloc(h->l_text + 1);
strncpy(newtext, h->text, beg - h->text);
sprintf(newtext + (beg - h->text), "\tSO:%s", so);
strcat(newtext, end);
}
free(h->text);
h->text = newtext;
return 0;
}
static inline int bam1_lt(const bam1_p a, const bam1_p b)
{
if (g_is_by_qname) {
int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0)));
} else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b)));
}
KSORT_INIT(sort, bam1_p, bam1_lt)
typedef struct {
size_t buf_len;
const char *prefix;
bam1_p *buf;
const bam_header_t *h;
int index;
} worker_t;
static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads)
{
size_t i;
bamFile fp;
fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode);
if (fp == 0) return;
bam_header_write(fp, h);
if (n_threads > 1) bgzf_mt(fp, n_threads, 256);
for (i = 0; i < l; ++i)
bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
bam_close(fp);
}
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
ks_mergesort(sort, w->buf_len, w->buf, 0);
name = (char*)calloc(strlen(w->prefix) + 20, 1);
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0);
free(name);
return 0;
}
static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads)
{
int i;
size_t rest;
bam1_p *b;
pthread_t *tid;
pthread_attr_t attr;
worker_t *w;
if (n_threads < 1) n_threads = 1;
if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
w = calloc(n_threads, sizeof(worker_t));
tid = calloc(n_threads, sizeof(pthread_t));
b = buf; rest = k;
for (i = 0; i < n_threads; ++i) {
w[i].buf_len = rest / (n_threads - i);
w[i].buf = b;
w[i].prefix = prefix;
w[i].h = h;
w[i].index = n_files + i;
b += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
free(tid); free(w);
return n_files + n_threads;
}
/*!
@abstract Sort an unsorted BAM file based on the chromosome order
and the leftmost position of an alignment
@param is_by_qname whether to sort by query name
@param fn name of the file to be sorted
@param prefix prefix of the output and the temporary files; upon
sucessess, prefix.bam will be written.
@param max_mem approxiate maximum memory (very inaccurate)
@discussion It may create multiple temporary subalignment files
and then merge them by calling bam_merge_core(). This function is
NOT thread safe.
*/
void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level)
{
int ret, i, n_files = 0;
size_t mem, max_k, k, max_mem;
bam_header_t *header;
bamFile fp;
bam1_t *b, **buf;
char *fnout = 0;
if (n_threads < 2) n_threads = 1;
g_is_by_qname = is_by_qname;
max_k = k = 0; mem = 0;
max_mem = _max_mem * n_threads;
buf = 0;
fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
if (fp == 0) {
fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
return;
}
header = bam_header_read(fp);
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
// write sub files
for (;;) {
if (k == max_k) {
size_t old_max = max_k;
max_k = max_k? max_k<<1 : 0x10000;
buf = realloc(buf, max_k * sizeof(void*));
memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
}
if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
b = buf[k];
if ((ret = bam_read1(fp, b)) < 0) break;
if (b->data_len < b->m_data>>2) { // shrink
b->m_data = b->data_len;
kroundup32(b->m_data);
b->data = realloc(b->data, b->m_data);
}
mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
++k;
if (mem >= max_mem) {
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
mem = k = 0;
}
}
if (ret != -1)
fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
// output file name
fnout = calloc(strlen(prefix) + 20, 1);
if (is_stdout) sprintf(fnout, "-");
else sprintf(fnout, "%s.bam", prefix);
// write the final output
if (n_files == 0) { // a single block
char mode[8];
strcpy(mode, "w");
if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
ks_mergesort(sort, k, buf, 0);
write_buffer(fnout, mode, k, buf, header, n_threads);
} else { // then merge
char **fns;
n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads);
fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
fns = (char**)calloc(n_files, sizeof(char*));
for (i = 0; i < n_files; ++i) {
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
for (i = 0; i < n_files; ++i) {
unlink(fns[i]);
free(fns[i]);
}
free(fns);
}
free(fnout);
// free
for (k = 0; k < max_k; ++k) {
if (!buf[k]) continue;
free(buf[k]->data);
free(buf[k]);
}
free(buf);
bam_header_destroy(header);
bam_close(fp);
}
void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
{
bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1);
}
int bam_sort(int argc, char *argv[])
{
size_t max_mem = 768<<20; // 512MB
int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1;
while ((c = getopt(argc, argv, "nom:@:l:")) >= 0) {
switch (c) {
case 'o': is_stdout = 1; break;
case 'n': is_by_qname = 1; break;
case 'm': {
char *q;
max_mem = strtol(optarg, &q, 0);
if (*q == 'k' || *q == 'K') max_mem <<= 10;
else if (*q == 'm' || *q == 'M') max_mem <<= 20;
else if (*q == 'g' || *q == 'G') max_mem <<= 30;
break;
}
case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools sort [options] <in.bam> <out.prefix>\n\n");
fprintf(stderr, "Options: -n sort by read name\n");
fprintf(stderr, " -o final output to stdout\n");
fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n");
fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n");
fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n");
fprintf(stderr, "\n");
return 1;
}
bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level);
return 0;
}
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <zlib.h>
#ifdef _WIN32
#define drand48() ((double)rand() / RAND_MAX)
#endif
#include "ksort.h"
KSORT_INIT_GENERIC(uint64_t)
#include "kseq.h"
KSTREAM_INIT(gzFile, gzread, 8192)
typedef struct {
int n, m;
uint64_t *a;
int *idx;
} bed_reglist_t;
#include "khash.h"
KHASH_MAP_INIT_STR(reg, bed_reglist_t)
#define LIDX_SHIFT 13
typedef kh_reg_t reghash_t;
int *bed_index_core(int n, uint64_t *a, int *n_idx)
{
int i, j, m, *idx;
m = *n_idx = 0; idx = 0;
for (i = 0; i < n; ++i) {
int beg, end;
beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
if (m < end + 1) {
int oldm = m;
m = end + 1;
kroundup32(m);
idx = realloc(idx, m * sizeof(int));
for (j = oldm; j < m; ++j) idx[j] = -1;
}
if (beg == end) {
if (idx[beg] < 0) idx[beg] = i;
} else {
for (j = beg; j <= end; ++j)
if (idx[j] < 0) idx[j] = i;
}
*n_idx = end + 1;
}
return idx;
}
void bed_index(void *_h)
{
reghash_t *h = (reghash_t*)_h;
khint_t k;
for (k = 0; k < kh_end(h); ++k) {
if (kh_exist(h, k)) {
bed_reglist_t *p = &kh_val(h, k);
if (p->idx) free(p->idx);
ks_introsort(uint64_t, p->n, p->a);
p->idx = bed_index_core(p->n, p->a, &p->m);
}
}
}
int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
{
int i, min_off;
if (p->n == 0) return 0;
min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
int n = beg>>LIDX_SHIFT;
if (n > p->n) n = p->n;
for (i = n - 1; i >= 0; --i)
if (p->idx[i] >= 0) break;
min_off = i >= 0? p->idx[i] : 0;
}
for (i = min_off; i < p->n; ++i) {
if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
return 1; // find the overlap; return
}
return 0;
}
int bed_overlap(const void *_h, const char *chr, int beg, int end)
{
const reghash_t *h = (const reghash_t*)_h;
khint_t k;
if (!h) return 0;
k = kh_get(reg, h, chr);
if (k == kh_end(h)) return 0;
return bed_overlap_core(&kh_val(h, k), beg, end);
}
void *bed_read(const char *fn)
{
reghash_t *h = kh_init(reg);
gzFile fp;
kstream_t *ks;
int dret;
kstring_t *str;
// read the list
fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) return 0;
str = calloc(1, sizeof(kstring_t));
ks = ks_init(fp);
while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
int beg = -1, end = -1;
bed_reglist_t *p;
khint_t k = kh_get(reg, h, str->s);
if (k == kh_end(h)) { // absent from the hash table
int ret;
char *s = strdup(str->s);
k = kh_put(reg, h, s, &ret);
memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
}
p = &kh_val(h, k);
if (dret != '\n') { // if the lines has other characters
if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
beg = atoi(str->s); // begin
if (dret != '\n') {
if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
end = atoi(str->s); // end
if (end < beg) end = -1;
}
}
}
}
if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
if (beg >= 0 && end > beg) {
if (p->n == p->m) {
p->m = p->m? p->m<<1 : 4;
p->a = realloc(p->a, p->m * 8);
}
p->a[p->n++] = (uint64_t)beg<<32 | end;
}
}
ks_destroy(ks);
gzclose(fp);
free(str->s); free(str);
bed_index(h);
return h;
}
void bed_destroy(void *_h)
{
reghash_t *h = (reghash_t*)_h;
khint_t k;
for (k = 0; k < kh_end(h); ++k) {
if (kh_exist(h, k)) {
free(kh_val(h, k).a);
free(kh_val(h, k).idx);
free((char*)kh_key(h, k));
}
}
kh_destroy(reg, h);
}
This diff is collapsed.
This diff is collapsed.
package Grun;
use Exporter;
use JSON::XS;
use File::Temp qw(tempfile);
use Carp;
our @ISA=qw(Exporter);
our @EXPORT=qw(grun grun_wait grun_kill);
my %JTMP;
sub grun_wait {
my ($jid) = @_;
my $ret = system("grun -q wait $jid 2>&1");
if ($ret) {
$ret =(($ret<<8)&255) if $ret > 255;
$ret = 1 if !$ret;
}
open (my $fh, '<', $JTMP{$jid} . ".out");
local $/=undef;
my $out=<$fh>;
close $fh;
# copy pasted from below... make a function!
$out=decode_json($out);
if ($out->{err}) {
# remote eval died... so we do too
die $out->{err};
};
if (wantarray) {
# return array
return @{$out->{ret}};
} else {
# return single value
return $out->{ret}->[0];
}
}
sub grun {
# this is only required on the execution node....so don't use it everywhere if not needed
require B::RecDeparse;
# at most 9 levels deep
my $deparse=B::RecDeparse->new(level=>9);
my ($op, $func, @args) = @_;
croak("usage: grun({options}, \\\&function, \@args)") unless ref($func) eq 'CODE' && defined(wantarray);
($fh, $filename) = tempfile(".grun.XXXXXX", DIR=>".");
my $code=$deparse->coderef2text($func);
my $def=encode_json({code=>$code, args=>\@args, wantarray=>wantarray});
print $fh $def;
close $fh;
my $opts;
if ($op->{nowait}) {
$opts = "-o $filename.out -nowait";
}
my $cmd = "grun $opts $^X -MGrun -e \"\\\"Grun::exec('$filename')\\\"\"";
# get output (json string)
my $out = `$cmd`;
if ($op->{nowait}) {
my ($jid) = $out =~ /job_id.*:\s*(\d+)/i;
$JTMP{$jid}=$filename;
return $jid;
}
$out=decode_json($out);
if ($out->{err}) {
# remote eval died... so we do too
die $out->{err};
};
if (wantarray) {
# return array
return @{$out->{ret}};
} else {
# return single value
return $out->{ret}->[0];
}
}
sub exec {
my ($fil) = @_;
local $/ = undef;
open( my $fh, '<', $fil );
my $json = <$fh>;
close $fh;
my $hash=decode_json($json);
my $sub = "sub " . $hash->{code};
$sub = eval($sub);
my (@ret, $ret, $err);
eval {
if ($hash->{wantarray}) {
@ret=&{$sub}(@{$hash->{args}});
} else {
# scalar context
$ret=&{$sub}(@{$hash->{args}});
@ret=(($ret));
}
};
my $err=$@;
my $out=encode_json({ret=>\@ret, err=>$err});
# return output via STDOUT
print $out;
}
1;
# wildcards
.deps
.dirstamp
*.la
.libs
*.lo
*.o
*_test
# specific files
/aclocal.m4
/autom4te.cache/
/benchmark
/config.guess
/config.h
/config.h.in
/config.log
/config.status
/config.sub
/configure
/db_bench
/depcomp
/hyperleveldb.upack
/install-sh
/leveldbutil
/leveldb-verify
/libhyperleveldb.pc
/libtool
/ltmain.sh
/m4/
/Makefile
/Makefile.in
/Makefile.old
/missing
/stamp-h1