Skip to content
Commits on Source (9)
# version format.
# you can use {branch} name in version format too
# version: 1.0.{build}-{branch}
version: 'vers.{build}'
# branches to build
branches:
# Blacklist
except:
- gh-pages
# Do not build on tags (GitHub and BitBucket)
skip_tags: true
# Skipping commits affecting specific files (GitHub only). More details here: /docs/appveyor-yml
#skip_commits:
# files:
# - docs/*
# - '**/*.html'
# We use Mingw/Msys, so use pacman for installs
install:
- set HOME=.
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- set MINGWPREFIX=x86_64-w64-mingw32
- "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\""
# The user may have e.g. jkbonfield/samtools branch FOO and an associated
# jkbonfield/htslib branch FOO. If so use that related htslib, obtained by
# munging $APPVEYOR_REPO_NAME. Otherwise we assume this is a PR only to
# samtools and should be linked against samtools(org)/htslib develop branch.
clone_script:
- "sh -lc \"git clone --branch=$APPVEYOR_REPO_BRANCH https://github.com/$APPVEYOR_REPO_NAME $APPVEYOR_BUILD_FOLDER\""
- "sh -lc \"git clone --branch=$APPVEYOR_REPO_BRANCH https://github.com/`echo $APPVEYOR_REPO_NAME|sed 's#/samtools#/htslib#'`.git $APPVEYOR_BUILD_FOLDER/htslib || git clone https://github.com/samtools/htslib.git $APPVEYOR_BUILD_FOLDER/htslib \""
build_script:
- set HOME=.
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- "sh -lc \"(cd htslib; aclocal && autoheader && autoconf)\""
- "sh -lc \"aclocal && autoheader && autoconf && ./configure && make -j2\""
test_script:
- set HOME=.
- set MSYSTEM=MINGW64
- set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
- "sh -lc \"make test\""
Release 1.8 (3rd April 2018)
----------------------------
* samtools calmd now has a quiet mode. This can be enabled by passing `-Q` to
calmd. (Thanks to Colin Davenport)
* In samtools depth `-d 0` will effectively remove the depth limit. (#764)
* Improvements made to samtools collate's interface and documentation. It is
now possible to specify an output file name using `-o`, instead of deriving
it from the prefix used for temporary files. The prefix itself is now
optional if `-o` or `-O` (to stdout) is used. (#780)
* Bug-fixes:
- Make samtools addreplacerg choose output format by file extension. (#767;
reported by Argy Megalios)
- Merge tests now work on ungzipped data, allowing tests to be run against
different deflate libraries.
- samtools markdup error messages about missing tags have been updated with
the suggestion that samtools fixmate is run beforehand. (#765; reported by
Yudong Cai)
- Enables the `--reference` option for samtools fastq. Now works like other
programs when a reference sequence is needed for CRAM files. (#791,
reported by Milana Kaljevic)
Release 1.7 (26th January 2018)
--------------------
......
......@@ -9,7 +9,7 @@ Building samtools
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
cd .../samtools-1.7 # Within the unpacked release directory
cd .../samtools-1.8 # Within the unpacked release directory
./configure
make
......@@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
cd .../samtools-1.7 # Within the unpacked release directory
cd .../samtools-1.8 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
......
......@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
#define BAM_VERSION "1.7"
#define BAM_VERSION "1.8"
#include <stdint.h>
#include <stdlib.h>
......
......@@ -81,7 +81,8 @@ static int usage() {
fprintf(stderr, " -b <bed> list of positions or regions\n");
fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]. If 0, depth is set to the maximum\n"
" integer value, effectively removing any depth limit.\n"); // the htslib's default
fprintf(stderr, " -q <int> base quality threshold [0]\n");
fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(stderr, " -r <chr:from-to> region\n");
......@@ -206,6 +207,8 @@ int main_depth(int argc, char *argv[])
mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
if (0 < max_depth)
bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth
else if (!max_depth)
bam_mplp_set_maxcnt(mplp,INT_MAX);
n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
......
......@@ -358,7 +358,9 @@ static void orphan_only_func(const state_t* state, bam1_t* file_read)
}
static bool init(const parsed_opts_t* opts, state_t** state_out) {
char output_mode[8] = "w";
state_t* retval = (state_t*) calloc(1, sizeof(state_t));
if (retval == NULL) {
fprintf(stderr, "[init] Out of memory allocating state struct.\n");
return false;
......@@ -374,7 +376,9 @@ static bool init(const parsed_opts_t* opts, state_t** state_out) {
retval->input_header = sam_hdr_read(retval->input_file);
retval->output_header = bam_hdr_dup(retval->input_header);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
if (opts->output_name) // File format auto-detection
sam_open_mode(output_mode + 1, opts->output_name, NULL);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, output_mode, &opts->ga.out);
if (retval->output_file == NULL) {
print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
......
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
Copyright (C) 2017 Genome Research Ltd.
Copyright (C) 2017-18 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
......@@ -276,7 +276,7 @@ static int64_t get_mate_score(bam1_t *b) {
if ((data = bam_aux_get(b, "ms"))) {
score = bam_aux2i(data);
} else {
fprintf(stderr, "[markdup] error: no ms score tag.\n");
fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
return -1;
}
......@@ -323,7 +323,7 @@ static int make_pair_key(key_data_t *key, bam1_t *bam) {
other_end = unclipped_other_end(bam->core.mpos, cig);
other_coord = unclipped_other_start(bam->core.mpos, cig);
} else {
fprintf(stderr, "[markdup] error: no MC tag.\n");
fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
return 1;
}
......@@ -634,14 +634,14 @@ static int bam_mark_duplicates(samFile *in, samFile *out, char *prefix, int remo
bp = &kh_val(pair_hash, k);
if ((mate_tmp = get_mate_score(bp->p)) == -1) {
fprintf(stderr, "[markdup] error: no ms score tag.\n");
fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
return 1;
} else {
old_score = calc_score(bp->p) + mate_tmp;
}
if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
fprintf(stderr, "[markdup] error: no ms score tag.\n");
fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
return 1;
} else {
new_score = calc_score(in_read->b) + mate_tmp;
......
......@@ -254,7 +254,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
{
bam_hdr_t *header;
bam1_t *b[2] = { NULL, NULL };
int curr, has_prev, pre_end = 0, cur_end = 0;
int curr, has_prev, pre_end = 0, cur_end = 0, result;
kstring_t str;
str.l = str.m = 0; str.s = 0;
......@@ -280,7 +280,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
b[0] = bam_init1();
b[1] = bam_init1();
curr = 0; has_prev = 0;
while (sam_read1(in, header, b[curr]) >= 0) {
while ((result = sam_read1(in, header, b[curr])) >= 0) {
bam1_t *cur = b[curr], *pre = b[1-curr];
if (cur->core.flag & BAM_FSECONDARY)
{
......@@ -365,6 +365,7 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop
curr = 1 - curr;
pre_end = cur_end;
}
if (result < -1) goto fail;
if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
bam1_t *pre = b[1-curr];
if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
......
......@@ -46,7 +46,7 @@ DEALINGS IN THE SOFTWARE. */
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm, int quiet_mode)
{
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
......@@ -116,7 +116,9 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
if (old_nm) old_nm_i = bam_aux2i(old_nm);
if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
else if (nm != old_nm_i) {
if (!quiet_mode) {
fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
}
bam_aux_del(b, old_nm);
bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
}
......@@ -134,7 +136,9 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
if (i < str->l) is_diff = 1;
} else is_diff = 1;
if (is_diff) {
if (!quiet_mode) {
fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
}
bam_aux_del(b, old_md);
bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
}
......@@ -156,20 +160,21 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
free(str->s); free(str);
}
void bam_fillmd1(bam1_t *b, char *ref, int flag)
void bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode)
{
bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
bam_fillmd1_core(b, ref, INT_MAX, flag, 0, quiet_mode);
}
int calmd_usage() {
fprintf(stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
"Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
"Options:\n"
" -e change identical bases to '='\n"
" -u uncompressed BAM output (for piping)\n"
" -b compressed BAM output\n"
" -S ignored (input format is auto-detected)\n"
" -A modify the quality string\n"
" -Q use quiet mode to output less debug info to stdout\n"
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
......@@ -179,7 +184,7 @@ int calmd_usage() {
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode;
htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
bam_hdr_t *header = NULL;
......@@ -194,9 +199,9 @@ int bam_fillmd(int argc, char *argv[])
};
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = quiet_mode = 0;
strcpy(mode_w, "w");
while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "EqQreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
......@@ -211,6 +216,7 @@ int bam_fillmd(int argc, char *argv[])
case 'C': capQ = atoi(optarg); break;
case 'A': baq_flag |= 1; break;
case 'E': baq_flag |= 2; break;
case 'Q': quiet_mode = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
/* else fall-through */
......@@ -283,7 +289,7 @@ int bam_fillmd(int argc, char *argv[])
int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm, quiet_mode);
}
if (sam_write1(fpout, header, b) < 0) {
print_error_errno("calmd", "failed to write to output file");
......
......@@ -123,6 +123,7 @@ static int strnum_cmp(const char *_a, const char *_b)
typedef struct {
int i;
uint32_t rev;
uint64_t pos, idx;
bam1_tag entry;
} heap1_t;
......@@ -150,6 +151,7 @@ static inline int heap_lt(const heap1_t a, const heap1_t b)
if (fa != fb) return fa > fb;
} else {
if (a.pos != b.pos) return a.pos > b.pos;
if (a.rev != b.rev) return a.rev > b.rev;
}
// This compares by position in the input file(s)
if (a.i != b.i) return a.i > b.i;
......@@ -1366,7 +1368,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record);
if (res >= 0) {
bam_translate(h->entry.bam_record, translation_tbl + i);
h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1)<<1 | bam_is_rev(h->entry.bam_record);
h->pos = ((uint64_t)h->entry.bam_record->core.tid<<32) | (uint32_t)((int32_t)h->entry.bam_record->core.pos+1);
h->rev = bam_is_rev(h->entry.bam_record);
h->idx = idx++;
if (g_is_by_tag) {
h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
......@@ -1413,7 +1416,8 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
}
if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
bam_translate(b, translation_tbl + heap->i);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1);
heap->rev = bam_is_rev(b);
heap->idx = idx++;
if (g_is_by_tag) {
heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
......@@ -1649,8 +1653,8 @@ static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
}
if (res >= 0) {
heap->pos = (((uint64_t)heap->entry.bam_record->core.tid<<32)
| (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1)<<1
| bam_is_rev(heap->entry.bam_record));
| (uint32_t)((int32_t)heap->entry.bam_record->core.pos+1));
heap->rev = bam_is_rev(heap->entry.bam_record);
heap->idx = (*idx)++;
if (g_is_by_tag) {
heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
......@@ -1804,8 +1808,14 @@ static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
if (t != 0) return t;
return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
} else {
pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1)<<1|bam_is_rev(a.bam_record);
pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1)<<1|bam_is_rev(b.bam_record);
pa = (uint64_t)a.bam_record->core.tid<<32|(a.bam_record->core.pos+1);
pb = (uint64_t)b.bam_record->core.tid<<32|(b.bam_record->core.pos+1);
if (pa == pb) {
pa = bam_is_rev(a.bam_record);
pb = bam_is_rev(b.bam_record);
}
return pa < pb ? -1 : (pa > pb ? 1 : 0);
}
}
......@@ -2215,6 +2225,7 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
bam_destroy1(b);
free(buf);
free(bam_mem);
free(in_mem);
bam_hdr_destroy(header);
if (fp) sam_close(fp);
return ret;
......
......@@ -30,6 +30,11 @@ DEALINGS IN THE SOFTWARE. */
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <errno.h>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#endif
#include "htslib/sam.h"
#include "htslib/hts.h"
#include "htslib/ksort.h"
......@@ -78,12 +83,12 @@ static inline int elem_lt(elem_t x, elem_t y)
KSORT_INIT(bamshuf, elem_t, elem_lt)
static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
int is_stdout, sam_global_args *ga)
int is_stdout, const char *output_file, sam_global_args *ga)
{
samFile *fp, *fpw = NULL, **fpt = NULL;
char **fnt = NULL, modew[8];
bam1_t *b = NULL;
int i, l, r;
int i, counter, l, r;
bam_hdr_t *h = NULL;
int64_t j, max_cnt = 0, *cnt = NULL;
elem_t *a = NULL;
......@@ -131,15 +136,18 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
l = strlen(pre);
for (i = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 10, 1);
for (i = counter = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 20, 1);
if (!fnt[i]) goto mem_fail;
sprintf(fnt[i], "%s.%.4d.bam", pre, i);
fpt[i] = sam_open(fnt[i], "wb1");
do {
sprintf(fnt[i], "%s.%04d.bam", pre, counter++);
fpt[i] = sam_open(fnt[i], "wxb1");
} while (!fpt[i] && errno == EEXIST);
if (fpt[i] == NULL) {
print_error_errno("collate", "Cannot open intermediate file \"%s\"", fnt[i]);
goto fail;
}
if (p.pool) hts_set_opt(fpt[i], HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpt[i], h) < 0) {
print_error_errno("collate", "Couldn't write header to intermediate file \"%s\"", fnt[i]);
goto fail;
......@@ -180,7 +188,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
fp = NULL;
// merge
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout) { // output to a file
if (!is_stdout && !output_file) { // output to a file (name based on prefix)
char *fnw = (char*)calloc(l + 5, 1);
if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
......@@ -189,6 +197,13 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
} else if (output_file) { // output to a given file
modew[0] = 'w'; modew[1] = '\0';
sam_open_mode(modew + 1, output_file, NULL);
j = strlen(modew);
snprintf(modew + j, sizeof(modew) - j, "%d",
(clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
fpw = sam_open_format(output_file, modew, &ga->out);
} else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
if (fpw == NULL) {
if (is_stdout) print_error_errno("collate", "Cannot open standard output");
......@@ -281,42 +296,87 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
static int usage(FILE *fp, int n_files) {
fprintf(fp,
"Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] <in.bam> <out.prefix>\n\n"
"Usage: samtools collate [-Ou] [-o <name>] [-n nFiles] [-l cLevel] <in.bam> [<prefix>]\n\n"
"Options:\n"
" -O output to stdout\n"
" -o output file name (use prefix if not set)\n"
" -u uncompressed BAM output\n"
" -l INT compression level [%d]\n" // DEF_CLEVEL
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
sam_global_opt_help(fp, "-....@");
fprintf(fp,
" <prefix> is required unless the -o or -O options are used.\n");
return 1;
}
char * generate_prefix() {
char *prefix;
unsigned int pid = getpid();
#ifdef _WIN32
# define PREFIX_LEN (MAX_PATH + 16)
DWORD ret;
prefix = calloc(PREFIX_LEN, sizeof(*prefix));
if (!prefix) {
perror("collate");
return NULL;
}
ret = GetTempPathA(MAX_PATH, prefix);
if (ret > MAX_PATH || ret == 0) {
fprintf(stderr,
"[E::collate] Couldn't get path for temporary files.\n");
free(prefix);
return NULL;
}
snprintf(prefix + ret, PREFIX_LEN - ret, "\\%x", pid);
return prefix;
#else
# define PREFIX_LEN 64
prefix = malloc(PREFIX_LEN);
if (!prefix) {
perror("collate");
return NULL;
}
snprintf(prefix, PREFIX_LEN, "/tmp/collate%x", pid);
return prefix;
#endif
}
int main_bamshuf(int argc, char *argv[])
{
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
const char *output_file = NULL, *prefix = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "n:l:uOo:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
case 'O': is_stdout = 1; break;
case 'o': output_file = optarg; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return usage(stderr, n_files);
}
}
if (is_un) clevel = 0;
if (optind + 2 > argc)
if (argc >= optind + 2) prefix = argv[optind+1];
if (!(prefix || is_stdout || output_file))
return usage(stderr, n_files);
if (is_stdout && output_file) {
fprintf(stderr, "collate: -o and -O options cannot be used together.\n");
return usage(stderr, n_files);
}
if (!prefix) prefix = generate_prefix();
if (!prefix) return EXIT_FAILURE;
return bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout, &ga);
return bamshuf(argv[optind], n_files, prefix, clevel, is_stdout,
output_file, &ga);
}
samtools (1.8-1) experimental; urgency=medium
* New upstream version
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
* hardening=+all
* Bump versioned Build-Depends: libhts-dev (>= 1.8)
-- Andreas Tille <tille@debian.org> Thu, 03 May 2018 12:02:20 +0200
samtools (1.7-2) unstable; urgency=medium
* Install examples (since it is used in dwgsim package and might be
......
......@@ -9,16 +9,16 @@ Build-Depends: debhelper (>= 11~),
# libio-pty-perl is needed by the regression test.
libio-pty-perl,
libncurses5-dev,
libhts-dev (>= 1.7),
libhts-dev (>= 1.8),
zlib1g-dev,
automake,
autoconf-archive,
pkg-config,
tabix (>= 1.0)
# tabix is needed for the regression tests.
Standards-Version: 4.1.3
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/samtools.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/samtools.git
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/samtools
Vcs-Git: https://salsa.debian.org/med-team/samtools.git
Homepage: http://www.htslib.org/
Package: samtools
......
......@@ -6,7 +6,7 @@ Last-Update: 2017-12-22
--- a/test/test.pl
+++ b/test/test.pl
@@ -2472,22 +2472,22 @@ sub test_sort
@@ -2476,22 +2476,22 @@ sub test_sort
# Pos sort
......@@ -34,4 +34,4 @@ Last-Update: 2017-12-22
+ test_cmd($opts, out=>"sort/tag.fi.sort.expected.sam", cmd=>"$$opts{bin}/samtools sort${threads} -m 192M -t FI $$opts{path}/dat/test_input_1_d.sam -O SAM -o -");
}
sub test_fixmate
sub test_collate
......@@ -3,6 +3,8 @@
export DH_VERBOSE=1
export CURSES_LIB=-lcurses
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
%:
dh $@
......
.TH wgsim 1 "26 January 2018" "samtools-1.7" "Bioinformatics tools"
.TH wgsim 1 "3 April 2018" "samtools-1.8" "Bioinformatics tools"
.SH NAME
wgsim \- Whole-genome sequencing read simulator
.SH SYNOPSIS
......
......@@ -1392,6 +1392,14 @@ static bool init_state(const bam2fq_opts_t* opts, bam2fq_state_t** state_out)
}
}
if (opts->ga.reference) {
if (hts_set_fai_filename(state->fp, opts->ga.reference) != 0) {
print_error_errno("bam2fq", "cannot load reference \"%s\"", opts->ga.reference);
free(state);
return false;
}
}
int i;
for (i = 0; i < 3; ++i) {
if (opts->fnr[i]) {
......
'\" t
.TH samtools 1 "26 January 2018" "samtools-1.7" "Bioinformatics tools"
.TH samtools 1 "3 April 2018" "samtools-1.8" "Bioinformatics tools"
.SH NAME
samtools \- Utilities for the Sequence Alignment/Map (SAM) format
.\"
......@@ -85,7 +85,7 @@ samtools fasta input.bam > output.fasta
.PP
samtools addreplacerg -r 'ID:fish' -r 'LB:1334' -r 'SM:alpha' -o output.bam input.bam
.PP
samtools collate aln.sorted.bam aln.name_collated.bam
samtools collate -o aln.name_collated.bam aln.sorted.bam
.PP
samtools depad input.bam
.PP
......@@ -830,7 +830,7 @@ has coverage outside of the region specified in the BED file.
.TP
.BI "-m, -d " INT
.RI "Truncate reported depth at a maximum of " INT " reads."
[8000]
[8000]. If 0, depth is set to the maximum integer value, effectively removing any depth limit.
.TP
.BI "-q " INT
.RI "Only count reads with base quality greater than " INT
......@@ -1566,7 +1566,7 @@ ignore the left part of the tag until the separator, then use the second part
.B collate
samtools collate
.RI [ options ]
.IR in.sam | in.bam | in.cram " [" out.prefix "]"
.IR in.sam | in.bam | in.cram " [" <prefix> "]"
Shuffles and groups reads together by their names.
A faster alternative to a full query name sort,
......@@ -1577,11 +1577,23 @@ but doesn't make any guarantees about the order of read names between groups.
The output from this command should be suitable for any operation that
requires all reads from the same template to be grouped together.
If present, <prefix> is used to name the temporary files that collate
uses when sorting the data. If neither the '-O' nor '-o' options are used,
<prefix> must be present and collate will use it to make an output file name
by appending a suffix depending on the format written (.bam by default).
If either the -O or -o option is used, <prefix> is optional. If <prefix>
is absent, collate will write the temporary files to a system-dependent
location (/tmp on UNIX).
.B Options:
.RS
.TP 8
.B -O
Output to stdout rather than to files starting with out.prefix
Output to stdout. This option cannot be used with '-o'.
.TP
.B -o FILE
Write output to FILE. This option cannot be used with '-O'.
.TP
.B -u
Write uncompressed BAM output
......@@ -1954,7 +1966,18 @@ MD5 not being obtainable via the REF_PATH or REF_CACHE environment variables.
.BI decode_md= 0|1
CRAM input only; defaults to 1 (on). CRAM does not typically store
MD and NM tags, preferring to generate them on the fly. This option
controls this behaviour.
controls this behaviour. It can be particularly useful when combined
with a file encoded using store_md=1 and store_nm=1.
.TP
.BI store_md= 0|1
CRAM output only; defaults to 0 (off). CRAM normally only stores MD
tags when no reference is unknown and lets the decoder generate these
values on-the-fly (see decode_md).
.TP
.BI store_nm= 0|1
CRAM output only; defaults to 0 (off). CRAM normally only stores NM
tags when no reference is unknown and lets the decoder generate these
values on-the-fly (see decode_md).
.TP
.BI ignore_md5= 0|1
CRAM input only; defaults to 0 (off). When enabled, md5 checksum
......
......@@ -60,6 +60,7 @@ DEALINGS IN THE SOFTWARE. */
#include <htslib/kstring.h>
#include "stats_isize.h"
#include "sam_opts.h"
#include "bedidx.h"
#define BWA_MIN_RDLEN 35
// From the spec
......@@ -213,6 +214,8 @@ typedef struct
char* split_name;
stats_info_t* info; // Pointer to options and settings struct
pos_t *chunks;
uint32_t nchunks;
}
stats_t;
......@@ -222,6 +225,12 @@ static void error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
void realloc_buffers(stats_t *stats, int seq_len);
static int regions_lt(const void *r1, const void *r2) {
int64_t from_diff = (int64_t)((pos_t *)r1)->from - (int64_t)((pos_t *)r2)->from;
int64_t to_diff = (int64_t)((pos_t *)r1)->to - (int64_t)((pos_t *)r2)->to;
return from_diff > 0 ? 1 : from_diff < 0 ? -1 : to_diff > 0 ? 1 : to_diff < 0 ? -1 : 0;
}
// Coverage distribution methods
static inline int coverage_idx(int min, int max, int n, int step, int depth)
......@@ -875,7 +884,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
int ncig = bam_cigar_oplen(bam_get_cigar(bam_line)[i]);
if ( !ncig ) continue; // curiously, this can happen: 0D
if ( cig==BAM_CDEL ) readlen += ncig;
else if ( cig==BAM_CMATCH )
else if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
{
if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
......@@ -958,7 +967,52 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
// Coverage distribution graph
round_buffer_flush(stats,bam_line->core.pos);
round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
if ( stats->regions ) {
uint32_t p = bam_line->core.pos, pnew, pmin, pmax, j;
pmin = pmax = i = j = 0;
while ( j < bam_line->core.n_cigar && i < stats->nchunks ) {
int op = bam_cigar_op(bam_get_cigar(bam_line)[j]);
int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]);
switch(op) {
case BAM_CMATCH:
case BAM_CEQUAL:
case BAM_CDIFF:
pmin = MAX(p, stats->chunks[i].from-1);
pmax = MIN(p+oplen, stats->chunks[i].to);
if ( pmax >= pmin )
round_buffer_insert_read(&(stats->cov_rbuf), pmin, pmax-1);
break;
case BAM_CDEL:
break;
}
pnew = p + (bam_cigar_type(op)&2 ? oplen : 0); // consumes reference
if ( pnew >= stats->chunks[i].to ) {
// go to the next chunk
i++;
} else {
// go to the next CIGAR op
j++;
p = pnew;
}
}
} else {
uint32_t p = bam_line->core.pos, j;
for (j = 0; j < bam_line->core.n_cigar; j++) {
int op = bam_cigar_op(bam_get_cigar(bam_line)[j]);
int oplen = bam_cigar_oplen(bam_get_cigar(bam_line)[j]);
switch(op) {
case BAM_CMATCH:
case BAM_CEQUAL:
case BAM_CDIFF:
round_buffer_insert_read(&(stats->cov_rbuf), p, p+oplen-1);
break;
case BAM_CDEL:
break;
}
p += bam_cigar_type(op)&2 ? oplen : 0; // consumes reference
}
}
}
}
......@@ -1225,7 +1279,7 @@ void init_regions(stats_t *stats, const char *file)
if ( !fp ) error("%s: %s\n",file,strerror(errno));
kstring_t line = { 0, 0, NULL };
int warned = 0;
int warned = 0, r, p, new_p;
int prev_tid=-1, prev_pos=-1;
while (line.l = 0, kgetline(&line, (kgets_func *)fgets, fp) >= 0)
{
......@@ -1272,10 +1326,27 @@ void init_regions(stats_t *stats, const char *file)
if ( prev_pos>stats->regions[tid].pos[npos].from )
error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line.s,stats->regions[tid].pos[npos].from,prev_pos);
stats->regions[tid].npos++;
if ( stats->regions[tid].npos > stats->nchunks )
stats->nchunks = stats->regions[tid].npos;
}
free(line.s);
if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
fclose(fp);
// sort region intervals and remove duplicates
for (r = 0; r < stats->nregions; r++) {
regions_t *reg = &stats->regions[r];
qsort(reg->pos, reg->npos, sizeof(pos_t), regions_lt);
for (new_p = 0, p = 1; p < reg->npos; p++) {
if ( reg->pos[new_p].to < reg->pos[p].from )
reg->pos[++new_p] = reg->pos[p];
else if ( reg->pos[new_p].to < reg->pos[p].to )
reg->pos[new_p].to = reg->pos[p].to;
}
reg->npos = ++new_p;
}
stats->chunks = calloc(stats->nchunks, sizeof(pos_t));
}
void destroy_regions(stats_t *stats)
......@@ -1287,6 +1358,7 @@ void destroy_regions(stats_t *stats)
free(stats->regions[i].pos);
}
if ( stats->regions ) free(stats->regions);
if ( stats->chunks ) free(stats->chunks);
}
void reset_regions(stats_t *stats)
......@@ -1311,14 +1383,67 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats)
int i = reg->cpos;
while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
if ( bam_endpos(bam_line) < reg->pos[i].from ) return 0;
//found a read overlapping a region
reg->cpos = i;
stats->reg_from = reg->pos[i].from;
stats->reg_to = reg->pos[i].to;
//now find all the overlapping chunks
stats->nchunks = 0;
while (i < reg->npos) {
if (bam_line->core.pos < reg->pos[i].to && bam_endpos(bam_line) >= reg->pos[i].from) {
int32_t endpos = bam_endpos(bam_line);
stats->chunks[stats->nchunks].from = MAX(bam_line->core.pos+1, reg->pos[i].from);
stats->chunks[stats->nchunks].to = MIN(endpos, reg->pos[i].to);
stats->nchunks++;
}
i++;
}
return 1;
}
int replicate_regions(stats_t *stats, hts_itr_multi_t *iter) {
if ( !stats || !iter)
return 1;
int i, j, tid;
stats->nregions = iter->n_reg;
stats->regions = calloc(stats->nregions, sizeof(regions_t));
stats->chunks = calloc(stats->nchunks, sizeof(pos_t));
if ( !stats->regions || !stats->chunks )
return 1;
for (i = 0; i < stats->nregions; i++) {
tid = iter->reg_list[i].tid;
if ( tid < 0 )
continue;
if ( tid >= stats->nregions ) {
stats->nregions = tid+10;
regions_t *tmp = realloc(stats->regions, stats->nregions * sizeof(regions_t));
if (!tmp)
return 1;
stats->regions = tmp;
memset(stats->regions + tid, 0, 10 * sizeof(regions_t));
}
stats->regions[tid].mpos = stats->regions[tid].npos = iter->reg_list[i].count;
stats->regions[tid].pos = calloc(stats->regions[tid].mpos, sizeof(pos_t));
if ( !stats->regions[tid].pos )
return 1;
for (j = 0; j < stats->regions[tid].npos; j++) {
stats->regions[tid].pos[j].from = iter->reg_list[i].intervals[j].beg+1;
stats->regions[tid].pos[j].to = iter->reg_list[i].intervals[j].end;
}
}
return 0;
}
void init_group_id(stats_t *stats, const char *id)
{
#if 0
......@@ -1507,6 +1632,7 @@ stats_t* stats_init()
stats->is_sorted = 1;
stats->nindels = stats->nbases;
stats->split_name = NULL;
stats->nchunks = 0;
return stats;
}
......@@ -1661,7 +1787,10 @@ int main_stats(int argc, char *argv[])
bam_fname = "-";
}
if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
if (init_stat_info_fname(info, bam_fname, &ga.in)) {
free(info);
return 1;
}
if (ga.nthreads > 0)
hts_set_threads(info->sam, ga.nthreads);
......@@ -1676,26 +1805,56 @@ int main_stats(int argc, char *argv[])
bam1_t *bam_line = bam_init1();
if ( optind<argc )
{
int filter = 1;
// Prepare the region hash table for the multi-region iterator
void *region_hash = bed_hash_regions(NULL, argv, optind, argc, &filter);
if (region_hash) {
// Collect stats in selected regions only
hts_idx_t *bam_idx = sam_index_load(info->sam,bam_fname);
if (bam_idx == 0)
error("Random alignment retrieval only works for indexed BAM files.\n");
if (bam_idx) {
int i;
for (i=optind; i<argc; i++)
{
hts_itr_t* iter = bam_itr_querys(bam_idx, info->sam_header, argv[i]);
while (sam_itr_next(info->sam, iter, bam_line) >= 0) {
int regcount = 0;
hts_reglist_t *reglist = bed_reglist(region_hash, ALL, &regcount);
if (reglist) {
hts_itr_multi_t *iter = sam_itr_regions(bam_idx, info->sam_header, reglist, regcount);
if (iter) {
if (!targets) {
all_stats->nchunks = argc-optind;
if ( replicate_regions(all_stats, iter) )
fprintf(stderr, "Replications of the regions failed.");
}
if ( all_stats->nregions && all_stats->regions ) {
while (sam_itr_multi_next(info->sam, iter, bam_line) >= 0) {
if (info->split_tag) {
curr_stats = get_curr_split_stats(bam_line, split_hash, info, targets);
collect_stats(bam_line, curr_stats);
}
collect_stats(bam_line, all_stats);
}
reset_regions(all_stats);
bam_itr_destroy(iter);
}
hts_itr_multi_destroy(iter);
} else {
fprintf(stderr, "Creation of the region iterator failed.");
hts_reglist_free(reglist, regcount);
}
} else {
fprintf(stderr, "Creation of the region list failed.");
}
hts_idx_destroy(bam_idx);
} else {
fprintf(stderr, "Random alignment retrieval only works for indexed BAM files.\n");
}
bed_destroy(region_hash);
} else {
fprintf(stderr, "Creation of the region hash table failed.\n");
}
}
else
{
......
@HD VN:1.4 SO:unsorted GO:query
@SQ SN:insert LN:599
@SQ SN:ref1 LN:45
@SQ SN:ref2 LN:40
@SQ SN:ref3 LN:4
@PG ID:llama
@RG ID:fish PG:llama
@RG ID:cow PU:13_&^&&*(:332 PG:donkey
@RG PU:*9u8jkjjkjd: ID:colt
@PG ID:bull PP:donkey
@PG ID:donkey
@CO Do you know?
r006 0 ref1 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * XA:Z:abc XB:i:-10 RG:Z:colt PG:Z:donkey AS:i:20 FI:f:4.5
r006 16 ref1 29 30 6H5M * 0 0 TAGGC * RG:Z:colt PG:Z:donkey FI:i:3
x8 0 ref2 2 30 21M * 0 0 GGTTTTATAAAACAAATAATT ????????????????????? RG:Z:cow PG:Z:bull AS:i:10 FI:f:1.5
x10 0 ref2 10 30 25M * 0 0 CAAATAATTAAGTCTACAGAGCAAC ????????????????????????? RG:Z:cow PG:Z:bull AS:i:0 FI:A:b
x7 0 ref2 1 30 20M * 0 0 AGGTTTTATAAAACAAATAA * RG:Z:cow PG:Z:bull AS:i:50 FI:i:2
x11 0 ref2 12 30 24M * 0 0 AATAATTAAGTCTACAGAGCAACT ???????????????????????? RG:Z:cow PG:Z:bull FI:Z:a
r005 83 ref1 37 30 9M = 7 -39 CAGCGCCAT * RG:Z:colt PG:Z:donkey AS:i:100 FI:f:2.5
r005 163 ref1 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 YY:i:100 RG:Z:colt PG:Z:donkey AS:i:10 FI:i:5
r007 0 ref1 9 30 5H6M * 0 0 AGCTAA * RG:Z:colt PG:Z:donkey AS:i:1 FI:i:4
r007 0 ref1 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * RG:Z:colt PG:Z:donkey AS:i:-5 FI:f:3.5
x12 0 ref2 14 30 23M * 0 0 TAATTAAGTCTACAGAGCAACTA ??????????????????????? RG:Z:cow PG:Z:bull AS:i:65100
x9 0 ref2 6 30 9M4I13M * 0 0 TTATAAAACAAATAATTAAGTCTACA ?????????????????????????? RG:Z:cow PG:Z:bull AS:i:20 FI:i:1