Skip to content
Commits on Source (5)
System Requirements
===================
Samtools requires the zlib library <http://zlib.net>, the bzip2
library <http://bzip.org/>, liblzma <http://tukaani.org/xz/> and (optionally)
a curses or GNU ncurses library <http://www.gnu.org/software/ncurses/>.
Samtools and HTSlib depend on the following libraries:
Samtools:
zlib <http://zlib.net>
curses or GNU ncurses (optional, for the 'tview' command)
<http://www.gnu.org/software/ncurses/>
HTSlib:
zlib <http://zlib.net>
libbz2 <http://bzip.org/>
liblzma <http://tukaani.org/xz/>
libcurl <https://curl.haxx.se/>
(optional but strongly recommended, for network access)
libcrypto <https://www.openssl.org/>
(optional, for Amazon S3 support; not needed on MacOS)
See the "System Specific Details" below for guidance on how to install
these.
The bzip2 and liblzma dependencies can be removed if full CRAM support
is not needed - see HTSlib's INSTALL file for details.
......@@ -142,6 +158,16 @@ for details. For example,
would specify that samtools is to be built with icc and installed into bin,
lib, etc subdirectories under /opt/icc-compiled.
If dependencies have been installed in non-standard locations (i.e. not on
the normal include and library search paths) then the CPPFLAGS and LDFLAGS
environment variables can be used to set the options needed to find them.
For example, NetBSD users may use:
./configure CPPFLAGS=-I/usr/pkg/include \
LDFLAGS='-L/usr/pkg/lib -Wl,-R/usr/pkg/lib'
to allow compiling and linking against dependencies installed via the ports
collection.
Installation Locations
======================
......@@ -166,3 +192,42 @@ For example,
make DESTDIR=/tmp/staging prefix=/opt
would install into bin and share/man subdirectories under /tmp/staging/opt.
System Specific Details
=======================
Installing the prerequisites is system dependent and there is more
than one correct way of satisfying these, including downloading them
from source, compiling and installing them yourself.
For people with super-user access, we provide an example set of commands
below for installing the dependencies on a variety of operating system
distributions. Note these are not specific recommendations on distribution,
compiler or SSL implementation. It is assumed you already have the core set
of packages for the given distribution - the lists may be incomplete if
this is not the case.
Debian / Ubuntu
---------------
sudo apt-get update # Ensure the package list is up to date
sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev libncurses5-dev
Note: libcurl4-openssl-dev can be used as an alternative to libcurl4-gnutls-dev.
RedHat / CentOS
---------------
sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel
Alpine Linux
------------
sudo apk update # Ensure the package list is up to date
sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev ncurses-dev
OpenSUSE
--------
sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-devel libcurl-devel libopenssl-devel ncurses-devel
The MIT/Expat License
Copyright (C) 2008-2014 Genome Research Ltd.
Copyright (C) 2008-2018 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......
......@@ -154,56 +154,58 @@ bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
bam_lpileup_h = bam_lpileup.h $(htslib_sam_h)
bam_plbuf_h = bam_plbuf.h $(htslib_sam_h)
bam_tview_h = bam_tview.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) $(bam2bcf_h) $(htslib_khash_h) $(bam_lpileup_h)
bedidx_h = bedidx.h $(htslib_hts_h)
sam_h = sam.h $(htslib_sam_h) $(bam_h)
sam_opts_h = sam_opts.h $(htslib_hts_h)
sample_h = sample.h $(htslib_kstring_h)
stats_isize_h = stats_isize.h $(htslib_khash_h)
tmp_file_h = tmp_file.h $(htslib_sam_h) $(LZ4DIR)/lz4.h
bam.o: bam.c config.h $(bam_h) $(htslib_kstring_h) sam_header.h
bam2bcf.o: bam2bcf.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h)
bam2bcf_indel.o: bam2bcf_indel.c config.h $(htslib_hts_h) $(htslib_sam_h) $(bam2bcf_h) $(htslib_khash_h) $(htslib_ksort_h)
bam2depth.o: bam2depth.c config.h $(htslib_sam_h) samtools.h $(sam_opts_h)
bam_addrprg.o: bam_addrprg.c config.h $(htslib_sam_h) $(htslib_kstring_h) samtools.h $(sam_opts_h)
bam_addrprg.o: bam_addrprg.c config.h $(htslib_sam_h) $(htslib_kstring_h) samtools.h $(htslib_thread_pool_h) $(sam_opts_h)
bam_aux.o: bam_aux.c config.h $(bam_h)
bam_cat.o: bam_cat.c config.h $(htslib_bgzf_h) $(htslib_sam_h) $(htslib_cram_h) $(htslib_khash_h) samtools.h
bam_color.o: bam_color.c config.h $(bam_h)
bam_import.o: bam_import.c config.h $(htslib_kstring_h) $(bam_h) $(htslib_kseq_h)
bam_index.o: bam_index.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_h) samtools.h
bam_index.o: bam_index.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_h) samtools.h $(sam_opts_h)
bam_lpileup.o: bam_lpileup.c config.h $(bam_plbuf_h) $(bam_lpileup_h) $(htslib_ksort_h)
bam_mate.o: bam_mate.c config.h $(sam_opts_h) $(htslib_kstring_h) $(htslib_sam_h) samtools.h
bam_md.o: bam_md.c config.h $(htslib_faidx_h) $(htslib_sam_h) $(htslib_kstring_h) $(sam_opts_h) samtools.h
bam_mate.o: bam_mate.c config.h $(htslib_thread_pool_h) $(sam_opts_h) $(htslib_kstring_h) $(htslib_sam_h) samtools.h
bam_md.o: bam_md.c config.h $(htslib_faidx_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_thread_pool_h) $(sam_opts_h) samtools.h
bam_plbuf.o: bam_plbuf.c config.h $(htslib_hts_h) $(htslib_sam_h) $(bam_plbuf_h)
bam_plcmd.o: bam_plcmd.c config.h $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) sam_header.h samtools.h $(sam_opts_h) $(bam2bcf_h) $(sample_h)
bam_quickcheck.o: bam_quickcheck.c config.h $(htslib_hts_h) $(htslib_sam_h)
bam_reheader.o: bam_reheader.c config.h $(htslib_bgzf_h) $(htslib_sam_h) $(htslib_hfile_h) $(htslib_cram_h) samtools.h
bam_rmdup.o: bam_rmdup.c config.h $(htslib_sam_h) $(sam_opts_h) samtools.h $(bam_h) $(htslib_khash_h)
bam_rmdupse.o: bam_rmdupse.c config.h $(bam_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_klist_h) samtools.h
bam_sort.o: bam_sort.c config.h $(htslib_ksort_h) $(htslib_khash_h) $(htslib_klist_h) $(htslib_kstring_h) $(htslib_sam_h) $(sam_opts_h) samtools.h
bam_split.o: bam_split.c config.h $(htslib_sam_h) $(htslib_khash_h) $(htslib_kstring_h) $(htslib_cram_h) $(sam_opts_h) samtools.h
bam_stat.o: bam_stat.c config.h $(htslib_sam_h) samtools.h
bam_sort.o: bam_sort.c config.h $(htslib_ksort_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_klist_h) $(htslib_kstring_h) $(htslib_sam_h) $(sam_opts_h) samtools.h
bam_split.o: bam_split.c config.h $(htslib_sam_h) $(htslib_khash_h) $(htslib_kstring_h) $(htslib_cram_h) $(htslib_thread_pool_h) $(sam_opts_h) samtools.h
bam_stat.o: bam_stat.c config.h $(htslib_sam_h) samtools.h $(sam_opts_h)
bam_tview.o: bam_tview.c config.h $(bam_tview_h) $(htslib_faidx_h) $(htslib_sam_h) $(htslib_bgzf_h) samtools.h $(sam_opts_h)
bam_tview_curses.o: bam_tview_curses.c config.h $(bam_tview_h)
bam_tview_html.o: bam_tview_html.c config.h $(bam_tview_h)
bam_flags.o: bam_flags.c config.h $(htslib_sam_h)
bamshuf.o: bamshuf.c config.h $(htslib_sam_h) $(htslib_hts_h) $(htslib_ksort_h) samtools.h $(sam_opts_h)
bamshuf.o: bamshuf.c config.h $(htslib_sam_h) $(htslib_hts_h) $(htslib_ksort_h) samtools.h $(htslib_thread_pool_h) $(sam_opts_h) $(htslib_khash_h)
bamtk.o: bamtk.c config.h $(htslib_hts_h) samtools.h version.h
bedcov.o: bedcov.c config.h $(htslib_kstring_h) $(htslib_sam_h) $(sam_opts_h) $(htslib_kseq_h)
bedidx.o: bedidx.c config.h $(htslib_ksort_h) $(htslib_kseq_h) $(htslib_khash_h)
bedcov.o: bedcov.c config.h $(htslib_kstring_h) $(htslib_sam_h) $(htslib_thread_pool_h) $(sam_opts_h) $(htslib_kseq_h)
bedidx.o: bedidx.c config.h $(bedidx_h) $(htslib_ksort_h) $(htslib_kseq_h) $(htslib_khash_h)
cut_target.o: cut_target.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) samtools.h $(sam_opts_h)
dict.o: dict.c config.h $(htslib_kseq_h) $(htslib_hts_h)
faidx.o: faidx.c config.h $(htslib_faidx_h) samtools.h
faidx.o: faidx.c config.h $(htslib_faidx_h) $(htslib_hts_h) $(htslib_hfile_h) $(htslib_kstring_h) samtools.h
padding.o: padding.c config.h $(htslib_kstring_h) $(htslib_sam_h) $(htslib_faidx_h) sam_header.h $(sam_opts_h) samtools.h
phase.o: phase.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(sam_opts_h) samtools.h $(htslib_kseq_h) $(htslib_khash_h) $(htslib_ksort_h)
phase.o: phase.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(sam_opts_h) samtools.h $(htslib_hts_os_h) $(htslib_kseq_h) $(htslib_khash_h) $(htslib_ksort_h)
sam.o: sam.c config.h $(htslib_faidx_h) $(sam_h)
sam_header.o: sam_header.c config.h sam_header.h $(htslib_khash_h)
sam_opts.o: sam_opts.c config.h $(sam_opts_h)
sam_utils.o: sam_utils.c config.h samtools.h
sam_view.o: sam_view.c config.h $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_h) samtools.h $(sam_opts_h)
sam_view.o: sam_view.c config.h $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_h) $(htslib_klist_h) $(htslib_thread_pool_h) $(htslib_bgzf_h) samtools.h $(sam_opts_h) $(bedidx_h)
sample.o: sample.c config.h $(sample_h) $(htslib_khash_h)
stats_isize.o: stats_isize.c config.h stats_isize.h $(htslib_khash_h)
stats.o: stats.c config.h $(htslib_faidx_h) $(htslib_sam_h) $(htslib_hts_h) sam_header.h $(htslib_khash_str2int_h) samtools.h $(htslib_khash_h) $(htslib_kstring_h) stats_isize.h $(sam_opts_h)
bam_markdup.o: bam_markdup.c config.h $(htslib_sam_h) $(sam_opts_h) samtools.h $(bam_h) $(htslib_khash_h) $(tmp_file_h)
tmp_file.o: tmp_file.c config.h $(tmp_file_h)
stats_isize.o: stats_isize.c config.h $(stats_isize_h) $(htslib_khash_h)
stats.o: stats.c config.h $(htslib_faidx_h) $(htslib_sam_h) $(htslib_hts_h) sam_header.h $(htslib_khash_str2int_h) samtools.h $(htslib_khash_h) $(htslib_kstring_h) $(stats_isize_h) $(sam_opts_h) $(bedidx_h)
bam_markdup.o: bam_markdup.c config.h $(htslib_thread_pool_h) $(htslib_sam_h) $(sam_opts_h) samtools.h $(htslib_khash_h) $(htslib_klist_h) $(htslib_kstring_h) $(tmp_file_h)
tmp_file.o: tmp_file.c config.h $(tmp_file_h) $(htslib_sam_h)
# test programs
......@@ -287,7 +289,7 @@ misc/wgsim: misc/wgsim.o $(HTSLIB)
misc/ace2sam.o: misc/ace2sam.c config.h $(htslib_kstring_h) $(htslib_kseq_h)
misc/md5fa.o: misc/md5fa.c config.h $(htslib_kseq_h) $(htslib_hts_h)
misc/md5sum-lite.o: misc/md5sum-lite.c config.h $(htslib_hts_h)
misc/wgsim.o: misc/wgsim.c config.h version.h $(htslib_kseq_h)
misc/wgsim.o: misc/wgsim.c config.h version.h $(htslib_kseq_h) $(htslib_hts_os_h)
misc/maq2sam-short.o: misc/maq2sam.c config.h version.h
$(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ misc/maq2sam.c
......
Release 1.9 (18th July 2018)
----------------------------
* Samtools mpileup VCF and BCF output is now deprecated. It is still
functional, but will warn. Please use bcftools mpileup instead. (#884)
* Samtools mpileup now handles the '-d' max_depth option differently. There
is no longer an enforced minimum, and '-d 0' is interpreted as limitless
(no maximum - warning this may be slow). The default per-file depth is
now 8000, which matches the value mpileup used to use when processing
a single sample. To get the previous default behaviour use the higher
of 8000 divided by the number of samples across all input files, or 250.
(#859)
* Samtools stats new features:
- The '--remove-overlaps' option discounts overlapping portions of
templates when computing coverage and mapped base counting. (#855)
- When a target file is in use, the number of bases inside the
target is printed and the percentage of target bases with coverage
above a given threshold specified by the '--cov-threshold' option. (#855)
- Split base composition and length statistics by first and last reads.
(#814, #816)
* Samtools faidx new features:
- Now takes long options. (#509, thanks to Pierre Lindenbaum)
- Now warns about zero-length and truncated sequences due to the
requested range being beyond the end of the sequence. (#834)
- Gets a new option (--continue) that allows it to carry on
when a requested sequence was not in the index. (#834)
- It is now possible to supply the list of regions to output in a text
file using the new '--region-file' option. (#840)
- New '-i' option to make faidx return the reverse complement of
the regions requested. (#878)
- faidx now works on FASTQ (returning FASTA) and added a new
fqidx command to index and return FASTQ. (#852)
* Samtools collate now has a fast option '-f' that only operates on
primary pairs, dropping secondary and supplementary. It tries to write
pairs to the final output file as soon as both reads have been found. (#818)
* Samtools bedcov gets a new '-j' option to make it ignore deletions (D) and
reference skips (N) when computing coverage. (#843)
* Small speed up to samtools coordinate sort, by converting it to use
radix sort. (#835, thanks to Zhuravleva Aleksandra)
* Samtools idxstats now works on SAM and CRAM files, however this
isn't fast due to some information lacking from indices. (#832)
* Compression levels may now be specified with the level=N
output-fmt-option. E.g. with -O bam,level=3.
* Various documentation improvements.
* Bug-fixes:
- Improved error reporting in several places. (#827, #834, #877, cd7197)
- Various test improvements.
- Fixed failures in the multi-region iterator (view -M) when regions
provided via BED files include overlaps (#819, reported by Dave Larson).
- Samtools stats now counts '=' and 'X' CIGAR operators when
counting mapped bases. (#855)
- Samtools stats has fixes for insert size filtering (-m, -i). (#845; #697
reported by Soumitra Pal)
- Samtools stats -F now longer negates an earlier -d option. (#830)
- Fix samtools stats crash when using a target region. (#875, reported by
John Marshall)
- Samtools sort now keeps to a single thread when the -@ option is absent.
Previously it would spawn a writer thread, which could cause the CPU
usage to go slightly over 100%. (#833, reported by Matthias Bernt)
- Fixed samtools phase '-A' option which was incorrectly defined to take
a parameter. (#850; #846 reported by Dianne Velasco)
- Fixed compilation problems when using C_INCLUDE_PATH. (#870; #817 reported
by Robert Boissy)
- Fixed --version when built from a Git repository. (#844, thanks to John
Marshall)
- Use noenhanced mode for title in plot-bamstats. Prevents unwanted
interpretation of characters like underscore in gnuplot version 5. (#829,
thanks to M. Zapukhlyak)
- blast2sam.pl now reports perfect match hits (no indels or mismatches).
(#873, thanks to Nils Homer)
- Fixed bug in fasta and fastq subcommands where stdout would not be flushed
correctly if the -0 option was used.
- Fixed invalid memory access in mpileup and depth on alignment records
where the sequence is absent.
Release 1.8 (3rd April 2018)
----------------------------
......
......@@ -9,7 +9,7 @@ Building samtools
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
cd .../samtools-1.8 # Within the unpacked release directory
cd .../samtools-1.9 # Within the unpacked release directory
./configure
make
......@@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
cd .../samtools-1.8 # Within the unpacked release directory
cd .../samtools-1.9 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
......@@ -30,6 +30,53 @@ You will likely wish to add /path/to/location/bin to your $PATH.
See INSTALL for full building and installation instructions and details.
Building with HTSlib plug-in support
====================================
Enabling plug-ins causes some parts of HTSlib to be built as separate modules.
There are two advantages to this:
* The static library libhts.a has fewer dependencies, which makes linking
third-party code against it easier.
* It is possible to build extra plug-ins in addition to the ones that are
bundled with HTSlib. For example, the hts-plugins repository
<https://github.com/samtools/htslib-plugins> includes a module that
allows direct access to files stored in an iRODS data management
repository (see <https://irods.org/>).
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
cd .../samtools-1.9 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
There are two other configure options that affect plug-ins. These are:
--with-plugin-dir=DIR plug-in installation location
--with-plugin-path=PATH default plug-in search path
The default for --with-plugin-dir is <prefix>/libexec/htslib.
--with-plugin-path sets the built-in search path used to find the plug-ins. By
default this is the directory set by the --with-plugin-dir option. Multiple
directories should be separated by colons.
Setting --with-plugin-path is useful if you want to run directly from
the source distribution instead of installing the package. In that case
you can use:
cd .../samtools-1.9 # Within the unpacked release directory
./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.9
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
environment variable. Directories should be separated by colons. To
include the built-in path, add an empty entry to HTS_PATH:
export HTS_PATH=:/my/path # Search built-in path first
export HTS_PATH=/my/path: # Search built-in path last
export HTS_PATH=/my/path1::/my/path2 # Search built-in path between others
Using an optimised zlib library
===============================
......
......@@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */
@copyright Genome Research Ltd.
*/
#define BAM_VERSION "1.8"
#define BAM_VERSION "1.9"
#include <stdint.h>
#include <stdlib.h>
......
......@@ -255,7 +255,8 @@ int main_depth(int argc, char *argv[])
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
else if (p->qpos < p->b->core.l_qseq &&
bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
}
printf("\t%d", n_plp[i] - m); // this the depth to output
}
......
......@@ -542,14 +542,14 @@ int main_cat(int argc, char *argv[])
case 'h': {
samFile *fph = sam_open(optarg, "r");
if (fph == 0) {
fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);
fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, optarg);
return 1;
}
h = sam_hdr_read(fph);
if (h == NULL) {
fprintf(stderr,
"[%s] ERROR: failed to read the header for '%s'.\n",
__func__, argv[1]);
"[%s] ERROR: failed to read the header from '%s'.\n",
__func__, optarg);
return 1;
}
sam_close(fph);
......
......@@ -34,8 +34,10 @@ DEALINGS IN THE SOFTWARE. */
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <unistd.h>
#include <getopt.h>
#include "samtools.h"
#include "sam_opts.h"
#define BAM_LIDX_SHIFT 14
......@@ -101,31 +103,128 @@ int bam_index(int argc, char *argv[])
return EXIT_FAILURE;
}
/*
* Cram indices do not contain mapped/unmapped record counts, so we have to
* decode each record and count. However we can speed this up as much as
* possible by using the required fields parameter.
*
* This prints the stats to stdout in the same manner than the BAM function
* does.
*
* Returns 0 on success,
* -1 on failure.
*/
int slow_idxstats(samFile *fp, bam_hdr_t *header) {
int ret, last_tid = -2;
bam1_t *b = bam_init1();
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_FLAG))
return -1;
uint64_t (*count0)[2] = calloc(header->n_targets+1, sizeof(*count0));
uint64_t (*counts)[2] = count0+1;
if (!count0)
return -1;
while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= header->n_targets || b->core.tid < -1) {
free(count0);
return -1;
}
if (b->core.tid != last_tid) {
if (last_tid >= -1) {
if (counts[b->core.tid][0] + counts[b->core.tid][1]) {
print_error("idxstats", "file is not position sorted");
free(count0);
return -1;
}
}
last_tid = b->core.tid;
}
counts[b->core.tid][(b->core.flag & BAM_FUNMAP) ? 1 : 0]++;
}
if (ret == -1) {
int i;
for (i = 0; i < header->n_targets; i++) {
printf("%s\t%d\t%"PRIu64"\t%"PRIu64"\n",
header->target_name[i],
header->target_len[i],
counts[i][0], counts[i][1]);
}
printf("*\t0\t%"PRIu64"\t%"PRIu64"\n", counts[-1][0], counts[-1][1]);
}
free(count0);
bam_destroy1(b);
return (ret == -1) ? 0 : -1;
}
static void usage_exit(FILE *fp, int exit_status)
{
fprintf(fp, "Usage: samtools idxstats [options] <in.bam>\n");
sam_global_opt_help(fp, "-.---@");
exit(exit_status);
}
int bam_idxstats(int argc, char *argv[])
{
hts_idx_t* idx;
bam_hdr_t* header;
samFile* fp;
int c;
if (argc < 2) {
fprintf(stderr, "Usage: samtools idxstats <in.bam>\n");
return 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
switch (c) {
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?':
usage_exit(stderr, EXIT_FAILURE);
}
}
fp = sam_open(argv[1], "r");
if (argc != optind+1) {
if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
else usage_exit(stderr, EXIT_FAILURE);
}
fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == NULL) {
print_error_errno("idxstats", "failed to open \"%s\"", argv[1]);
print_error_errno("idxstats", "failed to open \"%s\"", argv[optind]);
return 1;
}
header = sam_hdr_read(fp);
if (header == NULL) {
print_error("idxstats", "failed to read header for \"%s\"", argv[1]);
print_error("idxstats", "failed to read header for \"%s\"", argv[optind]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
if (idx == NULL) {
print_error("idxstats", "fail to load index for \"%s\"", argv[1]);
if (hts_get_format(fp)->format != bam) {
slow_method:
if (ga.nthreads)
hts_set_threads(fp, ga.nthreads);
if (slow_idxstats(fp, header) < 0) {
print_error("idxstats", "failed to process \"%s\"", argv[optind]);
return 1;
}
} else {
idx = sam_index_load(fp, argv[optind]);
if (idx == NULL) {
print_error("idxstats", "fail to load index for \"%s\", "
"reverting to slow method", argv[optind]);
goto slow_method;
}
int i;
for (i = 0; i < header->n_targets; ++i) {
......@@ -138,8 +237,10 @@ int bam_idxstats(int argc, char *argv[])
}
// Dump information about unmapped reads
printf("*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx));
bam_hdr_destroy(header);
hts_idx_destroy(idx);
}
bam_hdr_destroy(header);
sam_close(fp);
return 0;
}
......@@ -29,7 +29,6 @@ DEALINGS IN THE SOFTWARE. */
#include <assert.h>
#include "bam_plbuf.h"
#include "bam_lpileup.h"
#include "samtools.h"
#include <htslib/ksort.h>
#define TV_GAP 2
......
......@@ -115,6 +115,9 @@ static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref
#define MPLP_SMART_OVERLAPS (1<<12)
#define MPLP_PRINT_QNAME (1<<13)
#define MPLP_MAX_DEPTH 8000
#define MPLP_MAX_INDEL_DEPTH 250
void *bed_read(const char *fn);
void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
......@@ -381,8 +384,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
exit(EXIT_FAILURE);
}
bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
if (conf->flag & MPLP_BCF) {
// Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
}
if (conf->reg) {
hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
if (idx == NULL) {
......@@ -409,17 +414,18 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
data[i]->h = h;
}
}
fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
if (conf->flag & MPLP_BCF)
{
const char *mode;
// allocate data storage proportionate to number of samples being studied sm->n
gplp.n = sm->n;
gplp.n_plp = calloc(sm->n, sizeof(int));
gplp.m_plp = calloc(sm->n, sizeof(int));
gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*));
fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
// write the VCF header
if (conf->flag & MPLP_BCF)
{
const char *mode;
if ( conf->flag & MPLP_VCF )
mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF
else
......@@ -554,13 +560,16 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
// init pileup
iter = bam_mplp_init(n, mplp_func, (void**)data);
if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
if ( !conf->max_depth ) {
max_depth = INT_MAX;
fprintf(stderr, "[%s] Max depth set to maximum value (%d)\n", __func__, INT_MAX);
} else {
max_depth = conf->max_depth;
if (max_depth * sm->n > 1<<20)
fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
if (max_depth * sm->n < 8000) {
max_depth = 8000 / sm->n;
fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
if ( max_depth * n > 1<<20 )
fprintf(stderr, "[%s] Combined max depth is above 1M. Potential memory hog!\n", __func__);
}
// Only used when writing BCF
max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
......@@ -677,7 +686,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if ( c < conf->min_baseQ ) continue;
c = plp[i][j].b->core.qual + 33;
if (c > 126) c = 126;
......@@ -692,7 +703,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if ( c < conf->min_baseQ ) continue;
if (n > 0) putc(',', pileup_fp);
......@@ -707,7 +720,9 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = &plp[i][j];
int c = bam_get_qual(p->b)[p->qpos];
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if ( c < conf->min_baseQ ) continue;
if (n > 0) putc(',', pileup_fp);
......@@ -910,46 +925,28 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
"\n"
"Output options:\n"
" -o, --output FILE write output to FILE [standard output]\n"
" -g, --BCF generate genotype likelihoods in BCF format\n"
" -v, --VCF generate genotype likelihoods in VCF format\n"
"\n"
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
" --output-QNAME output read names\n"
" -a output all positions (including zero depth)\n"
" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
" -t, --output-tags LIST optional tags to output:\n"
" DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR []\n"
" -u, --uncompressed generate uncompressed VCF/BCF output\n"
"\n"
"SNP/INDEL genotype likelihoods options (effective with -g/-v):\n"
" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
fprintf(fp,
" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
fprintf(fp,
" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
fprintf(fp,
" -I, --skip-indels do not perform indel calling\n"
" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
fprintf(fp,
" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
fprintf(fp,
" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR comma separated list of platforms for indels [all]\n");
"Generic options:\n");
sam_global_opt_help(fp, "-.--.-");
fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
fprintf(fp, "\n"
"Note that using \"samtools mpileup\" to generate BCF or VCF files is now\n"
"deprecated. To output these formats, please use \"bcftools mpileup\" instead.\n");
free(tmp_require);
free(tmp_filter);
}
static void deprecated(char opt) {
fprintf(stderr, "[warning] samtools mpileup option `%c` is functional, "
"but deprecated. Please switch to using bcftools mpileup in future.\n", opt);
}
int bam_mpileup(int argc, char *argv[])
{
int c;
......@@ -960,7 +957,8 @@ int bam_mpileup(int argc, char *argv[])
memset(&mplp, 0, sizeof(mplp_conf_t));
mplp.min_baseQ = 13;
mplp.capQ_thres = 0;
mplp.max_depth = 250; mplp.max_indel_depth = 250;
mplp.max_depth = MPLP_MAX_DEPTH;
mplp.max_indel_depth = MPLP_MAX_INDEL_DEPTH;
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
......@@ -1052,16 +1050,16 @@ int bam_mpileup(int argc, char *argv[])
mplp.bed = bed_read(optarg);
if (!mplp.bed) { print_error_errno("mpileup", "Could not read file \"%s\"", optarg); return 1; }
break;
case 'P': mplp.pl_list = strdup(optarg); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
case 'g': mplp.flag |= MPLP_BCF; break;
case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break;
case 'P': mplp.pl_list = strdup(optarg); deprecated(c); break;
case 'p': mplp.flag |= MPLP_PER_SAMPLE; deprecated(c); break;
case 'g': mplp.flag |= MPLP_BCF; deprecated(c); break;
case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; deprecated(c); break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; deprecated(c); break;
case 'B': mplp.flag &= ~MPLP_REALN; break;
case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break;
case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break;
case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break;
case 'I': mplp.flag |= MPLP_NO_INDEL; break;
case 'D': mplp.fmt_flag |= B2B_FMT_DP; deprecated(c); break;
case 'S': mplp.fmt_flag |= B2B_FMT_SP; deprecated(c); break;
case 'V': mplp.fmt_flag |= B2B_FMT_DV; deprecated(c); break;
case 'I': mplp.flag |= MPLP_NO_INDEL; deprecated(c); break;
case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
case '6': mplp.flag |= MPLP_ILLUMINA13; break;
case 'R': mplp.flag |= MPLP_IGNORE_RG; break;
......@@ -1075,28 +1073,34 @@ int bam_mpileup(int argc, char *argv[])
char *end;
long value = strtol(optarg, &end, 10);
// Distinguish between -o INT and -o FILE (a bit of a hack!)
if (*end == '\0') mplp.openQ = value;
else mplp.output_fname = optarg;
if (*end == '\0') {
mplp.openQ = value;
fprintf(stderr, "[warning] samtools mpileup option "
"'--open-prob INT' is functional, but deprecated. "
"Please switch to using bcftools mpileup in future.\n");
} else {
mplp.output_fname = optarg;
}
}
break;
case 'e': mplp.extQ = atoi(optarg); break;
case 'h': mplp.tandemQ = atoi(optarg); break;
case 'e': mplp.extQ = atoi(optarg); deprecated(c); break;
case 'h': mplp.tandemQ = atoi(optarg); deprecated(c); break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
case 'L': mplp.max_indel_depth = atoi(optarg); break;
case 'F': mplp.min_frac = atof(optarg); deprecated(c); break;
case 'm': mplp.min_support = atoi(optarg); deprecated(c); break;
case 'L': mplp.max_indel_depth = atoi(optarg); deprecated(c); break;
case 'G': {
FILE *fp_rg;
char buf[1024];
mplp.rghash = khash_str2int_init();
if ((fp_rg = fopen(optarg, "r")) == NULL)
fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg);
fprintf(stderr, "[%s] Fail to open file %s. Continue anyway.\n", __func__, optarg);
while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me...
khash_str2int_inc(mplp.rghash, strdup(buf));
fclose(fp_rg);
}
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); deprecated(c); break;
case 'a': mplp.all++; break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
......
......@@ -38,7 +38,6 @@ DEALINGS IN THE SOFTWARE. */
#include <getopt.h>
#include <assert.h>
#include <pthread.h>
#include "htslib/bgzf.h"
#include "htslib/ksort.h"
#include "htslib/hts_os.h"
#include "htslib/khash.h"
......@@ -49,11 +48,15 @@ DEALINGS IN THE SOFTWARE. */
#include "samtools.h"
// Struct which contains the a record, and the pointer to the sort tag (if any)
// Used to speed up sort-by-tag.
// Struct which contains the a record, and the pointer to the sort tag (if any) or
// a combined ref / position / strand.
// Used to speed up tag and position sorts.
typedef struct bam1_tag {
bam1_t *bam_record;
union {
const uint8_t *tag;
uint64_t pos;
} u;
} bam1_tag;
/* Minimum memory required in megabytes before sort will attempt to run. This
......@@ -1363,7 +1366,7 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
int res;
h->i = i;
h->entry.bam_record = bam_init1();
h->entry.tag = NULL;
h->entry.u.tag = NULL;
if (!h->entry.bam_record) goto mem_fail;
res = iter[i] ? sam_itr_next(fp[i], iter[i], h->entry.bam_record) : sam_read1(fp[i], hdr[i], h->entry.bam_record);
if (res >= 0) {
......@@ -1372,16 +1375,16 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
h->rev = bam_is_rev(h->entry.bam_record);
h->idx = idx++;
if (g_is_by_tag) {
h->entry.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
} else {
h->entry.tag = NULL;
h->entry.u.tag = NULL;
}
}
else if (res == -1 && (!iter[i] || iter[i]->finished)) {
h->pos = HEAP_EMPTY;
bam_destroy1(h->entry.bam_record);
h->entry.bam_record = NULL;
h->entry.tag = NULL;
h->entry.u.tag = NULL;
} else {
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
......@@ -1420,15 +1423,15 @@ int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *m
heap->rev = bam_is_rev(b);
heap->idx = idx++;
if (g_is_by_tag) {
heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
} else {
heap->entry.tag = NULL;
heap->entry.u.tag = NULL;
}
} else if (j == -1 && (!iter[heap->i] || iter[heap->i]->finished)) {
heap->pos = HEAP_EMPTY;
bam_destroy1(heap->entry.bam_record);
heap->entry.bam_record = NULL;
heap->entry.tag = NULL;
heap->entry.u.tag = NULL;
} else {
print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
......@@ -1657,15 +1660,15 @@ static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
heap->rev = bam_is_rev(heap->entry.bam_record);
heap->idx = (*idx)++;
if (g_is_by_tag) {
heap->entry.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
} else {
heap->entry.tag = NULL;
heap->entry.u.tag = NULL;
}
} else if (res == -1) {
heap->pos = HEAP_EMPTY;
if (i < nfiles) bam_destroy1(heap->entry.bam_record);
heap->entry.bam_record = NULL;
heap->entry.tag = NULL;
heap->entry.u.tag = NULL;
} else {
return -1;
}
......@@ -1720,7 +1723,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
// Get a read into the heap
h->i = i;
h->entry.tag = NULL;
h->entry.u.tag = NULL;
if (i < n) {
h->entry.bam_record = bam_init1();
if (!h->entry.bam_record) goto mem_fail;
......@@ -1738,7 +1741,7 @@ static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
return -1;
}
hts_set_threads(fpout, n_threads);
if (n_threads > 1) hts_set_threads(fpout, n_threads);
if (sam_hdr_write(fpout, hout) != 0) {
print_error_errno(cmd, "failed to write header to \"%s\"", out);
......@@ -1838,8 +1841,8 @@ uint8_t normalize_type(const uint8_t* aux) {
// equal to or greater than b, respectively.
static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b)
{
const uint8_t* aux_a = a.tag;
const uint8_t* aux_b = b.tag;
const uint8_t* aux_a = a.u.tag;
const uint8_t* aux_b = b.u.tag;
if (aux_a == NULL && aux_b != NULL) {
return -1;
......@@ -1936,12 +1939,76 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu
return -1;
}
#define NUMBASE 256
#define STEP 8
static int ks_radixsort(size_t n, bam1_tag *buf, const bam_hdr_t *h)
{
int curr = 0, ret = -1;
ssize_t i;
bam1_tag *buf_ar2[2], *bam_a, *bam_b;
uint64_t max_pos = 0, max_digit = 0, shift = 0;
for (i = 0; i < n; i++) {
bam1_t *b = buf[i].bam_record;
int32_t tid = b->core.tid == -1 ? h->n_targets : b->core.tid;
buf[i].u.pos = (uint64_t)tid<<32 | (b->core.pos+1)<<1 | bam_is_rev(b);
if (max_pos < buf[i].u.pos)
max_pos = buf[i].u.pos;
}
while (max_pos) {
++max_digit;
max_pos = max_pos >> 1;
}
buf_ar2[0] = buf;
buf_ar2[1] = (bam1_tag *)malloc(sizeof(bam1_tag) * n);
if (buf_ar2[1] == NULL) {
print_error("sort", "couldn't allocate memory for temporary buf");
goto err;
}
while (shift < max_digit){
size_t remainders[NUMBASE] = { 0 };
bam_a = buf_ar2[curr]; bam_b = buf_ar2[1-curr];
for (i = 0; i < n; ++i)
remainders[(bam_a[i].u.pos >> shift) % NUMBASE]++;
for (i = 1; i < NUMBASE; ++i)
remainders[i] += remainders[i - 1];
for (i = n - 1; i >= 0; i--) {
size_t j = --remainders[(bam_a[i].u.pos >> shift) % NUMBASE];
bam_b[j] = bam_a[i];
}
shift += STEP;
curr = 1 - curr;
}
if (curr == 1) {
bam1_tag *end = buf + n;
bam_a = buf_ar2[0]; bam_b = buf_ar2[1];
while (bam_a < end) *bam_a++ = *bam_b++;
}
ret = 0;
err:
free(buf_ar2[1]);
return ret;
}
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
char *name;
w->error = 0;
if (!g_is_by_qname && !g_is_by_tag) {
if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
w->error = errno;
return NULL;
}
} else {
ks_mergesort(sort, w->buf_len, w->buf, 0);
}
if (w->no_save)
return 0;
......@@ -2148,11 +2215,12 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
mem_full = 1;
}
// Pull out the pointer to the sort tag if applicable
// Pull out the value of the position
// or the pointer to the sort tag if applicable
if (g_is_by_tag) {
buf[k].tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
} else {
buf[k].tag = NULL;
buf[k].u.tag = NULL;
}
++k;
......@@ -2184,7 +2252,6 @@ int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const
// write the final output
if (n_files == 0 && num_in_mem < 2) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
goto err;
......
/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
Copyright (C) 2013, 2015 Genome Research Ltd.
Copyright (C) 2013, 2015, 2018 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
......@@ -41,6 +41,7 @@ DEALINGS IN THE SOFTWARE. */
#include "samtools.h"
#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/khash.h"
#define DEF_CLEVEL 1
......@@ -82,8 +83,105 @@ static inline int elem_lt(elem_t x, elem_t y)
KSORT_INIT(bamshuf, elem_t, elem_lt)
typedef struct {
int written;
bam1_t *b;
} bam_item_t;
typedef struct {
bam1_t *bam_pool;
bam_item_t *items;
size_t size;
size_t index;
} bam_list_t;
typedef struct {
bam_item_t *bi;
} store_item_t;
KHASH_MAP_INIT_STR(bam_store, store_item_t)
static bam_item_t *store_bam(bam_list_t *list) {
size_t old_index = list->index;
list->items[list->index++].written = 0;
if (list->index >= list->size)
list->index = 0;
return &list->items[old_index];
}
static int write_bam_needed(bam_list_t *list) {
return !list->items[list->index].written;
}
static void mark_bam_as_written(bam_list_t *list) {
list->items[list->index].written = 1;
}
static int create_bam_list(bam_list_t *list, size_t max_size) {
size_t i;
list->size = list->index = 0;
list->items = NULL;
list->bam_pool = NULL;
if ((list->items = malloc(max_size * sizeof(bam_item_t))) == NULL) {
return 1;
}
if ((list->bam_pool = calloc(max_size, sizeof(bam1_t))) == NULL) {
return 1;
}
for (i = 0; i < max_size; i++) {
list->items[i].b = &list->bam_pool[i];
list->items[i].written = 1;
}
list->size = max_size;
list->index = 0;
return 0;
}
static void destroy_bam_list(bam_list_t *list) {
size_t i;
for (i = 0; i < list->size; i++) {
free(list->bam_pool[i].data);
}
free(list->bam_pool);
free(list->items);
}
static inline int write_to_bin_file(bam1_t *bam, int64_t *count, samFile **bin_files, char **names, bam_hdr_t *header, int files) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(bam)) % files;
if (sam_write1(bin_files[x], header, bam) < 0) {
print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", names[x]);
return 1;
}
++count[x];
return 0;
}
static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
int is_stdout, const char *output_file, sam_global_args *ga)
int is_stdout, const char *output_file, int fast, int store_max, sam_global_args *ga)
{
samFile *fp, *fpw = NULL, **fpt = NULL;
char **fnt = NULL, modew[8];
......@@ -127,6 +225,40 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
goto fail;
}
// open final output file
l = strlen(pre);
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout && !output_file) { // output to a file (name based on prefix)
char *fnw = (char*)calloc(l + 5, 1);
if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
else
sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
} else if (output_file) { // output to a given file
modew[0] = 'w'; modew[1] = '\0';
sam_open_mode(modew + 1, output_file, NULL);
j = strlen(modew);
snprintf(modew + j, sizeof(modew) - j, "%d",
(clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
fpw = sam_open_format(output_file, modew, &ga->out);
} else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
if (fpw == NULL) {
if (is_stdout) print_error_errno("collate", "Cannot open standard output");
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
goto fail;
}
fnt = (char**)calloc(n_files, sizeof(char*));
if (!fnt) goto mem_fail;
fpt = (samFile**)calloc(n_files, sizeof(samFile*));
......@@ -134,8 +266,6 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
cnt = (int64_t*)calloc(n_files, 8);
if (!cnt) goto mem_fail;
l = strlen(pre);
for (i = counter = 0; i < n_files; ++i) {
fnt[i] = (char*)calloc(l + 20, 1);
if (!fnt[i]) goto mem_fail;
......@@ -153,19 +283,145 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
goto fail;
}
}
if (fast) {
khash_t(bam_store) *stored = kh_init(bam_store);
khiter_t itr;
bam_list_t list;
int err = 0;
if (!stored) goto mem_fail;
if (store_max < 2) store_max = 2;
if (create_bam_list(&list, store_max)) {
fprintf(stderr, "[collate[ ERROR: unable to create bam list.\n");
err = 1;
goto fast_fail;
}
while ((r = sam_read1(fp, h, list.items[list.index].b)) >= 0) {
int ret;
bam1_t *b = list.items[list.index].b;
int readflag = b->core.flag & (BAM_FREAD1 | BAM_FREAD2);
// strictly paired reads only
if (!(b->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) && (readflag == BAM_FREAD1 || readflag == BAM_FREAD2)) {
itr = kh_get(bam_store, stored, bam_get_qname(b));
if (itr == kh_end(stored)) {
// new read
itr = kh_put(bam_store, stored, bam_get_qname(b), &ret);
if (ret > 0) { // okay to go ahead store it
kh_value(stored, itr).bi = store_bam(&list);
// see if the next one on the list needs to be written out
if (write_bam_needed(&list)) {
if (write_to_bin_file(list.items[list.index].b, cnt, fpt, fnt, h, n_files) < 0) {
fprintf(stderr, "[collate] ERROR: could not write line.\n");
err = 1;
goto fast_fail;
} else {
mark_bam_as_written(&list);
itr = kh_get(bam_store, stored, bam_get_qname(list.items[list.index].b));
if (itr != kh_end(stored)) {
kh_del(bam_store, stored, itr);
} else {
fprintf(stderr, "[collate] ERROR: stored value not in hash.\n");
err = 1;
goto fast_fail;
}
}
}
} else if (ret == 0) {
fprintf(stderr, "[collate] ERROR: value already in hash.\n");
err = 1;
goto fast_fail;
} else {
fprintf(stderr, "[collate] ERROR: unable to store in hash.\n");
err = 1;
goto fast_fail;
}
} else { // we have a match
// write out the reads in R1 R2 order
bam1_t *r1, *r2;
if (b->core.flag & BAM_FREAD1) {
r1 = b;
r2 = kh_value(stored, itr).bi->b;
} else {
r1 = kh_value(stored, itr).bi->b;
r2 = b;
}
if (sam_write1(fpw, h, r1) < 0) {
fprintf(stderr, "[collate] ERROR: could not write r1 alignment.\n");
err = 1;
goto fast_fail;
}
if (sam_write1(fpw, h, r2) < 0) {
fprintf(stderr, "[collate] ERROR: could not write r2 alignment.\n");
err = 1;
goto fast_fail;
}
mark_bam_as_written(&list);
// remove stored read
kh_value(stored, itr).bi->written = 1;
kh_del(bam_store, stored, itr);
}
}
}
for (list.index = 0; list.index < list.size; list.index++) {
if (write_bam_needed(&list)) {
bam1_t *b = list.items[list.index].b;
if (write_to_bin_file(b, cnt, fpt, fnt, h, n_files)) {
err = 1;
goto fast_fail;
} else {
itr = kh_get(bam_store, stored, bam_get_qname(b));
kh_del(bam_store, stored, itr);
}
}
}
fast_fail:
if (err) {
for (itr = kh_begin(stored); itr != kh_end(stored); ++itr) {
if (kh_exist(stored, itr)) {
kh_del(bam_store, stored, itr);
}
}
kh_destroy(bam_store, stored);
destroy_bam_list(&list);
goto fail;
} else {
kh_destroy(bam_store, stored);
destroy_bam_list(&list);
}
} else {
b = bam_init1();
if (!b) goto mem_fail;
while ((r = sam_read1(fp, h, b)) >= 0) {
uint32_t x;
x = hash_X31_Wang(bam_get_qname(b)) % n_files;
if (sam_write1(fpt[x], h, b) < 0) {
print_error_errno("collate", "Couldn't write to intermediate file \"%s\"", fnt[x]);
if (write_to_bin_file(b, cnt, fpt, fnt, h, n_files)) {
bam_destroy1(b);
goto fail;
}
++cnt[x];
}
bam_destroy1(b);
b = NULL;
}
if (r < -1) {
fprintf(stderr, "Error reading input file\n");
goto fail;
......@@ -186,37 +442,8 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
fpt = NULL;
sam_close(fp);
fp = NULL;
// merge
sprintf(modew, "wb%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
if (!is_stdout && !output_file) { // output to a file (name based on prefix)
char *fnw = (char*)calloc(l + 5, 1);
if (!fnw) goto mem_fail;
if (ga->out.format == unknown_format)
sprintf(fnw, "%s.bam", pre); // "wb" above makes BAM the default
else
sprintf(fnw, "%s.%s", pre, hts_format_file_extension(&ga->out));
fpw = sam_open_format(fnw, modew, &ga->out);
free(fnw);
} else if (output_file) { // output to a given file
modew[0] = 'w'; modew[1] = '\0';
sam_open_mode(modew + 1, output_file, NULL);
j = strlen(modew);
snprintf(modew + j, sizeof(modew) - j, "%d",
(clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
fpw = sam_open_format(output_file, modew, &ga->out);
} else fpw = sam_open_format("-", modew, &ga->out); // output to stdout
if (fpw == NULL) {
if (is_stdout) print_error_errno("collate", "Cannot open standard output");
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
goto fail;
}
// merge
a = malloc(max_cnt * sizeof(elem_t));
if (!a) goto mem_fail;
for (j = 0; j < max_cnt; ++j) {
......@@ -277,7 +504,6 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
if (fp) sam_close(fp);
if (fpw) sam_close(fpw);
if (h) bam_hdr_destroy(h);
if (b) bam_destroy1(b);
for (i = 0; i < n_files; ++i) {
if (fnt) free(fnt[i]);
if (fpt && fpt[i]) sam_close(fpt[i]);
......@@ -294,16 +520,18 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel,
return 1;
}
static int usage(FILE *fp, int n_files) {
static int usage(FILE *fp, int n_files, int reads_store) {
fprintf(fp,
"Usage: samtools collate [-Ou] [-o <name>] [-n nFiles] [-l cLevel] <in.bam> [<prefix>]\n\n"
"Options:\n"
" -O output to stdout\n"
" -o output file name (use prefix if not set)\n"
" -u uncompressed BAM output\n"
" -f fast (only primary alignments)\n"
" -r working reads stored (with -f) [%d]\n" // reads_store
" -l INT compression level [%d]\n" // DEF_CLEVEL
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
reads_store, DEF_CLEVEL, n_files);
sam_global_opt_help(fp, "-....@");
fprintf(fp,
......@@ -346,37 +574,48 @@ char * generate_prefix() {
int main_bamshuf(int argc, char *argv[])
{
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
const char *output_file = NULL, *prefix = NULL;
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0, fast_coll = 0, reads_store = 10000, ret, pre_mem = 0;
const char *output_file = NULL;
char *prefix = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, "n:l:uOo:@:", lopts, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'u': is_un = 1; break;
case 'O': is_stdout = 1; break;
case 'o': output_file = optarg; break;
case 'f': fast_coll = 1; break;
case 'r': reads_store = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return usage(stderr, n_files);
case '?': return usage(stderr, n_files, reads_store);
}
}
if (is_un) clevel = 0;
if (argc >= optind + 2) prefix = argv[optind+1];
if (!(prefix || is_stdout || output_file))
return usage(stderr, n_files);
return usage(stderr, n_files, reads_store);
if (is_stdout && output_file) {
fprintf(stderr, "collate: -o and -O options cannot be used together.\n");
return usage(stderr, n_files);
return usage(stderr, n_files, reads_store);
}
if (!prefix) prefix = generate_prefix();
if (!prefix) {
prefix = generate_prefix();
pre_mem = 1;
}
if (!prefix) return EXIT_FAILURE;
return bamshuf(argv[optind], n_files, prefix, clevel, is_stdout,
output_file, &ga);
ret = bamshuf(argv[optind], n_files, prefix, clevel, is_stdout,
output_file, fast_coll, reads_store, &ga);
if (pre_mem) free(prefix);
return ret;
}
......@@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */
#include "htslib/hts.h"
#include "samtools.h"
#include "version.h"
int bam_taf2baf(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
......@@ -62,7 +63,12 @@ int main_quickcheck(int argc, char *argv[]);
int main_addreplacerg(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int dict_main(int argc, char *argv[]);
int fqidx_main(int argc, char *argv[]);
const char *samtools_version()
{
return SAMTOOLS_VERSION;
}
static void usage(FILE *fp)
{
......@@ -79,6 +85,7 @@ static void usage(FILE *fp)
" -- Indexing\n"
" dict create a sequence dictionary file\n"
" faidx index/extract FASTA\n"
" fqidx index/extract FASTQ\n"
" index index alignment\n"
"\n"
" -- Editing\n"
......@@ -161,6 +168,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "index") == 0) ret = bam_index(argc-1, argv+1);
else if (strcmp(argv[1], "idxstats") == 0) ret = bam_idxstats(argc-1, argv+1);
else if (strcmp(argv[1], "faidx") == 0) ret = faidx_main(argc-1, argv+1);
else if (strcmp(argv[1], "fqidx") == 0) ret = fqidx_main(argc-1, argv+1);
else if (strcmp(argv[1], "dict") == 0) ret = dict_main(argc-1, argv+1);
else if (strcmp(argv[1], "fixmate") == 0) ret = bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) ret = bam_rmdup(argc-1, argv+1);
......
......@@ -68,7 +68,7 @@ int main_bedcov(int argc, char *argv[])
kstream_t *ks;
hts_idx_t **idx;
aux_t **aux;
int *n_plp, dret, i, n, c, min_mapQ = 0;
int *n_plp, dret, i, j, m, n, c, min_mapQ = 0, skip_DN = 0;
int64_t *cnt;
const bam_pileup1_t **plp;
int usage = 0;
......@@ -79,9 +79,10 @@ int main_bedcov(int argc, char *argv[])
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "Q:j", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
case 'j': skip_DN = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': usage = 1; break;
......@@ -92,6 +93,7 @@ int main_bedcov(int argc, char *argv[])
fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
sam_global_opt_help(stderr, "-.--.-");
return 1;
}
......@@ -155,8 +157,16 @@ int main_bedcov(int argc, char *argv[])
bam_mplp_set_maxcnt(mplp, 64000);
memset(cnt, 0, 8 * n);
while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
if (pos >= beg && pos < end)
for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
if (pos >= beg && pos < end) {
for (i = 0, m = 0; i < n; ++i) {
if (skip_DN)
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *pi = plp[i] + j;
if (pi->is_del || pi->is_refskip) ++m;
}
cnt[i] += n_plp[i] - m;
}
}
for (i = 0; i < n; ++i) {
kputc('\t', &str);
kputl(cnt[i], &str);
......
/* bedidx.c -- BED file indexing.
Copyright (C) 2011 Broad Institute.
Copyright (C) 2014 Genome Research Ltd.
Copyright (C) 2014,2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
......@@ -186,7 +186,7 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end)
* @param reg_hash the region hash table with interval lists as values
*/
static void bed_unify(void *reg_hash) {
void bed_unify(void *reg_hash) {
int i, j, new_n;
reghash_t *h;
......@@ -251,7 +251,7 @@ void *bed_read(const char *fn)
gzFile fp;
kstream_t *ks = NULL;
int dret;
unsigned int line = 0;
unsigned int line = 0, save_errno;
kstring_t str = { 0, 0, NULL };
if (NULL == h) return NULL;
......@@ -286,9 +286,18 @@ void *bed_read(const char *fn)
// has called their reference "browser" or "track".
if (0 == strcmp(ref, "browser")) continue;
if (0 == strcmp(ref, "track")) continue;
fprintf(stderr, "[bed_read] Parse error reading %s at line %u\n",
if (num < 1) {
fprintf(stderr,
"[bed_read] Parse error reading \"%s\" at line %u\n",
fn, line);
goto fail_no_msg;
} else {
fprintf(stderr,
"[bed_read] Parse error reading \"%s\" at line %u : "
"end (%u) must not be less than start (%u)\n",
fn, line, end, beg);
}
errno = 0; // Prevent caller from printing misleading error messages
goto fail;
}
// Put reg in the hash table if not already there
......@@ -324,12 +333,12 @@ void *bed_read(const char *fn)
//bed_unify(h);
return h;
fail:
fprintf(stderr, "[bed_read] Error reading %s : %s\n", fn, strerror(errno));
fail_no_msg:
save_errno = errno;
if (ks) ks_destroy(ks);
if (fp) gzclose(fp);
free(str.s);
bed_destroy(h);
errno = save_errno;
return NULL;
}
......
/* bedidx.h -- BED file indexing header file.
Copyright (C) 2017 Genome Research Ltd.
Author: Valeriu Ohan <vo2@sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */
#ifndef BEDIDX_H
#define BEDIDX_H
......@@ -16,5 +40,6 @@ int bed_overlap(const void *_h, const char *chr, int beg, int end);
void *bed_hash_regions(void *reg_hash, char **regs, int first, int last, int *op);
const char* bed_get(void *reg_hash, int index, int filter);
hts_reglist_t *bed_reglist(void *reg_hash, int filter, int *count_regs);
void bed_unify(void *_h);
#endif
samtools (1.8-2) UNRELEASED; urgency=medium
samtools (1.9-1) UNRELEASED; urgency=medium
* New upstream version
* Mention the CRAM format in the description.
* Fix Perl interpreter line
-- Charles Plessy <plessy@debian.org> Fri, 11 May 2018 09:43:42 +0900
......
......@@ -9,14 +9,14 @@ Build-Depends: debhelper (>= 11~),
# libio-pty-perl is needed by the regression test.
libio-pty-perl,
libncurses5-dev,
libhts-dev (>= 1.8),
libhts-dev (>= 1.9),
zlib1g-dev,
automake,
autoconf-archive,
pkg-config,
tabix (>= 1.0)
# tabix is needed for the regression tests.
Standards-Version: 4.1.4
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/samtools
Vcs-Git: https://salsa.debian.org/med-team/samtools.git
Homepage: http://www.htslib.org/
......
......@@ -33,3 +33,10 @@ override_dh_auto_install:
dh_auto_install -- \
prefix=/usr
make clean
override_dh_install:
dh_install
# Fix Perl interpreter PATH
for pscript in `grep -Rl '#!/usr/bin/env \+perl' debian/*` ; do \
sed -i '1s?#!/usr/bin/env \+perl?#!/usr/bin/perl?' $${pscript} ; \
done