Skip to content
Commits on Source (8)
System Requirements
===================
BCFtools and HTSlib depend on the zlib library <http://zlib.net>, the bzip2
library <http://bzip.org/> and liblzma <http://tukaani.org/xz/>. Building
them requires development files to be installed on the build machine;
BCFtools and HTSlib depend on the following libraries:
BCFtools:
zlib <http://zlib.net>
gsl <https://www.gnu.org/software/gsl/>
(optional, for the 'polysomy' command)
libperl <http://www.perl.org/>
(optional, to support filters using perl syntax)
HTSlib:
zlib <http://zlib.net>
libbz2 <http://bzip.org/>
liblzma <http://tukaani.org/xz/>
libcurl <https://curl.haxx.se/>
(optional but strongly recommended, for network access)
libcrypto <https://www.openssl.org/>
(optional, for Amazon S3 support; not needed on MacOS)
Building them requires development files to be installed on the build machine;
note that some Linux distributions package these separately from the library
itself (see below).
itself. See the "System Specific Details" below for guidance on how to install
these on a variety of systems.
The bzip2 and liblzma dependencies can be removed if full CRAM support
is not needed - see HTSlib's INSTALL file for details.
Packages for dpkg-based Linux distributions (Debian / Ubuntu) are:
zlib1g-dev
libbz2-dev
liblzma-dev
Packages for rpm or yum-based Linux distributions (RedHat / Fedora / CentOS)
are:
zlib-devel
bzip2-devel
xz-devel
To build BCFtools, you will need:
GNU make
......@@ -85,12 +89,6 @@ sophisticated filtering. This option can be enabled by supplying the
./configure --enable-perl-filters
Note that enabling this option changes the license from MIT to GPL
because bcftools need to be built with
perl -MExtUtils::Embed -e ccopts -e ldopts
Optional Compilation with GSL
=============================
......@@ -136,3 +134,42 @@ The bgzip and tabix utilities are provided by HTSlib. If you have not also
installed HTSlib separately, you may wish to install these utilities by hand
by copying bcftools-1.x/htslib-1.x/{bgzip,tabix} to the same bin directory
to which you have installed bcftools et al.
System Specific Details
=======================
Installing the prerequisites is system dependent and there is more
than one correct way of satisfying these, including downloading them
from source, compiling and installing them yourself.
For people with super-user access, we provide an example set of commands
below for installing the dependencies on a variety of operating system
distributions. Note these are not specific recommendations on distribution,
compiler or SSL implementation. It is assumed you already have the core set
of packages for the given distribution - the lists may be incomplete if
this is not the case.
Debian / Ubuntu
---------------
sudo apt-get update # Ensure the package list is up to date
sudo apt-get install autoconf automake make gcc perl zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev libperl-dev libgsl0-dev
Note: libcurl4-openssl-dev can be used as an alternative to libcurl4-gnutls-dev.
RedHat / CentOS
---------------
sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel gsl-devel perl-ExtUtils-Embed
Alpine Linux
------------
sudo apk update # Ensure the package list is up to date
sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev
OpenSUSE
--------
sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-devel libcurl-devel libopenssl-devel gsl-devel
......@@ -93,7 +93,7 @@ endif
include config.mk
PACKAGE_VERSION = 1.8
PACKAGE_VERSION = 1.9
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
......@@ -175,63 +175,67 @@ endif # PLUGINS_ENABLED
plugins: $(PLUGINS)
bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h)
bin_h = bin.h $(htslib_hts_h)
call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h
convert_h = convert.h $(htslib_vcf_h)
tsv2vcf_h = tsv2vcf.h $(htslib_vcf_h)
filter_h = filter.h $(htslib_vcf_h)
gvcf_h = gvcf.h $(bcftools_h)
khash_str2str_h = khash_str2str.h $(htslib_khash_h)
ploidy_h = ploidy.h regidx.h
prob1_h = prob1.h $(htslib_vcf_h) $(call_h)
roh_h = HMM.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(bcftools_h)
cnv_h = HMM.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h)
smpl_ilist_h = smpl_ilist.h $(htslib_vcf_h)
vcfbuf_h = vcfbuf.h $(htslib_vcf_h)
bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h)
bam_sample_h = bam_sample.h $(htslib_sam_h)
main.o: main.c $(htslib_hts_h) config.h version.h $(bcftools_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(bcftools_h) vcmp.h $(filter_h)
vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(bcftools_h) vcmp.h $(filter_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h)
vcfannotate.o: vcfannotate.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h) $(convert_h) $(smpl_ilist_h) $(htslib_khash_h)
vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(bcftools_h) vcmp.h $(filter_h)
vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h)
vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(bcftools_h)
vcfconvert.o: vcfconvert.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h)
vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h
vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) hclust.h
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h)
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(bcftools_h) rbuf.h
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
vcfroh.o: vcfroh.c $(roh_h)
vcfcnv.o: vcfcnv.c $(cnv_h)
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h)
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(bin_h)
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) kheap.h $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(bcftools_h) $(khash_str2str_h)
tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h)
ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h)
convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
filter.o: filter.c config.h $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
filter.o: filter.c $(htslib_khash_str2int_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h) $(htslib_kfunc_h) config.h $(filter_h) $(bcftools_h)
$(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $<
gvcf.o: gvcf.c gvcf.h $(call_h)
gvcf.o: gvcf.c $(gvcf_h) $(bcftools_h)
kmin.o: kmin.c kmin.h
mcall.o: mcall.c $(htslib_kfunc_h) $(call_h)
prob1.o: prob1.c $(prob1_h)
vcmp.o: vcmp.c $(htslib_hts_h) vcmp.h
ploidy.o: ploidy.c regidx.h $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h)
vcmp.o: vcmp.c $(htslib_hts_h) $(htslib_vcf_h) vcmp.h
ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h)
polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h
peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h)
bin.o: bin.c $(bin_h)
bin.o: bin.c $(bcftools_h) bin.h
regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h
consensus.o: consensus.c $(htslib_hts_h) $(htslib_kseq_h) rbuf.h $(bcftools_h) regidx.h
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(call_h) $(bam2bcf_h) $(bam_sample_h)
bam_sample.o: $(bam_sample_h) $(htslib_hts_h) $(htslib_khash_str2int_h)
consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h)
mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h)
bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h
bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h)
bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(khash_str2str_h) $(bam_sample_h) $(bcftools_h)
version.o: version.h version.c
hclust.o: hclust.c hclust.h
vcfbuf.o: vcfbuf.c vcfbuf.h rbuf.h
smpl_ilist.o: smpl_ilist.c smpl_ilist.h
csq.o: csq.c smpl_ilist.h regidx.h filter.h kheap.h rbuf.h
hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h
HMM.o: HMM.c $(htslib_hts_h) HMM.h
vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h
smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h)
csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h
# test programs
......@@ -253,7 +257,7 @@ test/test-rbuf.o: test/test-rbuf.c rbuf.h
test/test-rbuf: test/test-rbuf.o
$(CC) $(LDFLAGS) -o $@ $^ $(ALL_LIBS)
test/test-regidx.o: test/test-regidx.c regidx.h
test/test-regidx.o: test/test-regidx.c $(htslib_kstring_h) regidx.h
test/test-regidx: test/test-regidx.o regidx.o $(HTSLIB)
$(CC) $(ALL_LDFLAGS) -o $@ $^ $(HTSLIB) -lpthread $(HTSLIB_LIB) $(ALL_LIBS)
......
## Release 1.9 (18th July 2018)
* `annotate`
- REF and ALT columns can be now transferred from the annotation file.
- fixed bug when setting vector_end values.
* `consensus`
- new -M option to control output at missing genotypes
- variants immediately following insersions should not be skipped. Note
however, that the current fix requires normalized VCF and may still
falsely skip variants adjacent to multiallelic indels.
- bug fixed in -H selection handling
* `convert`
- the --tsv2vcf option now makes the missing genotypes diploid, "./." instead of "."
- the behavior of -i/-e with --gvcf2vcf changed. Previously only sites with
FILTER set to "PASS" or "." were expanded and the -i/-e options dropped
sites completely. The new behavior is to let the -i/-e options control which
records will be expanded. In order to drop records completely, one can stream
through "bcftools view" first.
* `csq`
- since the real consequence of start/splice events are not known, the aminoacid
positions at subsequent variants should stay unchanged
- add `--force` option to skip malformatted transcripts in GFFs with out-of-phase
CDS exons.
* `+dosage`: output all alleles and all their dosages at multiallelic sites
* `+fixref`: fix serious bug in -m top conversion
* `-i/-e` filtering expressions:
- add two-tailed binomial test
- add functions N_PASS() and F_PASS()
- add support for lists of samples in filtering expressions, with many
samples it was impractical to list them all on the command line. Samples
can be now in a file as, e.g., GT[@samples.txt]="het"
- allow multiple perl functions in the expressions and some bug fixes
- fix a parsing problem, '@' was not removed from '@filename' expressions
* `mpileup`: fixed bug where, if samples were renamed using the `-G` (`--read-groups`)
option, some samples could be omitted from the output file.
* `norm`: update INFO/END when normalizing indels
* `+split`: new -S option to subset samples and to use custom file names instead of the defaults
* `+smpl-stats`: new plugin
* `+trio-stats`: new plugin
* Fixed build problems with non-functional configure script produced on some platforms
## Release 1.8 (April 2018)
* `-i, -e` filtering: Support for custom perl scripts
......
/* bam_sample.c -- group data by sample.
Copyright (C) 2010, 2011 Broad Institute.
Copyright (C) 2013, 2016 Genome Research Ltd.
Copyright (C) 2013, 2016-2018 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
......@@ -167,10 +167,14 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
void *bam_smpls = khash_str2int_init();
int first_smpl = -1, nskipped = 0;
const char *p = bam_hdr, *q, *r;
while ((q = strstr(p, "@RG")) != 0)
while (p != NULL && (q = strstr(p, "@RG")) != 0)
{
char *eol = strchr(q + 3, '\n');
if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
p = eol;
continue;
}
p = q + 3;
r = q = 0;
if ((q = strstr(p, "\tID:")) != 0) q += 4;
if ((r = strstr(p, "\tSM:")) != 0) r += 4;
if (r && q)
......@@ -220,7 +224,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
}
else
break;
p = q > r ? q : r;
p = eol;
}
int nsmpls = khash_str2int_size(bam_smpls);
khash_str2int_destroy_free(bam_smpls);
......@@ -234,6 +238,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
// no suitable read group is available in this bam: ignore the whole file.
free(file->fname);
if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
bsmpl->nfiles--;
return -1;
}
......
......@@ -39,7 +39,7 @@ THE SOFTWARE. */
#define FT_STDIN (1<<3)
char *bcftools_version(void);
void error(const char *format, ...) HTS_NORETURN;
void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
......
......@@ -48,9 +48,9 @@ bin_t *bin_init(const char *list_def, float min, float max)
{
char *tmp;
bin->bins[i] = strtod(list[i],&tmp);
if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( *tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
error("Expected values from the interval [%f,%f], found %s\n", list[i]);
error("Expected values from the interval [%f,%f], found %s\n", min, max, list[i]);
free(list[i]);
}
free(list);
......
......@@ -23,13 +23,14 @@
# DEALINGS IN THE SOFTWARE.
dnl Process this file with autoconf to produce a configure script
AC_INIT([BCFtools], m4_esyscmd_s([make print-version]),
AC_INIT([BCFtools], m4_esyscmd_s([./version.sh 2>/dev/null]),
[samtools-help@lists.sourceforge.net], [], [http://www.htslib.org/])
AC_PREREQ([2.63]) dnl This version introduced 4-argument AC_CHECK_HEADER
AC_CONFIG_SRCDIR([main.c])
AC_CONFIG_HEADERS([config.h])
m4_include([m4/ax_with_htslib.m4])
m4_include([m4/hts_prog_cc_warnings.m4])
dnl Copyright notice to be copied into the generated configure script
AC_COPYRIGHT([Portions copyright (C) 2015,2017 Genome Research Ltd.
......@@ -39,6 +40,12 @@ redistribute it. There is NO WARRANTY, to the extent permitted by law.])
AC_PROG_CC
dnl Turn on compiler warnings, if possible
HTS_PROG_CC_WARNINGS
dnl Flags to treat warnings as errors. These need to be applied to CFLAGS
dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS)
HTS_PROG_CC_WERROR(hts_late_cflags)
AC_SYS_LARGEFILE
AC_ARG_ENABLE([bcftools-plugins],
......@@ -214,10 +221,11 @@ AS_IF([test "$enable_perl_filters" != "no" ], [dnl
PERL_LIBS="`perl -MExtUtils::Embed -e ldopts`"
AC_SUBST([PERL_CCOPTS])
AC_SUBST([PERL_LIBS])
USE_GPL=1
AC_SUBST([USE_GPL])
])
dnl Apply value from HTS_PROG_CC_WERROR (if set)
AS_IF([test "x$hts_late_cflags" != x],[CFLAGS="$CFLAGS $hts_late_cflags"])
AC_CONFIG_FILES([config.mk])
AC_OUTPUT
......
......@@ -36,6 +36,7 @@
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
#include <htslib/bgzf.h>
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
......@@ -73,6 +74,8 @@ typedef struct
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
int fa_case; // output upper case or lower case?
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
......@@ -96,7 +99,7 @@ typedef struct
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
}
args_t;
......@@ -237,7 +240,7 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
}
......@@ -264,7 +267,7 @@ static void init_region(args_t *args, char *line)
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
int from = 0, to = 0;
char tmp, *tmp_ptr = NULL;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
tmp = *se; *se = 0; tmp_ptr = se;
......@@ -284,6 +287,7 @@ static void init_region(args_t *args, char *line)
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->prev_base_pos = -1;
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
......@@ -371,13 +375,10 @@ static void flush_fa_buffer(args_t *args, int len)
}
static void apply_variant(args_t *args, bcf1_t *rec)
{
if ( rec->n_allele==1 ) return;
static int warned_haplotype = 0;
if ( rec->n_allele==1 && !args->missing_allele ) return;
if ( rec->pos <= args->fa_frz_pos )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
if ( args->mask )
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
......@@ -396,28 +397,65 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( fmt->type!=BCF_BT_INT8 )
error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
uint8_t *ptr = fmt->p + fmt->size*args->isample;
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
if ( args->haplotype > fmt->n )
{
if ( bcf_gt_is_missing(ptr[fmt->n-1]) || bcf_gt_is_missing(ptr[0]) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
{
if ( !warned_haplotype )
{
fprintf(stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
warned_haplotype = 1;
}
return;
}
}
else
{
ialt = (int8_t)ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int8_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
}
}
else if ( args->output_iupac )
{
ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
if ( bcf_gt_is_missing(jalt) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
else
jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
if ( ialt>=0 )
{
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
......@@ -427,13 +465,19 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
}
else
{
int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(ptr[i]) )
{
if ( !args->missing_allele ) return; // ignore missing or half-missing genotypes
ialt = -1;
break;
}
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
ialt = bcf_gt_allele(ptr[i]);
if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
}
......@@ -442,7 +486,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int prev_len = 0, jalt;
for (i=0; i<fmt->n; i++)
{
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
jalt = bcf_gt_allele(ptr[i]);
if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( args->allele & (PICK_LONG|PICK_SHORT) )
......@@ -475,6 +519,25 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
if ( ialt==-1 )
{
char alleles[4];
alleles[0] = rec->d.allele[0][0];
alleles[1] = ',';
alleles[2] = args->missing_allele;
alleles[3] = 0;
bcf_update_alleles_str(args->hdr, rec, alleles);
ialt = 1;
}
// Overlapping variant? Can be still OK iff this is an insertion
if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
if ( idx<0 )
......@@ -493,7 +556,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
}
if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
if ( rec->d.allele[ialt][0]=='<' )
......@@ -507,7 +570,20 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
// fprintf(stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
// This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA),
// the reference base in fa_buf is lost and the check fails. We do not keep a buffer
// with the original sequence as it should not be necessary, we should encounter max
// one base overlap
int fail = 1;
if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
{
if ( rec->rlen==1 ) fail = 0;
else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
}
if ( fail )
{
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
......@@ -523,6 +599,9 @@ static void apply_variant(args_t *args, bcf1_t *rec)
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
alen = strlen(rec->d.allele[ialt]);
len_diff = alen - rec->rlen;
}
else
{
alen = strlen(rec->d.allele[ialt]);
......@@ -540,8 +619,12 @@ static void apply_variant(args_t *args, bcf1_t *rec)
for (i=0; i<alen; i++)
args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
if ( len_diff )
{
args->prev_base = rec->d.allele[0][rec->rlen - 1];
args->prev_base_pos = rec->pos + rec->rlen - 1;
memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
}
}
else
{
// insertion
......@@ -590,10 +673,10 @@ static void mask_region(args_t *args, char *seq, int len)
static void consensus(args_t *args)
{
htsFile *fasta = hts_open(args->ref_fname, "rb");
BGZF *fasta = bgzf_open(args->ref_fname, "r");
if ( !fasta ) error("Error reading %s\n", args->ref_fname);
kstring_t str = {0,0,0};
while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
while ( bgzf_getline(fasta, '\n', &str) > 0 )
{
if ( str.s[0]=='>' )
{
......@@ -670,7 +753,7 @@ static void consensus(args_t *args)
destroy_chain(args);
}
flush_fa_buffer(args, 0);
hts_close(fasta);
bgzf_close(fasta);
free(str.s);
}
......@@ -682,7 +765,7 @@ static void usage(args_t *args)
fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
......@@ -698,6 +781,7 @@ static void usage(args_t *args)
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -M, --missing <char> output <char> instead of skipping the missing genotypes\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
......@@ -723,11 +807,12 @@ int main_consensus(int argc, char *argv[])
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"missing",1,0,'M'},
{"chain",1,0,'c'},
{0,0,0,0}
};
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0)
{
switch (c)
{
......@@ -738,6 +823,10 @@ int main_consensus(int argc, char *argv[])
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'M':
args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
......
......@@ -30,6 +30,7 @@ THE SOFTWARE. */
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <inttypes.h>
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
......@@ -756,7 +757,7 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
......@@ -773,7 +774,7 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
{
str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==bcf_int32_missing ) /* . */
else if ( ptr[0]==bcf_int8_missing ) /* . */
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
......@@ -910,7 +911,7 @@ static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
......
......@@ -595,6 +595,7 @@ typedef struct _args_t
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
faidx_t *fai;
kstring_t str, str2;
......@@ -1111,15 +1112,26 @@ void tscript_init_cds(args_t *args)
tr->cds[0]->len -= tr->cds[0]->phase;
tr->cds[0]->phase = 0;
// sanity check phase
// sanity check phase; the phase number in gff tells us how many bases to skip in this
// feature to reach the first base of the next codon
int tscript_ok = 1;
for (i=0; i<tr->ncds; i++)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
else
{
......@@ -1140,13 +1152,24 @@ void tscript_init_cds(args_t *args)
tr->cds[i]->phase = 0;
// sanity check phase
int tscript_ok = 1;
for (i=tr->ncds-1; i>=0; i--)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
// set len. At the same check that CDS within a transcript do not overlap
......@@ -1876,7 +1899,7 @@ fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
{
splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
return SPLICE_OVERLAP;
}
}
......@@ -2074,7 +2097,6 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
child->var = str.s;
child->type = HAP_SSS;
child->csq = splice.csq;
child->prev = parent->type==HAP_SSS ? parent->prev : parent;
child->rec = rec;
return 0;
}
......@@ -2092,7 +2114,7 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
assert( dbeg <= splice.kalt.l );
}
if ( parent->type==HAP_SSS ) parent = parent->prev;
assert( parent->type!=HAP_SSS );
if ( parent->type==HAP_CDS )
{
i = parent->icds;
......@@ -2402,12 +2424,12 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
#endif
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
int i;
for (i=0; i<vbuf->n; i++)
if ( vbuf->vrec[i]->line==rec ) break;
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
vrec_t *vrec = vbuf->vrec[i];
// if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
......@@ -2769,26 +2791,20 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->upstream_stop = 0;
int i = 1, dlen = 0, ibeg, indel = 0;
while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
hap->sbeg = hap->stack[i].node->sbeg;
assert( hap->stack[istack].node->type != HAP_SSS );
if ( tr->strand==STRAND_FWD )
{
i = 0, ibeg = -1;
while ( ++i <= istack )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
// start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
// This condition extends compound variants.
if ( i<istack )
{
if ( dlen%3 ) // frameshift
{
......@@ -2839,14 +2855,10 @@ void hap_finalize(args_t *args, hap_t *hap)
i = istack + 1, ibeg = -1;
while ( --i > 0 )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert ( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
if ( i>1 )
{
if ( dlen%3 )
{
......@@ -3352,7 +3364,8 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3363,7 +3376,22 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
ret = 1;
continue;
}
parent->nend--;
......@@ -3434,7 +3462,8 @@ int test_cds(args_t *args, bcf1_t *rec)
}
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, ial);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3446,10 +3475,23 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
}
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
continue;
}
if ( parent->cur_rec!=rec )
{
hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
......@@ -3708,6 +3750,7 @@ static const char *usage(void)
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" --force run even if some sanity checks fail\n"
" -i, --include <expr> select sites for which the expression is true\n"
" -o, --output <file> write output to a file [standard output]\n"
" -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
......@@ -3739,6 +3782,7 @@ int main_csq(int argc, char *argv[])
static struct option loptions[] =
{
{"force",0,0,1},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
{"custom-tag",1,0,'c'},
......@@ -3765,6 +3809,7 @@ int main_csq(int argc, char *argv[])
{
switch (c)
{
case 1 : args->force = 1; break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': args->quiet++; break;
......
bcftools (1.9-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Standards-Version: 4.2.1
* Fix Perl interpreter PATH
-- Andreas Tille <tille@debian.org> Tue, 04 Sep 2018 16:37:32 +0200
bcftools (1.8-1) unstable; urgency=medium
* Team upload.
......
Source: bcftools
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders:
Afif Elghraoui <afif@debian.org>
Uploaders: Afif Elghraoui <afif@debian.org>
Section: science
Priority: optional
Build-Depends:
debhelper (>= 11~),
Build-Depends: debhelper (>= 11~),
zlib1g-dev,
libbz2-dev,
liblzma-dev,
libhts-dev (>= 1.5-3),
libgsl-dev,
tabix <!nocheck>,
libio-pty-perl <!nocheck>,
Standards-Version: 4.1.4
libio-pty-perl <!nocheck>
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/bcftools
Vcs-Git: https://salsa.debian.org/med-team/bcftools.git
Homepage: http://samtools.github.io/bcftools/
Package: bcftools
Architecture: any
Breaks: samtools (<< 1.0)
Replaces: samtools (<< 1.0)
Depends:
${shlibs:Depends},
Depends: ${shlibs:Depends},
${misc:Depends},
${perl:Depends}
Suggests:
......@@ -32,6 +27,8 @@ Suggests:
python-numpy,
python-matplotlib,
texlive-latex-recommended
Breaks: samtools (<< 1.0)
Replaces: samtools (<< 1.0)
Description: genomic variant calling and manipulation of VCF/BCF files
BCFtools is a set of utilities that manipulate variant calls in the
Variant Call Format (VCF) and its binary counterpart BCF. All commands work
......
......@@ -18,5 +18,5 @@ Description: Skip single test causing new failures
-test_vcf_query($opts,in=>'filter.5',out=>'query.54.out',args=>q[-f'[%POS %SAMPLE %AD\\n]\\n' -i'AD[:0]+AD[:1] > 12']);
+#test_vcf_query($opts,in=>'filter.5',out=>'query.54.out',args=>q[-f'[%POS %SAMPLE %AD\\n]\\n' -i'AD[:0]+AD[:1] > 12']);
test_vcf_query($opts,in=>'query.filter.4',out=>'query.55.out',args=>q[-f'%POS\\t%REF\\t%ALT[\\t%GT]\\n' -e'TYPE!="snp" || ALT="*"']);
test_vcf_norm($opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx');
test_vcf_norm($opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-');
test_vcf_query($opts,in=>'view',out=>'query.56.out',args=>q[-f'%ID\\n' -i 'ID=@].$$opts{path}.q[/query.56.out']);
test_vcf_query($opts,in=>'query.filter.5',out=>'query.57.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\t%AD\\n]' -i'GT="het" & binom(FMT/AD)>0.01']);
......@@ -10,7 +10,7 @@ Forwarded: not-needed
Last-Update: 2015-11-09
--- a/test/test.pl
+++ b/test/test.pl
@@ -918,7 +918,7 @@ sub test_vcf_plugin
@@ -932,7 +932,7 @@ sub test_vcf_plugin
{
my ($opts,%args) = @_;
if ( !$$opts{test_plugins} ) { return; }
......
......@@ -30,3 +30,10 @@ override_dh_auto_configure:
--enable-gsl \
--with-cblas=gslcblas \
--with-bcf-plugin-dir=/usr/lib/$(DEB_HOST_MULTIARCH)/bcftools
override_dh_install:
dh_install
# Fix Perl interpreter PATH
for pscript in `grep -Rl '#!/usr/bin/env \+perl' debian/*` ; do \
sed -i '1s?#!/usr/bin/env \+perl?#!/usr/bin/perl?' $${pscript} ; \
done
......@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2018-04-03
.\" Date: 2018-07-18
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2018\-04\-03" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2018\-07\-18" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
......@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2018\-04\-03\fR and refers to bcftools git version \fB1\&.8\fR\&.
This manual page was last updated \fB2018\-07\-18\fR and refers to bcftools git version \fB1\&.9\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
......@@ -1359,6 +1359,11 @@ in
for file format details\&.
.RE
.PP
\fB\-M, \-\-missing\fR \fICHAR\fR
.RS 4
instead of skipping the missing genotypes, output the character CHAR (e\&.g\&. "?")
.RE
.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
write output to a file
......@@ -1563,7 +1568,12 @@ output VCF IDs in the second column instead of CHROM:POS_REF_ALT
.PP
\fB\-\-gvcf2vcf\fR
.RS 4
convert gVCF to VCF, expanding REF blocks into sites\&. Only sites with FILTER set to "PASS" or "\&." will be expanded\&.
convert gVCF to VCF, expanding REF blocks into sites\&. Note that the
\fB\-i\fR
and
\fB\-e\fR
options work differently with this switch\&. In this situation the filtering expressions define which sites should be expanded and which sites should be left unmodified, but all sites are printed on output\&. In order to drop sites, stream first through
\fBbcftools view\fR\&.
.RE
.PP
\fB\-f, \-\-fasta\-ref\fR \fIfile\fR
......@@ -1793,6 +1803,11 @@ is true\&. For valid expressions see
reference sequence in fasta format (required)
.RE
.PP
\fB\-\-force\fR
.RS 4
run even if some sanity checks fail\&. Currently the option allows to skip transcripts in malformatted GFFs with incorrect phase
.RE
.PP
\fB\-g, \-\-gff\-annot\fR \fIFILE\fR
.RS 4
GFF3 annotation file (required), such as
......@@ -2464,13 +2479,13 @@ Create intersection and complements of two sets saving the output in dir/*
.RE
.\}
.sp
Filter sites in A and B (but not in C) and create intersection
Filter sites in A (require INFO/MAF>=0\&.01) and B (require INFO/dbSNP) but not in C, and create an intersection, including only sites which appear in at least two of the files after filters have been applied
.sp
.if n \{\
.RS 4
.\}
.nf
bcftools isec \-e\*(AqMAF<0\&.01\*(Aq \-i\*(AqdbSNP=1\*(Aq \-e\- A\&.vcf\&.gz B\&.vcf\&.gz C\&.vcf\&.gz \-p dir
bcftools isec \-e\*(AqMAF<0\&.01\*(Aq \-i\*(AqdbSNP=1\*(Aq \-e\- A\&.vcf\&.gz B\&.vcf\&.gz C\&.vcf\&.gz \-n +2 \-p dir
.fi
.if n \{\
.RE
......@@ -2518,7 +2533,7 @@ Merge multiple VCF/BCF files from non\-overlapping sample sets to create one mul
.sp
Note that it is responsibility of the user to ensure that the sample names are unique across all files\&. If they are not, the program will exit with an error unless the option \fB\-\-force\-samples\fR is given\&. The sample names can be also given explicitly using the \fB\-\-print\-header\fR and \fB\-\-use\-header\fR options\&.
.sp
Note that only records from different files can be merged, never from the same file\&. For "vertical" merge take a look at \fBbcftools norm\fR instead\&.
Note that only records from different files can be merged, never from the same file\&. For "vertical" merge take a look at \fBbcftools concat\fR or \fBbcftools norm\fR\fB \-m\fR instead\&.
.PP
\fB\-\-force\-samples\fR
.RS 4
......@@ -4759,6 +4774,27 @@ CSQ[*] ~ "missense_variant\&.*deleterious"
.sp -1
.IP \(bu 2.3
.\}
with many samples it can be more practical to provide a file with sample names, one sample name per line
.sp
.if n \{\
.RS 4
.\}
.nf
GT[@samples\&.txt]="het" & binom(AD)<0\&.01
.fi
.if n \{\
.RE
.\}
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
function on FORMAT tags (over samples) and INFO tags (over vector fields)
.sp
.if n \{\
......@@ -4780,7 +4816,29 @@ MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT
.sp -1
.IP \(bu 2.3
.\}
variables calculated on the fly if not present: number of alternate alleles; number of samples; count of alternate alleles; minor allele count (similar to AC but is always smaller than 0\&.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype
two\-tailed binomial test\&. Note that for N=0 the test evaluates to a missing value and when FORMAT/GT is used to determine the vector indices, it evaluates to 1 for homozygous genotypes\&.
.sp
.if n \{\
.RS 4
.\}
.nf
binom(FMT/AD) \&.\&. GT can be used to determine the correct index
binom(AD[0],AD[1]) \&.\&. or the fields can be given explicitly
.fi
.if n \{\
.RE
.\}
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
variables calculated on the fly if not present: number of alternate alleles; number of samples; count of alternate alleles; minor allele count (similar to AC but is always smaller than 0\&.5); frequency of alternate alleles (AF=AC/AN); frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes; number of samples with missing genotype; fraction of samples with missing genotype;
.sp
.if n \{\
.RS 4
......@@ -4801,6 +4859,28 @@ N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
.sp -1
.IP \(bu 2.3
.\}
the number (N_PASS) or fraction (F_PASS) of samples which pass the expression
.sp
.if n \{\
.RS 4
.\}
.nf
N_PASS(GQ>90 & GT!="mis") > 90
F_PASS(GQ>90 & GT!="mis") > 0\&.9
.fi
.if n \{\
.RE
.\}
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
custom perl filtering\&. Note that this command is not compiled in by default, see the section
\fBOptional Compilation with Perl\fR
in the INSTALL file for help and misc/demo\-flt\&.pl for a working example\&. The demo defined the perl subroutine "severity" which can be invoked from the command line as follows:
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25098944"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25133856"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
......@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-04-03</strong></span> and refers to bcftools git version <span class="strong"><strong>1.8</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-07-18</strong></span> and refers to bcftools git version <span class="strong"><strong>1.9</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
......@@ -781,6 +781,10 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
of <span class="strong"><strong>--regions-file</strong></span> in <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span> for file
format details.
</dd><dt><span class="term">
<span class="strong"><strong>-M, --missing</strong></span> <span class="emphasis"><em>CHAR</em></span>
</span></dt><dd>
instead of skipping the missing genotypes, output the character CHAR (e.g. "?")
</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
write output to a file
......@@ -888,8 +892,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
</dd></dl></div></div><div class="refsect3" title="gVCF conversion:"><a id="_gvcf_conversion"></a><h4>gVCF conversion:</h4><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>--gvcf2vcf</strong></span>
</span></dt><dd>
convert gVCF to VCF, expanding REF blocks into sites. Only sites
with FILTER set to "PASS" or "." will be expanded.
convert gVCF to VCF, expanding REF blocks into sites. Note that
the <span class="strong"><strong>-i</strong></span> and <span class="strong"><strong>-e</strong></span> options work differently with this switch. In this situation
the filtering expressions define which sites should be expanded and
which sites should be left unmodified, but all sites are printed on
output. In order to drop sites, stream first through <span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">bcftools view</a></strong></span>.
</dd><dt><span class="term">
<span class="strong"><strong>-f, --fasta-ref</strong></span> <span class="emphasis"><em>file</em></span>
</span></dt><dd>
......@@ -1034,6 +1041,11 @@ output VCF and are ignored for the prediction analysis.</p><div class="variablel
</span></dt><dd>
reference sequence in fasta format (required)
</dd><dt><span class="term">
<span class="strong"><strong>--force</strong></span>
</span></dt><dd>
run even if some sanity checks fail. Currently the option allows to skip
transcripts in malformatted GFFs with incorrect phase
</dd><dt><span class="term">
<span class="strong"><strong>-g, --gff-annot</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
GFF3 annotation file (required), such as <a class="ulink" href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" target="_top">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens</a>.
......@@ -1457,7 +1469,9 @@ in the other files.</p><div class="variablelist"><dl><dt><span class="term">
</span></dt><dd>
list of input files to output given as 1-based indices. With <span class="strong"><strong>-p</strong></span> and no
<span class="strong"><strong>-w</strong></span>, all files are written.
</dd></dl></div><div class="refsect3" title="Examples:"><a id="_examples"></a><h4>Examples:</h4><p>Create intersection and complements of two sets saving the output in dir/*</p><pre class="screen"> bcftools isec -p dir A.vcf.gz B.vcf.gz</pre><p>Filter sites in A and B (but not in C) and create intersection</p><pre class="screen"> bcftools isec -e'MAF&lt;0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -p dir</pre><p>Extract and write records from A shared by both A and B using exact allele match</p><pre class="screen"> bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz</pre><p>Extract records private to A or B comparing by position only</p><pre class="screen"> bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz</pre><p>Print a list of records which are present in A and B but not in C and D</p><pre class="screen"> bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz</pre></div></div><div class="refsect2" title="bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]"><a id="merge"></a><h3>bcftools merge [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> <span class="emphasis"><em>B.vcf.gz</em></span> […]</h3><p>Merge multiple VCF/BCF files from non-overlapping sample sets to create one
</dd></dl></div><div class="refsect3" title="Examples:"><a id="_examples"></a><h4>Examples:</h4><p>Create intersection and complements of two sets saving the output in dir/*</p><pre class="screen"> bcftools isec -p dir A.vcf.gz B.vcf.gz</pre><p>Filter sites in A (require INFO/MAF&gt;=0.01) and B (require INFO/dbSNP) but not in C,
and create an intersection, including only sites which appear in at least two of
the files after filters have been applied</p><pre class="screen"> bcftools isec -e'MAF&lt;0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir</pre><p>Extract and write records from A shared by both A and B using exact allele match</p><pre class="screen"> bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz</pre><p>Extract records private to A or B comparing by position only</p><pre class="screen"> bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz</pre><p>Print a list of records which are present in A and B but not in C and D</p><pre class="screen"> bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz</pre></div></div><div class="refsect2" title="bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]"><a id="merge"></a><h3>bcftools merge [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> <span class="emphasis"><em>B.vcf.gz</em></span> […]</h3><p>Merge multiple VCF/BCF files from non-overlapping sample sets to create one
multi-sample file. For example, when merging file <span class="emphasis"><em>A.vcf.gz</em></span> containing
samples <span class="emphasis"><em>S1</em></span>, <span class="emphasis"><em>S2</em></span> and <span class="emphasis"><em>S3</em></span> and file <span class="emphasis"><em>B.vcf.gz</em></span> containing samples <span class="emphasis"><em>S3</em></span> and
<span class="emphasis"><em>S4</em></span>, the output file will contain four samples named <span class="emphasis"><em>S1</em></span>, <span class="emphasis"><em>S2</em></span>, <span class="emphasis"><em>S3</em></span>, <span class="emphasis"><em>2:S3</em></span>
......@@ -1465,7 +1479,7 @@ and <span class="emphasis"><em>S4</em></span>.</p><p>Note that it is responsibil
unique across all files. If they are not, the program will exit with an error
unless the option <span class="strong"><strong>--force-samples</strong></span> is given. The sample names can be
also given explicitly using the <span class="strong"><strong>--print-header</strong></span> and <span class="strong"><strong>--use-header</strong></span> options.</p><p>Note that only records from different files can be merged, never from the same file.
For "vertical" merge take a look at <span class="strong"><strong><a class="link" href="#norm" title="bcftools norm [OPTIONS] file.vcf.gz">bcftools norm</a></strong></span> instead.</p><div class="variablelist"><dl><dt><span class="term">
For "vertical" merge take a look at <span class="strong"><strong><a class="link" href="#concat" title="bcftools concat [OPTIONS] FILE1 FILE2 […]">bcftools concat</a></strong></span> or <span class="strong"><strong><a class="link" href="#norm" title="bcftools norm [OPTIONS] file.vcf.gz">bcftools norm</a> -m</strong></span> instead.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>--force-samples</strong></span>
</span></dt><dd>
if the merged files contain duplicate samples names, proceed anyway.
......@@ -2763,14 +2777,25 @@ FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
CSQ[*] ~ "missense_variant.*deleterious"</pre></li><li class="listitem"><p class="simpara">
with many samples it can be more practical to provide a file with sample names,
one sample name per line
</p><pre class="literallayout">GT[@samples.txt]="het" &amp; binom(AD)&lt;0.01</pre></li><li class="listitem"><p class="simpara">
function on FORMAT tags (over samples) and INFO tags (over vector fields)
</p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT</pre></li><li class="listitem"><p class="simpara">
two-tailed binomial test. Note that for N=0 the test evaluates to a missing value
and when FORMAT/GT is used to determine the vector indices, it evaluates to 1 for
homozygous genotypes.
</p><pre class="literallayout">binom(FMT/AD) .. GT can be used to determine the correct index
binom(AD[0],AD[1]) .. or the fields can be given explicitly</pre></li><li class="listitem"><p class="simpara">
variables calculated on the fly if not present: number of alternate alleles;
number of samples; count of alternate alleles; minor allele count (similar to
AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN);
frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes;
number of samples with missing genotype; fraction of samples with missing genotype
number of samples with missing genotype; fraction of samples with missing genotype;
</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li><li class="listitem"><p class="simpara">
the number (N_PASS) or fraction (F_PASS) of samples which pass the expression
</p><pre class="literallayout">N_PASS(GQ&gt;90 &amp; GT!="mis") &gt; 90
F_PASS(GQ&gt;90 &amp; GT!="mis") &gt; 0.9</pre></li><li class="listitem"><p class="simpara">
custom perl filtering. Note that this command is not compiled in by default, see
the section <span class="strong"><strong>Optional Compilation with Perl</strong></span> in the INSTALL file for help
and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
......
......@@ -790,6 +790,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
of *--regions-file* in *<<common_options,Common Options>>* for file
format details.
*-M, --missing* 'CHAR'::
instead of skipping the missing genotypes, output the character CHAR (e.g. "?")
*-o, --output* 'FILE'::
write output to a file
......@@ -898,8 +901,11 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the
==== gVCF conversion:
*--gvcf2vcf*::
convert gVCF to VCF, expanding REF blocks into sites. Only sites
with FILTER set to "PASS" or "." will be expanded.
convert gVCF to VCF, expanding REF blocks into sites. Note that
the *-i* and *-e* options work differently with this switch. In this situation
the filtering expressions define which sites should be expanded and
which sites should be left unmodified, but all sites are printed on
output. In order to drop sites, stream first through *<<view,bcftools view>>*.
*-f, --fasta-ref* 'file'::
reference sequence in fasta format. Must be indexed with samtools faidx
......@@ -1063,6 +1069,10 @@ output VCF and are ignored for the prediction analysis.
*-f, --fasta-ref* 'FILE'::
reference sequence in fasta format (required)
*--force*::
run even if some sanity checks fail. Currently the option allows to skip
transcripts in malformatted GFFs with incorrect phase
*-g, --gff-annot* 'FILE'::
GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens.
An example of a minimal working GFF file:
......@@ -1474,9 +1484,11 @@ Create intersection and complements of two sets saving the output in dir/*
bcftools isec -p dir A.vcf.gz B.vcf.gz
----
Filter sites in A and B (but not in C) and create intersection
Filter sites in A (require INFO/MAF>=0.01) and B (require INFO/dbSNP) but not in C,
and create an intersection, including only sites which appear in at least two of
the files after filters have been applied
----
bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -p dir
bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir
----
Extract and write records from A shared by both A and B using exact allele match
......@@ -1509,7 +1521,7 @@ unless the option *--force-samples* is given. The sample names can be
also given explicitly using the *--print-header* and *--use-header* options.
Note that only records from different files can be merged, never from the same file.
For "vertical" merge take a look at *<<norm,bcftools norm>>* instead.
For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcftools norm>> -m* instead.
*--force-samples*::
......@@ -2803,18 +2815,35 @@ element of the vector, as shown in the examples below
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
CSQ[*] ~ "missense_variant.*deleterious"
* with many samples it can be more practical to provide a file with sample names,
one sample name per line
GT[@samples.txt]="het" & binom(AD)<0.01
* function on FORMAT tags (over samples) and INFO tags (over vector fields)
MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT
* two-tailed binomial test. Note that for N=0 the test evaluates to a missing value
and when FORMAT/GT is used to determine the vector indices, it evaluates to 1 for
homozygous genotypes.
binom(FMT/AD) .. GT can be used to determine the correct index
binom(AD[0],AD[1]) .. or the fields can be given explicitly
* variables calculated on the fly if not present: number of alternate alleles;
number of samples; count of alternate alleles; minor allele count (similar to
AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN);
frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes;
number of samples with missing genotype; fraction of samples with missing genotype
number of samples with missing genotype; fraction of samples with missing genotype;
N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
* the number (N_PASS) or fraction (F_PASS) of samples which pass the expression
N_PASS(GQ>90 & GT!="mis") > 90
F_PASS(GQ>90 & GT!="mis") > 0.9
* custom perl filtering. Note that this command is not compiled in by default, see
the section *Optional Compilation with Perl* in the INSTALL file for help
and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
......
This diff is collapsed.
dnl @synopsis HTS_PROG_CC_WARNINGS([ANSI])
dnl
dnl Derived from
dnl http://ac-archive.sourceforge.net/ac-archive/vl_prog_cc_warnings.html
dnl
dnl Enables a reasonable set of warnings for the C compiler.
dnl Optionally, if the first argument is nonempty, turns on flags which
dnl enforce and/or enable proper ANSI C if such are known with the
dnl compiler used.
dnl
dnl Currently this macro knows about GCC, Solaris C compiler, Digital
dnl Unix C compiler, C for AIX Compiler, HP-UX C compiler, IRIX C
dnl compiler, NEC SX-5 (Super-UX 10) C compiler, and Cray J90 (Unicos
dnl 10.0.0.8) C compiler.
dnl
dnl @category C
dnl @author Ville Laurikari <vl@iki.fi>
dnl Updated by Rob Davies <rmd@sanger.ac.uk> for HTSlib
dnl @license AllPermissive
dnl Copying and distribution of this file, with or without modification,
dnl are permitted in any medium without royalty provided the copyright notice
dnl and this notice are preserved. Users of this software should generally
dnl follow the principles of the MIT License including its disclaimer.
dnl Original Copyright (c) Ville Laurikari 2002
dnl Modifications Copyright (c) Genome Research Limited 2015,2017
AC_DEFUN([HTS_PROG_CC_WARNINGS], [
AC_ARG_ENABLE([warnings],
[AS_HELP_STRING([--disable-warnings], [turn off compiler warnings])],
[],
[enable_warnings=yes])
AS_IF([test "x$enable_warnings" != xno],[
AC_REQUIRE([AC_PROG_GREP])
ansi="$1"
AS_IF([test "x$ansi" = "x"],
[msg="for C compiler warning flags"],
[msg="for C compiler warning and ANSI conformance flags"])
AC_MSG_CHECKING($msg)
AC_CACHE_VAL(hts_cv_prog_cc_warnings, [dnl
hts_cv_prog_cc_warnings=""
AS_IF([test "x$CC" != "x"],[
cat > conftest.c <<EOF
int main(int argc, char **argv) { return 0; }
EOF
dnl Most compilers print some kind of a version string with some command
dnl line options (often "-V"). The version string should be checked
dnl before doing a test compilation run with compiler-specific flags.
dnl This is because some compilers (like the Cray compiler) only
dnl produce a warning message for unknown flags instead of returning
dnl an error, resulting in a false positive. Also, compilers may do
dnl erratic things when invoked with flags meant for a different
dnl compiler.
dnl We attempt to strip out any flags that are already on CFLAGS.
dnl If an option needs more than one word (e.g. see Cray below) then
dnl they should be separated by hash signs (#), which will be converted
dnl to spaces before comparing and possibly adding to CFLAGS.
dnl This separator will need to be changed if a new compiler ever needs
dnl an option that includes a hash sign...
# Tests for flags to enable C compiler warnings
# GCC compatible
AS_IF([test "x$GCC" = "xyes" &&
"$CC" -c -Wall conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-Wall"],
[hts_cv_prog_cc_warnings="-Wall -ansi -pedantic"])
],
# Sun Studio or Solaris C compiler
["$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 &&
"$CC" -c -v -Xc conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-v"],
[hts_cv_prog_cc_warnings="-v -Xc"])
],
# Digital Unix C compiler
["$CC" -V 2>&1 | $GREP -i "Digital UNIX Compiler" > /dev/null 2>&1 &&
"$CC" -c -verbose -w0 -warnprotos -std1 conftest.c > /dev/null 2>&1 &&
test -f conftest.o], [dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos"],
[hts_cv_prog_cc_warnings="-verbose -w0 -warnprotos -std1"])
],
# C for AIX Compiler
["$CC" 2>&1 | $GREP -i "C for AIX Compiler" > /dev/null 2>&1 &&
"$CC" -c -qlanglvl=ansi -qinfo=all conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd"],
[hts_cv_prog_cc_warnings="-qsrcmsg -qinfo=all:noppt:noppc:noobs:nocnd -qlanglvl=ansi"])
],
# IRIX C compiler
["$CC" -version 2>&1 | $GREP -i "MIPSpro Compilers" > /dev/null 2>&1 &&
"$CC" -c -fullwarn -ansi -ansiE conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-fullwarn"],
[hts_cv_prog_cc_warnings="-fullwarn -ansi -ansiE"])
],
# HP-UX C compiler
[what "$CC" 2>&1 | $GREP -i "HP C Compiler" > /dev/null 2>&1 &&
"$CC" -c -Aa +w1 conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="+w1"],
[hts_cv_prog_cc_warnings="+w1 -Aa"])
],
# The NEC SX series (Super-UX 10) C compiler
["$CC" -V 2>&1 | $GREP "/SX" > /dev/null 2>&1 &&
"$CC" -c -pvctl[,]fullmsg -Xc conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-pvctl[,]fullmsg"],
[hts_cv_prog_cc_warnings="-pvctl[,]fullmsg -Xc"])
],
# The Cray C compiler (Unicos)
["$CC" -V 2>&1 | $GREP -i "Cray" > /dev/null 2>&1 &&
"$CC" -c -h msglevel_2 conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
AS_IF([test "x$ansi" = "x"],
[hts_cv_prog_cc_warnings="-h#msglevel_2"],
[hts_cv_prog_cc_warnings="-h#msglevel_2,conform"])
],
# The Tiny C Compiler
["$CC" -v 2>&1 | $GREP "tcc version" > /dev/null &&
"$CC" -Wall -c conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[dnl
hts_cv_prog_cc_warnings="-Wall"
])
rm -f conftest.*
])
])
AS_IF([test "x$hts_cv_prog_cc_warnings" != "x"],[
dnl Print result, with underscores as spaces
ac_arg_result=`echo "$hts_cv_prog_cc_warnings" | tr '#' ' '`
AC_MSG_RESULT($ac_arg_result)
dnl Add options to CFLAGS only if they are not already present
ac_arg_needed=""
for ac_arg in $hts_cv_prog_cc_warnings
do
ac_arg_sp=`echo "$ac_arg" | tr '#' ' '`
AS_CASE([" $CFLAGS "],
[*" $ac_arg_sp "*], [],
[ac_arg_needed="$ac_arg_all $ac_arg_sp"])
done
CFLAGS="$ac_arg_needed $CFLAGS"],[dnl
AC_MSG_RESULT(unknown)
])
])
])dnl HTS_PROG_CC_WARNINGS
# SYNOPSIS
#
# HTS_PROG_CC_WERROR(FLAGS_VAR)
#
# Set FLAGS_VAR to the flags needed to make the C compiler treat warnings
# as errors.
AC_DEFUN([HTS_PROG_CC_WERROR], [
AC_ARG_ENABLE([werror],
[AS_HELP_STRING([--enable-werror], [change warnings into errors, where supported])],
[enable_werror=yes],
[])
AS_IF([test "x$enable_werror" = xyes],[
AC_MSG_CHECKING([for C compiler flags to error on warnings])
AC_CACHE_VAL(hts_cv_prog_cc_werror, [dnl
hts_cv_prog_cc_werror=""
AS_IF([test "x$CC" != "x"],[
cat > conftest.c <<EOF
int main(int argc, char **argv) { return 0; }
EOF
AS_IF(dnl
# Tests for flags to make the C compiler treat warnings as errors
# GCC compatible
[test "x$GCC" = "xyes" &&
"$CC" -c -Werror conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[hts_cv_prog_cc_werror="-Werror"],
# Sun Studio or Solaris C compiler
["$CC" -V 2>&1 | $GREP -i -E "WorkShop|Sun C" > /dev/null 2>&1 &&
"$CC" -c -errwarn=%all conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[hts_cv_prog_cc_werror="-errwarn=%all"],
# The Tiny C Compiler
["$CC" -v 2>&1 | $GREP "tcc version" > /dev/null &&
"$CC" -Wall -c conftest.c > /dev/null 2>&1 &&
test -f conftest.o],[hts_cv_prog_cc_werror="-Werror"]
dnl TODO: Add more compilers
)
rm -f conftest.*
])
])
AS_IF([test "x$hts_cv_prog_cc_werror" != x],[
AC_MSG_RESULT($hts_cv_prog_cc_werror)
AS_IF([test "x$1" != x],[eval AS_TR_SH([$1])="$hts_cv_prog_cc_werror"])
],[dnl
AC_MSG_RESULT(unknown)
])
])
])dnl HTS_PROG_CC_WERROR