Skip to content
Commits on Source (5)
......@@ -93,7 +93,7 @@ endif
include config.mk
PACKAGE_VERSION = 1.6
PACKAGE_VERSION = 1.7
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
......@@ -148,7 +148,7 @@ PLATFORM := $(shell uname -s)
endif
ifeq "$(PLATFORM)" "Darwin"
$(PLUGINS): | bcftools
PLUGIN_FLAGS = -bundle -bundle_loader bcftools
PLUGIN_FLAGS = -bundle -bundle_loader bcftools -Wl,-undefined,dynamic_lookup
else
PLUGIN_FLAGS = -fPIC -shared
endif
......@@ -272,7 +272,7 @@ docs: doc/bcftools.1 doc/bcftools.html
# bcftools.1 is a generated file from the asciidoc bcftools.txt file.
# Since there is no make dependency, bcftools.1 can be out-of-date and
# make docs can be run to update if asciidoc is available
install: $(PROG) $(PLUGINS)
install: $(PROGRAMS) $(PLUGINS)
$(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(man1dir) $(DESTDIR)$(plugindir)
$(INSTALL_PROGRAM) $(PROGRAMS) $(DESTDIR)$(bindir)
$(INSTALL_SCRIPT) $(MISC_SCRIPTS) $(DESTDIR)$(misc_bindir)
......@@ -280,7 +280,7 @@ install: $(PROG) $(PLUGINS)
$(INSTALL_PROGRAM) plugins/*.so $(DESTDIR)$(plugindir)
clean: testclean clean-plugins
-rm -f gmon.out *.o *~ $(PROG) version.h plugins/*.so plugins/*.P
-rm -f gmon.out *.o *~ $(PROGRAMS) version.h plugins/*.so plugins/*.P
-rm -rf *.dSYM plugins/*.dSYM test/*.dSYM
clean-plugins:
......@@ -288,7 +288,7 @@ clean-plugins:
-rm -rf plugins/*.dSYM
testclean:
-rm -f test/*.o test/*~ $(TEST_PROG)
-rm -f test/*.o test/*~ $(TEST_PROGRAMS)
distclean: clean
-rm -f config.cache config.h config.log config.mk config.status
......
## Release 1.7 (February 2018)
* `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields
and missing values. New GT=ref,alt,mis etc keywords, check the documenation
for details.
* `query`: Only matching expression are printed when both the -f and -i/-e
expressions contain genotype fields. Note that this changes the original
behavior. Previously all samples were output when one matching sample was
found. This functionality can be achieved by pre-filtering with view and then
streaming to query. Compare
bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]' -i'GT="alt"' file.bcf
and
bcftools view -i'GT="alt"' file.bcf -Ou | bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]'
* `annotate`: New -k, --keep-sites option
* `consensus`: Fix --iupac-codes output
* `csq`: Homs always considered phased and other fixes
* `norm`: Make `-c none` work and remove `query -c`
* `roh`: Fix errors in the RG output
* `stats`: Allow IUPAC ambiguity codes in the reference file; report the number of missing genotypes
* `+fill-tags`: Add ExcHet annotation
* `+setGt`: Fix bug in binom.test calculation, previously it worked only for nAlt<nRef!
* `+split`: New plugin to split a multi-sample file into single-sample files in one go
* Improve python3 compatibility in plotting scripts
## Release 1.6 (September 2017)
* New `sort` command.
......
......@@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b)
return iupac[(int)a][(int)b];
}
static inline int iupac_consistent(char iupac, char nt)
{
static const char iupac_mask[90] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
};
if ( iupac > 89 ) return 0;
if ( nt > 90 ) nt -= 32; // lowercase
if ( nt=='A' ) nt = 1;
else if ( nt=='C' ) nt = 2;
else if ( nt=='G' ) nt = 4;
else if ( nt=='T' ) nt = 8;
return iupac_mask[(int)iupac] & nt ? 1 : 0;
}
static inline char nt_to_upper(char nt)
{
if ( nt < 97 ) return nt;
......
......@@ -77,6 +77,7 @@ typedef struct
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
char *chr;
regidx_t *mask;
regitr_t *itr;
......@@ -121,6 +122,8 @@ static void destroy_chain(args_t *args)
free(chain->block_lengths);
free(chain);
chain = NULL;
free(args->chr);
args->chr = NULL;
}
static void print_chain(args_t *args)
......@@ -162,7 +165,7 @@ static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
......@@ -248,6 +251,7 @@ static void destroy_data(args_t *args)
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
......@@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
......@@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}
int i, ialt = 1;
int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
......@@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
......@@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
static void mask_region(args_t *args, char *seq, int len)
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
......
/* convert.c -- functions for converting between VCF/BCF and related formats.
Copyright (C) 2013-2017 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -92,6 +92,7 @@ struct _convert_t
int ndat;
char *undef_info_tag;
int allow_undef_tags;
uint8_t **subset_samples;
};
typedef struct
......@@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
}
return ((int32_t*)bcf_array)[idx];
}
static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
{
int n = 0, ibeg = 0;
while ( src[ibeg] && ibeg<len && n < idx )
{
if ( src[ibeg]==',' ) n++;
ibeg++;
}
if ( ibeg==len ) { kputc('.', str); return; }
int iend = ibeg;
while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
if ( iend>ibeg )
kputsn(src+ibeg, iend-ibeg, str);
else
kputc('.', str);
}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
......@@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
......@@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
else
kputw(ival, str);
}
else if ( fmt->fmt->type == BCF_BT_CHAR )
_copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
......@@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
......@@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
default:
ret = -1;
}
......
......@@ -30,7 +30,8 @@ THE SOFTWARE. */
typedef struct _convert_t convert_t;
enum convert_option
{
allow_undef_tags
allow_undef_tags,
subset_samples,
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
......
/* The MIT License
Copyright (c) 2016 Genome Research Ltd.
Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
......@@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}
void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
......@@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap)
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i<istack )
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
......@@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec)
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;
if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
......@@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
......@@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
......@@ -3695,12 +3700,12 @@ static const char *usage(void)
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
" -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
" -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased GTs\n"
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
......
bcftools (1.7-1) unstable; urgency=medium
* Team upload
* New upstream version
* Standards-Version: 4.1.3
* d/rules: drop useless option --parallel
-- Andreas Tille <tille@debian.org> Sat, 17 Feb 2018 15:00:09 +0100
bcftools (1.6-3) unstable; urgency=medium
* Team upload
......
......@@ -13,7 +13,7 @@ Build-Depends:
libgsl-dev,
tabix <!nocheck>,
libio-pty-perl <!nocheck>,
Standards-Version: 4.1.2
Standards-Version: 4.1.3
Homepage: http://samtools.github.io/bcftools/
Vcs-Git: https://anonscm.debian.org/git/debian-med/bcftools.git
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/bcftools.git
......
Description: Fix test_vcf_query on 32-bit architectures
Introduced by upstream commit:
https://github.com/samtools/bcftools/commit/25d042b833987d32e5de1fc3a109d357c4d0f738
Forwarded: https://github.com/samtools/bcftools/pull/701
Applied-Upstream: https://github.com/samtools/bcftools/commit/e436c3d0fb26614505fb02297a21a5ede1961723
Bug-Debian: https://bugs.debian.org/868958
Author: Graham Inggs <ginggs@debian.org>
Last-Update 2017-11-09
--- a/vcfnorm.c
+++ b/vcfnorm.c
@@ -94,7 +94,7 @@
}
static inline int has_non_acgtn(char *seq, int nseq)
{
- char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ char *end = seq + nseq;
while ( *seq && seq<end )
{
char c = toupper(*seq);
@@ -326,7 +326,7 @@
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
- if ( has_non_acgtn(line->d.allele[i],0) )
+ if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
{
if ( args->check_ref==CHECK_REF_EXIT )
error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
tests-pluginpath.patch
test-regidx-unsigned-char.patch
fix-test_vcf_query.patch
make_the_test_cases_conform_to_VCF_specification.patch
Description: Fix test-regidx argument parsing on archs with unsigned char
On architectures where char is unsigned "c >= 0" was always true.
Author: Adrian Bunk <bunk@debian.org>
Bug-Debian: https://bugs.debian.org/865060
Forwarded: https://github.com/samtools/bcftools/pull/700
Applied-Upstream: https://github.com/samtools/bcftools/commit/9d83925bcb1eed867288bcd6441e863a51349c2d
--- a/test/test-regidx.c
+++ b/test/test-regidx.c
@@ -336,7 +336,7 @@
{"seed",1,0,'s'},
{0,0,0,0}
};
- char c;
+ int c;
int seed = (int)time(NULL);
while ((c = getopt_long(argc, argv, "hvs:",loptions,NULL)) >= 0)
{
......@@ -10,7 +10,7 @@ Forwarded: not-needed
Last-Update: 2015-11-09
--- a/test/test.pl
+++ b/test/test.pl
@@ -885,7 +885,7 @@ sub test_vcf_plugin
@@ -918,7 +918,7 @@ sub test_vcf_plugin
{
my ($opts,%args) = @_;
if ( !$$opts{test_plugins} ) { return; }
......
......@@ -4,7 +4,7 @@
#include /usr/share/dpkg/default.mk
%:
dh $@ --parallel
dh $@
override_dh_installman:
dh_installman --language=C
......
......@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2017-09-28
.\" Date: 2018-02-12
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2017\-09\-28" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2018\-02\-12" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
......@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2017\-09\-28\fR and refers to bcftools git version \fB1\&.6\fR\&.
This manual page was last updated \fB2018\-02\-12\fR and refers to bcftools git version \fB1\&.7\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
......@@ -691,6 +691,15 @@ is true\&. For valid expressions see
\fBEXPRESSIONS\fR\&.
.RE
.PP
\fB\-k, \-\-keep\-sites\fR
.RS 4
keep sites wich do not pass
\fB\-i\fR
and
\fB\-e\fR
expressions instead of discarding them
.RE
.PP
\fB\-m, \-\-mark\-sites\fR \fITAG\fR
.RS 4
annotate sites which are present ("+") or absent ("\-") in the
......@@ -1141,7 +1150,7 @@ to a lower value\&. Note however, that this comes at the cost of increased false
relative contribution from BAF
.RE
.PP
\fBd, \-\-BAF\-dev\fR \fIfloat\fR[,\fIfloat\fR]
\fB\-d, \-\-BAF\-dev\fR \fIfloat\fR[,\fIfloat\fR]
.RS 4
expected BAF deviation in query and control, i\&.e\&. the noise observed in the data\&.
.RE
......@@ -1824,7 +1833,7 @@ see
.PP
\fB\-p, \-\-phase\fR \fIa\fR|\fIm\fR|\fIr\fR|\fIR\fR|\fIs\fR
.RS 4
how to construct haplotypes and how to deal with unphased data:
how to handle unphased heterozygous genotypes:
.PP
\fIa\fR
.RS 4
......@@ -1848,7 +1857,7 @@ create non\-reference haplotypes if possible (0/1 → 1|1, 1/2 → 1|2)
.PP
\fIs\fR
.RS 4
skip unphased GTs
skip unphased heterozygous GTs
.RE
.RE
.PP
......@@ -2678,6 +2687,11 @@ option\&.
\fB\-G, \-\-read\-groups\fR \fIFILE\fR
.RS 4
list of read groups to include or exclude if prefixed with "^"\&. One read group per line\&. This file can also be used to assign new sample names to read groups by giving the new sample name as a second white\-space\-separated field, like this: "read_group_id new_sample_name"\&. If the read group name is not unique, also the bam file name can be included: "read_group_id file_name sample_name"\&. If all reads from the alignment file should be treated as a single sample, the asterisk symbol can be used: "* file_name sample_name"\&. Alignments without a read group ID can be matched with "?"\&.
\fBNOTE:\fR
The meaning of
\fBbcftools mpileup \-G\fR
is the opposite of
\fBsamtools mpileup \-G\fR\&.
.RE
.sp
.if n \{\
......@@ -3526,12 +3540,6 @@ is from the interval [0,1] and larger is stricter
.sp
Extracts fields from VCF or BCF files and outputs them in user\-defined format\&.
.PP
\fB\-c, \-\-collapse\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fIsome\fR|\fInone\fR
.RS 4
see
\fBCommon Options\fR
.RE
.PP
\fB\-e, \-\-exclude\fR \fIEXPRESSION\fR
.RS 4
exclude sites for which
......@@ -3712,6 +3720,28 @@ bcftools query \-f\*(Aq%CHROM\et%POS0\et%END\et%ID\en\*(Aq file\&.bcf
.if n \{\
.RE
.\}
.sp
.if n \{\
.RS 4
.\}
.nf
# Print only samples with alternate (non\-reference) genotypes
bcftools query \-f\*(Aq[%CHROM:%POS %SAMPLE %GT\en]\*(Aq \-i\*(AqGT="alt"\*(Aq file\&.bcf
.fi
.if n \{\
.RE
.\}
.sp
.if n \{\
.RS 4
.\}
.nf
# Print all samples at sites with at least one alternate genotype
bcftools view \-i\*(AqGT="alt"\*(Aq file\&.bcf \-Ou | bcftools query \-f\*(Aq[%CHROM:%POS %SAMPLE %GT\en]\*(Aq
.fi
.if n \{\
.RE
.\}
.RE
.SS "bcftools reheader [\fIOPTIONS\fR] \fIfile\&.vcf\&.gz\fR"
.sp
......@@ -4594,12 +4624,15 @@ GT="\&.|\&.", GT="\&./\&.", GT="\&."
.sp -1
.IP \(bu 2.3
.\}
sample genotype: homozygous, heterozygous, haploid, ref\-ref hom, alt\-alt hom, ref\-alt het, alt\-alt het, haploid ref, haploid alt (case\-insensitive)
sample genotype: reference (haploid or diploid), alternate (hom or het, haploid or diploid), missing genotype, homozygous, heterozygous, haploid, ref\-ref hom, alt\-alt hom, ref\-alt het, alt\-alt het, haploid ref, haploid alt (case\-insensitive)
.sp
.if n \{\
.RS 4
.\}
.nf
GT="ref"
GT="alt"
GT="mis"
GT="hom"
GT="het"
GT="hap"
......@@ -4647,7 +4680,7 @@ TYPE!~"snp"
.sp -1
.IP \(bu 2.3
.\}
array subscripts (0\-based), "*" for any field, "\-" to indicate a range
array subscripts (0\-based), "*" for any field, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
.sp
.if n \{\
.RS 4
......@@ -4656,9 +4689,13 @@ array subscripts (0\-based), "*" for any field, "\-" to indicate a range
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
DP4[*] == 0
CSQ[*] ~ "missense_variant\&.*deleterious"
FORMAT/DP[1\-3] > 10
FORMAT/DP[1\-] < 7
FORMAT/DP[0,2\-4] > 20
FORMAT/DP[0] > 30 \&.\&. DP of the first sample bigger than 30
FORMAT/DP[1\-3] > 10 \&.\&. samples 2\-4
FORMAT/DP[1\-] < 7 \&.\&. all samples but the first
FORMAT/DP[0,2\-4] > 20 \&.\&. samples 1, 3\-5
FORMAT/AD[0:1] \&.\&. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] \&.\&. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] \&.\&. any sample, second AD field
.fi
.if n \{\
.RE
......@@ -4679,7 +4716,7 @@ function on FORMAT tags (over samples) and INFO tags (over vector fields)
.RS 4
.\}
.nf
MAX, MIN, AVG, SUM, STRLEN, ABS
MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT
.fi
.if n \{\
.RE
......@@ -4728,7 +4765,7 @@ String comparisons and regular expressions are case\-insensitive
.sp -1
.IP \(bu 2.3
.\}
If the subscript "*" is used in regular expression search, the whole field is treated as one string\&. For example, the regex STR[*]~"B,C" will be true for the string vector INFO/STR=AB,CD\&.
Variables and function names are case\-insensitive, but not tag names\&. For example, "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but not "dp" instead of "DP"\&.
.RE
.sp
.RS 4
......@@ -4739,7 +4776,24 @@ If the subscript "*" is used in regular expression search, the whole field is tr
.sp -1
.IP \(bu 2.3
.\}
Variables and function names are case\-insensitive, but not tag names\&. For example, "qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but not "dp" instead of "DP"\&.
When querying multiple subfields, all subfields are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
.sp
.if n \{\
.RS 4
.\}
.nf
\-i \*(AqTAG[*]=1\*(Aq \&.\&. true, the record will be printed
\-i \*(AqTAG[*]!=1\*(Aq \&.\&. true
\-e \*(AqTAG[*]=1\*(Aq \&.\&. false, the record will be discarded
\-e \*(AqTAG[*]!=1\*(Aq \&.\&. false
\-i \*(AqTAG[0]=1\*(Aq \&.\&. true
\-i \*(AqTAG[0]!=1\*(Aq \&.\&. false
\-e \*(AqTAG[0]=1\*(Aq \&.\&. false
\-e \*(AqTAG[0]!=1\*(Aq \&.\&. true
.fi
.if n \{\
.RE
.\}
.RE
.sp
\fBExamples:\fR
......@@ -4798,7 +4852,7 @@ FMT/DP>10 && FMT/GQ>10 \&.\&. the conditions can be satisfied in different sampl
.RS 4
.\}
.nf
QUAL>10 | FMT/GQ>10 \&.\&. selects only GQ>10 samples
QUAL>10 | FMT/GQ>10 \&.\&. true for sites with QUAL>10 or a sample with GQ>10, but selects only samples with GQ>10
.fi
.if n \{\
.RE
......@@ -4808,7 +4862,7 @@ QUAL>10 | FMT/GQ>10 \&.\&. selects only GQ>10 samples
.RS 4
.\}
.nf
QUAL>10 || FMT/GQ>10 \&.\&. selects all samples at QUAL>10 sites
QUAL>10 || FMT/GQ>10 \&.\&. true for sites with QUAL>10 or a sample with GQ>10, plus selects all samples at such sites
.fi
.if n \{\
.RE
......@@ -4828,6 +4882,16 @@ TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2)
.RS 4
.\}
.nf
COUNT(GT="hom")=0
.fi
.if n \{\
.RE
.\}
.sp
.if n \{\
.RS 4
.\}
.nf
MIN(DP)>35 && AVG(GQ)>50
.fi
.if n \{\
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp53408"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp144864"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
......@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-09-28</strong></span> and refers to bcftools git version <span class="strong"><strong>1.6</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-02-12</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
......@@ -311,6 +311,10 @@ specific commands to see if they apply.</p><div class="variablelist"><dl><dt><sp
include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
<span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
</dd><dt><span class="term">
<span class="strong"><strong>-k, --keep-sites</strong></span>
</span></dt><dd>
keep sites wich do not pass <span class="strong"><strong>-i</strong></span> and <span class="strong"><strong>-e</strong></span> expressions instead of discarding them
</dd><dt><span class="term">
<span class="strong"><strong>-m, --mark-sites</strong></span> <span class="emphasis"><em><span class="+-">TAG</span></em></span>
</span></dt><dd>
annotate sites which are present ("+") or absent ("-") in the <span class="strong"><strong>-a</strong></span> file with a new INFO/TAG flag
......@@ -610,7 +614,7 @@ loss), 0 (complete loss), 3 (single-copy gain).</p><div class="refsect3" title="
</span></dt><dd>
relative contribution from BAF
</dd><dt><span class="term">
<span class="strong"><strong>d, --BAF-dev</strong></span> <span class="emphasis"><em>float</em></span>[,<span class="emphasis"><em>float</em></span>]
<span class="strong"><strong>-d, --BAF-dev</strong></span> <span class="emphasis"><em>float</em></span>[,<span class="emphasis"><em>float</em></span>]
</span></dt><dd>
expected BAF deviation in query and control, i.e. the noise observed
in the data.
......@@ -1062,7 +1066,7 @@ output VCF and are ignored for the prediction analysis.</p><div class="variablel
</dd><dt><span class="term">
<span class="strong"><strong>-p, --phase</strong></span> <span class="emphasis"><em>a</em></span>|<span class="emphasis"><em>m</em></span>|<span class="emphasis"><em>r</em></span>|<span class="emphasis"><em>R</em></span>|<span class="emphasis"><em>s</em></span>
</span></dt><dd><p class="simpara">
how to construct haplotypes and how to deal with unphased data:
how to handle unphased heterozygous genotypes:
</p><div class="variablelist"><dl><dt><span class="term">
<span class="emphasis"><em>a</em></span>
</span></dt><dd>
......@@ -1082,7 +1086,7 @@ output VCF and are ignored for the prediction analysis.</p><div class="variablel
</dd><dt><span class="term">
<span class="emphasis"><em>s</em></span>
</span></dt><dd>
skip unphased GTs
skip unphased heterozygous GTs
</dd></dl></div></dd><dt><span class="term">
<span class="strong"><strong>-q, --quiet</strong></span>
</span></dt><dd>
......@@ -1593,7 +1597,8 @@ multiple regions and many alignment files are processed.</p><div class="refsect3
be included: "read_group_id file_name sample_name". If all
reads from the alignment file should be treated as a single sample, the
asterisk symbol can be used: "* file_name sample_name". Alignments without
a read group ID can be matched with "?".
a read group ID can be matched with "?". <span class="strong"><strong>NOTE:</strong></span> The meaning of <span class="strong"><strong>bcftools mpileup -G</strong></span>
is the opposite of <span class="strong"><strong>samtools mpileup -G</strong></span>.
</dd></dl></div><pre class="screen"> RG_ID_1
RG_ID_2 SAMPLE_A
RG_ID_3 SAMPLE_A
......@@ -2118,10 +2123,6 @@ file for help.</p><div class="refsect3" title="General options:"><a id="_general
a heuristics to filter failed fits where the expected peak symmetry is violated.
The <span class="emphasis"><em>float</em></span> is from the interval [0,1] and larger is stricter
</dd></dl></div></div></div><div class="refsect2" title="bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz […]]"><a id="query"></a><h3>bcftools query [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>file.vcf.gz</em></span> [<span class="emphasis"><em>file.vcf.gz</em></span> […]]</h3><p>Extracts fields from VCF or BCF files and outputs them in user-defined format.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-c, --collapse</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>all</em></span>|<span class="emphasis"><em>some</em></span>|<span class="emphasis"><em>none</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd><dt><span class="term">
<span class="strong"><strong>-e, --exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
......@@ -2201,7 +2202,9 @@ file for help.</p><div class="refsect3" title="General options:"><a id="_general
bcftools query -f '%CHROM %POS %REF %ALT{0}\n' file.vcf.gz</pre><pre class="literallayout"># Similar to above, but use tabs instead of spaces, add sample name and genotype
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n' file.vcf.gz</pre><pre class="literallayout"># Print FORMAT/GT fields followed by FORMAT/GT fields
bcftools query -f 'GQ:[ %GQ] \t GT:[ %GT]\n' file.vcf</pre><pre class="literallayout"># Make a BED file: chr, pos (0-based), end pos (1-based), id
bcftools query -f'%CHROM\t%POS0\t%END\t%ID\n' file.bcf</pre></div></div><div class="refsect2" title="bcftools reheader [OPTIONS] file.vcf.gz"><a id="reheader"></a><h3>bcftools reheader [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>file.vcf.gz</em></span></h3><p>Modify header of VCF/BCF files, change sample names.</p><div class="variablelist"><dl><dt><span class="term">
bcftools query -f'%CHROM\t%POS0\t%END\t%ID\n' file.bcf</pre><pre class="literallayout"># Print only samples with alternate (non-reference) genotypes
bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]' -i'GT="alt"' file.bcf</pre><pre class="literallayout"># Print all samples at sites with at least one alternate genotype
bcftools view -i'GT="alt"' file.bcf -Ou | bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]'</pre></div></div><div class="refsect2" title="bcftools reheader [OPTIONS] file.vcf.gz"><a id="reheader"></a><h3>bcftools reheader [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>file.vcf.gz</em></span></h3><p>Modify header of VCF/BCF files, change sample names.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-h, --header</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
new VCF header
......@@ -2681,9 +2684,14 @@ using these expressions
missing genotypes can be matched including the phase and ploidy (".|.", "./.", ".")
using these expressions
</p><pre class="literallayout">GT=".|.", GT="./.", GT="."</pre></li><li class="listitem"><p class="simpara">
sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
</p><pre class="literallayout">GT="hom"
sample genotype: reference (haploid or diploid), alternate (hom or het,
haploid or diploid), missing genotype, homozygous, heterozygous, haploid,
ref-ref hom, alt-alt hom, ref-alt het, alt-alt het, haploid ref, haploid alt
(case-insensitive)
</p><pre class="literallayout">GT="ref"
GT="alt"
GT="mis"
GT="hom"
GT="het"
GT="hap"
GT="RR"
......@@ -2699,15 +2707,20 @@ to require that all alleles are of the given type. Compare
TYPE~"snp"
TYPE!="snp"
TYPE!~"snp"</pre></li><li class="listitem"><p class="simpara">
array subscripts (0-based), "*" for any field, "-" to indicate a range
array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
</p><pre class="literallayout">(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
DP4[*] == 0
CSQ[*] ~ "missense_variant.*deleterious"
FORMAT/DP[1-3] &gt; 10
FORMAT/DP[1-] &lt; 7
FORMAT/DP[0,2-4] &gt; 20</pre></li><li class="listitem"><p class="simpara">
FORMAT/DP[0] &gt; 30 .. DP of the first sample bigger than 30
FORMAT/DP[1-3] &gt; 10 .. samples 2-4
FORMAT/DP[1-] &lt; 7 .. all samples but the first
FORMAT/DP[0,2-4] &gt; 20 .. samples 1, 3-5
FORMAT/AD[0:1] .. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field</pre></li><li class="listitem"><p class="simpara">
function on FORMAT tags (over samples) and INFO tags (over vector fields)
</p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS</pre></li><li class="listitem"><p class="simpara">
</p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT</pre></li><li class="listitem"><p class="simpara">
variables calculated on the fly if not present: number of alternate alleles;
number of samples; count of alternate alleles; minor allele count (similar to
AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN);
......@@ -2716,14 +2729,20 @@ number of samples with missing genotype; fraction of samples with missing genoty
</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li></ul></div><div class="itemizedlist" title="Notes:"><p class="title"><strong>Notes:</strong></p><ul class="itemizedlist" type="disc"><li class="listitem">
String comparisons and regular expressions are case-insensitive
</li><li class="listitem">
If the subscript "*" is used in regular expression search, the
whole field is treated as one string. For example, the regex STR[*]~"B,C" will be
true for the string vector INFO/STR=AB,CD.
</li><li class="listitem">
Variables and function names are case-insensitive, but not tag names. For example,
"qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
not "dp" instead of "DP".
</li></ul></div><p><span class="strong"><strong>Examples:</strong></span></p><pre class="literallayout">MIN(DV)&gt;5</pre><pre class="literallayout">MIN(DV/DP)&gt;0.3</pre><pre class="literallayout">MIN(DP)&gt;10 &amp; MIN(DV)&gt;3</pre><pre class="literallayout">FMT/DP&gt;10 &amp; FMT/GQ&gt;10 .. both conditions must be satisfied within one sample</pre><pre class="literallayout">FMT/DP&gt;10 &amp;&amp; FMT/GQ&gt;10 .. the conditions can be satisfied in different samples</pre><pre class="literallayout">QUAL&gt;10 | FMT/GQ&gt;10 .. selects only GQ&gt;10 samples</pre><pre class="literallayout">QUAL&gt;10 || FMT/GQ&gt;10 .. selects all samples at QUAL&gt;10 sites</pre><pre class="literallayout">TYPE="snp" &amp;&amp; QUAL&gt;=10 &amp;&amp; (DP4[2]+DP4[3] &gt; 2)</pre><pre class="literallayout">MIN(DP)&gt;35 &amp;&amp; AVG(GQ)&gt;50</pre><pre class="literallayout">ID=@file .. selects lines with ID present in the file</pre><pre class="literallayout">ID!=@~/file .. skip lines with ID present in the ~/file</pre><pre class="literallayout">MAF[0]&lt;0.05 .. select rare variants at 5% cutoff</pre><pre class="literallayout">POS&gt;=100 .. restrict your range query, e.g. 20:100-200 to strictly sites with POS in that range.</pre><p><span class="strong"><strong>Shell expansion:</strong></span></p><p>Note that expressions must often be quoted because some characters
</li><li class="listitem"><p class="simpara">
When querying multiple subfields, all subfields are tested and the OR logic is
used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
</p><pre class="literallayout">-i 'TAG[*]=1' .. true, the record will be printed
-i 'TAG[*]!=1' .. true
-e 'TAG[*]=1' .. false, the record will be discarded
-e 'TAG[*]!=1' .. false
-i 'TAG[0]=1' .. true
-i 'TAG[0]!=1' .. false
-e 'TAG[0]=1' .. false
-e 'TAG[0]!=1' .. true</pre></li></ul></div><p><span class="strong"><strong>Examples:</strong></span></p><pre class="literallayout">MIN(DV)&gt;5</pre><pre class="literallayout">MIN(DV/DP)&gt;0.3</pre><pre class="literallayout">MIN(DP)&gt;10 &amp; MIN(DV)&gt;3</pre><pre class="literallayout">FMT/DP&gt;10 &amp; FMT/GQ&gt;10 .. both conditions must be satisfied within one sample</pre><pre class="literallayout">FMT/DP&gt;10 &amp;&amp; FMT/GQ&gt;10 .. the conditions can be satisfied in different samples</pre><pre class="literallayout">QUAL&gt;10 | FMT/GQ&gt;10 .. true for sites with QUAL&gt;10 or a sample with GQ&gt;10, but selects only samples with GQ&gt;10</pre><pre class="literallayout">QUAL&gt;10 || FMT/GQ&gt;10 .. true for sites with QUAL&gt;10 or a sample with GQ&gt;10, plus selects all samples at such sites</pre><pre class="literallayout">TYPE="snp" &amp;&amp; QUAL&gt;=10 &amp;&amp; (DP4[2]+DP4[3] &gt; 2)</pre><pre class="literallayout">COUNT(GT="hom")=0</pre><pre class="literallayout">MIN(DP)&gt;35 &amp;&amp; AVG(GQ)&gt;50</pre><pre class="literallayout">ID=@file .. selects lines with ID present in the file</pre><pre class="literallayout">ID!=@~/file .. skip lines with ID present in the ~/file</pre><pre class="literallayout">MAF[0]&lt;0.05 .. select rare variants at 5% cutoff</pre><pre class="literallayout">POS&gt;=100 .. restrict your range query, e.g. 20:100-200 to strictly sites with POS in that range.</pre><p><span class="strong"><strong>Shell expansion:</strong></span></p><p>Note that expressions must often be quoted because some characters
have special meaning in the shell.
An example of expression enclosed in single quotes which cause
that the whole expression is passed to the program as intended:</p><pre class="literallayout">bcftools view -i '%ID!="." &amp; MAF[0]&lt;0.01'</pre><p>Please refer to the documentation of your shell for details.</p></div><div class="refsect1" title="SCRIPTS AND OPTIONS"><a id="_scripts_and_options"></a><h2>SCRIPTS AND OPTIONS</h2><div class="refsect2" title="plot-vcfstats [OPTIONS] file.vchk […]"><a id="plot-vcfstats"></a><h3>plot-vcfstats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>file.vchk</em></span> […]</h3><p>Script for processing output of <span class="strong"><strong><a class="link" href="#stats" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]">bcftools stats</a></strong></span>. It can merge
......
......@@ -356,6 +356,9 @@ Add or remove annotations.
include only sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.
*-k, --keep-sites*::
keep sites wich do not pass *-i* and *-e* expressions instead of discarding them
*-m, --mark-sites* [+-]'TAG'::
annotate sites which are present ("+") or absent ("-") in the *-a* file with a new INFO/TAG flag
......@@ -639,7 +642,7 @@ loss), 0 (complete loss), 3 (single-copy gain).
*-b, --BAF-weight* 'float'::
relative contribution from BAF
*d, --BAF-dev* 'float'[,'float']::
*-d, --BAF-dev* 'float'[,'float']::
expected BAF deviation in query and control, i.e. the noise observed
in the data.
......@@ -1085,7 +1088,7 @@ output VCF and are ignored for the prediction analysis.
plain text output can be printed ('t').
*-p, --phase* 'a'|'m'|'r'|'R'|'s'::
how to construct haplotypes and how to deal with unphased data:
how to handle unphased heterozygous genotypes:
'a';;
take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)
......@@ -1100,7 +1103,7 @@ output VCF and are ignored for the prediction analysis.
create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)
's';;
skip unphased GTs
skip unphased heterozygous GTs
*-q, --quiet*::
suppress warning messages
......@@ -1625,7 +1628,8 @@ multiple regions and many alignment files are processed.
be included: "read_group_id file_name sample_name". If all
reads from the alignment file should be treated as a single sample, the
asterisk symbol can be used: "* file_name sample_name". Alignments without
a read group ID can be matched with "?".
a read group ID can be matched with "?". *NOTE:* The meaning of *bcftools mpileup -G*
is the opposite of *samtools mpileup -G*.
----
RG_ID_1
RG_ID_2 SAMPLE_A
......@@ -2133,9 +2137,6 @@ file for help.
=== bcftools query ['OPTIONS'] 'file.vcf.gz' ['file.vcf.gz' [...]]
Extracts fields from VCF or BCF files and outputs them in user-defined format.
*-c, --collapse* 'snps'|'indels'|'both'|'all'|'some'|'none'::
see *<<common_options,Common Options>>*
*-e, --exclude* 'EXPRESSION'::
exclude sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.
......@@ -2218,6 +2219,12 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format.
# Make a BED file: chr, pos (0-based), end pos (1-based), id
bcftools query -f'%CHROM\t%POS0\t%END\t%ID\n' file.bcf
# Print only samples with alternate (non-reference) genotypes
bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]' -i'GT="alt"' file.bcf
# Print all samples at sites with at least one alternate genotype
bcftools view -i'GT="alt"' file.bcf -Ou | bcftools query -f'[%CHROM:%POS %SAMPLE %GT\n]'
[[reheader]]
=== bcftools reheader ['OPTIONS'] 'file.vcf.gz'
Modify header of VCF/BCF files, change sample names.
......@@ -2709,9 +2716,14 @@ using these expressions
GT=".|.", GT="./.", GT="."
* sample genotype: homozygous, heterozygous, haploid, ref-ref hom, alt-alt hom,
ref-alt het, alt-alt het, haploid ref, haploid alt (case-insensitive)
* sample genotype: reference (haploid or diploid), alternate (hom or het,
haploid or diploid), missing genotype, homozygous, heterozygous, haploid,
ref-ref hom, alt-alt hom, ref-alt het, alt-alt het, haploid ref, haploid alt
(case-insensitive)
GT="ref"
GT="alt"
GT="mis"
GT="hom"
GT="het"
GT="hap"
......@@ -2731,18 +2743,23 @@ to require that all alleles are of the given type. Compare
TYPE!="snp"
TYPE!~"snp"
* array subscripts (0-based), "*" for any field, "-" to indicate a range
* array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
DP4[*] == 0
CSQ[*] ~ "missense_variant.*deleterious"
FORMAT/DP[1-3] > 10
FORMAT/DP[1-] < 7
FORMAT/DP[0,2-4] > 20
FORMAT/DP[0] > 30 .. DP of the first sample bigger than 30
FORMAT/DP[1-3] > 10 .. samples 2-4
FORMAT/DP[1-] < 7 .. all samples but the first
FORMAT/DP[0,2-4] > 20 .. samples 1, 3-5
FORMAT/AD[0:1] .. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field
* function on FORMAT tags (over samples) and INFO tags (over vector fields)
MAX, MIN, AVG, SUM, STRLEN, ABS
MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT
* variables calculated on the fly if not present: number of alternate alleles;
number of samples; count of alternate alleles; minor allele count (similar to
......@@ -2756,12 +2773,20 @@ number of samples with missing genotype; fraction of samples with missing genoty
.Notes:
* String comparisons and regular expressions are case-insensitive
* If the subscript "\*" is used in regular expression search, the
whole field is treated as one string. For example, the regex STR[*]~"B,C" will be
true for the string vector INFO/STR=AB,CD.
* Variables and function names are case-insensitive, but not tag names. For example,
"qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
not "dp" instead of "DP".
* When querying multiple subfields, all subfields are tested and the OR logic is
used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
-i 'TAG[*]=1' .. true, the record will be printed
-i 'TAG[*]!=1' .. true
-e 'TAG[*]=1' .. false, the record will be discarded
-e 'TAG[*]!=1' .. false
-i 'TAG[0]=1' .. true
-i 'TAG[0]!=1' .. false
-e 'TAG[0]=1' .. false
-e 'TAG[0]!=1' .. true
*Examples:*
......@@ -2777,12 +2802,14 @@ not "dp" instead of "DP".
FMT/DP>10 && FMT/GQ>10 .. the conditions can be satisfied in different samples
QUAL>10 | FMT/GQ>10 .. selects only GQ>10 samples
QUAL>10 | FMT/GQ>10 .. true for sites with QUAL>10 or a sample with GQ>10, but selects only samples with GQ>10
QUAL>10 || FMT/GQ>10 .. selects all samples at QUAL>10 sites
QUAL>10 || FMT/GQ>10 .. true for sites with QUAL>10 or a sample with GQ>10, plus selects all samples at such sites
TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2)
COUNT(GT="hom")=0
MIN(DP)>35 && AVG(GQ)>50
ID=@file .. selects lines with ID present in the file
......
This diff is collapsed.
#!/usr/bin/env perl
#
# The MIT License
#
# Copyright (c) 2017 Genome Research Ltd.
#
# Author: Petr Danecek <pd3@sanger.ac.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
use strict;
use warnings;
use Carp;
my $opts = parse_params();
fix($opts);
exit;
#--------------------------------
sub error
{
my (@msg) = @_;
if ( scalar @msg ) { confess @msg; }
print
"About:\n",
" Some parts of GATK throw errors like\n",
" \"java.lang.Double cannot be cast to java.lang.Integer\"\n",
" This is caused by a bug in GATK which incorrectly parses valid float number expressions, for example\n",
" it would not accept \"0\", only \"0.0\". Such unnecessary strictness violates the VCF specification\n",
" (and common sense). This script reformats floating point fields to work around this.\n",
"\n",
"Usage: fix-broken-GATK-Double-vs-Integer [OPTIONS]\n",
"Options:\n",
" -c, --check-only check for problems, do not output VCF\n",
" -h, --help this help message\n",
"\n",
"Example:\n",
" gunzip -c ori.vcf.gz | fix-broken-GATK-Double-vs-Integer | bgzip -c > new.vcf.gz\n",
"\n";
exit -1;
}
sub parse_params
{
my $opts =
{
nflt => 0,
nint => 0,
};
while (defined(my $arg=shift(@ARGV)))
{
if ( $arg eq '-c' || $arg eq '--check-only' ) { $$opts{check_only} = 1; next; }
if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
error("Unknown parameter \"$arg\". Run -h for help.\n");
}
return $opts;
}
sub fix
{
my ($opts) = @_;
while (my $line=<STDIN>)
{
if ( $line=~/^#/ )
{
$line = parse_header($opts,$line);
}
else
{
$line = fix_line($opts,$line);
}
if ( !$$opts{check_only} ) { print $line; }
}
print STDERR "Modified $$opts{nflt} float values, $$opts{nint} integer values\n";
}
sub parse_header
{
my ($opts,$line) = @_;
my $coltype = undef;
if ( $line=~/^##INFO/ ) { $coltype = 'info'; }
elsif ( $line=~/^##FORMAT/ ) { $coltype = 'fmt'; }
else { return $line; }
if ( !($line=~/ID=([^,>]+)/) ) { return $line; }
my $id = $1;
my $type = undef;
if ( $line=~/Type=Float/ ) { $type = 'float'; }
elsif ( $line=~/Type=Integer/ ) { $type = 'int'; }
else { return $line; }
$$opts{$coltype}{$id} = $type;
return $line;
}
sub fix_line
{
my ($opts,$line) = @_;
my $info = $$opts{info};
my $fmt = $$opts{fmt};
my @cols = split(/\t/,$line);
my @info = split(/;/,$cols[7]);
my @fmt = split(/:/,$cols[8]);
my $pos = "$cols[0]:$cols[1]";
chomp($cols[-1]);
for (my $i=0; $i<@info; $i++)
{
my ($key,$val) = split(/=/,$info[$i]);
if ( !defined $val or !exists($$info{$key}) ) { next; }
$val = $$info{$key} eq 'float' ? fix_float($opts,$pos,$key,$val) : fix_int($opts,$pos,$key,$val);
$info[$i] = "$key=$val";
}
$cols[7] = join(';',@info);
for (my $j=9; $j<@cols; $j++)
{
my @vals = split(/:/,$cols[$j]);
for (my $i=0; $i<@fmt; $i++)
{
my $key = $fmt[$i];
if ( !exists($$fmt{$key}) ) { next; }
if ( !exists($vals[$i]) ) { last; }
$vals[$i] = $$fmt{$key} eq 'float' ? fix_float($opts,$pos,$key,$vals[$i]) : fix_int($opts,$pos,$key,$vals[$i]);
}
$cols[$j] = join(':',@vals);
}
return join("\t",@cols)."\n";
}
sub fix_int
{
my ($opts,$pos,$key,$vals) = @_;
my @vals = split(/,/,$vals);
for (my $i=0; $i<@vals; $i++)
{
if ( $vals[$i] eq '.' ) { next; }
if ( $vals[$i] =~ /^-?[0-9]+$/ ) { next; }
if ( $$opts{check_only} ) { print "$pos\t$key\tInteger\t$vals[$i]\n"; }
error("todo: $pos\t$key\tInteger\t$vals[$i]\n");
$$opts{nint}++;
}
return $vals;
}
sub fix_float
{
my ($opts,$pos,$key,$vals) = @_;
my @vals = split(/,/,$vals);
for (my $i=0; $i<@vals; $i++)
{
if ( $vals[$i] eq '.' ) { next; }
if ( $vals[$i] =~ /\./ ) { next; }
if ( $vals[$i] =~ /[eE]/ ) { next; }
if ( $$opts{check_only} ) { print "$pos\t$key\tFloat\t$vals[$i]\n"; }
$vals[$i] .= '.';
$$opts{nflt}++;
}
return join(',',@vals);
}