Skip to content
Commits on Source (12)
......@@ -76,6 +76,21 @@ to an HTSlib source tree or installation in DIR; or --with-htslib=system
to use a system-installed HTSlib.
Optional Compilation with Perl
==============================
The '-i' and '-e' options can take external perl scripts for a more
sophisticated filtering. This option can be enabled by supplying the
--enable-perl-filters option to configure before running make:
./configure --enable-perl-filters
Note that enabling this option changes the license from MIT to GPL
because bcftools need to be built with
perl -MExtUtils::Embed -e ccopts -e ldopts
Optional Compilation with GSL
=============================
......
......@@ -93,7 +93,7 @@ endif
include config.mk
PACKAGE_VERSION = 1.7
PACKAGE_VERSION = 1.8
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
......@@ -135,7 +135,7 @@ ifdef USE_GPL
endif
bcftools: $(OBJS) $(HTSLIB)
$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS)
$(CC) $(DYNAMIC_FLAGS) -pthread $(ALL_LDFLAGS) -o $@ $(OBJS) $(HTSLIB_LIB) -lm $(ALL_LIBS) $(GSL_LIBS) $(PERL_LIBS)
# Plugin rules
ifneq "$(PLUGINS_ENABLED)" "no"
......@@ -212,7 +212,8 @@ ccall.o: ccall.c $(htslib_kfunc_h) $(call_h) kmin.h $(prob1_h)
convert.o: convert.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(convert_h)
tsv2vcf.o: tsv2vcf.c $(tsv2vcf_h)
em.o: em.c $(htslib_vcf_h) kmin.h $(call_h)
filter.o: filter.c $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
filter.o: filter.c config.h $(htslib_khash_str2int_h) $(filter_h) $(bcftools_h) $(htslib_hts_defs_h) $(htslib_vcfutils_h)
$(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $<
gvcf.o: gvcf.c gvcf.h $(call_h)
kmin.o: kmin.c kmin.h
mcall.o: mcall.c $(htslib_kfunc_h) $(call_h)
......
## Release 1.8 (April 2018)
* `-i, -e` filtering: Support for custom perl scripts
* `+contrast`: New plugin to annotate genotype differences between groups of samples
* `+fixploidy`: New options for simpler ploidy usage
* `+setGT`: Target genotypes can be set to phased by giving `--new-gt p`
* `run-roh.pl`: Allow to pass options directly to `bcftools roh`
* Number of bug fixes
## Release 1.7 (February 2018)
* `-i, -e` filtering: Major revamp, improved filtering by FORMAT fields
......
......@@ -45,6 +45,8 @@ DYNAMIC_FLAGS = @CC_RDYNAMIC_OPT@
USE_GPL = @USE_GPL@
GSL_LIBS = @GSL_LIBS@
PERL_CFLAGS = @PERL_CCOPTS@
PERL_LIBS = @PERL_LIBS@
PLATFORM = @PLATFORM@
PLUGINS_ENABLED = @enable_bcftools_plugins@
......
......@@ -66,6 +66,12 @@ AC_ARG_WITH([bcf-plugin-path],
[with_bcf_plugin_path=$with_bcf_plugin_dir])
AC_SUBST([PLUGINPATH], $with_bcf_plugin_path)
AC_ARG_ENABLE([perl-filters],
[AS_HELP_STRING([--enable-perl-filters],
[do not build support for PERL scripts in -i/-e filtering expressions])],
[], [enable_perl_filters=no])
AC_SUBST(enable_perl_filters)
AC_ARG_ENABLE([libgsl],
[AS_HELP_STRING([--enable-libgsl],
[build options that require the GNU scientific library (changes bcftools license to GPL, see INSTALL for details)])],
......@@ -199,6 +205,19 @@ Either configure with --disable-libgsl or resolve this error to build bcftools.
AC_SUBST([GSL_LIBS])
])
AS_IF([test "$enable_perl_filters" != "no" ], [dnl
if ! perl -MExtUtils::Embed -e ccopts -e ldopts &>/dev/null ; then
AC_MSG_ERROR([Cannot `perl -MExtUtils::Embed -e ccopts -e ldopts`])
fi
AC_DEFINE([ENABLE_PERL_FILTERS], 1, [Define if BCFtools should enable for support PERL scripts in -i/-e filtering expressions.])
PERL_CCOPTS="`perl -MExtUtils::Embed -e ccopts`"
PERL_LIBS="`perl -MExtUtils::Embed -e ldopts`"
AC_SUBST([PERL_CCOPTS])
AC_SUBST([PERL_LIBS])
USE_GPL=1
AC_SUBST([USE_GPL])
])
AC_CONFIG_FILES([config.mk])
AC_OUTPUT
......
......@@ -280,6 +280,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
free(args->chr);
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
......
......@@ -760,12 +760,33 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);
int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
ptr += fmt_gt->n;
if ( ptr[0]==2 )
if ( fmt_gt->n==1 ) // haploid genotypes
{
if ( ptr[0]==2 ) /* 0 */
{
str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==bcf_int32_missing ) /* . */
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==4 ) /* 1 */
{
str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else
{
kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
}
else if ( ptr[0]==2 )
{
if ( ptr[1]==3 ) /* 0|0 */
{
......
......@@ -71,7 +71,7 @@
A .. gene line with a supported biotype
A.ID=~/^gene:/
B .. transcript line referencing A
B .. transcript line referencing A with supported biotype
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
......
bcftools (1.8-1) unstable; urgency=medium
* Team upload.
* New upstream version
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
* debhelper 11
-- Andreas Tille <tille@debian.org> Fri, 20 Apr 2018 11:36:02 +0200
bcftools (1.7-2) unstable; urgency=medium
* Team upload.
......
Source: bcftools
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders:
Afif Elghraoui <afif@debian.org>
Section: science
Priority: optional
Build-Depends:
debhelper (>= 10),
debhelper (>= 11~),
zlib1g-dev,
libbz2-dev,
liblzma-dev,
......@@ -13,10 +13,10 @@ Build-Depends:
libgsl-dev,
tabix <!nocheck>,
libio-pty-perl <!nocheck>,
Standards-Version: 4.1.3
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/bcftools
Vcs-Git: https://salsa.debian.org/med-team/bcftools.git
Homepage: http://samtools.github.io/bcftools/
Vcs-Git: https://anonscm.debian.org/git/debian-med/bcftools.git
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/bcftools.git
Package: bcftools
Architecture: any
......
......@@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2018-02-12
.\" Date: 2018-04-03
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2018\-02\-12" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2018\-04\-03" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
......@@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2018\-02\-12\fR and refers to bcftools git version \fB1\&.7\fR\&.
This manual page was last updated \fB2018\-04\-03\fR and refers to bcftools git version \fB1\&.8\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
......@@ -1796,8 +1796,45 @@ reference sequence in fasta format (required)
\fB\-g, \-\-gff\-annot\fR \fIFILE\fR
.RS 4
GFF3 annotation file (required), such as
ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens/
ftp://ftp\&.ensembl\&.org/pub/current_gff3/homo_sapiens\&. An example of a minimal working GFF file:
.RE
.sp
.if n \{\
.RS 4
.\}
.nf
# The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
# looks up their parent transcript (determined from the "Parent=transcript:" attribute),
# the gene (determined from the transcript\*(Aqs "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding")\&.
#
# Attributes required for
# gene lines:
# \- ID=gene:<gene_id>
# \- biotype=<biotype>
# \- Name=<gene_name> [optional]
#
# transcript lines:
# \- ID=transcript:<transcript_id>
# \- Parent=gene:<gene_id>
# \- biotype=<biotype>
#
# other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
# \- Parent=transcript:<transcript_id>
#
# Supported biotypes:
# \- see the function gff_parse_biotype() in bcftools/csq\&.c
1 ignored_field gene 21 2148 \&. \- \&. ID=gene:GeneId;biotype=protein_coding;Name=GeneName
1 ignored_field transcript 21 2148 \&. \- \&. ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
1 ignored_field three_prime_UTR 21 2054 \&. \- \&. Parent=transcript:TranscriptId
1 ignored_field exon 21 2148 \&. \- \&. Parent=transcript:TranscriptId
1 ignored_field CDS 21 2148 \&. \- 1 Parent=transcript:TranscriptId
1 ignored_field five_prime_UTR 210 2148 \&. \- \&. Parent=transcript:TranscriptId
.fi
.if n \{\
.RE
.\}
.PP
\fB\-i, \-\-include\fR \fIEXPRESSION\fR
.RS 4
......@@ -2096,7 +2133,7 @@ see
.RE
.SS "bcftools gtcheck [\fIOPTIONS\fR] [\-g \fIgenotypes\&.vcf\&.gz\fR] \fIquery\&.vcf\&.gz\fR"
.sp
Checks sample identity or, without \fB\-g\fR, multi\-sample cross\-check is performed\&.
Checks sample identity\&. The program can operate in two modes\&. If the \fB\-g\fR option is given, the identity of the \fB\-s\fR sample from \fIquery\&.vcf\&.gz\fR is checked against the samples in the \fB\-g\fR file\&. Without the \fB\-g\fR option, multi\-sample cross\-check of samples in \fIquery\&.vcf\&.gz\fR is performed\&.
.PP
\fB\-a, \-\-all\-sites\fR
.RS 4
......@@ -2227,21 +2264,14 @@ is used for the unseen genotypes\&. With
can be used instead; the discordance value then gives exactly the number of differing genotypes\&.
.RE
.PP
SM, Average Discordance
.RS 4
Average discordance between sample
\fIa\fR
and all other samples\&.
.RE
.PP
SM, Average Depth
ERR, error rate
.RS 4
Average depth at evaluated sites, or 1 if FORMAT/DP field is not present\&.
Pairwise error rate calculated as number of differences divided by the total number of comparisons\&.
.RE
.PP
SM, Average Number of sites
CLUSTER, TH, DOT
.RS 4
The average number of sites used to calculate the discordance\&. In other words, the average number of non\-missing PLs/genotypes seen both samples\&.
In presence of multiple samples, related samples and outliers can be identified by clustering samples by error rate\&. A simple hierarchical clustering based on minimization of standard deviation is used\&. This is useful to detect sample swaps, for example in situations where one sample has been sequenced in multiple runs\&.
.RE
.RE
.SS "bcftools index [\fIOPTIONS\fR] \fIin\&.bcf\fR|\fIin\&.vcf\&.gz\fR"
......@@ -2986,7 +3016,7 @@ and
can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields\&.
.RE
.PP
\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIany\fR
\fB\-d, \-\-rm\-dup\fR \fIsnps\fR|\fIindels\fR|\fIboth\fR|\fIall\fR|\fInone\fR
.RS 4
If a record is present multiple times, output only the first instance, see
\fB\-\-collapse\fR
......@@ -2997,8 +3027,7 @@ in
\fB\-D, \-\-remove\-duplicates\fR
.RS 4
If a record is present in multiple files, output only the first instance\&. Alias for
\fB\-d none\fR\&. Requires
\fB\-a, \-\-allow\-overlaps\fR\&.
\fB\-d none\fR, deprecated\&.
.RE
.PP
\fB\-f, \-\-fasta\-ref\fR \fIFILE\fR
......@@ -3864,12 +3893,28 @@ nor the other
options are given, the allele frequency is estimated from AC and AN counts which are already present in the INFO field\&.
.RE
.PP
\fB\-\-exclude\fR \fIEXPRESSION\fR
.RS 4
exclude sites for which
\fIEXPRESSION\fR
is true\&. For valid expressions see
\fBEXPRESSIONS\fR\&.
.RE
.PP
\fB\-G, \-\-GTs\-only\fR \fIFLOAT\fR
.RS 4
use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL), setting PL of unseen genotypes to
\fIFLOAT\fR\&. Safe value to use is 30 to account for GT errors\&.
.RE
.PP
\fB\-\-include\fR \fIEXPRESSION\fR
.RS 4
include only sites for which
\fIEXPRESSION\fR
is true\&. For valid expressions see
\fBEXPRESSIONS\fR\&.
.RE
.PP
\fB\-I, \-\-skip\-indels\fR
.RS 4
skip indels as their genotypes are usually enriched for errors
......@@ -4680,15 +4725,17 @@ TYPE!~"snp"
.sp -1
.IP \(bu 2.3
.\}
array subscripts (0\-based), "*" for any field, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
array subscripts (0\-based), "*" for any element, "\-" to indicate a range\&. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below
.sp
.if n \{\
.RS 4
.\}
.nf
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
DP4[*] == 0
CSQ[*] ~ "missense_variant\&.*deleterious"
INFO/AF[0] > 0\&.3 \&.\&. first AF value bigger than 0\&.3
FORMAT/AD[0:0] > 30 \&.\&. first AD value of the first sample bigger than 30
FORMAT/AD[0:1] \&.\&. first sample, second AD value
FORMAT/AD[1:0] \&.\&. second sample, first AD value
DP4[*] == 0 \&.\&. any DP4 value
FORMAT/DP[0] > 30 \&.\&. DP of the first sample bigger than 30
FORMAT/DP[1\-3] > 10 \&.\&. samples 2\-4
FORMAT/DP[1\-] < 7 \&.\&. all samples but the first
......@@ -4696,6 +4743,8 @@ FORMAT/DP[0,2\-4] > 20 \&.\&. samples 1, 3\-5
FORMAT/AD[0:1] \&.\&. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] \&.\&. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] \&.\&. any sample, second AD field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0\&.3
CSQ[*] ~ "missense_variant\&.*deleterious"
.fi
.if n \{\
.RE
......@@ -4743,6 +4792,29 @@ N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
.RE
.\}
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
custom perl filtering\&. Note that this command is not compiled in by default, see the section
\fBOptional Compilation with Perl\fR
in the INSTALL file for help and misc/demo\-flt\&.pl for a working example\&. The demo defined the perl subroutine "severity" which can be invoked from the command line as follows:
.sp
.if n \{\
.RS 4
.\}
.nf
perl:path/to/script\&.pl; perl\&.severity(INFO/CSQ) > 3
.fi
.if n \{\
.RE
.\}
.RE
.PP
\fBNotes:\fR
.sp
......@@ -4776,7 +4848,7 @@ Variables and function names are case\-insensitive, but not tag names\&. For exa
.sp -1
.IP \(bu 2.3
.\}
When querying multiple subfields, all subfields are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
When querying multiple values, all elements are tested and the OR logic is used on the result\&. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
.sp
.if n \{\
.RS 4
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp144864"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25098944"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
......@@ -8,7 +8,7 @@ will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-02-12</strong></span> and refers to bcftools git version <span class="strong"><strong>1.7</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2018-04-03</strong></span> and refers to bcftools git version <span class="strong"><strong>1.8</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
......@@ -1036,8 +1036,36 @@ output VCF and are ignored for the prediction analysis.</p><div class="variablel
</dd><dt><span class="term">
<span class="strong"><strong>-g, --gff-annot</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
GFF3 annotation file (required), such as <a class="ulink" href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/" target="_top">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/</a>
</dd><dt><span class="term">
GFF3 annotation file (required), such as <a class="ulink" href="ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" target="_top">ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens</a>.
An example of a minimal working GFF file:
</dd></dl></div><pre class="screen"> # The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
# looks up their parent transcript (determined from the "Parent=transcript:" attribute),
# the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding").
#
# Attributes required for
# gene lines:
# - ID=gene:&lt;gene_id&gt;
# - biotype=&lt;biotype&gt;
# - Name=&lt;gene_name&gt; [optional]
#
# transcript lines:
# - ID=transcript:&lt;transcript_id&gt;
# - Parent=gene:&lt;gene_id&gt;
# - biotype=&lt;biotype&gt;
#
# other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
# - Parent=transcript:&lt;transcript_id&gt;
#
# Supported biotypes:
# - see the function gff_parse_biotype() in bcftools/csq.c
1 ignored_field gene 21 2148 . - . ID=gene:GeneId;biotype=protein_coding;Name=GeneName
1 ignored_field transcript 21 2148 . - . ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
1 ignored_field three_prime_UTR 21 2054 . - . Parent=transcript:TranscriptId
1 ignored_field exon 21 2148 . - . Parent=transcript:TranscriptId
1 ignored_field CDS 21 2148 . - 1 Parent=transcript:TranscriptId
1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId</pre><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-i, --include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
......@@ -1232,7 +1260,10 @@ And similarly here, the second is filtered:
<span class="strong"><strong>--threads</strong></span> <span class="emphasis"><em>INT</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd></dl></div></div><div class="refsect2" title="bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz"><a id="gtcheck"></a><h3>bcftools gtcheck [<span class="emphasis"><em>OPTIONS</em></span>] [<span class="strong"><strong>-g</strong></span> <span class="emphasis"><em>genotypes.vcf.gz</em></span>] <span class="emphasis"><em>query.vcf.gz</em></span></h3><p>Checks sample identity or, without <span class="strong"><strong>-g</strong></span>, multi-sample cross-check is performed.</p><div class="variablelist"><dl><dt><span class="term">
</dd></dl></div></div><div class="refsect2" title="bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz"><a id="gtcheck"></a><h3>bcftools gtcheck [<span class="emphasis"><em>OPTIONS</em></span>] [<span class="strong"><strong>-g</strong></span> <span class="emphasis"><em>genotypes.vcf.gz</em></span>] <span class="emphasis"><em>query.vcf.gz</em></span></h3><p>Checks sample identity. The program can operate in two modes. If the <span class="strong"><strong>-g</strong></span>
option is given, the identity of the <span class="strong"><strong>-s</strong></span> sample from <span class="emphasis"><em>query.vcf.gz</em></span>
is checked against the samples in the <span class="strong"><strong>-g</strong></span> file.
Without the <span class="strong"><strong>-g</strong></span> option, multi-sample cross-check of samples in <span class="emphasis"><em>query.vcf.gz</em></span> is performed.</p><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-a, --all-sites</strong></span>
</span></dt><dd>
output for all sites
......@@ -1304,20 +1335,18 @@ CN, Discordance
<span class="strong"><strong>-G</strong></span>, the value <span class="emphasis"><em>1</em></span> can be used instead; the discordance value then
gives exactly the number of differing genotypes.
</dd><dt><span class="term">
SM, Average Discordance
</span></dt><dd>
Average discordance between sample <span class="emphasis"><em>a</em></span> and all other samples.
</dd><dt><span class="term">
SM, Average Depth
ERR, error rate
</span></dt><dd>
Average depth at evaluated sites, or 1 if FORMAT/DP field is not
present.
Pairwise error rate calculated as number of differences divided
by the total number of comparisons.
</dd><dt><span class="term">
SM, Average Number of sites
CLUSTER, TH, DOT
</span></dt><dd>
The average number of sites used to calculate the discordance. In
other words, the average number of non-missing PLs/genotypes seen
both samples.
In presence of multiple samples, related samples and outliers can be
identified by clustering samples by error rate. A simple hierarchical
clustering based on minimization of standard deviation is used. This is
useful to detect sample swaps, for example in situations where one
sample has been sequenced in multiple runs.
</dd></dl></div></div></div><div class="refsect2" title="bcftools index [OPTIONS] in.bcf|in.vcf.gz"><a id="index"></a><h3>bcftools index [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>in.bcf</em></span>|<span class="emphasis"><em>in.vcf.gz</em></span></h3><p>Creates index for bgzip compressed VCF/BCF files for random access. CSI
(coordinate-sorted index) is created by default. The CSI format
supports indexing of chromosomes up to length 2^31. TBI (tabix index)
......@@ -1778,7 +1807,7 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
can swap alleles and will update genotypes (GT) and AC counts,
but will not attempt to fix PL or other fields.
</dd><dt><span class="term">
<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>any</em></span>
<span class="strong"><strong>-d, --rm-dup</strong></span> <span class="emphasis"><em>snps</em></span>|<span class="emphasis"><em>indels</em></span>|<span class="emphasis"><em>both</em></span>|<span class="emphasis"><em>all</em></span>|<span class="emphasis"><em>none</em></span>
</span></dt><dd>
If a record is present multiple times, output only the first instance,
see <span class="strong"><strong>--collapse</strong></span> in <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>.
......@@ -1786,7 +1815,7 @@ the <span class="strong"><strong><a class="link" href="#fasta_ref">--fasta-ref</
<span class="strong"><strong>-D, --remove-duplicates</strong></span>
</span></dt><dd>
If a record is present in multiple files, output only the first instance.
Alias for <span class="strong"><strong>-d none</strong></span>. Requires <span class="strong"><strong>-a, --allow-overlaps</strong></span>.
Alias for <span class="strong"><strong>-d none</strong></span>, deprecated.
</dd><dt><span class="term">
<span class="strong"><strong>-f, --fasta-ref</strong></span> <span class="emphasis"><em>FILE</em></span><a id="fasta_ref"></a>
</span></dt><dd>
......@@ -2278,12 +2307,22 @@ Transition probabilities:
If neither <span class="strong"><strong>-e</strong></span> nor the other <span class="strong"><strong>--AF-…</strong></span> options are given, the allele frequency is
estimated from AC and AN counts which are already present in the INFO field.
</dd><dt><span class="term">
<span class="strong"><strong>--exclude</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
exclude sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
<span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
</dd><dt><span class="term">
<span class="strong"><strong>-G, --GTs-only</strong></span> <span class="emphasis"><em>FLOAT</em></span>
</span></dt><dd>
use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL),
setting PL of unseen genotypes to <span class="emphasis"><em>FLOAT</em></span>. Safe value to use is 30 to
account for GT errors.
</dd><dt><span class="term">
<span class="strong"><strong>--include</strong></span> <span class="emphasis"><em>EXPRESSION</em></span>
</span></dt><dd>
include only sites for which <span class="emphasis"><em>EXPRESSION</em></span> is true. For valid expressions see
<span class="strong"><strong><a class="link" href="#expressions" title="EXPRESSIONS">EXPRESSIONS</a></strong></span>.
</dd><dt><span class="term">
<span class="strong"><strong>-I, --skip-indels</strong></span>
</span></dt><dd>
skip indels as their genotypes are usually enriched for errors
......@@ -2707,18 +2746,23 @@ to require that all alleles are of the given type. Compare
TYPE~"snp"
TYPE!="snp"
TYPE!~"snp"</pre></li><li class="listitem"><p class="simpara">
array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
</p><pre class="literallayout">(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
DP4[*] == 0
CSQ[*] ~ "missense_variant.*deleterious"
array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and an
element of the vector, as shown in the examples below
</p><pre class="literallayout">INFO/AF[0] &gt; 0.3 .. first AF value bigger than 0.3
FORMAT/AD[0:0] &gt; 30 .. first AD value of the first sample bigger than 30
FORMAT/AD[0:1] .. first sample, second AD value
FORMAT/AD[1:0] .. second sample, first AD value
DP4[*] == 0 .. any DP4 value
FORMAT/DP[0] &gt; 30 .. DP of the first sample bigger than 30
FORMAT/DP[1-3] &gt; 10 .. samples 2-4
FORMAT/DP[1-] &lt; 7 .. all samples but the first
FORMAT/DP[0,2-4] &gt; 20 .. samples 1, 3-5
FORMAT/AD[0:1] .. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field</pre></li><li class="listitem"><p class="simpara">
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) &gt; 0.3
CSQ[*] ~ "missense_variant.*deleterious"</pre></li><li class="listitem"><p class="simpara">
function on FORMAT tags (over samples) and INFO tags (over vector fields)
</p><pre class="literallayout">MAX, MIN, AVG, SUM, STRLEN, ABS, COUNT</pre></li><li class="listitem"><p class="simpara">
variables calculated on the fly if not present: number of alternate alleles;
......@@ -2726,14 +2770,19 @@ number of samples; count of alternate alleles; minor allele count (similar to
AC but is always smaller than 0.5); frequency of alternate alleles (AF=AC/AN);
frequency of minor alleles (MAF=MAC/AN); number of alleles in called genotypes;
number of samples with missing genotype; fraction of samples with missing genotype
</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li></ul></div><div class="itemizedlist" title="Notes:"><p class="title"><strong>Notes:</strong></p><ul class="itemizedlist" type="disc"><li class="listitem">
</p><pre class="literallayout">N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING</pre></li><li class="listitem"><p class="simpara">
custom perl filtering. Note that this command is not compiled in by default, see
the section <span class="strong"><strong>Optional Compilation with Perl</strong></span> in the INSTALL file for help
and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
"severity" which can be invoked from the command line as follows:
</p><pre class="literallayout">perl:path/to/script.pl; perl.severity(INFO/CSQ) &gt; 3</pre></li></ul></div><div class="itemizedlist" title="Notes:"><p class="title"><strong>Notes:</strong></p><ul class="itemizedlist" type="disc"><li class="listitem">
String comparisons and regular expressions are case-insensitive
</li><li class="listitem">
Variables and function names are case-insensitive, but not tag names. For example,
"qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
not "dp" instead of "DP".
</li><li class="listitem"><p class="simpara">
When querying multiple subfields, all subfields are tested and the OR logic is
When querying multiple values, all elements are tested and the OR logic is
used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
</p><pre class="literallayout">-i 'TAG[*]=1' .. true, the record will be printed
-i 'TAG[*]!=1' .. true
......
......@@ -1064,7 +1064,38 @@ output VCF and are ignored for the prediction analysis.
reference sequence in fasta format (required)
*-g, --gff-annot* 'FILE'::
GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/
GFF3 annotation file (required), such as ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens.
An example of a minimal working GFF file:
----
# The program looks for "CDS", "exon", "three_prime_UTR" and "five_prime_UTR" lines,
# looks up their parent transcript (determined from the "Parent=transcript:" attribute),
# the gene (determined from the transcript's "Parent=gene:" attribute), and the biotype
# (the most interesting is "protein_coding").
#
# Attributes required for
# gene lines:
# - ID=gene:<gene_id>
# - biotype=<biotype>
# - Name=<gene_name> [optional]
#
# transcript lines:
# - ID=transcript:<transcript_id>
# - Parent=gene:<gene_id>
# - biotype=<biotype>
#
# other lines (CDS, exon, five_prime_UTR, three_prime_UTR):
# - Parent=transcript:<transcript_id>
#
# Supported biotypes:
# - see the function gff_parse_biotype() in bcftools/csq.c
1 ignored_field gene 21 2148 . - . ID=gene:GeneId;biotype=protein_coding;Name=GeneName
1 ignored_field transcript 21 2148 . - . ID=transcript:TranscriptId;Parent=gene:GeneId;biotype=protein_coding
1 ignored_field three_prime_UTR 21 2054 . - . Parent=transcript:TranscriptId
1 ignored_field exon 21 2148 . - . Parent=transcript:TranscriptId
1 ignored_field CDS 21 2148 . - 1 Parent=transcript:TranscriptId
1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId
----
*-i, --include* 'EXPRESSION'::
include only sites for which 'EXPRESSION' is true. For valid expressions see
......@@ -1255,7 +1286,10 @@ And similarly here, the second is filtered:
[[gtcheck]]
=== bcftools gtcheck ['OPTIONS'] [*-g* 'genotypes.vcf.gz'] 'query.vcf.gz'
Checks sample identity or, without *-g*, multi-sample cross-check is performed.
Checks sample identity. The program can operate in two modes. If the *-g*
option is given, the identity of the *-s* sample from 'query.vcf.gz'
is checked against the samples in the *-g* file.
Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' is performed.
*-a, --all-sites*::
output for all sites
......@@ -1319,17 +1353,16 @@ Checks sample identity or, without *-g*, multi-sample cross-check is performed.
*-G*, the value '1' can be used instead; the discordance value then
gives exactly the number of differing genotypes.
SM, Average Discordance;;
Average discordance between sample 'a' and all other samples.
SM, Average Depth;;
Average depth at evaluated sites, or 1 if FORMAT/DP field is not
present.
ERR, error rate;;
Pairwise error rate calculated as number of differences divided
by the total number of comparisons.
SM, Average Number of sites;;
The average number of sites used to calculate the discordance. In
other words, the average number of non-missing PLs/genotypes seen
both samples.
CLUSTER, TH, DOT;;
In presence of multiple samples, related samples and outliers can be
identified by clustering samples by error rate. A simple hierarchical
clustering based on minimization of standard deviation is used. This is
useful to detect sample swaps, for example in situations where one
sample has been sequenced in multiple runs.
// MD, Maximum Deviation;;
// The maximum absolute deviation from average score of the sample
......@@ -1807,13 +1840,13 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
can swap alleles and will update genotypes (GT) and AC counts,
but will not attempt to fix PL or other fields.
*-d, --rm-dup* 'snps'|'indels'|'both'|'any'::
*-d, --rm-dup* 'snps'|'indels'|'both'|'all'|'none'::
If a record is present multiple times, output only the first instance,
see *--collapse* in *<<common_options,Common Options>>*.
*-D, --remove-duplicates*::
If a record is present in multiple files, output only the first instance.
Alias for *-d none*. Requires *-a, --allow-overlaps*.
Alias for *-d none*, deprecated.
*-f, --fasta-ref* 'FILE'[[fasta_ref]]::
reference sequence. Supplying this option will turn on left-alignment
......@@ -2310,11 +2343,19 @@ Transition probabilities:
If neither *-e* nor the other *--AF-...* options are given, the allele frequency is
estimated from AC and AN counts which are already present in the INFO field.
*--exclude* 'EXPRESSION'::
exclude sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.
*-G, --GTs-only* 'FLOAT'::
use genotypes (FORMAT/GT fields) ignoring genotype likelihoods (FORMAT/PL),
setting PL of unseen genotypes to 'FLOAT'. Safe value to use is 30 to
account for GT errors.
*--include* 'EXPRESSION'::
include only sites for which 'EXPRESSION' is true. For valid expressions see
*<<expressions,EXPRESSIONS>>*.
*-I, --skip-indels*::
skip indels as their genotypes are usually enriched for errors
......@@ -2743,12 +2784,15 @@ to require that all alleles are of the given type. Compare
TYPE!="snp"
TYPE!~"snp"
* array subscripts (0-based), "*" for any field, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and a subfield
* array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
for querying FORMAT vectors, the colon ":" can be used to select a sample and an
element of the vector, as shown in the examples below
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
DP4[*] == 0
CSQ[*] ~ "missense_variant.*deleterious"
INFO/AF[0] > 0.3 .. first AF value bigger than 0.3
FORMAT/AD[0:0] > 30 .. first AD value of the first sample bigger than 30
FORMAT/AD[0:1] .. first sample, second AD value
FORMAT/AD[1:0] .. second sample, first AD value
DP4[*] == 0 .. any DP4 value
FORMAT/DP[0] > 30 .. DP of the first sample bigger than 30
FORMAT/DP[1-3] > 10 .. samples 2-4
FORMAT/DP[1-] < 7 .. all samples but the first
......@@ -2756,6 +2800,8 @@ for querying FORMAT vectors, the colon ":" can be used to select a sample and a
FORMAT/AD[0:1] .. first sample, second AD field
FORMAT/AD[0:*], AD[0:] or AD[0] .. first sample, any AD field
FORMAT/AD[*:1] or AD[:1] .. any sample, second AD field
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
CSQ[*] ~ "missense_variant.*deleterious"
* function on FORMAT tags (over samples) and INFO tags (over vector fields)
......@@ -2769,6 +2815,13 @@ number of samples with missing genotype; fraction of samples with missing genoty
N_ALT, N_SAMPLES, AC, MAC, AF, MAF, AN, N_MISSING, F_MISSING
* custom perl filtering. Note that this command is not compiled in by default, see
the section *Optional Compilation with Perl* in the INSTALL file for help
and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
"severity" which can be invoked from the command line as follows:
perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3
.Notes:
......@@ -2776,7 +2829,7 @@ number of samples with missing genotype; fraction of samples with missing genoty
* Variables and function names are case-insensitive, but not tag names. For example,
"qual" can be used instead of "QUAL", "strlen()" instead of "STRLEN()" , but
not "dp" instead of "DP".
* When querying multiple subfields, all subfields are tested and the OR logic is
* When querying multiple values, all elements are tested and the OR logic is
used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluated as follows:
-i 'TAG[*]=1' .. true, the record will be printed
......
......@@ -27,18 +27,31 @@ THE SOFTWARE. */
#include <strings.h>
#include <errno.h>
#include <math.h>
#include <wordexp.h>
#include <sys/types.h>
#include <pwd.h>
#include <regex.h>
#include <htslib/khash_str2int.h>
#include "filter.h"
#include "bcftools.h"
#include <htslib/hts_defs.h>
#include <htslib/vcfutils.h>
#include "config.h"
#include "filter.h"
#include "bcftools.h"
#if ENABLE_PERL_FILTERS
# define filter_t perl_filter_t
# include <EXTERN.h>
# include <perl.h>
# undef filter_t
# define my_perl perl
#endif
#ifndef __FUNCTION__
# define __FUNCTION__ __func__
#endif
static filter_ninit = 0;
uint64_t bcf_double_missing = 0x7ff0000000000001;
uint64_t bcf_double_vector_end = 0x7ff0000000000002;
static inline void bcf_double_set(double *ptr, uint64_t value)
......@@ -63,6 +76,7 @@ typedef struct _token_t
{
// read-only values, same for all VCF lines
int tok_type; // one of the TOK_* keys below
int nargs; // used only TOK_PERLSUB, the first argument is the name of the subroutine
char *key; // set only for string constants, otherwise NULL
char *tag; // for debugging and printout only, VCF tag name
double threshold; // filtering threshold
......@@ -100,6 +114,9 @@ struct _filter_t
float *tmpf;
kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
#if ENABLE_PERL_FILTERS
PerlInterpreter *perl;
#endif
};
......@@ -130,11 +147,12 @@ struct _filter_t
#define TOK_LEN 24
#define TOK_FUNC 25
#define TOK_CNT 26
#define TOK_PERLSUB 27
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l
static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8};
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^f"
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p
static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8};
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcp"
static int filters_next_token(char **str, int *len)
{
......@@ -169,6 +187,7 @@ static int filters_next_token(char **str, int *len)
if ( !strncasecmp(tmp,"INFO/",5) ) tmp += 5;
if ( !strncasecmp(tmp,"FORMAT/",7) ) tmp += 7;
if ( !strncasecmp(tmp,"FMT/",4) ) tmp += 4;
if ( !strncasecmp(tmp,"PERL.",5) ) { (*str) += 5; return TOK_PERLSUB; }
if ( tmp[0]=='@' ) // file name
{
......@@ -250,6 +269,53 @@ static int filters_next_token(char **str, int *len)
return TOK_VAL;
}
/*
Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
Based on jkb's staden code with some adjustements.
https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
*/
char *expand_path(char *path)
{
#ifdef _WIN32
return strdup(path); // windows expansion: todo
#endif
kstring_t str = {0,0,0};
int i;
if ( path[0] == '~' )
{
if ( !path[1] || path[1] == '/' )
{
// ~ or ~/path
kputs(getenv("HOME"), &str);
if ( path[1] ) kputs(path+1, &str);
}
else
{
// user name: ~pd3/path
char *end = path;
while ( *end && *end!='/' ) end++;
kputsn(path+1, end-path-1, &str);
struct passwd *pwentry = getpwnam(str.s);
str.l = 0;
if ( !pwentry ) kputsn(path, end-path, &str);
else kputs(pwentry->pw_dir, &str);
kputs(end, &str);
}
return str.s;
}
if ( path[0] == '$' )
{
char *var = getenv(path+1);
if ( var ) path = var;
}
return strdup(path);
}
static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
{
float *ptr = &line->qual;
......@@ -856,7 +922,6 @@ static void filters_set_alt_string(filter_t *flt, bcf1_t *line, token_t *tok)
kputs(line->d.allele[tok->idx + 1], &tok->str_value);
else
kputc('.', &tok->str_value);
tok->idx = 0;
}
else if ( tok->idx==-2 )
{
......@@ -1118,6 +1183,9 @@ static int func_strlen(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **sta
}
else
{
if ( !tok->str_value.s[1] && tok->str_value.s[0]=='.' )
rtok->values[0] = 0;
else
rtok->values[0] = strlen(tok->str_value.s);
rtok->nvalues = 1;
}
......@@ -1682,6 +1750,18 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
for (i=0; i<tok->nidxs; i++) if ( tok->idxs[i] ) tok->nuidxs++;
}
}
static int max_ac_an_unpack(bcf_hdr_t *hdr)
{
int hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AC");
if ( hdr_id<0 ) return BCF_UN_FMT;
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
hdr_id = bcf_hdr_id2int(hdr,BCF_DT_ID,"AN");
if ( hdr_id<0 ) return BCF_UN_FMT;
if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,hdr_id) ) return BCF_UN_FMT;
return BCF_UN_INFO;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
tok->tok_type = TOK_VAL;
......@@ -1711,13 +1791,11 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
tok->tag = (char*) calloc(len+1,sizeof(char));
memcpy(tok->tag,str,len);
tok->tag[len] = 0;
wordexp_t wexp;
wordexp(tok->tag+1, &wexp, 0);
if ( !wexp.we_wordc ) error("No such file: %s\n", tok->tag+1);
char *fname = expand_path(tok->tag);
int i, n;
char **list = hts_readlist(wexp.we_wordv[0], 1, &n);
if ( !list ) error("Could not read: %s\n", wexp.we_wordv[0]);
wordfree(&wexp);
char **list = hts_readlist(fname, 1, &n);
if ( !list ) error("Could not read: %s\n", fname);
free(fname);
tok->hash = khash_str2int_init();
for (i=0; i<n; i++)
{
......@@ -1916,6 +1994,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
else if ( !strcasecmp(tmp.s,"AN") )
{
filter->max_unpack |= BCF_UN_FMT;
tok->setter = &filters_set_an;
tok->tag = strdup("AN");
free(tmp.s);
......@@ -1923,6 +2002,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
else if ( !strcasecmp(tmp.s,"AC") )
{
filter->max_unpack |= BCF_UN_FMT;
tok->setter = &filters_set_ac;
tok->tag = strdup("AC");
free(tmp.s);
......@@ -1930,6 +2010,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
else if ( !strcasecmp(tmp.s,"MAC") )
{
filter->max_unpack |= max_ac_an_unpack(filter->hdr);
tok->setter = &filters_set_mac;
tok->tag = strdup("MAC");
free(tmp.s);
......@@ -1937,6 +2018,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
else if ( !strcasecmp(tmp.s,"AF") )
{
filter->max_unpack |= max_ac_an_unpack(filter->hdr);
tok->setter = &filters_set_af;
tok->tag = strdup("AF");
free(tmp.s);
......@@ -1944,6 +2026,7 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
}
else if ( !strcasecmp(tmp.s,"MAF") )
{
filter->max_unpack |= max_ac_an_unpack(filter->hdr);
tok->setter = &filters_set_maf;
tok->tag = strdup("MAF");
free(tmp.s);
......@@ -1994,6 +2077,123 @@ static void str_to_lower(char *str)
{
while ( *str ) { *str = tolower(*str); str++; }
}
static int perl_exec(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
{
#if ENABLE_PERL_FILTERS
assert( rtok->nargs == nstack );
PerlInterpreter *perl = flt->perl;
dSP;
ENTER;
SAVETMPS;
PUSHMARK(SP);
int i,j;
for (i=1; i<nstack; i++)
{
token_t *tok = stack[i];
if ( tok->is_str )
XPUSHs(sv_2mortal(newSVpvn(tok->str_value.s,tok->str_value.l)));
else if ( tok->nvalues==1 )
XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
else if ( tok->nvalues>1 )
{
AV *av = newAV();
for (j=0; j<tok->nvalues; j++) av_push(av, newSVnv(tok->values[j]));
SV *rv = newRV_inc((SV*)av);
XPUSHs(rv);
}
else
{
bcf_double_set_missing(tok->values[0]);
XPUSHs(sv_2mortal(newSVnv(tok->values[0])));
}
}
PUTBACK;
// A possible future todo: provide a means to select samples and indexes,
// expressions like this don't work yet
// perl.filter(FMT/AD)[1:0]
int nret = call_pv(stack[0]->str_value.s, G_ARRAY);
SPAGAIN;
rtok->nvalues = nret;
hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values);
for (i=nret; i>0; i--)
{
rtok->values[i-1] = (double) POPn;
if ( isnan(rtok->values[i-1]) ) bcf_double_set_missing(rtok->values[i-1]);
}
PUTBACK;
FREETMPS;
LEAVE;
#else
error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
#endif
return nstack;
}
static void perl_init(filter_t *filter, char **str)
{
char *beg = *str;
while ( *beg && isspace(*beg) ) beg++;
if ( !*beg ) return;
if ( strncasecmp("perl:", beg, 5) ) return;
#if ENABLE_PERL_FILTERS
beg += 5;
char *end = beg;
while ( *end && *end!=';' ) end++; // for now not escaping semicolons
*str = end+1;
if ( ++filter_ninit == 1 )
{
// must be executed only once, even for multiple filters; first time here
PERL_SYS_INIT3(0, NULL, NULL);
}
filter->perl = perl_alloc();
PerlInterpreter *perl = filter->perl;
if ( !perl ) error("perl_alloc failed\n");
perl_construct(perl);
// name of the perl script to run
char *rmme = (char*) calloc(end - beg + 1,1);
memcpy(rmme, beg, end - beg);
char *argv[] = { "", "" };
argv[1] = expand_path(rmme);
free(rmme);
PL_origalen = 1; // don't allow $0 change
int ret = perl_parse(filter->perl, NULL, 2, argv, NULL);
PL_exit_flags |= PERL_EXIT_DESTRUCT_END;
if ( ret ) error("Failed to parse: %s\n", argv[1]);
free(argv[1]);
perl_run(perl);
#else
error("\nPerl filtering requires running `configure --enable-perl-filters` at compile time.\n\n");
#endif
}
static void perl_destroy(filter_t *filter)
{
#if ENABLE_PERL_FILTERS
if ( !filter->perl ) return;
PerlInterpreter *perl = filter->perl;
perl_destruct(perl);
perl_free(perl);
if ( --filter_ninit <= 0 )
{
// similarly to PERL_SYS_INIT3, can must be executed only once? todo: test
PERL_SYS_TERM();
}
#endif
}
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
......@@ -2008,6 +2208,8 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
int nout = 0, mout = 0; // filter tokens, RPN
token_t *out = NULL;
char *tmp = filter->str;
perl_init(filter, &tmp);
int last_op = -1;
while ( *tmp )
{
......@@ -2016,7 +2218,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
if ( ret==-1 ) error("Missing quotes in: %s\n", str);
// fprintf(stderr,"token=[%c] .. [%s] %d\n", TOKEN_STRING[ret], tmp, len);
// int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
// int i; for (i=0; i<nops; i++) fprintf(stderr," .%c", TOKEN_STRING[ops[i]]); fprintf(stderr,"\n");
if ( ret==TOK_LFT ) // left bracket
{
......@@ -2050,6 +2252,52 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
tok->threshold = -1.0;
ret = TOK_MULT;
}
else if ( ret==TOK_PERLSUB )
{
tmp += len;
char *beg = tmp;
while ( *beg && ((isalnum(*beg) && !ispunct(*beg)) || *beg=='_') ) beg++;
if ( *beg!='(' ) error("Could not parse the expression: %s\n", str);
char *end = beg;
while ( *end && *end!=')' ) end++;
if ( !*end ) error("Could not parse the expression: %s\n", str);
kstring_t rmme = {0,0,0};
// the subroutine name
kputc('"', &rmme);
kputsn(tmp, beg-tmp, &rmme);
kputc('"', &rmme);
nout++;
hts_expand0(token_t, nout, mout, out);
filters_init1(filter, rmme.s, rmme.l, &out[nout-1]);
// subroutine arguments
rmme.l = 0;
kputsn(beg+1, end-beg-1, &rmme);
int i, nargs;
char **rmme_list = hts_readlist(rmme.s, 0, &nargs);
for (i=0; i<nargs; i++)
{
nout++;
hts_expand0(token_t, nout, mout, out);
filters_init1(filter, rmme_list[i], strlen(rmme_list[i]), &out[nout-1]);
free(rmme_list[i]);
}
free(rmme_list);
free(rmme.s);
nout++;
hts_expand0(token_t, nout, mout, out);
token_t *tok = &out[nout-1];
tok->tok_type = TOK_PERLSUB;
tok->nargs = nargs + 1;
tok->hdr_id = -1;
tok->pass_site = -1;
tok->threshold = -1.0;
tmp = end + 1;
continue;
}
else
{
while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
......@@ -2073,6 +2321,9 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
{
nout++;
hts_expand0(token_t, nout, mout, out);
if ( tmp[len-1]==',' )
filters_init1(filter, tmp, len-1, &out[nout-1]);
else
filters_init1(filter, tmp, len, &out[nout-1]);
tmp += len;
}
......@@ -2240,6 +2491,7 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
void filter_destroy(filter_t *filter)
{
perl_destroy(filter);
int i;
for (i=0; i<filter->nfilters; i++)
{
......@@ -2300,6 +2552,14 @@ int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
if ( --nargs > 0 ) nstack -= nargs;
continue;
}
else if ( filter->filters[i].tok_type == TOK_PERLSUB )
{
int nargs = filter->filters[i].nargs;
perl_exec(filter, line, &filter->filters[i], filter->flt_stack, nstack);
filter->flt_stack[nstack-nargs] = &filter->filters[i];
nstack -= nargs - 1;
continue;
}
if ( nstack<2 )
error("Error occurred while processing the filter \"%s\" (1:%d)\n", filter->str,nstack); // too few values left on the stack
......
# This demo code shows how to write a custom perl code and use it in
# the -i/-e filtering expressions.
#
# In this example we want to take variant consequences generated by `bcftools csq`,
# rank them by severity, filter by the most severe, and print the list using the
# following command:
#
# bcftools query -f '%CHROM:%POS \t %CSQ\n' -i 'perl:misc/demo-flt.pl; perl.severity(INFO/CSQ) > 10' test/perl-flt.vcf
#
# There can be multiple subroutines in the same script and they
# can be referenced from the command line by prefixing "perl.subroutine_name()"
#
sub severity
{
# Arbitrary number of arguments can be provided.
my (@args) = @_;
# Note that string arrays are passed to perl in the form of a single
# comma-separated string, but numeric arrays must be dereferenced, for
# example as shown in this example:
#
# for my $arg (@args)
# {
# if ( ref($arg) eq 'ARRAY' ) { print "array: ".join(',',@$arg)."\n"; }
# else { print "scalar: $arg\n"; }
# }
#
# In our case there should be only one parameter passed to the subroutine;
# check if this is the case
if ( scalar @args != 1 ) { error("Incorrect use, expected one argument, got ".scalar @args."!\n"); }
# Create a lookup table from consequences to an arbitrary severity score
my %severity =
(
"intergenic" => 1,
"intron" => 2,
"non_coding" => 3,
"5_prime_utr" => 4,
"3_prime_utr" => 5,
"stop_retained" => 6,
"synonymous" => 7,
"coding_sequence" => 8,
"missense" => 9,
"splice_region" => 10,
"inframe_altering" => 11,
"inframe_deletion" => 12,
"inframe_insertion" => 13,
"splice_acceptor" => 14,
"splice_donor" => 15,
"stop_lost" => 16,
"stop_gained" => 17,
"start_lost" => 18,
"frameshift" => 19,
);
# Split multiple consequences into an array
my @csq = split(/,/, $args[0]);
# Find the most severe consequence. The consequence string may look like this:
# inframe_deletion|ABC|ENST00000000002|protein_coding|-|5YV>5T|144TAC>T+148TA>T
my $max = 0;
for my $csq (@csq)
{
my @fields = split(/\|/, $csq);
my $string = $fields[0];
my $score = exists($severity{$string}) ? $severity{$string} : 0;
if ( $max < $score ) { $max = $score; }
}
return $max;
}
sub error
{
my (@msg) = @_;
print STDERR @msg;
exit;
}
#!/usr/bin/python
#!/usr/bin/env python
#
# The MIT License
#
......@@ -44,6 +44,7 @@ def usage(msg=None):
print ' +adj, --adjust <str> Set plot adjust [bottom=0.18,left=0.07,right=0.98]'
print ' +dpi, --dpi <num> Set bitmap DPI [150]'
print ' +sxt, --show-xticks Show x-ticks (genomic coordinate)'
print ' +twh, --track-wh <num,num> Set track width and height [20,1]'
print ' +xlb, --xlabel <str> Set x-label'
print ' +xli, --xlimit <num> Extend x-range by this fraction [0.05]'
else:
......@@ -51,7 +52,7 @@ def usage(msg=None):
sys.exit(1)
dir = None
regs = None
plot_regions = None
min_length = 0
min_markers = 0
min_qual = 0
......@@ -64,13 +65,15 @@ dpi = 150
xlim = 0.05
show_xticks = False
xlabel = None
track_width = 20
track_height = 1
if len(sys.argv) < 2: usage()
args = sys.argv[1:]
while len(args):
if args[0]=='-r' or args[0]=='--region':
args = args[1:]
regs = args[0]
plot_regions = args[0]
elif args[0]=='-i' or args[0]=='--interactive':
interactive = True
elif args[0]=='-l' or args[0]=='--min-length':
......@@ -99,11 +102,16 @@ while len(args):
elif args[0]=='+dpi' or args[0]=='--dpi':
args = args[1:]
dpi = float(args[0])
elif args[0]=='+sxt' or args[0]=='--show-xticks':
show_xticks = True
elif args[0]=='+twh' or args[0]=='--track-wh':
args = args[1:]
(track_width,track_height) = args[0].split(',')
track_height = -float(track_height) # will be used as if negative, no auto-magic
track_width = float(track_width)
elif args[0]=='+xlb' or args[0]=='--xlabel':
args = args[1:]
xlabel = args[0]
elif args[0]=='+sxt' or args[0]=='--show-xticks':
show_xticks = True
elif args[0]=='+xli' or args[0]=='--xlimit':
args = args[1:]
xlim = float(args[0])
......@@ -119,6 +127,11 @@ adjust = eval("wrap_hash("+adjust+")")
import matplotlib as mpl
if interactive==False:
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as patches
else:
for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
try:
mpl.use(gui,warn=False, force=True)
......@@ -129,7 +142,6 @@ for gui in ['TKAgg','GTKAgg','Qt4Agg','WXAgg','MacOSX']:
continue
cols = [ '#337ab7', '#5cb85c', '#5bc0de', '#f0ad4e', '#d9534f', 'grey', 'black' ]
mpl.rcParams['axes.color_cycle'] = cols
globstr = os.path.join(dir, '*.txt.gz')
fnames = glob.glob(globstr)
......@@ -272,7 +284,7 @@ def parse_samples(fname,highlight):
if highlight==None: groups = None
return (samples,groups,smpl2y)
regs = parse_regions(regs)
plot_regions = parse_regions(plot_regions)
(samples,groups,smpl2y) = parse_samples(sample_file,highlight)
dat_gt = {}
......@@ -285,7 +297,7 @@ for fname in fnames:
if row[0]=='GT':
chr = row[1]
pos = int(row[2])
reg = region_overlap(regs,chr,pos,pos)
reg = region_overlap(plot_regions,chr,pos,pos)
if reg==None: continue
for i in range(3,len(row),2):
smpl = row[i]
......@@ -317,7 +329,8 @@ for fname in fnames:
if length < min_length: continue
if nmark < min_markers : continue
if qual < min_qual : continue
reg = region_overlap(regs,chr,beg,end)
reg = region_overlap(plot_regions,chr,beg,end)
if reg==None: continue
if chr not in dat_rg: dat_rg[chr] = {}
if smpl not in dat_rg[chr]: dat_rg[chr][smpl] = []
if reg!=None:
......@@ -352,9 +365,11 @@ for chr in chrs:
off += max_pos + off_sep
off_list.append(off)
height = len(smpl2y)
if len(smpl2y)>5: heigth = 5
wh = 20,height
wh = track_width,len(smpl2y)
if track_height < 0:
wh = track_width,-track_height*len(smpl2y)
elif len(smpl2y)>5:
wh = track_width,5
def bignum(num):
s = str(num); out = ''; slen = len(s)
......@@ -379,6 +394,7 @@ ax1.format_coord = format_coord
xtick_lbl = []
xtick_pos = []
max_x = 0
min_x = -1
for chr in dat_gt:
off = off_hash[chr]
icol = 0
......@@ -394,6 +410,7 @@ for chr in dat_gt:
rect = patches.Rectangle((rg[0]+off,3*y+0.5), rg[1]-rg[0]+1, 2, color='#d9534f')
ax1.add_patch(rect)
ax1.plot([x[0]+off for x in dat_gt[chr][smpl]],[x[1]+3*y for x in dat_gt[chr][smpl]],'.',color=cols[icol])
if min_x==-1 or min_x > dat_gt[chr][smpl][0][0]+off: min_x = dat_gt[chr][smpl][0][0]+off
if max_x < dat_gt[chr][smpl][-1][0]+off: max_x = dat_gt[chr][smpl][-1][0]+off
if max < dat_gt[chr][smpl][-1][0]: max = dat_gt[chr][smpl][-1][0]
icol += 1
......@@ -408,7 +425,8 @@ for chr in dat_gt:
ytick_pos.append(3*smpl2y[smpl]+1)
break
if xlim!=0:
ax1.set_xlim(0,max_x+xlim*max_x)
if min_x==-1: min_x = 0
ax1.set_xlim(min_x,max_x+xlim*max_x)
lbl_pos = 3*(len(smpl2y)-1)
ax1.annotate(' HomAlt ',xy=(max_x,lbl_pos-1),xycoords='data',va='center')
ax1.annotate(' Het',xy=(max_x,lbl_pos-2),xycoords='data',va='center')
......
......@@ -51,12 +51,15 @@ sub error
"Options:\n",
" -a, --af-annots <file> Allele frequency annotations [1000GP-AFs/AFs.tab.gz]\n",
" -i, --indir <dir> Input directory with VCF files\n",
" --include <expr> Select sites for which the expression is true\n",
" --exclude <expr> Exclude sites for which the epxression is true\n",
" -l, --min-length <num> Filter input regions shorter than this [1e6]\n",
" -m, --genmap <dir> Directory with genetic map in IMPUTE2 format (optional)\n",
" -M, --rec-rate <float> constant recombination rate per bp (optional)\n",
" -n, --min-markers <num> Filter input regions with fewer marker than this [100]\n",
" -o, --outdir <dir> Output directory\n",
" -q, --min-qual <num> Filter input regions with quality smaller than this [10]\n",
" --roh-args <string> Extra arguments to pass to bcftools roh\n",
" -s, --silent Quiet output, do not print commands\n",
" -h, -?, --help This help message\n",
"\n";
......@@ -71,9 +74,13 @@ sub parse_params
min_length => 1e6,
min_markers => 100,
min_qual => 10,
roh_args => '',
};
while (defined(my $arg=shift(@ARGV)))
{
if ( $arg eq '--roh-args' ) { $$opts{roh_args}=shift(@ARGV); next }
if ( $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
if ( $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next }
if ( $arg eq '-n' || $arg eq '--min-markers' ) { $$opts{min_markers}=shift(@ARGV); next }
......@@ -92,6 +99,8 @@ sub parse_params
if ( ! -e "$$opts{af_annots}.tbi" ) { error("The annotation file is not indexed: $$opts{af_annots}.tbi\n"); }
if ( ! -e "$$opts{af_annots}.hdr" ) { error("The annotation file has no header: $$opts{af_annots}.hdr\n"); }
if ( exists($$opts{genmap}) && ! -d "$$opts{genmap}" ) { error("The directory with genetic maps does not exist: $$opts{genmap}\n"); }
if ( exists($$opts{include_expr}) ) { $$opts{include_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -i '$$opts{include_expr}']; }
if ( exists($$opts{exclude_expr}) ) { $$opts{exclude_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -e '$$opts{exclude_expr}']; }
return $opts;
}
......@@ -196,10 +205,19 @@ sub run_roh
my $outfile = "$$opts{outdir}/$`.bcf";
push @files,$outfile;
if ( -e $outfile ) { next; }
cmd(
my $cmd =
"bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
"bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} -Ob -o $outfile.part && " .
"mv $outfile.part $outfile",%$opts);
"bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} ";
if ( exists($$opts{inc_exc}) )
{
$cmd .= " -Ou | bcftools view $$opts{inc_exc} ";
}
$cmd .= "-Ob -o $outfile.part && ";
$cmd .= "mv $outfile.part $outfile";
cmd($cmd, %$opts);
}
closedir($dh) or error("close failed: $$opts{indir}");
......@@ -209,7 +227,7 @@ sub run_roh
for my $file (@files)
{
if ( -e "$file.txt.gz" ) { next; }
my @out = cmd("bcftools roh --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
my @out = cmd("bcftools roh $$opts{roh_args} --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
for my $line (@out)
{
if ( !($line=~m{total/processed:\s+(\d+)/(\d+)}) ) { next; }
......
/* The MIT License
Copyright (c) 2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <errno.h>
#include <unistd.h> // for isatty
#include <htslib/hts.h>
#include <htslib/vcf.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>
#include <htslib/synced_bcf_reader.h>
#include "bcftools.h"
#include "filter.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2
typedef struct
{
int argc, filter_logic, regions_is_file, targets_is_file, output_type;
char **argv, *output_fname, *fname, *regions, *targets, *filter_str;
char *bg_samples_str, *novel_samples_str;
int *bg_smpl, *novel_smpl, nbg_smpl, nnovel_smpl;
filter_t *filter;
bcf_srs_t *sr;
bcf_hdr_t *hdr, *hdr_out;
htsFile *out_fh;
int32_t *gts;
int mgts;
uint32_t *bg_gts;
int nbg_gts, mbg_gts, ntotal, nskipped, ntested, nnovel_al, nnovel_gt;
kstring_t novel_als_smpl, novel_gts_smpl;
}
args_t;
args_t args;
const char *about(void)
{
return "Find novel alleles and genotypes in two groups of samples.\n";
}
static const char *usage_text(void)
{
return
"\n"
"About: Finds novel alleles and genotypes in two groups of samples. Adds\n"
" an annotation which lists samples with a novel allele (INFO/NOVELAL)\n"
" or a novel genotype (INFO/NOVELGT)\n"
"Usage: bcftools +contrast [Plugin Options]\n"
"Plugin options:\n"
" -0, --bg-samples <list> list of background samples\n"
" -1, --novel-samples <list> list of samples where novel allele or genotype are expected\n"
" -e, --exclude EXPR exclude sites and samples for which the expression is true\n"
" -i, --include EXPR include sites and samples for which the expression is true\n"
" -o, --output FILE output file name [stdout]\n"
" -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"
" -r, --regions REG restrict to comma-separated list of regions\n"
" -R, --regions-file FILE restrict to regions listed in a file\n"
" -t, --targets REG similar to -r but streams rather than index-jumps\n"
" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
"\n"
"Example:\n"
" # Test if any of the samples a,b is different from the samples c,d,e\n"
" bcftools +contrast -0 c,d,e -1 a,b file.bcf\n"
"\n";
}
static void init_data(args_t *args)
{
args->sr = bcf_sr_init();
if ( args->regions )
{
args->sr->require_index = 1;
if ( bcf_sr_set_regions(args->sr, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n",args->regions);
}
if ( args->targets && bcf_sr_set_targets(args->sr, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->targets);
if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
args->hdr = bcf_sr_get_header(args->sr,0);
args->hdr_out = bcf_hdr_dup(args->hdr);
bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELAL,Number=.,Type=String,Description=\"List of samples with novel alleles\">");
bcf_hdr_append(args->hdr_out, "##INFO=<ID=NOVELGT,Number=.,Type=String,Description=\"List of samples with novel genotypes. Note that only samples w/o a novel allele are listed.\">");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
int i;
char **smpl = hts_readlist(args->bg_samples_str, 0, &args->nbg_smpl);
args->bg_smpl = (int*) malloc(sizeof(int)*args->nbg_smpl);
for (i=0; i<args->nbg_smpl; i++)
{
args->bg_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]);
if ( args->bg_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]);
free(smpl[i]);
}
free(smpl);
smpl = hts_readlist(args->novel_samples_str, 0, &args->nnovel_smpl);
args->novel_smpl = (int*) malloc(sizeof(int)*args->nnovel_smpl);
for (i=0; i<args->nnovel_smpl; i++)
{
args->novel_smpl[i] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, smpl[i]);
if ( args->novel_smpl[i]<0 ) error("The sample not present in the VCF: \"%s\"\n", smpl[i]);
free(smpl[i]);
}
free(smpl);
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
bcf_hdr_write(args->out_fh, args->hdr_out);
}
static void destroy_data(args_t *args)
{
bcf_hdr_destroy(args->hdr_out);
hts_close(args->out_fh);
free(args->novel_als_smpl.s);
free(args->novel_gts_smpl.s);
free(args->gts);
free(args->bg_gts);
free(args->bg_smpl);
free(args->novel_smpl);
if ( args->filter ) filter_destroy(args->filter);
bcf_sr_destroy(args->sr);
free(args);
}
static inline int binary_search(uint32_t val, uint32_t *dat, int ndat)
{
int i = -1, imin = 0, imax = ndat - 1;
while ( imin<=imax )
{
i = (imin+imax)/2;
if ( dat[i] < val ) imin = i + 1;
else if ( dat[i] > val ) imax = i - 1;
else return 1;
}
return 0;
}
static inline void binary_insert(uint32_t val, uint32_t **dat, int *ndat, int *mdat)
{
int i = -1, imin = 0, imax = *ndat - 1;
while ( imin<=imax )
{
i = (imin+imax)/2;
if ( (*dat)[i] < val ) imin = i + 1;
else if ( (*dat)[i] > val ) imax = i - 1;
else return;
}
while ( i>=0 && (*dat)[i]>val ) i--;
(*ndat)++;
hts_expand(uint32_t, (*ndat), (*mdat), (*dat));
if ( *ndat > 1 )
memmove(*dat + i + 1, *dat + i, sizeof(uint32_t)*(*ndat - i - 1));
(*dat)[i+1] = val;
}
static int process_record(args_t *args, bcf1_t *rec)
{
args->ntotal++;
static int warned = 0;
int ngts = bcf_get_genotypes(args->hdr, rec, &args->gts, &args->mgts);
ngts /= rec->n_sample;
if ( ngts>2 ) error("todo: ploidy=%d\n", ngts);
args->nbg_gts = 0;
uint32_t bg_als = 0;
int i,j;
for (i=0; i<args->nbg_smpl; i++)
{
uint32_t gt = 0;
int32_t *ptr = args->gts + args->bg_smpl[i]*ngts;
for (j=0; j<ngts; j++)
{
if ( ptr[j]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(ptr[j]) ) continue;
int ial = bcf_gt_allele(ptr[j]);
if ( ial > 31 )
{
if ( !warned )
{
fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1);
warned = 1;
}
args->nskipped++;
return -1;
}
bg_als |= 1<<ial;
gt |= 1<<ial;
}
binary_insert(gt, &args->bg_gts, &args->nbg_gts, &args->mbg_gts);
}
if ( !bg_als )
{
// all are missing
args->nskipped++;
return -1;
}
args->novel_als_smpl.l = 0;
args->novel_gts_smpl.l = 0;
int has_gt = 0;
for (i=0; i<args->nnovel_smpl; i++)
{
int novel_al = 0;
uint32_t gt = 0;
int32_t *ptr = args->gts + args->novel_smpl[i]*ngts;
for (j=0; j<ngts; j++)
{
if ( ptr[j]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(ptr[j]) ) continue;
int ial = bcf_gt_allele(ptr[j]);
if ( ial > 31 )
{
if ( !warned )
{
fprintf(stderr,"Too many alleles (>32) at %s:%d, skipping. (todo?)\n", bcf_seqname(args->hdr,rec),rec->pos+1);
warned = 1;
}
args->nskipped++;
return -1;
}
if ( !(bg_als & (1<<ial)) ) novel_al = 1;
gt |= 1<<ial;
}
if ( !gt ) continue;
has_gt = 1;
char *smpl = args->hdr->samples[ args->novel_smpl[i] ];
if ( novel_al )
{
if ( args->novel_als_smpl.l ) kputc(',', &args->novel_als_smpl);
kputs(smpl, &args->novel_als_smpl);
}
else if ( !binary_search(gt, args->bg_gts, args->nbg_gts) )
{
if ( args->novel_gts_smpl.l ) kputc(',', &args->novel_gts_smpl);
kputs(smpl, &args->novel_gts_smpl);
}
}
if ( !has_gt )
{
// all are missing
args->nskipped++;
return -1;
}
if ( args->novel_als_smpl.l )
{
bcf_update_info_string(args->hdr_out, rec, "NOVELAL", args->novel_als_smpl.s);
args->nnovel_al++;
}
if ( args->novel_gts_smpl.l )
{
bcf_update_info_string(args->hdr_out, rec, "NOVELGT", args->novel_gts_smpl.s);
args->nnovel_gt++;
}
args->ntested++;
return 0;
}
int run(int argc, char **argv)
{
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
args->output_fname = "-";
static struct option loptions[] =
{
{"bg-samples",required_argument,0,'0'},
{"novel-samples",required_argument,0,'1'},
{"include",required_argument,0,'i'},
{"exclude",required_argument,0,'e'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"regions",1,0,'r'},
{"regions-file",1,0,'R'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{NULL,0,NULL,0}
};
int c;
while ((c = getopt_long(argc, argv, "O:o:i:e:r:R:t:T:0:1:",loptions,NULL)) >= 0)
{
switch (c)
{
case '0': args->bg_samples_str = optarg; break;
case '1': args->novel_samples_str = optarg; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 't': args->targets = optarg; break;
case 'T': args->targets = optarg; args->targets_is_file = 1; break;
case 'r': args->regions = optarg; break;
case 'R': args->regions = optarg; args->regions_is_file = 1; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
case 'b': args->output_type = FT_BCF_GZ; break;
case 'u': args->output_type = FT_BCF; break;
case 'z': args->output_type = FT_VCF_GZ; break;
case 'v': args->output_type = FT_VCF; break;
default: error("The output type \"%s\" not recognised\n", optarg);
};
break;
case 'h':
case '?':
default: error("%s", usage_text()); break;
}
}
if ( optind==argc )
{
if ( !isatty(fileno((FILE *)stdin)) ) args->fname = "-"; // reading from stdin
else { error(usage_text()); }
}
else if ( optind+1!=argc ) error(usage_text());
else args->fname = argv[optind];
init_data(args);
while ( bcf_sr_next_line(args->sr) )
{
bcf1_t *rec = bcf_sr_get_line(args->sr,0);
if ( args->filter )
{
int pass = filter_test(args->filter, rec, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
if ( !pass ) continue;
}
process_record(args, rec);
bcf_write(args->out_fh, args->hdr_out, rec);
}
fprintf(stderr,"Total/processed/skipped/novel_allele/novel_gt:\t%d\t%d\t%d\t%d\t%d\n", args->ntotal, args->ntested, args->nskipped, args->nnovel_al, args->nnovel_gt);
destroy_data(args);
return 0;
}
plugins/fill-from-fasta.so: plugins/fill-from-fasta.c version.h version.c filter.h filter.c
$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)
$(CC) $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) $(LDFLAGS) -o $@ filter.c version.c $< $(LIBS)