Skip to content
Commits on Source (3)
......@@ -21,7 +21,7 @@ addons:
- g++
script:
- ./run_tests_travis.sh
- ./devtools/run_tests_travis.sh
notifications:
email:
......
......@@ -5,6 +5,17 @@ http://pysam.readthedocs.io/en/latest/release.html
Release notes
=============
Release 0.15.0
==============
This release wraps htslib/samtools/bcftools version 1.9.0.
* [#673] permit dash in chromosome name of region string
* [#656] Support `text` when opening a SAM file for writing
* [#658] return None in get_forward_sequence if sequence not in record
* [#683] allow lower case bases in MD tags
* Ensure that = and X CIGAR ops are treated the same as M
Release 0.14.1
==============
......
......@@ -15,7 +15,8 @@ includes an interface for tabix_.
If you are using the conda packaging manager (e.g. miniconda or anaconda),
you can install pysam from the `bioconda channel <https://bioconda.github.io/>`_::
conda config --add channels r
conda config --add channels defaults
conda config --add channels conda-forge
conda config --add channels bioconda
conda install pysam
......@@ -29,7 +30,7 @@ Pysam is available through `pypi
pip install pysam
Pysam documentation is available through https://readthedocs.org/ from
Pysam documentation is available
`here <http://pysam.readthedocs.org/en/latest/>`_
Questions and comments are very welcome and should be sent to the
......
This diff is collapsed.
BCFtools implements utilities for variant calling (in conjunction with
SAMtools) and manipulating VCF and BCF files. The program is intended
to replace the Perl-based tools from vcftools.
See INSTALL for building and installation instructions.
/* bam_sample.c -- group data by sample.
Copyright (C) 2010, 2011 Broad Institute.
Copyright (C) 2013, 2016 Genome Research Ltd.
Copyright (C) 2013, 2016-2018 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
......@@ -167,10 +167,14 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
void *bam_smpls = khash_str2int_init();
int first_smpl = -1, nskipped = 0;
const char *p = bam_hdr, *q, *r;
while ((q = strstr(p, "@RG")) != 0)
while (p != NULL && (q = strstr(p, "@RG")) != 0)
{
char *eol = strchr(q + 3, '\n');
if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
p = eol;
continue;
}
p = q + 3;
r = q = 0;
if ((q = strstr(p, "\tID:")) != 0) q += 4;
if ((r = strstr(p, "\tSM:")) != 0) r += 4;
if (r && q)
......@@ -220,7 +224,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
}
else
break;
p = q > r ? q : r;
p = eol;
}
int nsmpls = khash_str2int_size(bam_smpls);
khash_str2int_destroy_free(bam_smpls);
......@@ -234,6 +238,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
// no suitable read group is available in this bam: ignore the whole file.
free(file->fname);
if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
bsmpl->nfiles--;
return -1;
}
......
......@@ -3,7 +3,7 @@
/* bam_sample.c -- group data by sample.
Copyright (C) 2010, 2011 Broad Institute.
Copyright (C) 2013, 2016 Genome Research Ltd.
Copyright (C) 2013, 2016-2018 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
......@@ -169,10 +169,14 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
void *bam_smpls = khash_str2int_init();
int first_smpl = -1, nskipped = 0;
const char *p = bam_hdr, *q, *r;
while ((q = strstr(p, "@RG")) != 0)
while (p != NULL && (q = strstr(p, "@RG")) != 0)
{
char *eol = strchr(q + 3, '\n');
if (q > bam_hdr && *(q - 1) != '\n') { // @RG must be at start of line
p = eol;
continue;
}
p = q + 3;
r = q = 0;
if ((q = strstr(p, "\tID:")) != 0) q += 4;
if ((r = strstr(p, "\tSM:")) != 0) r += 4;
if (r && q)
......@@ -222,7 +226,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
}
else
break;
p = q > r ? q : r;
p = eol;
}
int nsmpls = khash_str2int_size(bam_smpls);
khash_str2int_destroy_free(bam_smpls);
......@@ -236,6 +240,7 @@ int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
{
// no suitable read group is available in this bam: ignore the whole file.
free(file->fname);
if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
bsmpl->nfiles--;
return -1;
}
......
......@@ -39,7 +39,7 @@ THE SOFTWARE. */
#define FT_STDIN (1<<3)
char *bcftools_version(void);
void error(const char *format, ...) HTS_NORETURN;
void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
const char *hts_bcf_wmode(int file_type);
......
......@@ -54,6 +54,12 @@ void bcftools_unset_stdout(void)
bcftools_stdout_fileno = STDOUT_FILENO;
}
int bcftools_puts(const char *s)
{
if (fputs(s, bcftools_stdout) == EOF) return EOF;
return putc('\n', bcftools_stdout);
}
void bcftools_set_optind(int val)
{
// setting this in cython via
......
#ifndef BCFTOOLS_PYSAM_H
#define BCFTOOLS_PYSAM_H
#ifndef PYSAM_H
#define PYSAM_H
#include "stdio.h"
......@@ -38,6 +38,8 @@ void bcftools_unset_stderr(void);
*/
void bcftools_unset_stdout(void);
int bcftools_puts(const char *s);
int bcftools_dispatch(int argc, char *argv[]);
void bcftools_set_optind(int);
......
......@@ -48,9 +48,9 @@ bin_t *bin_init(const char *list_def, float min, float max)
{
char *tmp;
bin->bins[i] = strtod(list[i],&tmp);
if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( *tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
error("Expected values from the interval [%f,%f], found %s\n", list[i]);
error("Expected values from the interval [%f,%f], found %s\n", min, max, list[i]);
free(list[i]);
}
free(list);
......
......@@ -50,9 +50,9 @@ bin_t *bin_init(const char *list_def, float min, float max)
{
char *tmp;
bin->bins[i] = strtod(list[i],&tmp);
if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( *tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
error("Expected values from the interval [%f,%f], found %s\n", list[i]);
error("Expected values from the interval [%f,%f], found %s\n", min, max, list[i]);
free(list[i]);
}
free(list);
......
......@@ -36,6 +36,7 @@
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
#include <htslib/bgzf.h>
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
......@@ -73,6 +74,8 @@ typedef struct
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
int fa_case; // output upper case or lower case?
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
......@@ -96,7 +99,7 @@ typedef struct
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
}
args_t;
......@@ -237,7 +240,7 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
}
......@@ -264,7 +267,7 @@ static void init_region(args_t *args, char *line)
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
int from = 0, to = 0;
char tmp, *tmp_ptr = NULL;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
tmp = *se; *se = 0; tmp_ptr = se;
......@@ -280,10 +283,12 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
free(args->chr);
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
args->prev_base_pos = -1;
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
args->fa_ori_pos = from;
......@@ -370,13 +375,10 @@ static void flush_fa_buffer(args_t *args, int len)
}
static void apply_variant(args_t *args, bcf1_t *rec)
{
if ( rec->n_allele==1 ) return;
static int warned_haplotype = 0;
if ( rec->n_allele==1 && !args->missing_allele ) return;
if ( rec->pos <= args->fa_frz_pos )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
if ( args->mask )
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
......@@ -395,35 +397,73 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( fmt->type!=BCF_BT_INT8 )
error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
uint8_t *ptr = fmt->p + fmt->size*args->isample;
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
if ( args->haplotype > fmt->n )
{
if ( bcf_gt_is_missing(ptr[fmt->n-1]) || bcf_gt_is_missing(ptr[0]) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
{
if ( !warned_haplotype )
{
fprintf(stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
warned_haplotype = 1;
}
return;
}
}
else
{
ialt = (int8_t)ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int8_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
}
}
else if ( args->output_iupac )
{
ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
if ( bcf_gt_is_missing(jalt) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
else
jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
if ( ialt>=0 )
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
}
else
......@@ -431,8 +471,13 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(ptr[i]) )
{
if ( !args->missing_allele ) return; // ignore missing or half-missing genotypes
ialt = -1;
break;
}
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
ialt = bcf_gt_allele(ptr[i]);
if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
}
......@@ -441,7 +486,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int prev_len = 0, jalt;
for (i=0; i<fmt->n; i++)
{
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
jalt = bcf_gt_allele(ptr[i]);
if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( args->allele & (PICK_LONG|PICK_SHORT) )
......@@ -474,6 +519,25 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
if ( ialt==-1 )
{
char alleles[4];
alleles[0] = rec->d.allele[0][0];
alleles[1] = ',';
alleles[2] = args->missing_allele;
alleles[3] = 0;
bcf_update_alleles_str(args->hdr, rec, alleles);
ialt = 1;
}
// Overlapping variant? Can be still OK iff this is an insertion
if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) )
{
fprintf(stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
if ( idx<0 )
......@@ -492,7 +556,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
}
if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
if ( rec->d.allele[ialt][0]=='<' )
......@@ -506,21 +570,37 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
// fprintf(stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
// This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA),
// the reference base in fa_buf is lost and the check fails. We do not keep a buffer
// with the original sequence as it should not be necessary, we should encounter max
// one base overlap
int fail = 1;
if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
{
if ( rec->rlen==1 ) fail = 0;
else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
}
if ( fail )
{
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
}
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
alen = strlen(rec->d.allele[ialt]);
len_diff = alen - rec->rlen;
}
else
{
......@@ -539,7 +619,11 @@ static void apply_variant(args_t *args, bcf1_t *rec)
for (i=0; i<alen; i++)
args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
if ( len_diff )
{
args->prev_base = rec->d.allele[0][rec->rlen - 1];
args->prev_base_pos = rec->pos + rec->rlen - 1;
memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
}
}
else
{
......@@ -589,10 +673,10 @@ static void mask_region(args_t *args, char *seq, int len)
static void consensus(args_t *args)
{
htsFile *fasta = hts_open(args->ref_fname, "rb");
BGZF *fasta = bgzf_open(args->ref_fname, "r");
if ( !fasta ) error("Error reading %s\n", args->ref_fname);
kstring_t str = {0,0,0};
while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
while ( bgzf_getline(fasta, '\n', &str) > 0 )
{
if ( str.s[0]=='>' )
{
......@@ -669,7 +753,7 @@ static void consensus(args_t *args)
destroy_chain(args);
}
flush_fa_buffer(args, 0);
hts_close(fasta);
bgzf_close(fasta);
free(str.s);
}
......@@ -681,7 +765,7 @@ static void usage(args_t *args)
fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
......@@ -697,6 +781,7 @@ static void usage(args_t *args)
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -M, --missing <char> output <char> instead of skipping the missing genotypes\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
......@@ -722,11 +807,12 @@ int main_consensus(int argc, char *argv[])
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"missing",1,0,'M'},
{"chain",1,0,'c'},
{0,0,0,0}
};
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0)
{
switch (c)
{
......@@ -737,6 +823,10 @@ int main_consensus(int argc, char *argv[])
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'M':
args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
......
......@@ -38,6 +38,7 @@
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
#include <htslib/bgzf.h>
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
......@@ -75,6 +76,8 @@ typedef struct
int fa_length; // region's length in the original sequence (in case end_pos not provided in the FASTA header)
int fa_case; // output upper case or lower case?
int fa_src_pos; // last genomic coordinate read from the input fasta (0-based)
char prev_base; // this is only to validate the REF allele in the VCF - the modified fa_buf cannot be used for inserts following deletions, see 600#issuecomment-383186778
int prev_base_pos; // the position of prev_base
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
......@@ -98,7 +101,7 @@ typedef struct
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele;
}
args_t;
......@@ -239,7 +242,7 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = bcftools_stdout;
if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records\n");
if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
}
......@@ -266,7 +269,7 @@ static void init_region(args_t *args, char *line)
char *ss, *se = line;
while ( *se && !isspace(*se) && *se!=':' ) se++;
int from = 0, to = 0;
char tmp, *tmp_ptr = NULL;
char tmp = 0, *tmp_ptr = NULL;
if ( *se )
{
tmp = *se; *se = 0; tmp_ptr = se;
......@@ -282,10 +285,12 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
free(args->chr);
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
args->prev_base_pos = -1;
args->fa_buf.l = 0;
args->fa_length = 0;
args->fa_end_pos = to;
args->fa_ori_pos = from;
......@@ -372,13 +377,10 @@ static void flush_fa_buffer(args_t *args, int len)
}
static void apply_variant(args_t *args, bcf1_t *rec)
{
if ( rec->n_allele==1 ) return;
static int warned_haplotype = 0;
if ( rec->n_allele==1 && !args->missing_allele ) return;
if ( rec->pos <= args->fa_frz_pos )
{
fprintf(bcftools_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
if ( args->mask )
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
......@@ -397,35 +399,73 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( fmt->type!=BCF_BT_INT8 )
error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
uint8_t *ptr = fmt->p + fmt->size*args->isample;
if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
if ( args->haplotype > fmt->n )
{
if ( bcf_gt_is_missing(ptr[fmt->n-1]) || bcf_gt_is_missing(ptr[0]) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
{
if ( !warned_haplotype )
{
fprintf(bcftools_stderr, "Can't apply %d-th haplotype at %s:%d. (This warning is printed only once.)\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
warned_haplotype = 1;
}
return;
}
}
else
{
ialt = (int8_t)ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int8_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
}
}
else if ( args->output_iupac )
{
ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else
ialt = bcf_gt_allele(ialt);
int jalt;
if ( fmt->n>1 )
{
jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
if ( bcf_gt_is_missing(jalt) )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
else if ( jalt==bcf_int32_vector_end ) jalt = ialt;
else
jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
if ( ialt>=0 )
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
}
else
......@@ -433,8 +473,13 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( bcf_gt_is_missing(ptr[i]) )
{
if ( !args->missing_allele ) return; // ignore missing or half-missing genotypes
ialt = -1;
break;
}
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
ialt = bcf_gt_allele(ptr[i]);
if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
}
......@@ -443,7 +488,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int prev_len = 0, jalt;
for (i=0; i<fmt->n; i++)
{
if ( ptr[i]==bcf_int32_vector_end ) break;
if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
jalt = bcf_gt_allele(ptr[i]);
if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( args->allele & (PICK_LONG|PICK_SHORT) )
......@@ -476,6 +521,25 @@ static void apply_variant(args_t *args, bcf1_t *rec)
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
if ( rec->n_allele==1 && ialt!=-1 ) return; // non-missing reference
if ( ialt==-1 )
{
char alleles[4];
alleles[0] = rec->d.allele[0][0];
alleles[1] = ',';
alleles[2] = args->missing_allele;
alleles[3] = 0;
bcf_update_alleles_str(args->hdr, rec, alleles);
ialt = 1;
}
// Overlapping variant? Can be still OK iff this is an insertion
if ( rec->pos <= args->fa_frz_pos && (rec->pos!=args->fa_frz_pos || rec->d.allele[0][0]!=rec->d.allele[ialt][0]) )
{
fprintf(bcftools_stderr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
return;
}
int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
if ( idx<0 )
......@@ -494,7 +558,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
}
if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
if ( rec->d.allele[ialt][0]=='<' )
......@@ -508,21 +572,37 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
{
// fprintf(bcftools_stderr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
// This is hacky, handle a special case: if insert follows a deletion (AAC>A, C>CAA),
// the reference base in fa_buf is lost and the check fails. We do not keep a buffer
// with the original sequence as it should not be necessary, we should encounter max
// one base overlap
int fail = 1;
if ( args->prev_base_pos==rec->pos && toupper(rec->d.allele[0][0])==toupper(args->prev_base) )
{
if ( rec->rlen==1 ) fail = 0;
else if ( !strncasecmp(rec->d.allele[0]+1,args->fa_buf.s+idx+1,rec->rlen-1) ) fail = 0;
}
if ( fail )
{
char tmp = 0;
if ( args->fa_buf.l - idx > rec->rlen )
{
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
}
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
}
error(
"The fasta sequence does not match the REF allele at %s:%d:\n"
" .vcf: [%s]\n"
" .vcf: [%s] <- (ALT)\n"
" .fa: [%s]%c%s\n",
bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx,
tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
);
alen = strlen(rec->d.allele[ialt]);
len_diff = alen - rec->rlen;
}
else
{
......@@ -541,7 +621,11 @@ static void apply_variant(args_t *args, bcf1_t *rec)
for (i=0; i<alen; i++)
args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
if ( len_diff )
{
args->prev_base = rec->d.allele[0][rec->rlen - 1];
args->prev_base_pos = rec->pos + rec->rlen - 1;
memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
}
}
else
{
......@@ -591,10 +675,10 @@ static void mask_region(args_t *args, char *seq, int len)
static void consensus(args_t *args)
{
htsFile *fasta = hts_open(args->ref_fname, "rb");
BGZF *fasta = bgzf_open(args->ref_fname, "r");
if ( !fasta ) error("Error reading %s\n", args->ref_fname);
kstring_t str = {0,0,0};
while ( hts_getline(fasta, KS_SEP_LINE, &str) > 0 )
while ( bgzf_getline(fasta, '\n', &str) > 0 )
{
if ( str.s[0]=='>' )
{
......@@ -671,7 +755,7 @@ static void consensus(args_t *args)
destroy_chain(args);
}
flush_fa_buffer(args, 0);
hts_close(fasta);
bgzf_close(fasta);
free(str.s);
}
......@@ -683,7 +767,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(bcftools_stderr, "Options:\n");
fprintf(bcftools_stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(bcftools_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
......@@ -699,6 +783,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(bcftools_stderr, " -m, --mask <file> replace regions with N\n");
fprintf(bcftools_stderr, " -M, --missing <char> output <char> instead of skipping the missing genotypes\n");
fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(bcftools_stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(bcftools_stderr, "Examples:\n");
......@@ -724,11 +809,12 @@ int main_consensus(int argc, char *argv[])
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
{"mask",1,0,'m'},
{"missing",1,0,'M'},
{"chain",1,0,'c'},
{0,0,0,0}
};
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:",loptions,NULL)) >= 0)
{
switch (c)
{
......@@ -739,6 +825,10 @@ int main_consensus(int argc, char *argv[])
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'M':
args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
......
......@@ -30,6 +30,7 @@ THE SOFTWARE. */
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <inttypes.h>
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
......@@ -756,16 +757,37 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);
int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
ptr += fmt_gt->n;
if ( ptr[0]==2 )
if ( fmt_gt->n==1 ) // haploid genotypes
{
if ( ptr[0]==2 ) /* 0 */
{
str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==bcf_int8_missing ) /* . */
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==4 ) /* 1 */
{
str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else
{
kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
}
else if ( ptr[0]==2 )
{
if ( ptr[1]==3 ) /* 0|0 */
{
......@@ -889,7 +911,7 @@ static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
......
......@@ -32,6 +32,7 @@ THE SOFTWARE. */
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <inttypes.h>
#include <math.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
......@@ -758,16 +759,37 @@ static void process_gt_to_hap(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( fmt_gt->n!=1 && fmt_gt->n!=2 )
error("Uh, ploidy of %d not supported, see %s:%d\n", fmt_gt->n, bcf_seqname(convert->header, line), line->pos+1);
int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
ptr += fmt_gt->n;
if ( ptr[0]==2 )
if ( fmt_gt->n==1 ) // haploid genotypes
{
if ( ptr[0]==2 ) /* 0 */
{
str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==bcf_int8_missing ) /* . */
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
else if ( ptr[0]==4 ) /* 1 */
{
str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
else
{
kputw(bcf_gt_allele(ptr[0]),str); str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
}
else if ( ptr[0]==2 )
{
if ( ptr[1]==3 ) /* 0|0 */
{
......@@ -891,7 +913,7 @@ static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int
if ( line->n_allele > 100 )
error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
error("Could not alloc %"PRIu64" bytes\n", (uint64_t)(str->l + convert->nsamples*8));
if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
......
......@@ -71,7 +71,7 @@
A .. gene line with a supported biotype
A.ID=~/^gene:/
B .. transcript line referencing A
B .. transcript line referencing A with supported biotype
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
......@@ -595,6 +595,7 @@ typedef struct _args_t
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
faidx_t *fai;
kstring_t str, str2;
......@@ -1111,15 +1112,26 @@ void tscript_init_cds(args_t *args)
tr->cds[0]->len -= tr->cds[0]->phase;
tr->cds[0]->phase = 0;
// sanity check phase
// sanity check phase; the phase number in gff tells us how many bases to skip in this
// feature to reach the first base of the next codon
int tscript_ok = 1;
for (i=0; i<tr->ncds; i++)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
else
{
......@@ -1140,13 +1152,24 @@ void tscript_init_cds(args_t *args)
tr->cds[i]->phase = 0;
// sanity check phase
int tscript_ok = 1;
for (i=tr->ncds-1; i>=0; i--)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
// set len. At the same check that CDS within a transcript do not overlap
......@@ -1876,7 +1899,7 @@ fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
{
splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
return SPLICE_OVERLAP;
}
}
......@@ -2074,7 +2097,6 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
child->var = str.s;
child->type = HAP_SSS;
child->csq = splice.csq;
child->prev = parent->type==HAP_SSS ? parent->prev : parent;
child->rec = rec;
return 0;
}
......@@ -2092,7 +2114,7 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n
assert( dbeg <= splice.kalt.l );
}
if ( parent->type==HAP_SSS ) parent = parent->prev;
assert( parent->type!=HAP_SSS );
if ( parent->type==HAP_CDS )
{
i = parent->icds;
......@@ -2402,12 +2424,12 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
#endif
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
int i;
for (i=0; i<vbuf->n; i++)
if ( vbuf->vrec[i]->line==rec ) break;
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
vrec_t *vrec = vbuf->vrec[i];
// if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
......@@ -2769,26 +2791,20 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->upstream_stop = 0;
int i = 1, dlen = 0, ibeg, indel = 0;
while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
hap->sbeg = hap->stack[i].node->sbeg;
assert( hap->stack[istack].node->type != HAP_SSS );
if ( tr->strand==STRAND_FWD )
{
i = 0, ibeg = -1;
while ( ++i <= istack )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
// start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
// This condition extends compound variants.
if ( i<istack )
{
if ( dlen%3 ) // frameshift
{
......@@ -2839,14 +2855,10 @@ void hap_finalize(args_t *args, hap_t *hap)
i = istack + 1, ibeg = -1;
while ( --i > 0 )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert ( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
if ( i>1 )
{
if ( dlen%3 )
{
......@@ -3352,7 +3364,8 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3363,7 +3376,22 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
ret = 1;
continue;
}
parent->nend--;
......@@ -3434,7 +3462,8 @@ int test_cds(args_t *args, bcf1_t *rec)
}
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, ial);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3446,10 +3475,23 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
}
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
continue;
}
if ( parent->cur_rec!=rec )
{
hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
......@@ -3708,6 +3750,7 @@ static const char *usage(void)
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" --force run even if some sanity checks fail\n"
" -i, --include <expr> select sites for which the expression is true\n"
" -o, --output <file> write output to a file [standard output]\n"
" -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
......@@ -3739,6 +3782,7 @@ int main_csq(int argc, char *argv[])
static struct option loptions[] =
{
{"force",0,0,1},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
{"custom-tag",1,0,'c'},
......@@ -3765,6 +3809,7 @@ int main_csq(int argc, char *argv[])
{
switch (c)
{
case 1 : args->force = 1; break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': args->quiet++; break;
......
......@@ -73,7 +73,7 @@
A .. gene line with a supported biotype
A.ID=~/^gene:/
B .. transcript line referencing A
B .. transcript line referencing A with supported biotype
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
......@@ -597,6 +597,7 @@ typedef struct _args_t
csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
int ncsq_buf, mcsq_buf;
id_tbl_t tscript_ids; // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
faidx_t *fai;
kstring_t str, str2;
......@@ -1113,15 +1114,26 @@ void tscript_init_cds(args_t *args)
tr->cds[0]->len -= tr->cds[0]->phase;
tr->cds[0]->phase = 0;
// sanity check phase
// sanity check phase; the phase number in gff tells us how many bases to skip in this
// feature to reach the first base of the next codon
int tscript_ok = 1;
for (i=0; i<tr->ncds; i++)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
assert( phase == len%3 );
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
else
{
......@@ -1142,13 +1154,24 @@ void tscript_init_cds(args_t *args)
tr->cds[i]->phase = 0;
// sanity check phase
int tscript_ok = 1;
for (i=tr->ncds-1; i>=0; i--)
{
int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
if ( phase!=len%3)
error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
{
if ( args->force )
{
if ( args->quiet < 2 )
fprintf(bcftools_stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
tscript_ok = 0;
break;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
// set len. At the same check that CDS within a transcript do not overlap
......@@ -1878,7 +1901,7 @@ fprintf(bcftools_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
{
splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
return SPLICE_OVERLAP;
}
}
......@@ -2076,7 +2099,6 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c
child->var = str.s;
child->type = HAP_SSS;
child->csq = splice.csq;
child->prev = parent->type==HAP_SSS ? parent->prev : parent;
child->rec = rec;
return 0;
}
......@@ -2094,7 +2116,7 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c
assert( dbeg <= splice.kalt.l );
}
if ( parent->type==HAP_SSS ) parent = parent->prev;
assert( parent->type!=HAP_SSS );
if ( parent->type==HAP_CDS )
{
i = parent->icds;
......@@ -2404,12 +2426,12 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
#endif
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
int i;
for (i=0; i<vbuf->n; i++)
if ( vbuf->vrec[i]->line==rec ) break;
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
vrec_t *vrec = vbuf->vrec[i];
// if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
......@@ -2771,26 +2793,20 @@ void hap_finalize(args_t *args, hap_t *hap)
hap->upstream_stop = 0;
int i = 1, dlen = 0, ibeg, indel = 0;
while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
hap->sbeg = hap->stack[i].node->sbeg;
assert( hap->stack[istack].node->type != HAP_SSS );
if ( tr->strand==STRAND_FWD )
{
i = 0, ibeg = -1;
while ( ++i <= istack )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
// start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
// This condition extends compound variants.
if ( i<istack )
{
if ( dlen%3 ) // frameshift
{
......@@ -2841,14 +2857,10 @@ void hap_finalize(args_t *args, hap_t *hap)
i = istack + 1, ibeg = -1;
while ( --i > 0 )
{
if ( hap->stack[i].node->type == HAP_SSS )
{
hap_add_csq(args,hap,node,0,i,i,0,0);
continue;
}
assert ( hap->stack[i].node->type != HAP_SSS );
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
if ( i>1 )
{
if ( dlen%3 )
{
......@@ -3354,7 +3366,8 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, 1);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3365,7 +3378,22 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
ret = 1;
continue;
}
parent->nend--;
......@@ -3436,7 +3464,8 @@ int test_cds(args_t *args, bcf1_t *rec)
}
hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
hap_ret = hap_init(args, parent, child, cds, rec, ial);
if ( hap_ret!=0 )
{
// overlapping or intron variant, cannot apply
if ( hap_ret==1 )
......@@ -3448,10 +3477,23 @@ int test_cds(args_t *args, bcf1_t *rec)
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
}
free(child);
hap_destroy(child);
continue;
}
if ( child->type==HAP_SSS )
{
csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
csq.type.strand = tr->strand;
csq.type.trid = tr->id;
csq.type.gene = tr->gene->name;
csq.type.type = child->csq;
csq_stage(args, &csq, rec);
hap_destroy(child);
continue;
}
if ( parent->cur_rec!=rec )
{
hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
......@@ -3710,6 +3752,7 @@ static const char *usage(void)
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" --force run even if some sanity checks fail\n"
" -i, --include <expr> select sites for which the expression is true\n"
" -o, --output <file> write output to a file [standard output]\n"
" -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
......@@ -3741,6 +3784,7 @@ int main_csq(int argc, char *argv[])
static struct option loptions[] =
{
{"force",0,0,1},
{"help",0,0,'h'},
{"ncsq",1,0,'n'},
{"custom-tag",1,0,'c'},
......@@ -3767,6 +3811,7 @@ int main_csq(int argc, char *argv[])
{
switch (c)
{
case 1 : args->force = 1; break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': args->quiet++; break;
......
This diff is collapsed.
This diff is collapsed.