Skip to content
Commits on Source (4)
......@@ -5,6 +5,18 @@ http://pysam.readthedocs.io/en/latest/release.html
Release notes
=============
Release 0.14.1
==============
This is mostly a bugfix release, though bcftools has now also been
upgraded to 1.7.0.
* [#621] Add a warning to count_coverage when an alignment has an
empty QUAL field
* [#635] Speed-up of AlignedSegment.find_intro()
* treat border case of all bases in pileup column below quality score
* [#634] Fix access to pileup reference_sequence
Release 0.14.0
==============
......
......@@ -63,6 +63,23 @@ static inline char gt2iupac(char a, char b)
return iupac[(int)a][(int)b];
}
static inline int iupac_consistent(char iupac, char nt)
{
static const char iupac_mask[90] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,14,2,
13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
};
if ( iupac > 89 ) return 0;
if ( nt > 90 ) nt -= 32; // lowercase
if ( nt=='A' ) nt = 1;
else if ( nt=='C' ) nt = 2;
else if ( nt=='G' ) nt = 4;
else if ( nt=='T' ) nt = 8;
return iupac_mask[(int)iupac] & nt ? 1 : 0;
}
static inline char nt_to_upper(char nt)
{
if ( nt < 97 ) return nt;
......
......@@ -77,6 +77,7 @@ typedef struct
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
char *chr;
regidx_t *mask;
regitr_t *itr;
......@@ -121,6 +122,8 @@ static void destroy_chain(args_t *args)
free(chain->block_lengths);
free(chain);
chain = NULL;
free(args->chr);
args->chr = NULL;
}
static void print_chain(args_t *args)
......@@ -162,7 +165,7 @@ static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
......@@ -248,6 +251,7 @@ static void destroy_data(args_t *args)
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
......@@ -276,6 +280,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
......@@ -380,7 +385,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}
int i, ialt = 1;
int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
......@@ -417,6 +422,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
......@@ -565,11 +571,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
static void mask_region(args_t *args, char *seq, int len)
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
......
......@@ -79,6 +79,7 @@ typedef struct
rbuf_t vcf_rbuf;
bcf1_t **vcf_buf;
int nvcf_buf, rid;
char *chr;
regidx_t *mask;
regitr_t *itr;
......@@ -123,6 +124,8 @@ static void destroy_chain(args_t *args)
free(chain->block_lengths);
free(chain);
chain = NULL;
free(args->chr);
args->chr = NULL;
}
static void print_chain(args_t *args)
......@@ -164,7 +167,7 @@ static void print_chain(args_t *args)
score += chain->block_lengths[n];
}
score += last_block_size;
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, bcf_hdr_id2name(args->hdr,args->rid), ref_end_pos, chain->ori_pos, ref_end_pos, bcf_hdr_id2name(args->hdr,args->rid), alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
fprintf(args->fp_chain, "chain %d %s %d + %d %d %s %d + %d %d %d\n", score, args->chr, ref_end_pos, chain->ori_pos, ref_end_pos, args->chr, alt_end_pos, chain->ori_pos, alt_end_pos, ++args->chain_id);
for (n=0; n<chain->num; n++) {
fprintf(args->fp_chain, "%d %d %d\n", chain->block_lengths[n], chain->ref_gaps[n], chain->alt_gaps[n]);
}
......@@ -250,6 +253,7 @@ static void destroy_data(args_t *args)
if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]);
free(args->vcf_buf);
free(args->fa_buf.s);
free(args->chr);
if ( args->mask ) regidx_destroy(args->mask);
if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
......@@ -278,6 +282,7 @@ static void init_region(args_t *args, char *line)
else to--;
}
}
args->chr = strdup(line);
args->rid = bcf_hdr_name2id(args->hdr,line);
if ( args->rid<0 ) fprintf(bcftools_stderr,"Warning: Sequence \"%s\" not in %s\n", line,args->fname);
args->fa_buf.l = 0;
......@@ -382,7 +387,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
}
int i, ialt = 1;
int i, ialt = 1; // the alternate allele
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
......@@ -419,6 +424,7 @@ static void apply_variant(args_t *args, bcf1_t *rec)
{
char ial = rec->d.allele[ialt][0];
char jal = rec->d.allele[jalt][0];
if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored
rec->d.allele[ialt][0] = gt2iupac(ial,jal);
}
}
......@@ -567,11 +573,10 @@ static void apply_variant(args_t *args, bcf1_t *rec)
static void mask_region(args_t *args, char *seq, int len)
{
char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
while ( regitr_overlap(args->itr) )
......
/* convert.c -- functions for converting between VCF/BCF and related formats.
Copyright (C) 2013-2017 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -92,6 +92,7 @@ struct _convert_t
int ndat;
char *undef_info_tag;
int allow_undef_tags;
uint8_t **subset_samples;
};
typedef struct
......@@ -174,6 +175,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
}
return ((int32_t*)bcf_array)[idx];
}
static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
{
int n = 0, ibeg = 0;
while ( src[ibeg] && ibeg<len && n < idx )
{
if ( src[ibeg]==',' ) n++;
ibeg++;
}
if ( ibeg==len ) { kputc('.', str); return; }
int iend = ibeg;
while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
if ( iend>ibeg )
kputsn(src+ibeg, iend-ibeg, str);
else
kputc('.', str);
}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
......@@ -232,6 +251,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
......@@ -288,6 +308,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
else
kputw(ival, str);
}
else if ( fmt->fmt->type == BCF_BT_CHAR )
_copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
......@@ -1312,6 +1334,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
......@@ -1362,6 +1387,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
default:
ret = -1;
}
......
......@@ -2,7 +2,7 @@
/* convert.c -- functions for converting between VCF/BCF and related formats.
Copyright (C) 2013-2017 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -94,6 +94,7 @@ struct _convert_t
int ndat;
char *undef_info_tag;
int allow_undef_tags;
uint8_t **subset_samples;
};
typedef struct
......@@ -176,6 +177,24 @@ static inline int32_t bcf_array_ivalue(void *bcf_array, int type, int idx)
}
return ((int32_t*)bcf_array)[idx];
}
static inline void _copy_field(char *src, uint32_t len, int idx, kstring_t *str)
{
int n = 0, ibeg = 0;
while ( src[ibeg] && ibeg<len && n < idx )
{
if ( src[ibeg]==',' ) n++;
ibeg++;
}
if ( ibeg==len ) { kputc('.', str); return; }
int iend = ibeg;
while ( src[iend] && src[iend]!=',' && iend<len ) iend++;
if ( iend>ibeg )
kputsn(src+ibeg, iend-ibeg, str);
else
kputc('.', str);
}
static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( fmt->id<0 )
......@@ -234,6 +253,7 @@ static void process_info(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isamp
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
case BCF_BT_CHAR: _copy_field((char*)info->vptr, info->vptr_len, fmt->subscript, str); break;
default: fprintf(bcftools_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
......@@ -290,6 +310,8 @@ static void process_format(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isa
else
kputw(ival, str);
}
else if ( fmt->fmt->type == BCF_BT_CHAR )
_copy_field((char*)(fmt->fmt->p + isample*fmt->fmt->size), fmt->fmt->size, fmt->subscript, str);
else error("TODO: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt->fmt->type);
}
else
......@@ -1314,6 +1336,9 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
}
for (js=0; js<convert->nsamples; js++)
{
// Skip samples when filtering was requested
if ( *convert->subset_samples && !(*convert->subset_samples)[js] ) continue;
// Here comes a hack designed for TBCSQ. When running on large files,
// such as 1000GP, there are too many empty fields in the output and
// it's very very slow. Therefore in case the handler does not add
......@@ -1364,6 +1389,9 @@ int convert_set_option(convert_t *convert, enum convert_option opt, ...)
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
break;
case subset_samples:
convert->subset_samples = va_arg(args, uint8_t**);
break;
default:
ret = -1;
}
......
......@@ -30,7 +30,8 @@ THE SOFTWARE. */
typedef struct _convert_t convert_t;
enum convert_option
{
allow_undef_tags
allow_undef_tags,
subset_samples,
};
convert_t *convert_init(bcf_hdr_t *hdr, int *samples, int nsamples, const char *str);
......
/* The MIT License
Copyright (c) 2016 Genome Research Ltd.
Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -726,7 +726,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
......@@ -2713,6 +2713,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}
void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
......@@ -2784,7 +2785,10 @@ void hap_finalize(args_t *args, hap_t *hap)
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i<istack )
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
......@@ -3388,7 +3392,7 @@ int test_cds(args_t *args, bcf1_t *rec)
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;
if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
......@@ -3397,7 +3401,7 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
......@@ -3648,6 +3652,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
......@@ -3695,12 +3700,12 @@ static const char *usage(void)
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
" -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
" -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased GTs\n"
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
......
......@@ -2,7 +2,7 @@
/* The MIT License
Copyright (c) 2016 Genome Research Ltd.
Copyright (c) 2016-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -728,7 +728,7 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char
id = tbl->nstr++;
hts_expand(char*, tbl->nstr, tbl->mstr, tbl->str);
tbl->str[id] = strdup(ss);
int ret = khash_str2int_set(tbl->str2id, tbl->str[id], id);
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
......@@ -2715,6 +2715,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg,
}
}
void hap_finalize(args_t *args, hap_t *hap)
{
tscript_t *tr = hap->tr;
......@@ -2786,7 +2787,10 @@ void hap_finalize(args_t *args, hap_t *hap)
}
dlen += hap->stack[i].node->dlen;
if ( hap->stack[i].node->dlen ) indel = 1;
if ( i<istack )
// This condition extends compound variants. Note that s/s/s sites are forced out to always break
// a compound block. See ENST00000271583/splice-acceptor.vcf for motivation.
if ( i<istack && hap->stack[i+1].node->type != HAP_SSS )
{
if ( dlen%3 ) // frameshift
{
......@@ -3390,7 +3394,7 @@ int test_cds(args_t *args, bcf1_t *rec)
int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
if ( gt[0]==bcf_gt_missing ) continue;
if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
if ( ngts>1 && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end && bcf_gt_allele(gt[0])!=bcf_gt_allele(gt[1]) )
{
if ( args->phase==PHASE_MERGE )
{
......@@ -3399,7 +3403,7 @@ int test_cds(args_t *args, bcf1_t *rec)
if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
{
if ( args->phase==PHASE_REQUIRE )
error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
error("Unphased heterozygous genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
if ( args->phase==PHASE_SKIP )
continue;
if ( args->phase==PHASE_NON_REF )
......@@ -3650,6 +3654,7 @@ void process(args_t *args, bcf1_t **rec_ptr)
int call_csq = 1;
if ( !rec->n_allele ) call_csq = 0; // no alternate allele
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
else if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][0]!='*') call_csq = 0; // a symbolic allele, not ready for CNVs etc
else if ( args->filter )
{
call_csq = filter_test(args->filter, rec, NULL);
......@@ -3697,12 +3702,12 @@ static const char *usage(void)
" -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
" -l, --local-csq localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
" -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
" -p, --phase <a|m|r|R|s> how to handle unphased heterozygous genotypes: [r]\n"
" a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
" m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
" r: require phased GTs, throw an error on unphased het GTs\n"
" R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
" s: skip unphased GTs\n"
" s: skip unphased hets\n"
"Options:\n"
" -e, --exclude <expr> exclude sites for which the expression is true\n"
" -i, --include <expr> select sites for which the expression is true\n"
......
This diff is collapsed.
This diff is collapsed.
......@@ -49,7 +49,7 @@ int main_vcfcall(int argc, char *argv[]);
int main_vcfannotate(int argc, char *argv[]);
int main_vcfroh(int argc, char *argv[]);
int main_vcfconcat(int argc, char *argv[]);
int main_bcftools_reheader(int argc, char *argv[]);
int main_reheader(int argc, char *argv[]);
int main_vcfconvert(int argc, char *argv[]);
int main_vcfcnv(int argc, char *argv[]);
#if USE_GPL
......@@ -125,7 +125,7 @@ static cmd_t cmds[] =
.alias = "query",
.help = "transform VCF/BCF into user-defined formats"
},
{ .func = main_bcftools_reheader,
{ .func = main_reheader,
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
......
/*
Copyright (C) 2014-2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -189,6 +189,35 @@ int regidx_insert(regidx_t *idx, char *line)
return 0;
}
regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
if ( !idx ) return NULL;
idx->free = free_f;
idx->parse = parser ? parser : regidx_parse_tab;
idx->usr = usr_dat;
idx->seq2regs = khash_str2int_init();
idx->payload_size = payload_size;
if ( payload_size ) idx->payload = malloc(payload_size);
kstring_t tmp = {0,0,0};
const char *ss = str;
while ( *ss )
{
while ( *ss && isspace(*ss) ) ss++;
const char *se = ss;
while ( *se && *se!='\r' && *se!='\n' ) se++;
tmp.l = 0;
kputsn(ss, se-ss, &tmp);
regidx_insert(idx,tmp.s);
while ( *se && isspace(*se) ) se++;
ss = se;
}
free(tmp.s);
return idx;
}
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
......@@ -595,4 +624,13 @@ int regitr_loop(regitr_t *regitr)
}
void regitr_copy(regitr_t *dst, regitr_t *src)
{
_itr_t *dst_itr = (_itr_t*) dst->itr;
_itr_t *src_itr = (_itr_t*) src->itr;
*dst_itr = *src_itr;
*dst = *src;
dst->itr = dst_itr;
}
#include "bcftools.pysam.h"
/*
Copyright (C) 2014-2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -191,6 +191,35 @@ int regidx_insert(regidx_t *idx, char *line)
return 0;
}
regidx_t *regidx_init_string(const char *str, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
if ( !idx ) return NULL;
idx->free = free_f;
idx->parse = parser ? parser : regidx_parse_tab;
idx->usr = usr_dat;
idx->seq2regs = khash_str2int_init();
idx->payload_size = payload_size;
if ( payload_size ) idx->payload = malloc(payload_size);
kstring_t tmp = {0,0,0};
const char *ss = str;
while ( *ss )
{
while ( *ss && isspace(*ss) ) ss++;
const char *se = ss;
while ( *se && *se!='\r' && *se!='\n' ) se++;
tmp.l = 0;
kputsn(ss, se-ss, &tmp);
regidx_insert(idx,tmp.s);
while ( *se && isspace(*se) ) se++;
ss = se;
}
free(tmp.s);
return idx;
}
regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
{
if ( !parser )
......@@ -597,4 +626,13 @@ int regitr_loop(regitr_t *regitr)
}
void regitr_copy(regitr_t *dst, regitr_t *src)
{
_itr_t *dst_itr = (_itr_t*) dst->itr;
_itr_t *src_itr = (_itr_t*) src->itr;
*dst_itr = *src_itr;
*dst = *src;
dst->itr = dst_itr;
}
......@@ -109,6 +109,8 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
/*
* regidx_init() - creates new index
* regidx_init_string() - creates new index, from a string rather than from a file
*
* @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
* @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
* the format will be autodected, currently either regidx_parse_tab (the default) or
......@@ -121,6 +123,7 @@ int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*);
* Returns index on success or NULL on error.
*/
regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
/*
* regidx_destroy() - free memory allocated by regidx_init
......@@ -184,6 +187,11 @@ int regitr_overlap(regitr_t *itr);
*/
int regitr_loop(regitr_t *itr);
/*
* regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop
*/
void regitr_copy(regitr_t *dst, regitr_t *src);
#ifdef __cplusplus
}
#endif
......
/* reheader.c -- reheader subcommand.
Copyright (C) 2014,2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -467,7 +467,7 @@ static void usage(args_t *args)
exit(1);
}
int main_bcftools_reheader(int argc, char *argv[])
int main_reheader(int argc, char *argv[])
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
......
......@@ -2,7 +2,7 @@
/* reheader.c -- reheader subcommand.
Copyright (C) 2014,2016 Genome Research Ltd.
Copyright (C) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -469,7 +469,7 @@ static void usage(args_t *args)
exit(1);
}
int main_bcftools_reheader(int argc, char *argv[])
int main_reheader(int argc, char *argv[])
{
int c;
args_t *args = (args_t*) calloc(1,sizeof(args_t));
......
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
Copyright (C) 2013-2016 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -95,6 +95,7 @@ typedef struct _args_t
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
int keep_sites;
rm_tag_t *rm; // tags scheduled for removal
int nrm;
......@@ -263,7 +264,7 @@ static void init_remove_annots(args_t *args)
tag->key = strdup(str.s);
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
}
else
{
......@@ -293,32 +294,35 @@ static void init_remove_annots(args_t *args)
tag->key = strdup(str.s);
if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
bcf_hdr_remove(args->hdr_out,type,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
}
else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
else if ( !strcasecmp("FILTER",str.s) )
{
tag->handler = remove_filter;
remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
tag->handler = remove_info;
remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
{
tag->handler = remove_format;
remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
}
else if ( str.l )
{
if ( !args->keep_sites )
{
if ( str.s[0]=='#' && str.s[1]=='#' )
bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
else
bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
}
args->nrm--;
}
......@@ -354,7 +358,7 @@ static void init_remove_annots(args_t *args)
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
}
tag->key = strdup(hrec->vals[k]);
bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
}
}
khash_str2int_destroy_free(keep);
......@@ -1870,6 +1874,7 @@ static void usage(args_t *args)
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
fprintf(stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
fprintf(stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
......@@ -1879,7 +1884,7 @@ static void usage(args_t *args)
fprintf(stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
fprintf(stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
fprintf(stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(stderr, "\n");
exit(1);
......@@ -1901,6 +1906,7 @@ int main_vcfannotate(int argc, char *argv[])
static struct option loptions[] =
{
{"keep-sites",required_argument,NULL,'k'},
{"mark-sites",required_argument,NULL,'m'},
{"set-id",required_argument,NULL,'I'},
{"output",required_argument,NULL,'o'},
......@@ -1921,9 +1927,10 @@ int main_vcfannotate(int argc, char *argv[])
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
{
switch (c) {
case 'k': args->keep_sites = 1; break;
case 'm':
args->mark_sites_logic = MARK_LISTED;
if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
......@@ -2008,7 +2015,11 @@ int main_vcfannotate(int argc, char *argv[])
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
if ( !pass ) continue;
if ( !pass )
{
if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
continue;
}
}
annotate(args, line);
bcf_write1(args->out_fh, args->hdr_out, line);
......
......@@ -2,7 +2,7 @@
/* vcfannotate.c -- Annotate and edit VCF/BCF files.
Copyright (C) 2013-2016 Genome Research Ltd.
Copyright (C) 2013-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -97,6 +97,7 @@ typedef struct _args_t
filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
int keep_sites;
rm_tag_t *rm; // tags scheduled for removal
int nrm;
......@@ -265,7 +266,7 @@ static void init_remove_annots(args_t *args)
tag->key = strdup(str.s);
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, tag->key);
if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,tag->hdr_id) ) error("Cannot remove %s, not defined in the header.\n", str.s);
bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,BCF_HL_FLT,tag->key);
}
else
{
......@@ -295,32 +296,35 @@ static void init_remove_annots(args_t *args)
tag->key = strdup(str.s);
if ( type==BCF_HL_INFO ) tag->handler = remove_info_tag;
else if ( type==BCF_HL_FMT ) tag->handler = remove_format_tag;
bcf_hdr_remove(args->hdr_out,type,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,type,tag->key);
}
}
else if ( !strcasecmp("ID",str.s) ) tag->handler = remove_id;
else if ( !strcasecmp("FILTER",str.s) )
{
tag->handler = remove_filter;
remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FLT);
}
else if ( !strcasecmp("QUAL",str.s) ) tag->handler = remove_qual;
else if ( !strcasecmp("INFO",str.s) )
{
tag->handler = remove_info;
remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_INFO);
}
else if ( !strcasecmp("FMT",str.s) || !strcasecmp("FORMAT",str.s) )
{
tag->handler = remove_format;
remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
if ( !args->keep_sites ) remove_hdr_lines(args->hdr_out,BCF_HL_FMT);
}
else if ( str.l )
{
if ( !args->keep_sites )
{
if ( str.s[0]=='#' && str.s[1]=='#' )
bcf_hdr_remove(args->hdr_out,BCF_HL_GEN,str.s+2);
else
bcf_hdr_remove(args->hdr_out,BCF_HL_STR,str.s);
}
args->nrm--;
}
......@@ -356,7 +360,7 @@ static void init_remove_annots(args_t *args)
tag->hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, hrec->vals[k]);
}
tag->key = strdup(hrec->vals[k]);
bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
if ( !args->keep_sites ) bcf_hdr_remove(args->hdr_out,hrec->type,tag->key);
}
}
khash_str2int_destroy_free(keep);
......@@ -1872,6 +1876,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
fprintf(bcftools_stderr, " -I, --set-id [+]<format> set ID column, see man page for details\n");
fprintf(bcftools_stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(bcftools_stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n");
fprintf(bcftools_stderr, " -m, --mark-sites [+-]<tag> add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n");
fprintf(bcftools_stderr, " --no-version do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output <file> write output to a file [standard output]\n");
......@@ -1881,7 +1886,7 @@ static void usage(args_t *args)
fprintf(bcftools_stderr, " --rename-chrs <file> rename sequences according to map file: from\\tto\n");
fprintf(bcftools_stderr, " -s, --samples [^]<list> comma separated list of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(bcftools_stderr, " -S, --samples-file [^]<file> file of samples to annotate (or exclude with \"^\" prefix)\n");
fprintf(bcftools_stderr, " -x, --remove <list> list of annotations to remove (e.g. ID,INFO/DP,FORMAT/DP,FILTER). See man page for details\n");
fprintf(bcftools_stderr, " -x, --remove <list> list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n");
fprintf(bcftools_stderr, " --threads <int> number of extra output compression threads [0]\n");
fprintf(bcftools_stderr, "\n");
exit(1);
......@@ -1903,6 +1908,7 @@ int main_vcfannotate(int argc, char *argv[])
static struct option loptions[] =
{
{"keep-sites",required_argument,NULL,'k'},
{"mark-sites",required_argument,NULL,'m'},
{"set-id",required_argument,NULL,'I'},
{"output",required_argument,NULL,'o'},
......@@ -1923,9 +1929,10 @@ int main_vcfannotate(int argc, char *argv[])
{"no-version",no_argument,NULL,8},
{NULL,0,NULL,0}
};
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:k",loptions,NULL)) >= 0)
{
switch (c) {
case 'k': args->keep_sites = 1; break;
case 'm':
args->mark_sites_logic = MARK_LISTED;
if ( optarg[0]=='+' ) args->mark_sites = optarg+1;
......@@ -2010,7 +2017,11 @@ int main_vcfannotate(int argc, char *argv[])
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
if ( !pass ) continue;
if ( !pass )
{
if ( args->keep_sites ) bcf_write1(args->out_fh, args->hdr_out, line);
continue;
}
}
annotate(args, line);
bcf_write1(args->out_fh, args->hdr_out, line);
......
/* The MIT License
Copyright (c) 2014-2015 Genome Research Ltd.
Copyright (c) 2014-2018 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
......@@ -338,7 +338,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
"csv.register_dialect('tab', delimiter='\\t', quoting=csv.QUOTE_NONE)\n"
"\n"
"dat = {}\n"
"with open('%s', 'rb') as f:\n"
"with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
......@@ -347,7 +347,7 @@ static void plot_sample(args_t *args, sample_t *smpl)
" dat[chr].append([row[1], float(row[2]), float(row[3])])\n"
"\n"
"cnv = {}\n"
"with open('%s', 'rb') as f:\n"
"with open('%s', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
......@@ -429,7 +429,7 @@ static void create_plots(args_t *args)
"\n"
"def chroms_to_plot(th):\n"
" dat = {}\n"
" with open('%s/summary.tab', 'rb') as f:\n"
" with open('%s/summary.tab', 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" if row[0]!='RG': continue\n"
......@@ -451,14 +451,14 @@ static void create_plots(args_t *args)
" plot_chroms = chroms_to_plot(args.plot_threshold)\n"
"\n"
"def read_dat(file,dat,plot_chr):\n"
" with open(file, 'rb') as f:\n"
" with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
" if chr != plot_chr: continue\n"
" dat.append([row[1], float(row[2]), float(row[3])])\n"
"def read_cnv(file,cnv,plot_chr):\n"
" with open(file, 'rb') as f:\n"
" with open(file, 'r') as f:\n"
" reader = csv.reader(f, 'tab')\n"
" for row in reader:\n"
" chr = row[0]\n"
......