Commit 47d88078 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 0.0.20141212

parents
*~
.Rhistory
.*swp
.nfs*
*.o
BedReader.cpp
Fasta.cpp
Fasta.h
Makefile.bad
Multinomial.cpp
Multinomial.h
Pilot1
Pilot2
VCF.h
VariantFilter.h
b.vcf
bugs/
callgrind.out.7143
f
freebayes.chr20.integrated.nogeno.20101123.vcf
glorder
glorder.cpp
glorder.py
glorder.pyc
gmon.out
multimaptest.cpp
pooled.sqlite
pooled.sqlite3
shunt
shunt.c
t.bed
test.db
test.vcf
test.vcf.gz
test.vcf.gz.tbi
test/
vcf2tsv
vcfaddinfo
vcfaddtag.cpp
vcfafpath
vcfallelicprimitives
vcfaltcount
vcfannotate
vcfannotategenotypes
vcfbreakmulti
vcfcheck
vcfclassify
vcfcleancomplex
vcfcommonsamples
vcfcountalleles
vcfcreatemulti
vcfdistance
vcfecho
vcfentropy
vcffilter
vcffixup
vcffixup.cpp.bak
vcfflatten
vcfgeno2haplo
vcfgenotypecompare
vcfgenotypes
vcfglxgt
vcfhaplotyecompare.cpp
vcfhetcount
vcfhethomratio
vcfintersect
vcfkeepfields
vcfkeepgeno
vcfkeepinfo
vcfkeepsamples
vcflength
vcfmultiwaywwwindexfilter
vcfnogeno
vcfnogeno.cpp
vcfnumalt
vcfoverlay
vcfparallel
vcfparsealts
vcfphylo.cpp
vcfplotaltdiscrepancy.r.loess
vcfplottstv.r
vcfprimers
vcfrandom
vcfrandomsample
vcfremap
vcfremoveaberrantgenotypes
vcfremovesamples
vcfroc
vcfsamplediff
vcfsamplenames
vcfsitesummarize
vcfsom
vcfsplit.cpp
vcfstats
vcfstreamsort
vcfuniqalleles
#vcfcountalleles.cpp#
.vcfplotaltdiscrepancy.r.swo
.vcfstats.cpp.swn
.vcfstats.cpp.swo
a.out
vcfuniq
vcfcat
vcfevenregions
vcfgenosummarize
vcfgenosamplenames
vcf2fasta
bin/vcf2dag
bin/vcfcombine
bin/vcfgeno2alleles
bin/vcfglbound
bin/vcfindex
bin/vcfinfo2qual
bin/vcfinfosummarize
bin/vcfleftalign
bin/vcfqual2info
bin/vcfsample2info
libvcflib.a
[submodule "tabixpp"]
path = tabixpp
url = https://github.com/ekg/tabixpp.git
[submodule "smithwaterman"]
path = smithwaterman
url = https://github.com/ekg/smithwaterman.git
[submodule "multichoose"]
path = multichoose
url = https://github.com/ekg/multichoose.git
[submodule "fastahack"]
path = fastahack
url = https://github.com/ekg/fastahack.git
[submodule "intervaltree"]
path = intervaltree
url = https://github.com/ekg/intervaltree.git
[submodule "fsom"]
path = fsom
url = https://github.com/ekg/fsom.git
[submodule "filevercmp"]
path = filevercmp
url = https://github.com/ekg/filevercmp.git
Copyright (c) 2012 Erik Garrison
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
#OBJ_DIR = ./
HEADERS = src/Variant.h \
src/split.h \
src/join.h
SOURCES = src/Variant.cpp \
src/split.cpp
OBJECTS= $(SOURCES:.cpp=.o)
# TODO
#vcfstats.cpp
BIN_SOURCES = src/vcfecho.cpp \
src/vcfaltcount.cpp \
src/vcfhetcount.cpp \
src/vcfhethomratio.cpp \
src/vcffilter.cpp \
src/vcf2tsv.cpp \
src/vcfgenotypes.cpp \
src/vcfannotategenotypes.cpp \
src/vcfcommonsamples.cpp \
src/vcfremovesamples.cpp \
src/vcfkeepsamples.cpp \
src/vcfsamplenames.cpp \
src/vcfgenotypecompare.cpp \
src/vcffixup.cpp \
src/vcfclassify.cpp \
src/vcfsamplediff.cpp \
src/vcfremoveaberrantgenotypes.cpp \
src/vcfrandom.cpp \
src/vcfparsealts.cpp \
src/vcfstats.cpp \
src/vcfflatten.cpp \
src/vcfprimers.cpp \
src/vcfnumalt.cpp \
src/vcfcleancomplex.cpp \
src/vcfintersect.cpp \
src/vcfannotate.cpp \
src/vcfallelicprimitives.cpp \
src/vcfoverlay.cpp \
src/vcfaddinfo.cpp \
src/vcfkeepinfo.cpp \
src/vcfkeepgeno.cpp \
src/vcfafpath.cpp \
src/vcfcountalleles.cpp \
src/vcflength.cpp \
src/vcfdistance.cpp \
src/vcfrandomsample.cpp \
src/vcfentropy.cpp \
src/vcfglxgt.cpp \
src/vcfroc.cpp \
src/vcfcheck.cpp \
src/vcfstreamsort.cpp \
src/vcfuniq.cpp \
src/vcfuniqalleles.cpp \
src/vcfremap.cpp \
src/vcf2fasta.cpp \
src/vcfsitesummarize.cpp \
src/vcfbreakmulti.cpp \
src/vcfcreatemulti.cpp \
src/vcfevenregions.cpp \
src/vcfcat.cpp \
src/vcfgenosummarize.cpp \
src/vcfgenosamplenames.cpp \
src/vcfgeno2haplo.cpp \
src/vcfleftalign.cpp \
src/vcfcombine.cpp \
src/vcfgeno2alleles.cpp \
src/vcfindex.cpp \
src/vcf2dag.cpp \
src/vcfsample2info.cpp \
src/vcfqual2info.cpp \
src/vcfinfo2qual.cpp \
src/vcfglbound.cpp \
src/vcfinfosummarize.cpp
# when we can figure out how to build on mac
# src/vcfsom.cpp
#BINS = $(BIN_SOURCES:.cpp=)
BINS = $(addprefix bin/,$(notdir $(BIN_SOURCES:.cpp=)))
SHORTBINS = $(notdir $(BIN_SOURCES:.cpp=))
TABIX = tabixpp/tabix.o
FASTAHACK = fastahack/Fasta.o
SMITHWATERMAN = smithwaterman/SmithWatermanGotoh.o
REPEATS = smithwaterman/Repeats.o
INDELALLELE = smithwaterman/IndelAllele.o
DISORDER = smithwaterman/disorder.o
LEFTALIGN = smithwaterman/LeftAlign.o
FSOM = fsom/fsom.o
FILEVERCMP = filevercmp/filevercmp.o
INCLUDES = -I. -L. -Ltabixpp/
LDFLAGS = -lvcflib -ltabix -lz -lm
all: $(OBJECTS) $(BINS)
CXX = g++
CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64
#CXXFLAGS = -O2
#CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual
SSW = src/ssw.o src/ssw_cpp.o
ssw.o: src/ssw.h
ssw_cpp.o:src/ssw_cpp.h
openmp:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP"
profiling:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -g" all
gprof:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all
$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX)
$(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
$(TABIX):
cd tabixpp && $(MAKE)
$(SMITHWATERMAN):
cd smithwaterman && $(MAKE)
$(DISORDER): $(SMITHWATERMAN)
$(REPEATS): $(SMITHWATERMAN)
$(LEFTALIGN): $(SMITHWATERMAN)
$(INDELALLELE): $(SMITHWATERMAN)
$(FASTAHACK):
cd fastahack && $(MAKE)
#$(FSOM):
# cd fsom && $(CXX) $(CXXFLAGS) -c fsom.c -lm
$(FILEVERCMP):
cd filevercmp && make
$(SHORTBINS):
$(MAKE) bin/$@
$(BINS): $(BIN_SOURCES) libvcflib.a $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP)
$(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
libvcflib.a: $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) $(TABIX)
ar rs libvcflib.a $(OBJECTS) smithwaterman/sw.o $(FASTAHACK) $(SSW) $(FILEVERCMP) $(TABIX) tabixpp/bgzf.o tabixpp/index.o tabixpp/knetfile.o tabixpp/kstring.o
test: $(BINS)
@prove -Itests/lib -w tests/*.t
clean:
rm -f $(BINS) $(OBJECTS)
rm -f ssw_cpp.o ssw.o
rm -f libvcflib.a
cd tabixpp && make clean
cd smithwaterman && make clean
cd fastahack && make clean
.PHONY: clean all test
This diff is collapsed.
#!/usr/bin/perl
while (<STDIN>) {
$_ =~ /^(.+?)\s(.+?)\s(.+)\s*/;
$chrom = $1;
$pos = $2;
$end = $3;
print $chrom . ":" . $pos . "-" . $end . "\n";
}
#!/usr/bin/Rscript
require(plyr)
require(ggplot2)
require(pracma)
require(grid)
argv <- commandArgs(trailingOnly = TRUE)
prefix <- gsub("\\s","", argv[1])
print(prefix)
truthset <- argv[2]
print(truthset)
results <- argv[3]
print(results)
xmin <- as.numeric(argv[4])
xmax <- as.numeric(argv[5])
ymin <- as.numeric(argv[6])
ymax <- as.numeric(argv[7])
roc <- read.delim(results)
bests <- ddply(roc, .(set), function(x) { data.frame(best_snps=with(x, min(false_negative_snps + false_positive_snps)), best_snp_threshold=min(subset(x, (false_negative_snps + false_positive_snps) == with(x, min(false_negative_snps + false_positive_snps)))$threshold ), best_indels=with(x, min(false_negative_indels + false_positive_indels)), best_indel_threshold=min(subset(x, (false_negative_indels + false_positive_indels) == with(x, min(false_negative_indels + false_positive_indels)))$threshold )) })
write.table(bests, paste(prefix, ".bests.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
#abs(trapz(c(1, roc$complexfpr), c(1, roc$complextpr)))
true_snps <- with(subset(roc, set==truthset), max(num_snps))
true_indels <- with(subset(roc, set==truthset), max(num_indels))
# get ROC AUC
auc <- ddply(roc, .(set),
function(x) {
data.frame(
snp_auc=ifelse(true_snps>0,
with(x,
abs(trapz(c(1,
false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps))),
c(max(1- false_negative_snps/true_snps),
1- false_negative_snps/true_snps)))),
0),
indel_auc=ifelse(true_indels>0,
with(x,
abs(trapz(c(1,
false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels))),
c(max(1- false_negative_indels/true_indels),
1- false_negative_indels/true_indels)))),
0)
)
}
)
write.table(auc, paste(prefix, ".auc.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
rocsnps <- ddply(roc, .(set),
function(x) {
data.frame(
FPR=
with(x,
c(1,
false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps)))),
TPR=
with(x,
c(max(1- false_negative_snps/true_snps),
1- false_negative_snps/true_snps)),
type=as.factor("snps")
)
}
)
rocindels <- ddply(roc, .(set),
function(x) {
data.frame(
FPR=
with(x,
c(1,
false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels)))),
TPR=
with(x,
c(max(1- false_negative_indels/true_indels),
1- false_negative_indels/true_indels)),
type=as.factor("indels")
)
}
)
if (FALSE) {
if (true_snps>0) {
ggplot(subset(roc, set != truthset),
aes(false_positive_snps/(false_positive_snps+with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps))),
1- false_negative_snps/with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps)),
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
}
if (true_indels>0) {
ggplot(subset(roc, set != truthset),
aes(false_positive_indels/(false_positive_indels+with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels))),
1- false_negative_indels/with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels)),
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
}
}
# new versions
if (true_snps>0) {
ggplot(subset(rocsnps, set != truthset),
aes(FPR,
TPR,
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
}
if (true_indels>0) {
ggplot(subset(rocindels, set != truthset),
aes(FPR,
TPR,
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
}
if (true_indels>0 && true_snps>0) {
(
ggplot(subset(rbind(rocsnps,rocindels), set != truthset),
aes(FPR,
TPR,
group=set,
color=set))
+ scale_x_continuous("false positive rate")
+ scale_y_continuous("true positive rate")
+ geom_path()
+ theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ facet_grid(type ~ .)
+ theme(panel.margin = unit(1, "lines"))
)
ggsave(paste(prefix, ".both.png", sep=""), height=5, width=5)
}
#!/usr/bin/python
import sys
for line in sys.stdin:
if line.startswith('#'):
continue
fields = line.strip().split()
# VCF is 1-based, BED is 0-based half open
# print out chrom, start, end,
chrom = fields[0]
start = int(fields[1]) - 1
span = len(fields[3]) # handle multi-base alleles
end = start + span
name = fields[2]
print "\t".join(map(str, [chrom, start, end, name]))
#!/usr/bin/python
import sys
import re
import sqlite3
if len(sys.argv) < 2:
print "usage", sys.argv[0], " [dbname]"
print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]"
exit(1)
dbname = sys.argv[1]
# parse the header
# into a mapping from tag -> type
infotypes = {}
infonumbers = {}
for line in sys.stdin:
if line.startswith('##INFO'):
#<ID=XRS,Number=1,Type=Float,
i = re.search("ID=(.*?),", line)
n = re.search("Number=(.*?),", line)
t = re.search("Type=(.*?),", line)
if i and n and t:
id = i.groups()[0]
number = n.groups()[0]
if number == "A":
number = -1
elif number == "G" or int(number) > 1:
# unclear how to deal with these
continue
else:
number = int(number)
typestr = t.groups()[0]
infotypes[id] = typestr
infonumbers[id] = number
else:
continue
elif line.startswith('##'):
continue
else:
break # header line, sample names etc.
# write the table schema
infotype_to_sqltype = {}
infotype_to_sqltype["Flag"] = "boolean"
infotype_to_sqltype["Integer"] = "integer"
infotype_to_sqltype["Float"] = "real"
infotype_to_sqltype["String"] = "text"
tablecmd = """create table alleles"""
specs = ["CHROM text",
"POS integer",
"ID text",
"REF text",
"ALT text",
"QUAL real",
"FILTER text"]
sorted_fields = sorted(infotypes.keys())
for field in sorted_fields:
infotype = infotypes[field]
sqltype = infotype_to_sqltype[infotype]
field = field.replace(".", "_") # escape periods, which are not allowed
specs.append(field + " " + sqltype)
tablecmd += " (" + ", ".join(specs) + ")"
conn = sqlite3.connect(dbname)
conn.execute(tablecmd)
# for each record
# parse the record
# for each allele
for line in sys.stdin:
fields = line.split('\t')
chrom, pos, id, ref, alts, qual, filter, info = fields[:8]
alts = alts.split(",")
altindex = 0
chrom = "\'" + chrom + "\'"
id = "\'" + id + "\'"
ref = "\'" + ref + "\'"
filter = "\'" + filter + "\'"
for alt in alts:
alt = "\'" + alt + "\'"
info_values = {}
for pair in info.split(";"):
if pair.find("=") is not -1:
pair = pair.split("=")
key = pair[0]
value = pair[1]
if not infonumbers.has_key(key):
continue
if infonumbers[key] == -1:
values = value.split(",")
value = values[altindex]
info_values[key] = value
else:
# boolean flag
info_values[pair] = "1"
ordered_insertion = []
for field in sorted_fields:
value = "null"
if info_values.has_key(field):
value = info_values[field]
if infotypes[field] == "String":
value = "\'" + value + "\'"
else:
# missing flag means "false" for that flag
if infotypes[field] == "Flag":
value = "0"
ordered_insertion.append(value)
cmd = "insert into alleles values (" \
+ ", ".join([chrom, pos, id, ref, alt, qual, filter]) \
+ ", " \
+ ", ".join(ordered_insertion) + ")"
conn.execute(cmd)
altindex += 1
conn.commit()
# TODO ignoring samples (for now)
# add indexes everywhere?
conn.close()
#!/usr/bin/perl
my $seen_non_header = 0;
while (<STDIN>) {
if (! $seen_non_header) {
if (/^#/) {
} else {
$seen_non_header = 1;
}
print;
} else {
if (! /^#/) {
print;
}
}
}
#!/usr/bin/perl
#
while (<STDIN>) {
if ($_ =~ /^#/) {
print;
} else {
$_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
$chrom = $1;
$pos = $2;
$tag = $3;
$ref = $4;
$alt = $5;
if ($alt =~ /,/) {
# remove anything which isn't biallelic
} else {
print;
}
}
}
#!/usr/bin/python
#
import sys
for line in sys.stdin:
if line.startswith("#"