Commit 564aed27 authored by Andreas Tille's avatar Andreas Tille

New upstream version 1.0.0~rc2+dfsg

parent dd96c448
......@@ -19,3 +19,9 @@
[submodule "filevercmp"]
path = filevercmp
url = https://github.com/ekg/filevercmp.git
[submodule "googletest"]
path = googletest
url = https://github.com/google/googletest.git
[submodule "libVCFH"]
path = libVCFH
url = https://github.com/edawson/libVCFH.git
language: cpp
sudo: required
compiler:
- gcc
- gcc
before_install:
- sudo apt-get install g++
- sudo apt-get install build-essential
- git submodule update --init --recursive
- sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
- ls /etc/apt/sources.list.d
- sudo apt-get update
- sudo apt-get install -qq gcc-4.9 g++-4.9
- sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-4.9
- gcc --version && g++ --version
- sudo apt-get install build-essential
- git submodule update --init --recursive
os:
- linux
script: make
\ No newline at end of file
- linux
script: make
......@@ -25,8 +25,10 @@ OBJ_DIR:=obj
#vcfstats.cpp
BIN_SOURCES = src/vcfecho.cpp \
src/vcfnormalizesvs.cpp \
src/dumpContigsFromHeader.cpp \
src/bFst.cpp \
src/pVst.cpp \
src/hapLrt.cpp \
src/popStats.cpp \
src/wcFst.cpp \
......@@ -37,7 +39,7 @@ BIN_SOURCES = src/vcfecho.cpp \
src/sequenceDiversity.cpp \
src/pFst.cpp \
src/smoother.cpp \
src/LD.cpp \
src/vcfld.cpp \
src/plotHaps.cpp \
src/abba-baba.cpp \
src/permuteGPAT++.cpp \
......@@ -105,6 +107,8 @@ BIN_SOURCES = src/vcfecho.cpp \
src/vcfqual2info.cpp \
src/vcfinfo2qual.cpp \
src/vcfglbound.cpp \
src/vcfunphase.cpp \
src/vcfnull2ref.cpp \
src/vcfinfosummarize.cpp
# when we can figure out how to build on mac
......@@ -125,23 +129,24 @@ FSOM = fsom/fsom.o
FILEVERCMP = filevercmp/filevercmp.o
INCLUDES = -Itabixpp/htslib -I$(INC_DIR) -L. -Ltabixpp/htslib
LDFLAGS = -L$(LIB_DIR) -lvcflib -lhts -lpthread -lz -lm -fopenmp
LDFLAGS = -L$(LIB_DIR) -lvcflib -lhts -lpthread -lz -lm -llzma -lbz2
all: $(OBJECTS) $(BINS)
all: $(OBJECTS) $(BINS) scriptToBin
scriptToBin: $(BINS)
cp scripts/* bin
GIT_VERSION := $(shell git describe --abbrev=4 --dirty --always)
CXX = g++
CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64 -std=c++0x -DVERSION=\"$(GIT_VERSION)\"
CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64 -std=c++0x
#CXXFLAGS = -O2
#CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual
SSW = src/ssw.o src/ssw_cpp.o
ssw.o: src/ssw.h
ssw_cpp.o:src/ssw_cpp.h
ssw.o: src/ssw.hpp
ssw_cpp.o:src/ssw_cpp.hpp
openmp:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP"
......@@ -152,7 +157,7 @@ profiling:
gprof:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all
$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX) multichoose pre $(SMITHWATERMAN) $(FILEVERCMP)
$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX) multichoose pre $(SMITHWATERMAN) $(FILEVERCMP) $(FASTAHACK)
$(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS) && cp src/*.h* $(VCF_LIB_LOCAL)/$(INC_DIR)/
multichoose: pre
......@@ -188,7 +193,7 @@ $(SHORTBINS): pre
$(MAKE) bin/$@
$(BINS): $(BIN_SOURCES) libvcflib.a $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) pre intervaltree
$(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
$(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS) -DVERSION=\"$(GIT_VERSION)\"
libvcflib.a: $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) $(TABIX) pre
ar rs libvcflib.a $(OBJECTS) smithwaterman/sw.o $(FASTAHACK) $(SSW) $(FILEVERCMP) $(TABIX)
......
#OBJ_DIR = ./
HEADERS = src/Variant.h \
src/split.h \
src/join.h
SOURCES = src/Variant.cpp \
src/split.cpp
OBJECTS= $(SOURCES:.cpp=.o)
# TODO
#vcfstats.cpp
BIN_SOURCES = src/vcfecho.cpp \
src/vcfaltcount.cpp \
src/vcfhetcount.cpp \
src/vcfhethomratio.cpp \
src/vcffilter.cpp \
src/vcf2tsv.cpp \
src/vcfgenotypes.cpp \
src/vcfannotategenotypes.cpp \
src/vcfcommonsamples.cpp \
src/vcfremovesamples.cpp \
src/vcfkeepsamples.cpp \
src/vcfsamplenames.cpp \
src/vcfgenotypecompare.cpp \
src/vcffixup.cpp \
src/vcfclassify.cpp \
src/vcfsamplediff.cpp \
src/vcfremoveaberrantgenotypes.cpp \
src/vcfrandom.cpp \
src/vcfparsealts.cpp \
src/vcfstats.cpp \
src/vcfflatten.cpp \
src/vcfprimers.cpp \
src/vcfnumalt.cpp \
src/vcfcleancomplex.cpp \
src/vcfintersect.cpp \
src/vcfannotate.cpp \
src/vcfallelicprimitives.cpp \
src/vcfoverlay.cpp \
src/vcfaddinfo.cpp \
src/vcfkeepinfo.cpp \
src/vcfkeepgeno.cpp \
src/vcfafpath.cpp \
src/vcfcountalleles.cpp \
src/vcflength.cpp \
src/vcfdistance.cpp \
src/vcfrandomsample.cpp \
src/vcfentropy.cpp \
src/vcfglxgt.cpp \
src/vcfroc.cpp \
src/vcfcheck.cpp \
src/vcfstreamsort.cpp \
src/vcfuniq.cpp \
src/vcfuniqalleles.cpp \
src/vcfremap.cpp \
src/vcf2fasta.cpp \
src/vcfsitesummarize.cpp \
src/vcfbreakmulti.cpp \
src/vcfcreatemulti.cpp \
src/vcfevenregions.cpp \
src/vcfcat.cpp \
src/vcfgenosummarize.cpp \
src/vcfgenosamplenames.cpp \
src/vcfgeno2haplo.cpp \
src/vcfleftalign.cpp \
src/vcfcombine.cpp \
src/vcfgeno2alleles.cpp \
src/vcfindex.cpp \
src/vcf2dag.cpp \
src/vcfsample2info.cpp \
src/vcfqual2info.cpp \
src/vcfinfo2qual.cpp \
src/vcfglbound.cpp \
src/vcfinfosummarize.cpp
# when we can figure out how to build on mac
# src/vcfsom.cpp
#BINS = $(BIN_SOURCES:.cpp=)
BINS = $(addprefix bin/,$(notdir $(BIN_SOURCES:.cpp=)))
SHORTBINS = $(notdir $(BIN_SOURCES:.cpp=))
TABIX = tabixpp/tabix.o
FASTAHACK = fastahack/Fasta.o
SMITHWATERMAN = smithwaterman/SmithWatermanGotoh.o
REPEATS = smithwaterman/Repeats.o
INDELALLELE = smithwaterman/IndelAllele.o
DISORDER = smithwaterman/disorder.o
LEFTALIGN = smithwaterman/LeftAlign.o
FSOM = fsom/fsom.o
FILEVERCMP = filevercmp/filevercmp.o
INCLUDES = -I. -Itabixpp/htslib/ -L. -Ltabixpp/ -Ltabixpp/htslib/
LDFLAGS = -lvcflib -lhts -lpthread -lz -lm
all: $(OBJECTS) $(BINS)
CXX = g++
CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64
#CXXFLAGS = -O2
#CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual
SSW = src/ssw.o src/ssw_cpp.o
ssw.o: src/ssw.h
ssw_cpp.o:src/ssw_cpp.h
openmp:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP"
profiling:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -g" all
gprof:
$(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all
$(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX)
$(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
$(TABIX):
cd tabixpp && $(MAKE)
$(SMITHWATERMAN):
cd smithwaterman && $(MAKE)
$(DISORDER): $(SMITHWATERMAN)
$(REPEATS): $(SMITHWATERMAN)
$(LEFTALIGN): $(SMITHWATERMAN)
$(INDELALLELE): $(SMITHWATERMAN)
$(FASTAHACK):
cd fastahack && $(MAKE)
#$(FSOM):
# cd fsom && $(CXX) $(CXXFLAGS) -c fsom.c -lm
$(FILEVERCMP):
cd filevercmp && make
$(SHORTBINS):
$(MAKE) bin/$@
$(BINS): $(BIN_SOURCES) libvcflib.a $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP)
$(CXX) src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS)
libvcflib.a: $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FILEVERCMP) $(TABIX)
ar rs libvcflib.a $(OBJECTS) smithwaterman/sw.o $(FASTAHACK) $(SSW) $(FILEVERCMP) $(TABIX)
test: $(BINS)
@prove -Itests/lib -w tests/*.t
clean:
rm -f $(BINS) $(OBJECTS)
rm -f ssw_cpp.o ssw.o
rm -f libvcflib.a
cd tabixpp && make clean
cd smithwaterman && make clean
cd fastahack && make clean
.PHONY: clean all test
......@@ -5,7 +5,7 @@
#### license: MIT
[![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/ekg/vcflib?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Build Status](https://travis-ci.org/vcflib/vcflib.svg?branch=master)](https://travis-ci.org/vcflib/vcflib)
[![Gitter](https://badges.gitter.im/ekg/vcflib.svg)](https://gitter.im/ekg/vcflib?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Build Status](https://travis-ci.org/vcflib/vcflib.svg?branch=master)](https://travis-ci.org/vcflib/vcflib)
---
## overview
......@@ -25,6 +25,19 @@ It is both:
The API itself provides a quick and extremely permissive method to read and write VCF files.
Extensions and applications of the library provided in the included utilities (*.cpp) comprise the vast bulk of the library's utility for most users.
## download and install
1. Under the repository name, click to copy the clone URL for the repository. ![](https://help.github.com/assets/images/help/repository/clone-repo-clone-url-button.png)
2. Go to the location where you want the cloned directory to be made: `cd <PathWhereIWantToCloneVcflib>`
3. Type `git clone --recursive`, and then paste the URL you copied in Step 1.
4. Enter the cloned directory and type `make` to compile the programs. If you want to use threading type `make openmp` instead of `make`. Only a few VCFLIB tools are threaded.
5. Once make is finished the executables are ready in the folder `<PathWhereIWantToCloneVcflib>/vcflib/bin/`. Set this path as an environment variable in the .bashrc file to access executables form everywhere on your proile OR call the executables from the path where they are.
## usage
vcflib provides a variety of functions for VCF manipulation:
......
#!/usr/bin/env perl
while (<STDIN>) {
$_ =~ /^(.+?)\s(.+?)\s(.+)\s*/;
$chrom = $1;
$pos = $2;
$end = $3;
print $chrom . ":" . $pos . "-" . $end . "\n";
}
#!/usr/bin/env bash
if [ $# -ne 1 ];
then
echo usage: $0 '[file name]'
echo runs bgzip on the input and tabix indexes the result
echo "== bgzip >[file] && tabix -fp vcf [file]"
exit
fi
file=$1
bgzip >$file && tabix -fp vcf $file
#!/usr/bin/env Rscript
require(plyr)
require(ggplot2)
require(pracma)
require(grid)
argv <- commandArgs(trailingOnly = TRUE)
prefix <- gsub("\\s","", argv[1])
print(prefix)
truthset <- argv[2]
print(truthset)
results <- argv[3]
print(results)
xmin <- as.numeric(argv[4])
xmax <- as.numeric(argv[5])
ymin <- as.numeric(argv[6])
ymax <- as.numeric(argv[7])
roc <- read.delim(results)
bests <- ddply(roc, .(set), function(x) { data.frame(best_snps=with(x, min(false_negative_snps + false_positive_snps)), best_snp_threshold=min(subset(x, (false_negative_snps + false_positive_snps) == with(x, min(false_negative_snps + false_positive_snps)))$threshold ), best_indels=with(x, min(false_negative_indels + false_positive_indels)), best_indel_threshold=min(subset(x, (false_negative_indels + false_positive_indels) == with(x, min(false_negative_indels + false_positive_indels)))$threshold )) })
write.table(bests, paste(prefix, ".bests.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
#abs(trapz(c(1, roc$complexfpr), c(1, roc$complextpr)))
true_snps <- with(subset(roc, set==truthset), max(num_snps))
true_indels <- with(subset(roc, set==truthset), max(num_indels))
# get ROC AUC
auc <- ddply(roc, .(set),
function(x) {
data.frame(
snp_auc=ifelse(true_snps>0,
with(x,
abs(trapz(c(1,
false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps))),
c(max(1- false_negative_snps/true_snps),
1- false_negative_snps/true_snps)))),
0),
indel_auc=ifelse(true_indels>0,
with(x,
abs(trapz(c(1,
false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels))),
c(max(1- false_negative_indels/true_indels),
1- false_negative_indels/true_indels)))),
0)
)
}
)
write.table(auc, paste(prefix, ".auc.tsv", sep=""), row.names=FALSE, quote=FALSE, sep="\t")
rocsnps <- ddply(roc, .(set),
function(x) {
data.frame(
FPR=
with(x,
c(1,
false_positive_snps/(false_positive_snps+ max(false_negative_snps + num_snps - false_positive_snps)))),
TPR=
with(x,
c(max(1- false_negative_snps/true_snps),
1- false_negative_snps/true_snps)),
type=as.factor("snps")
)
}
)
rocindels <- ddply(roc, .(set),
function(x) {
data.frame(
FPR=
with(x,
c(1,
false_positive_indels/(false_positive_indels+ max(false_negative_indels + num_indels - false_positive_indels)))),
TPR=
with(x,
c(max(1- false_negative_indels/true_indels),
1- false_negative_indels/true_indels)),
type=as.factor("indels")
)
}
)
if (FALSE) {
if (true_snps>0) {
ggplot(subset(roc, set != truthset),
aes(false_positive_snps/(false_positive_snps+with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps))),
1- false_negative_snps/with(subset(roc, set==set), max(false_negative_snps + num_snps - false_positive_snps)),
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
}
if (true_indels>0) {
ggplot(subset(roc, set != truthset),
aes(false_positive_indels/(false_positive_indels+with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels))),
1- false_negative_indels/with(subset(roc, set==set), max(false_negative_indels + num_indels - false_positive_indels)),
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
}
}
# new versions
if (true_snps>0) {
ggplot(subset(rocsnps, set != truthset),
aes(FPR,
TPR,
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".snps.png", sep=""), height=6, width=9)
}
if (true_indels>0) {
ggplot(subset(rocindels, set != truthset),
aes(FPR,
TPR,
group=set,
color=set)) + scale_x_continuous("false positive rate") + scale_y_continuous("true positive rate") + geom_path() + theme_bw() + coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
ggsave(paste(prefix, ".indels.png", sep=""), height=6, width=9)
}
if (true_indels>0 && true_snps>0) {
(
ggplot(subset(rbind(rocsnps,rocindels), set != truthset),
aes(FPR,
TPR,
group=set,
color=set))
+ scale_x_continuous("false positive rate")
+ scale_y_continuous("true positive rate")
+ geom_path()
+ theme_bw()
+ coord_cartesian(xlim=c(xmin,xmax), ylim=c(ymin,ymax))
+ facet_grid(type ~ .)
+ theme(panel.margin = unit(1, "lines"))
)
ggsave(paste(prefix, ".both.png", sep=""), height=5, width=5)
}
#!/usr/bin/env python
import sys
for line in sys.stdin:
if line.startswith('#'):
continue
fields = line.strip().split()
# VCF is 1-based, BED is 0-based half open
# print out chrom, start, end,
chrom = fields[0]
start = int(fields[1]) - 1
span = len(fields[3]) # handle multi-base alleles
end = start + span
name = fields[2]
print "\t".join(map(str, [chrom, start, end, name]))
#!/usr/bin/env python
import sys
import re
import sqlite3
if len(sys.argv) < 2:
print "usage", sys.argv[0], " [dbname]"
print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]"
exit(1)
dbname = sys.argv[1]
# parse the header
# into a mapping from tag -> type
infotypes = {}
infonumbers = {}
for line in sys.stdin:
if line.startswith('##INFO'):
#<ID=XRS,Number=1,Type=Float,
i = re.search("ID=(.*?),", line)
n = re.search("Number=(.*?),", line)
t = re.search("Type=(.*?),", line)
if i and n and t:
id = i.groups()[0]
number = n.groups()[0]
if number == "A":
number = -1
elif number == "G" or int(number) > 1:
# unclear how to deal with these
continue
else:
number = int(number)
typestr = t.groups()[0]
infotypes[id] = typestr
infonumbers[id] = number
else:
continue
elif line.startswith('##'):
continue
else:
break # header line, sample names etc.
# write the table schema
infotype_to_sqltype = {}
infotype_to_sqltype["Flag"] = "boolean"
infotype_to_sqltype["Integer"] = "integer"
infotype_to_sqltype["Float"] = "real"
infotype_to_sqltype["String"] = "text"
tablecmd = """create table alleles"""
specs = ["CHROM text",
"POS integer",
"ID text",
"REF text",
"ALT text",
"QUAL real",
"FILTER text"]
sorted_fields = sorted(infotypes.keys())
for field in sorted_fields:
infotype = infotypes[field]
sqltype = infotype_to_sqltype[infotype]
field = field.replace(".", "_") # escape periods, which are not allowed
specs.append(field + " " + sqltype)
tablecmd += " (" + ", ".join(specs) + ")"
conn = sqlite3.connect(dbname)
conn.execute(tablecmd)
# for each record
# parse the record
# for each allele
for line in sys.stdin:
fields = line.split('\t')
chrom, pos, id, ref, alts, qual, filter, info = fields[:8]
alts = alts.split(",")
altindex = 0
chrom = "\'" + chrom + "\'"
id = "\'" + id + "\'"
ref = "\'" + ref + "\'"
filter = "\'" + filter + "\'"
for alt in alts:
alt = "\'" + alt + "\'"
info_values = {}
for pair in info.split(";"):
if pair.find("=") is not -1:
pair = pair.split("=")
key = pair[0]
value = pair[1]
if not infonumbers.has_key(key):
continue
if infonumbers[key] == -1:
values = value.split(",")
value = values[altindex]
info_values[key] = value
else:
# boolean flag
info_values[pair] = "1"
ordered_insertion = []
for field in sorted_fields:
value = "null"
if info_values.has_key(field):
value = info_values[field]
if infotypes[field] == "String":
value = "\'" + value + "\'"
else:
# missing flag means "false" for that flag
if infotypes[field] == "Flag":
value = "0"
ordered_insertion.append(value)
cmd = "insert into alleles values (" \
+ ", ".join([chrom, pos, id, ref, alt, qual, filter]) \
+ ", " \
+ ", ".join(ordered_insertion) + ")"
conn.execute(cmd)
altindex += 1
conn.commit()
# TODO ignoring samples (for now)
# add indexes everywhere?
conn.close()
#!/usr/bin/env perl
my $seen_non_header = 0;
while (<STDIN>) {
if (! $seen_non_header) {
if (/^#/) {
} else {
$seen_non_header = 1;
}
print;
} else {
if (! /^#/) {
print;
}
}
}
#!/usr/bin/env perl
#
while (<STDIN>) {
if ($_ =~ /^#/) {
print;
} else {
$_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/;
$chrom = $1;
$pos = $2;
$tag = $3;
$ref = $4;
$alt = $5;
if ($alt =~ /,/) {
# remove anything which isn't biallelic
} else {
print;
}
}
}
#!/usr/bin/env python
#
import sys
for line in sys.stdin:
if line.startswith("#"):
print line.strip()
else:
fields = line.strip().split("\t")
fields[2] = "."
print "\t".join(fields)
#!/usr/bin/env python
#
import sys