Skip to content
Commits on Source (4)
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>Sniffles</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
</buildSpec>
<natures>
</natures>
</projectDescription>
......@@ -7,9 +7,9 @@ option(STATIC "Build static binary" OFF)
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
set( SNIF_VERSION_BUILD 8-debug )
set( SNIF_VERSION_BUILD 10-debug )
else()
set( SNIF_VERSION_BUILD 8 )
set( SNIF_VERSION_BUILD 10 )
ENDIF()
......
......@@ -4,6 +4,24 @@ Sniffles is a structural variation caller using third generation sequencing (Pac
Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
# How to build Sniffles
<pre>wget https://github.com/fritzsedlazeck/Sniffles/archive/master.tar.gz -O Sniffles.tar.gz
tar xzvf Sniffles.tar.gz
cd Sniffles-master/
mkdir -p build/
cd build/
cmake ..
make
cd ../bin/sniffles*
./sniffles</pre>
Note Mac users often have to provide parameters to the cmake command:
<pre>cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
</pre>
**************************************
## NGMLR
Sniffles performs best with the mappings of NGMLR our novel long read mapping method.
......@@ -12,8 +30,8 @@ https://github.com/philres/ngmlr
****************************************
## Citation:
Please see and cite our preprint:
http://www.biorxiv.org/content/early/2017/07/28/169557
Please see and cite our paper:
https://www.nature.com/articles/s41592-018-0001-7
**************************************
## Poster & Talks:
......
sniffles (1.0.10+ds-1) unstable; urgency=low
* Team upload.
* New upstream version.
-- Steffen Moeller <moeller@debian.org> Mon, 15 Oct 2018 19:10:58 +0200
sniffles (1.0.8+ds-2) unstable; urgency=low
* Use dh_installman rather than debian/manpages (Closes: #903192)
......
......@@ -10,9 +10,9 @@ Build-Depends:
libtclap-dev,
zlib1g-dev,
pandoc <!nodoc>,
Standards-Version: 4.1.1
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/sniffles.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/sniffles.git
Standards-Version: 4.2.1
Vcs-Browser: https://salsa.debian.org/med-team/sniffles
Vcs-Git: https://salsa.debian.org/med-team/sniffles.git
Homepage: http://fritzsedlazeck.github.io/Sniffles
Package: sniffles
......@@ -23,7 +23,7 @@ Depends:
Suggests:
bwa,
Description: structural variation caller using third-generation sequencing
Sniffles is a structural variation caller using third-generation sequencing
data such as those from Pacific Biosciences or Oxford Nanopore platforms.
It detects all types of SVs using evidence from split-read alignments,
high-mismatch regions, and coverage analysis.
Sniffles is a structural variation (SV) caller using third-generation
sequencing data such as those from Pacific Biosciences or Oxford
Nanopore platforms. It detects all types of SVs using evidence from
split-read alignments, high-mismatch regions, and coverage analysis.
......@@ -19,6 +19,7 @@ override_dh_auto_configure:
override_dh_auto_clean:
dh_auto_clean
find -name "*.mk" -delete
rm -rf bin
$(RM) \
makefile \
debian/sniffles.1 \
......
......@@ -136,6 +136,7 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
if (al->CigarData[0].Type == 'S') {
read_pos += al->CigarData[0].Length;
}
for (size_t i = 0; i < al->CigarData.size(); i++) {
if (al->CigarData[i].Type == 'M') {
pos += al->CigarData[i].Length;
......@@ -176,15 +177,16 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
}
}
/*if (flag) {
//exit(0);
if (flag) {
std::cout << "FIRST:" << std::endl;
for (size_t i = 0; i < events.size(); i++) {
if (abs(events[i].type) > 200) {
// if (abs(events[i].type) > 200) {
cout << events[i].position << " " << events[i].type << endl;
}
// }
}
cout << endl;
}*/
}
//set ref length requ. later on:
this->ref_len = pos - getPosition(); //TODO compare to get_length!
......@@ -676,6 +678,7 @@ bool Alignment::overlapping_segments(vector<aln_str> entries) {
return (entries.size() == 2 && abs(entries[0].pos - entries[1].pos) < 100);
}
vector<aln_str> Alignment::getSA(RefVector ref) {
string sa;
vector<aln_str> entries;
if (al->GetTag("SA", sa) && !sa.empty()) {
......@@ -690,7 +693,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
uint32_t sv;
al->GetTag("SV", sv);
tmp.cross_N = ((sv & Ns_CLIPPED));
bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0;
bool flag = strcmp(getName().c_str(),"0bac61ef-7819-462b-ae3d-32c68fe580c0")==0; //Parameter::Instance()->read_name.c_str()) == 0;
get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop);
if (flag) {
......@@ -791,8 +794,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
double Alignment::get_scrore_ratio() {
uint score = -1;
uint subscore = -1;
if (al->GetTag("AS", score)) {
al->GetTag("XS", subscore);
if (al->GetTag("AS", score) && al->GetTag("XS", subscore)) {
if (subscore == 0) {
subscore = 1;
}
......@@ -804,6 +806,7 @@ bool Alignment::get_is_save() {
string sa;
double score = get_scrore_ratio(); //TODO should I use this again for bwa?
// cout<<score<<endl;
return !((al->GetTag("XA", sa) && !sa.empty()) || (al->GetTag("XT", sa) && !sa.empty())) && (score == -1 || score > Parameter::Instance()->score_treshold); //|| //TODO: 7.5
}
......@@ -923,6 +926,7 @@ std::string Alignment::get_md() {
return md;
} else {
std::cerr << "No MD string detected! Check bam file! Otherwise generate using e.g. samtools." << std::endl;
cout<<"MD: TEST" << this->getName()<<endl;
exit(0);
}
return md;
......@@ -1056,6 +1060,8 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
//computeAlignment();
//cout<<alignment.first<<endl;
//cout<<alignment.second<<endl;
avg_del = 0;
avg_ins = 0;
vector<int> mis_per_window;
std::vector<indel_str> dels;
vector<differences_str> event_aln = summarizeAlignment(dels);
......@@ -1070,6 +1076,7 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
double del = 0;
double ins = 0;
double mis = 0;
if (event_aln.size() > 1) {
double length = event_aln[event_aln.size() - 1].position - event_aln[0].position;
for (size_t i = 0; i < event_aln.size(); i++) {
if (i != 0) {
......@@ -1092,12 +1099,14 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
avg_ins += event_aln[i].type * -1;
}
}
//cout << "len: " << length << endl;
avg_ins = avg_ins / length;
avg_del = avg_del / length;
dist = dist / (double) event_aln.size();
}
plane->finalyze();
return mis_per_window; //total_num /num;
}
......@@ -1108,12 +1117,12 @@ vector<str_event> Alignment::get_events_Aln() {
//clock_t comp_aln = clock();
std::vector<indel_str> dels;
vector<differences_str> event_aln;
if (Parameter::Instance()->cs_string) {
/* if (Parameter::Instance()->cs_string) {
cout<<"run cs check "<<std::endl;
event_aln = summarize_csstring(dels);
} else {
} else {*/
event_aln = summarizeAlignment(dels);
}
// }
//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
vector<str_event> events;
......
......@@ -11,8 +11,7 @@ using std::cerr;
using std::cout;
using std::endl;
class ArgParseOutput : public TCLAP::StdOutput
{
class ArgParseOutput: public TCLAP::StdOutput {
private:
std::string usageStr;
......@@ -36,6 +35,8 @@ public:
cerr << endl;
cerr << "Short usage:" << endl;
cerr << " sniffles [options] -m <sorted.bam> -v <output.vcf> " << endl;
cerr << "Version: " << Parameter::Instance()->version << std::endl;
cerr << "Contact: fritz.sedlazeck@gmail.com" << std::endl;
cerr << endl;
cerr << "For complete USAGE and HELP type:" << endl;
cerr << " sniffles --help" << endl;
......
......@@ -13,25 +13,32 @@ std::string Genotyper::assess_genotype(int ref, int support) {
if (allele < Parameter::Instance()->min_allelel_frequency) {
return "";
}
if((support + ref)==0){
allele=0;
}
std::stringstream ss;
ss << ";AF=";
ss << allele;
ss << "\tGT:DR:DV\t";
if (allele > Parameter::Instance()->homfreq) {
ss << "1/1:";
if(ref==0 && support==0){
ss << "./."; //we cannot define it.
}else if (allele > Parameter::Instance()->homfreq) {
ss << "1/1";
} else if (allele > Parameter::Instance()->hetfreq) {
ss << "0/1:";
ss << "0/1";
} else {
ss << "0/0:";
ss << "0/0";
}
ss<<":";
ss << ref;
ss << ":";
ss << support;
return ss.str();
}
std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
std::string Genotyper::mod_breakpoint_vcf(string buffer, std::pair<int, int> ref_strand) {
int ref=ref_strand.first+ref_strand.second;
//find last of\t
//parse #reads supporting
//print #ref
......@@ -41,6 +48,12 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
pos = buffer.find_last_of("GT");
//tab
entry = buffer.substr(0, pos - 2);
std::stringstream ss;
ss << ";REF_strand=";
ss << ref_strand.first;
ss << ",";
ss << ref_strand.second;
entry +=ss.str();
buffer = buffer.substr(pos + 1); // the right part is only needed:
pos = buffer.find_last_of(':');
......@@ -54,8 +67,9 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
}
std::string Genotyper::mod_breakpoint_bedpe(string buffer, int ref) {
std::string Genotyper::mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref_strand) {
int ref=ref_strand.first+ref_strand.second;
std::string tmp = buffer;
std::string entry = tmp;
entry += '\t';
......@@ -198,11 +212,24 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
tmp = get_breakpoint_bedpe(buffer);
}
int ref = max(tree.get_ref(node, tmp.chr, tmp.pos), tree.get_ref(node, tmp.chr2, tmp.pos2));
std::pair<int,int> first_node=tree.get_ref(node, tmp.chr, tmp.pos);
std::pair<int,int> second_node=tree.get_ref(node, tmp.chr2, tmp.pos2);
std::pair<int,int> final_ref;
if(first_node.first+first_node.second>second_node.first+second_node.second){
final_ref=first_node;
}else{
final_ref=second_node;
}
if(final_ref.first==-1){
std::cerr<<"Error in GT: Tree node not found. Exiting."<<std::endl;
exit(0);
}
if (is_vcf) {
to_print = mod_breakpoint_vcf(buffer, ref);
to_print = mod_breakpoint_vcf(buffer,final_ref);
} else {
to_print = mod_breakpoint_bedpe(buffer, ref);
to_print = mod_breakpoint_bedpe(buffer,final_ref);
}
if (!to_print.empty()) {
fprintf(file, "%s", to_print.c_str());
......@@ -266,7 +293,7 @@ std::vector<std::string> Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_
tree.insert(node, tmp.chr, tmp.pos, true); //true: start;
tree.insert(node, tmp.chr2, tmp.pos2, false); //false: stop;//
num_sv++;
if (num_sv % 1000 == 0) {
if (num_sv % 5000 == 0) {
cout << "\t\tRead in SV: " << num_sv << endl;
}
} else if (buffer[2] == 'c' && buffer[3] == 'o') { //##contig=<ID=chr1,length=699930>
......@@ -300,7 +327,11 @@ void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std
cout << "\t\tScanning CHR " << ref_dict[tmp.chr_id] << endl;
prev_id = tmp.chr_id;
}
tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node);
if (tmp.strand == 1) { //strand of read
tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node, true);
} else {
tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node, false);
}
nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
fclose(ref_allel_reads);
......
......@@ -25,8 +25,8 @@ private:
void update_file(Breakpoint_Tree & tree,breakpoint_node *& node);
variant_str get_breakpoint_vcf(string buffer);
variant_str get_breakpoint_bedpe(string buffer);
std::string mod_breakpoint_vcf(string buffer, int ref);
std::string mod_breakpoint_bedpe(string buffer, int ref);
std::string mod_breakpoint_vcf(string buffer, std::pair<int,int> ref);
std::string mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref);
void parse_pos(char * buffer, int & pos, std::string & chr);
......
......@@ -25,7 +25,7 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
version="1.0.8";
version="1.0.10";
huge_ins = 999;//TODO check??
}
~Parameter() {
......@@ -84,6 +84,7 @@ public:
bool change_coords; //indicating for --Ivcf
bool skip_parameter_estimation;
bool cs_string;
bool read_strand;
void set_regions(std::string reg) {
size_t i = 0;
......
......@@ -14,6 +14,7 @@ struct str_read{
uint chr_id;
uint start;
ushort length;
ushort strand; //1=forward, 2=backward;
};
class Parser {
......
......@@ -2,16 +2,16 @@
// Name : Sniffles.cpp
// Author : Fritz Sedlazeck
// Version :
// Copyright : Your copyright notice
// Description : Hello World in C++, Ansi-style
// Copyright : MIT License
// Description : Detection of SVs for long read data.
//============================================================================
// phil: cd ~/hetero/philipp/pacbio/example-svs/reads
//For mac: cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
#include <iostream>
#include "Paramer.h"
#include <tclap/CmdLine.h>
#include <unistd.h>
#include <omp.h>
#include "Genotyper/Genotyper.h"
#include "realign/Realign.h"
#include "sub/Detect_Breakpoints.h"
......@@ -25,7 +25,9 @@
#include "ArgParseOutput.h"
#include "force_calling/Force_calling.h"
//cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
//cmake -D CMAKE_C_COMPILER=/usr/local/bin/gcc-8 -D CMAKE_CXX_COMPILER=/usr/local/bin/g++-8 ..
//TODO:
//check strand headers.
......@@ -86,12 +88,12 @@ void read_parameters(int argc, char *argv[]) {
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. ", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. ", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Dont report BND instead use Tra in vcf output. ", cmd, true);
TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) ", cmd, false);
TCLAP::SwitchArg arg_coords("", "change_coords", "Adopt coordinates for force calling if finding evidence. ", cmd, false);
TCLAP::SwitchArg arg_parameter("", "skip_parameter_estimation", "Enables the scan if only very few reads are present. ", cmd, false);
TCLAP::SwitchArg arg_cs_string("", "cs_string", "Enables the scan of CS string instead of Cigar and MD. ", cmd, false);
TCLAP::SwitchArg arg_read_strand("", "report_read_strands", "Enables the report of the strand categories per read. (Beta) ", cmd, false);
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). ", false, 0.0, "float", cmd);
TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). ", false, 0.3, "float", cmd);
......@@ -102,9 +104,11 @@ void read_parameters(int argc, char *argv[]) {
std::stringstream usage;
usage << "" << std::endl;
usage << "Usage: sniffles [options] -m <sorted.bam> -v <output.vcf> " << std::endl;
usage << "" << std::endl;
usage << "Version: "<<Parameter::Instance()->version << std::endl;
usage << "Contact: fritz.sedlazeck@gmail.com" << std::endl;
usage << std::endl;
usage << "Input/Output:" << std::endl;
printParameter<std::string>(usage, arg_bamfile);
printParameter<std::string>(usage, arg_vcf);
printParameter<std::string>(usage, arg_bedpe);
......@@ -138,6 +142,7 @@ void read_parameters(int argc, char *argv[]) {
printParameter(usage, arg_bnd);
printParameter(usage, arg_seq);
printParameter(usage, arg_std);
printParameter(usage,arg_read_strand);
usage << "" << std::endl;
usage << "Parameter estimation:" << std::endl;
......@@ -174,7 +179,7 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->change_coords = arg_coords.getValue();
Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
Parameter::Instance()->read_name = " "; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->read_name = "0bac61ef-7819-462b-ae3d-32c68fe580c0"; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
......@@ -200,6 +205,7 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->hetfreq = arg_hetfreq.getValue();
Parameter::Instance()->skip_parameter_estimation = arg_parameter.getValue();
Parameter::Instance()->cs_string = arg_cs_string.getValue();
Parameter::Instance()->read_strand=arg_read_strand.getValue();
if (Parameter::Instance()->skip_parameter_estimation) {
cout<<"\tSkip parameter estimation."<<endl;
......
......@@ -43,22 +43,26 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i].type != -1) {
position_str svs;
//cout<<"start: "<<entries[i].start.chr << " stop "<<entries[i].stop.chr<<endl;
svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
read_str read;
if(ref_lens.find(entries[i].start.chr)==ref_lens.end()){
cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].start.chr<<endl;
if (ref_lens.find(entries[i].start.chr) == ref_lens.end()) { // check why this is not called!
cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].start.chr << endl;
exit(0);
}
if (ref_lens.find(entries[i].stop.chr) == ref_lens.end()) {
cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].stop.chr<<endl;
cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].stop.chr << endl;
exit(0);
}
svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
read_str read;
read.coordinates.first = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
read.coordinates.second = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
if (entries[i].type == 4) { //ins?
if (entries[i].sv_len == Parameter::Instance()->huge_ins) {
entries[i].sv_len++; // bad hack!
}
svs.stop.max_pos += (long) entries[i].sv_len;
read.coordinates.second += (long) entries[i].sv_len;
// cout << "Parse: " << entries[i].start.pos << " " << entries[i].stop.pos << " " << svs.start.min_pos <<" "<<svs.stop.max_pos << endl;
......@@ -69,17 +73,18 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
read.type = 2; //called
read.length = entries[i].sv_len; //svs.stop.max_pos-svs.start.min_pos;//try
svs.support["input"] = read;
// cout<<"Submit: "<<entries[i].type <<endl;
Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len, read.SV);
final.insert(br, root_final);
} else {
invalid_svs++;
}
}
cerr << "Invalid types found skipping " << invalid_svs <<" entries." << endl;
cerr << "\tInvalid types found skipping " << invalid_svs << " entries." << endl;
//std::cout << "Print:" << std::endl;
//final.print(root_final);
entries.clear();
//exit(0);
}
void force_calling(std::string bam_file, IPrinter *& printer) {
......@@ -210,5 +215,5 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
points[i]->predict_SV();
printer->printSV(points[i]); //redo! Ignore min support + STD etc.
}
std::cout << "Fin!" << std::endl;
}
......@@ -431,7 +431,6 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
}
if ((strcmp(tmp.start.chr.c_str(), tmp.stop.chr.c_str()) != 0 || (tmp.sv_len >= min_svs))) { // || tmp.type==4
if (tmp.type == 5) { //BND
if (strcmp(tmp.stop.chr.c_str(), tmp.start.chr.c_str()) == 0) {
tmp.type = 2;
......
......@@ -8,7 +8,7 @@
#include "BedpePrinter.h"
void BedpePrinter::print_header() {
fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\tFILTER\n");
fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_supporting_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\tFILTER\n");
}
void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
......@@ -103,8 +103,6 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", "\tIMPRECISE");
}
//fprintf(file, "%c", '\t');
//fprintf(file, "%i", SV->get_support());
fprintf(file, "%c", '\n');
}
}
......
......@@ -8,7 +8,7 @@
#include "VCFPrinter.h"
void VCFPrinter::print_header() {
fprintf(file, "%s", "##fileformat=VCFv4.2\n");
fprintf(file, "%s", "##fileformat=VCFv4.3\n");
fprintf(file, "%s", "##source=Sniffles\n");
string time = currentDateTime();
fprintf(file, "%s", "##fileDate=");
......@@ -37,18 +37,28 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##INFO=<ID=RE,Number=1,Type=Integer,Description=\"read support\">\n");
fprintf(file, "%s", "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n");
fprintf(file, "%s", "##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n");
fprintf(file, "%s", "##INFO=<ID=UNRESOLVED,Number=0,Type=Flag,Description=\"An insertion that is longer than the read and thus we cannot predict the full size.\">\n");
fprintf(file, "%s", "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"Length of the SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
fprintf(file, "%s", "##INFO=<ID=RNAMES,Number=1,Type=String,Description=\"Names of reads supporting SVs (comma seperated)\">\n");
fprintf(file, "%s", "##INFO=<ID=RNAMES,Number=1,Type=String,Description=\"Names of reads supporting SVs (comma separated)\">\n");
}
if (Parameter::Instance()->print_seq) {
fprintf(file, "%s", "##INFO=<ID=SEQ,Number=1,Type=String,Description=\"Extracted sequence from the best representative read.\">\n");
}
if (Parameter::Instance()->read_strand) {
fprintf(file, "%s", "##INFO=<ID=STRANDS2,Number=4,Type=Integer,Description=\"alt reads first + ,alt reads first -,alt reads second + ,alt reads second -.\">\n");
fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"plus strand ref, minus strand ref.\">\n");
}
fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=A,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=A,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description=\"Kurtosis value of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description=\"Kurtosis value of the stop breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN,NR)\">\n");
fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=A,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
fprintf(file, "%s", "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency.\">\n");
fprintf(file, "%s", "##INFO=<ID=ZMW,Number=A,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
......@@ -127,7 +137,6 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", "\t.\tPASS\t");
}
if (std_quant.first < 10 && std_quant.second < 10) {
fprintf(file, "%s", "PRECISE");
} else {
......@@ -177,13 +186,47 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) { //!
fprintf(file, "%i", 999999999);
}else if (SV->get_SVtype() & TRA){
fprintf(file, "%i",0);
}else if (SV->get_SVtype() & DEL){
fprintf(file, "%i", SV->get_length()*-1);
}else {
fprintf(file, "%i", SV->get_length());
}
// }
fprintf(file, "%s", ";STRANDS=");
fprintf(file, "%s", strands.c_str());
if (!SV->get_sequence().empty()) {
if (Parameter::Instance()->read_strand) {
fprintf(file, "%s", ";STRANDS2=");
std::map<std::string, read_str> support = SV->get_coordinates().support;
pair<int, int> tmp_start;
pair<int, int> tmp_stop;
tmp_start.first = 0;
tmp_start.second = 0;
tmp_stop.first = 0;
tmp_stop.second = 0;
for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
if ((*i).second.read_strand.first) {
tmp_start.first++;
} else {
tmp_start.second++;
}
if ((*i).second.read_strand.second) {
tmp_stop.first++;
} else {
tmp_stop.second++;
}
}
fprintf(file, "%i", tmp_start.first);
fprintf(file, "%s", ",");
fprintf(file, "%i", tmp_start.second);
fprintf(file, "%s", ",");
fprintf(file, "%i", tmp_stop.first);
fprintf(file, "%s", ",");
fprintf(file, "%i", tmp_stop.second);
}
if (Parameter::Instance()->print_seq && !SV->get_sequence().empty()) {
fprintf(file, "%s", ";SEQ=");
fprintf(file, "%s", SV->get_sequence().c_str());
}
......@@ -204,6 +247,13 @@ void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
if (Parameter::Instance()->phase) {
store_readnames(SV->get_read_ids(), id);
}
pair<double, double> kurtosis;
pair<double, double> std_quant;
double std_length = 0;
int zmws = 0;
bool ok_to_print = to_print(SV, std_quant, kurtosis, std_length, zmws);
std::string chr;
int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
fprintf(file, "%s", chr.c_str());
......
......@@ -441,7 +441,6 @@ void Breakpoint::predict_SV() {
this->indel_sequence = "";
//find the stop coordinate:
maxim = 0;
coord = 0;
mean = 0;
......@@ -457,6 +456,7 @@ void Breakpoint::predict_SV() {
} else {
this->positions.stop.most_support = coord;
}
//find the length:
if (!(this->get_SVtype() & INS)) { //all types but Insertions:
this->length = this->positions.stop.most_support - this->positions.start.most_support;
......@@ -483,6 +483,7 @@ void Breakpoint::predict_SV() {
}
starts.clear();
stops.clear();
//strand information:
for (size_t i = 0; i < strands.size(); i++) {
maxim = 0;
std::string id;
......@@ -499,19 +500,40 @@ void Breakpoint::predict_SV() {
}
}
strands.clear();
}
if (!scan_reads) {
// std::cout<<"Test strand"<<std::endl;
//if forcecalling we need to define the strands:
std::map<std::string, read_str>::iterator it = positions.support.find("input");
//no output!??
/*if ((*it).second.strand.first) {
std::cout << "s1: true" << endl;
}
if ((*it).second.strand.second) {
std::cout << "s2: true" << endl;
}*/
if (it != positions.support.end()) {
this->strand.push_back(translate_strand((*it).second.strand));
}
}
std::map<std::string, read_str>::iterator tmp = positions.support.begin();
int start_prev_dist = 1000;
int stop_prev_dist = 1000;
while (tmp != positions.support.end()) {
int start_dist = abs((*tmp).second.coordinates.first - this->positions.start.most_support);
int stop_dist = abs((*tmp).second.coordinates.second - this->positions.stop.most_support);
if (((*tmp).second.SV & this->sv_type) && ((start_dist < start_prev_dist && stop_dist < stop_prev_dist) && (strcmp((*tmp).second.sequence.c_str(), "NA") != 0 && !(*tmp).second.sequence.empty()))) {
this->indel_sequence = (*tmp).second.sequence;
stop_prev_dist = stop_dist; //new to test!
start_prev_dist = start_dist;
}
tmp++;
}
}
//}
this->supporting_types = "";
if (positions.support.find("input") != positions.support.end()) {
if (num == 0) {
......
......@@ -283,6 +283,11 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
tmp.chr_id = tmp_aln->getRefID(); //check string in binary???
tmp.start = tmp_aln->getPosition();
tmp.length = tmp_aln->getRefLength();
if (tmp_aln->getStrand()) {
tmp.strand = 1;
} else {
tmp.strand = 2;
}
fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
......@@ -361,6 +366,7 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
for (size_t i = 0; i < events.size(); i++) {
// if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
position_str svs;
read_str read;
if (events[i].is_noise) {
......@@ -434,6 +440,7 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
//std::cout<<"Print:"<<std::endl;
//bst.print(root);
}
// }
}
void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree& bst, TNode *&root, long read_id, bool add) {
......@@ -452,6 +459,7 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
}
for (size_t i = 1; i < events.size(); i++) {
// if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
position_str svs;
//position_str stop;
read_str read;
......@@ -654,6 +662,7 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
// bst.print(root);
}
}
//}
}
void estimate_parameters(std::string read_filename) {
......@@ -692,9 +701,10 @@ void estimate_parameters(std::string read_filename) {
double avg_del = 0;
double avg_ins = 0;
vector<int> tmp = tmp_aln->get_avg_diff(dist, avg_del, avg_ins);
// std::cout<<"Debug:\t"<<avg_del<<" "<<avg_ins<<endl;
tot_avg_ins += avg_ins;
tot_avg_del += avg_del;
//std::cout<<"Debug:\t"<<dist<<"\t";
//
avg_dist += dist;
double avg_mis = 0;
for (size_t i = 0; i < tmp.size(); i++) {
......@@ -749,6 +759,8 @@ void estimate_parameters(std::string read_filename) {
pos = nums.size() * 0.05; //the lowest 5% cuttoff
Parameter::Instance()->score_treshold = 2; //nums[pos]; //prev=2
//cout<<"test: "<<tot_avg_ins<<" "<<num<<endl;
//cout<<"test2: "<<tot_avg_del<<" "<<num<<endl;
Parameter::Instance()->avg_del = tot_avg_del / num;
Parameter::Instance()->avg_ins = tot_avg_ins / num;
......
......@@ -25,28 +25,36 @@ void Breakpoint_Tree::find(int position, std::string chr, breakpoint_node *par,
}
}
void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par) {
void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par, bool ref_strand) {
//start + stop: read coordinates.
if (par == NULL) { //not found
return;
}
if (par->direction) { //start
if ((par->position - 100 > start && par->position + 100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
par->ref_support++;
if (ref_strand) {
par->ref_support.first++;
} else {
par->ref_support.second++;
}
// std::cout<<"start: "<<start<<" "<<stop<<std::endl;
}
} else { //stop coordinate
if ((par->position > start + 100 && par->position < stop - 100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
par->ref_support++;
if (ref_strand) {
par->ref_support.first++;
} else {
par->ref_support.second++;
}
// std::cout<<"stop: "<< start<<" "<<stop<<std::endl;
}
}
//search goes on:
if (start < par->position) {
overalps(start, stop, chr, par->left);
overalps(start, stop, chr, par->left, ref_strand);
} else {
overalps(start, stop, chr, par->right);
overalps(start, stop, chr, par->right, ref_strand);
}
}
......@@ -57,7 +65,8 @@ void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int positi
if (tree == NULL) {
tree = new breakpoint_node;
tree->position = position;
tree->ref_support = 0;
tree->ref_support.first = 0;
tree->ref_support.second = 0;
tree->chr = chr;
tree->direction = direction;
tree->left = NULL;
......@@ -74,15 +83,19 @@ void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int positi
}
}
int Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int position) {
std::pair<int,int> Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int position ) {
if (tree == NULL) {
return -1;
std::pair<int,int> tmp;
tmp.first=-1;
tmp.second=-1;
return tmp;
}
if (tree->position > position) {
return get_ref(tree->left, chr, position);
} else if (tree->position < position) {
return get_ref(tree->right, chr, position);
} else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element
return tree->ref_support;
} else {
return get_ref(tree->left, chr, position); //just in case.
......@@ -231,7 +244,7 @@ void Breakpoint_Tree::inorder(breakpoint_node *ptr) {
}
if (ptr != NULL) {
inorder(ptr->left);
std::cout << ptr->chr << " " << ptr->position << " " << ptr->ref_support << std::endl;
std::cout << ptr->chr << " " << ptr->position << " " << ptr->ref_support.first <<" "<< ptr->ref_support.second<< std::endl;
inorder(ptr->right);
}
}
......