Skip to content
Commits on Source (5)
cmake_minimum_required(VERSION 2.8)
project(Sniffles)
option(STATIC "Build static binary" OFF)
set( SNIF_VERSION_MAJOR 1 )
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
set( SNIF_VERSION_BUILD 7-debug )
set( SNIF_VERSION_BUILD 8-debug )
else()
set( SNIF_VERSION_BUILD 7 )
set( SNIF_VERSION_BUILD 8 )
ENDIF()
......
# Sniffles
Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGM-LR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck@gmail.com
Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGMLR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck@gmail.com
Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
**************************************
## NextGenMap-LR: (NGM-LR)
Sniffles performs best with the mappings of NGM-LR our novel long read mapping method.
## NGMLR
Sniffles performs best with the mappings of NGMLR our novel long read mapping method.
Please see:
https://github.com/philres/nextgenmap-lr
https://github.com/philres/ngmlr
****************************************
## Citation:
......@@ -24,7 +21,7 @@ http://www.biorxiv.org/content/early/2017/07/28/169557
[Accurate and fast detection of complex and nested structural variations using long read technologies](http://schatzlab.cshl.edu/presentations/2016/2016.10.28.BIODATA.PacBioSV.pdf)
Biological Data Science, Cold Spring Harbor Laboratory, Cold Spring Harbor, NY, 26 - 29.10.2016
[NextGenMap-LR: Highly accurate read mapping of third generation sequencing reads for improved structural variation analysis](http://www.cibiv.at/~philipp_/files/gi2016_poster_phr.pdf)
[NGMLR: Highly accurate read mapping of third generation sequencing reads for improved structural variation analysis](http://www.cibiv.at/~philipp_/files/gi2016_poster_phr.pdf)
Genome Informatics 2016, Wellcome Genome Campus Conference Centre, Hinxton, Cambridge, UK, 19.09.-2.09.2016
**************************************
......
sniffles (1.0.8+ds-1) unstable; urgency=medium
* New upstream version
* Run tests
* Refresh patches
-- Afif Elghraoui <afif@debian.org> Sat, 24 Mar 2018 16:20:10 -0400
sniffles (1.0.7+ds-1) unstable; urgency=medium
* New upstream version
......
......@@ -5,7 +5,7 @@ Last-Update: 2016-06-18
--- sniffles.orig/CMakeLists.txt
+++ sniffles/CMakeLists.txt
@@ -30,7 +30,5 @@
@@ -32,7 +32,5 @@
endif()
......@@ -15,16 +15,16 @@ Last-Update: 2016-06-18
add_subdirectory(src)
--- sniffles.orig/src/CMakeLists.txt
+++ sniffles/src/CMakeLists.txt
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 2.8)
@@ -2,6 +2,8 @@
project(Sniffles)
+include_directories(${CMAKE_INCLUDE_PATH})
+
include_directories (../lib/bamtools-2.3.0/src)
include_directories(../lib/tclap-1.2.1/include)
@@ -32,8 +34,8 @@
@@ -42,8 +44,8 @@
)
#target_link_libraries(ngm-core pthread)
......
......@@ -6,7 +6,7 @@ Forwarded: not-needed
Last-Update: 2016-06-18
--- sniffles.orig/src/CMakeLists.txt
+++ sniffles/src/CMakeLists.txt
@@ -36,35 +36,4 @@
@@ -46,35 +46,4 @@
#target_link_libraries(ngm-core pthread)
TARGET_LINK_LIBRARIES(sniffles bamtools)
TARGET_LINK_LIBRARIES(sniffles z)
......
......@@ -19,8 +19,13 @@ override_dh_auto_configure:
override_dh_auto_clean:
dh_auto_clean
find -name "*.mk" -delete
$(RM) makefile
$(RM) debian/sniffles.1
$(RM) \
makefile \
debian/sniffles.1 \
test.vcf \
override_dh_auto_test:
bin/sniffles-*/sniffles -m test_set/reads_region.bam -v test.vcf
ifeq ($(filter nodoc,$(DEB_BUILD_PROFILES)),)
build: debian/sniffles.1
......
Test-Command:
sniffles -m test_set/reads_region.bam -v ${AUTOPKGTEST_TMP}/test.vcf;
......@@ -51,6 +51,81 @@ void add_event(int pos, size_t & i, vector<differences_str> & events) {
events.insert(events.begin() + i, ev);
}
/*
* :6-ata:10+gtc:4*at:3,
* -ata: del
* +gtc: ins
* *at a->t
* :10 match bp
* cs:Z::5+g:7+t:4+c:7-a:6*tg:15*tg:2+g:6+c:1+c:1+c:4+a:15+g:4+c:12*gt*ta:2+c:7+t:8+t:17+t:8+g:5+g:10-g:4+g*tc:5+t*ag:10+a:4-ttt
*/
vector<differences_str> Alignment::summarize_csstring(std::vector<indel_str> &dels) {
string cs = this->get_cs();
int pos = this->getPosition();
int corr = 0;
int ref_pos = 0;
size_t pos_events = 0;
int max_size = (this->getRefLength() * 0.9) + getPosition();
// comp_aln = clock();
indel_str del;
del.sequence = "";
del.pos = -1;
vector<differences_str> events;
differences_str ev;
std::cout<<"CS: "<<cs<<std::endl;
for (size_t i = 0; i < cs.size() && pos < max_size; i++) {
ev.position = pos;
if (cs[i] == ':') { //match (can be numbers or bp!)
i++;
int num = atoi(&cs[i]); //is 0 if letter!
bool is_long = ((num == 0 && cs[i] != '0'));
while ((i < cs.size() && cs[i] != '+') && (cs[i] != '-' && cs[i] != '+')) {
i++;
if (is_long) {
num++;
}
}
cout<<"\tMatch: "<<num<<std::endl;
pos += num;
} else if (cs[i] == '*') { //mismatch (ref,alt) pairs
//only every second char counts!
add_event(pos, pos_events, events);
pos++;
i += 2;
ev.type = 0; //mismatch
cout<<"\tMiss: "<<1<<std::endl;
} else if (cs[i] == '-') { //del
//collet del seq in dels!
indel_str del;
del.sequence = "";
del.pos = pos;
while ((i < cs.size() && cs[i] != '+') && (cs[i] != '-' && cs[i] != '+')) {
del.sequence += cs[i];
}
dels.push_back(del);
pos += del.sequence.size();
ev.type = del.sequence.size();
cout<<"\tDEL: "<<del.sequence.size()<<std::endl;
} else if (cs[i] == '+') { //ins
int num=0;
while ((i < cs.size() && cs[i] != '+') && (cs[i] != '-' && cs[i] != '+')) {
i++;
num++;
}
ev.type =num*-1;
cout<<"\tINS: "<<num<<std::endl;
}
events.push_back(ev);
}
std::cout<<"end CS: "<<cs<<std::endl;
return events;
}
vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &dels) {
// clock_t comp_aln = clock();
vector<differences_str> events;
......@@ -69,16 +144,19 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
ev.position = pos;
ev.type = al->CigarData[i].Length; //deletion
ev.readposition = read_pos;
ev.resolved = true;
events.push_back(ev);
pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'I') {
ev.position = pos;
ev.resolved = true;
ev.readposition = read_pos;
ev.type = al->CigarData[i].Length * -1; //insertion
events.push_back(ev);
read_pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'N') {
pos += al->CigarData[i].Length;
ev.resolved = true;
read_pos += al->CigarData[i].Length;
} else if (al->CigarData[i].Type == 'S' && al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser
string sa;
......@@ -91,6 +169,7 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
} else {
ev.readposition = read_pos;
}
ev.resolved = false;
ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!]
events.push_back(ev);
}
......@@ -842,9 +921,24 @@ std::string Alignment::get_md() {
std::string md;
if (al->GetTag("MD", md)) {
return md;
}else{
std::cerr<<"No MD string detected! Check bam file! Otherwise generate using e.g. samtools."<<std::endl;
exit(0);
}
return md;
}
std::string Alignment::get_cs() {
std::string cs;
if (al->GetTag("cs", cs)) {
return cs;
}else{
std::cerr<<"No CS string detected! Check bam file!"<<std::endl;
exit(0);
}
return cs;
}
vector<str_event> Alignment::get_events_MD(int min_mis) {
vector<str_event> events;
/*std::string md;
......@@ -1013,7 +1107,13 @@ vector<str_event> Alignment::get_events_Aln() {
//clock_t comp_aln = clock();
std::vector<indel_str> dels;
vector<differences_str> event_aln = summarizeAlignment(dels);
vector<differences_str> event_aln;
if (Parameter::Instance()->cs_string) {
cout<<"run cs check "<<std::endl;
event_aln = summarize_csstring(dels);
} else {
event_aln = summarizeAlignment(dels);
}
//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
vector<str_event> events;
......@@ -1032,7 +1132,7 @@ vector<str_event> Alignment::get_events_Aln() {
for (size_t i = 0; i < event_aln.size(); i++) {
pair_str tmp;
tmp.position = -1;
if (event_aln[i].type == 0) {
if (event_aln[i].type == 0) { //substitutions.
tmp = plane->add_mut(event_aln[i].position, 1, Parameter::Instance()->window_thresh);
} else {
tmp = plane->add_mut(event_aln[i].position, 1, Parameter::Instance()->window_thresh); // abs(event_aln[i].type)
......@@ -1049,7 +1149,6 @@ vector<str_event> Alignment::get_events_Aln() {
int stop = 0;
size_t start = 0;
for (size_t i = 0; i < profile.size() && stop < event_aln.size(); i++) {
if (profile[i].position >= event_aln[stop].position) {
//find the postion:
size_t pos = 0;
......@@ -1072,8 +1171,10 @@ vector<str_event> Alignment::get_events_Aln() {
}
prev += prev_type;
}
start++; //we are running one too far!
if (start + 1 < event_aln.size()) { //TODO do some testing!
start++; //we are running one too far!
}
//run forward to identify the stop:
prev = event_aln[pos].position;
stop = pos;
......@@ -1091,8 +1192,11 @@ vector<str_event> Alignment::get_events_Aln() {
if (stop > 0) {
stop--;
}
// cout<<start<<" events: "<<event_aln[start].type <<" pos "<<event_aln[start].readposition<<endl;
int insert_max_pos = 0;
int insert_max = 0;
if (event_aln[start].type < 0) {
insert_max_pos = event_aln[start].position;
insert_max = abs(event_aln[start].type);
......@@ -1101,6 +1205,13 @@ vector<str_event> Alignment::get_events_Aln() {
int del_max = 0;
int del_max_pos = 0;
if (event_aln[start].type > 0) {
// cout<<"HIT"<<endl;
del_max_pos = event_aln[start].position;
del_max = event_aln[start].type;
}
double insert = 0;
double del = 0;
double mismatch = 0;
......@@ -1122,6 +1233,8 @@ vector<str_event> Alignment::get_events_Aln() {
}
}
}
// cout << "DELMAX: " << del_max << " " << Parameter::Instance()->avg_del << endl;
str_event tmp;
tmp.pos = event_aln[start].position;
......@@ -1148,9 +1261,6 @@ vector<str_event> Alignment::get_events_Aln() {
}
tmp.read_pos = event_aln[start].readposition;
if (Parameter::Instance()->print_seq) {
//if (tmp.read_pos + tmp.length > this->getAlignment()->QueryBases.size() || tmp.read_pos<0) {
// cerr << "BUG! ALN event INS: " << this->getName() << " " << tmp.read_pos << " " << tmp.length << " " << this->getAlignment()->QueryBases.size() << endl;
// }
if (flag) {
std::cout << "Seq+:" << this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length) << std::endl;
......
......@@ -40,6 +40,7 @@ struct differences_str{
int position;
int readposition;
short type;
bool resolved;
};
struct indel_str{
int pos;
......@@ -81,6 +82,7 @@ private:
size_t get_length(std::vector<CigarOp> CigarData);
int get_id(RefVector ref, std::string chr);
vector<differences_str> summarizeAlignment(std::vector<indel_str> &dels);
vector<differences_str> summarize_csstring (std::vector<indel_str> &dels) ;
void sort_insert(aln_str tmp, vector<aln_str> &entries);
void sort_insert_ref(aln_str tmp, vector<aln_str> &entries);
......@@ -141,6 +143,7 @@ public:
double get_num_mismatches(std::string md);
double get_scrore_ratio();
std::string get_md();
std::string get_cs();
double get_avg_indel_length_Cigar();
vector<int> get_avg_diff(double & dist,double & avg_del, double & avg_len);
......
/**
* Contact: philipp.rescheneder@gmail.com
*/
#include <iostream>
#include <string>
#include <tclap/CmdLine.h>
using std::cerr;
using std::cout;
using std::endl;
class ArgParseOutput : public TCLAP::StdOutput
{
private:
std::string usageStr;
std::string versionStr;
public:
ArgParseOutput(std::string usage, std::string version) {
usageStr = usage;
versionStr = version;
}
virtual ~ArgParseOutput() {
}
virtual void failure(TCLAP::CmdLineInterface& c, TCLAP::ArgException& e) {
cerr << "Error:" << endl;
cerr << " " << e.error() << endl;
cerr << endl;
cerr << "Short usage:" << endl;
cerr << " sniffles [options] -m <sorted.bam> -v <output.vcf> " << endl;
cerr << endl;
cerr << "For complete USAGE and HELP type:" << endl;
cerr << " sniffles --help" << endl;
cerr << endl;
exit(1);
}
virtual void usage(TCLAP::CmdLineInterface& c) {
cerr << usageStr << std::endl;
}
virtual void version(TCLAP::CmdLineInterface& c) {
}
};
cmake_minimum_required(VERSION 2.8)
project(Sniffles)
include_directories (../lib/bamtools-2.3.0/src)
include_directories(../lib/tclap-1.2.1/include)
configure_file( Version.h.in ${CMAKE_SOURCE_DIR}/src/Version.h )
IF(STATIC)
SET(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
SET(BUILD_SHARED_LIBRARIES OFF)
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
ENDIF()
add_executable(sniffles
tree/Breakpoint_Tree.cpp
Genotyper/Genotyper.cpp
......
......@@ -36,7 +36,6 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
//parse #reads supporting
//print #ref
string entry;
int pos = 0;
pos = buffer.find_last_of("GT");
......@@ -46,7 +45,11 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
buffer = buffer.substr(pos + 1); // the right part is only needed:
pos = buffer.find_last_of(':');
int support = atoi(buffer.substr(pos + 1).c_str());
entry += assess_genotype(ref, support);
string msg = assess_genotype(ref, support);
if (msg.empty()) {
return "";
}
entry += msg;
return entry;
}
......@@ -315,7 +318,7 @@ void Genotyper::update_SVs() {
cout << "\tCleaning tmp files" << endl;
string del = "rm ";
del += Parameter::Instance()->tmp_genotyp;
//system(del.c_str());
system(del.c_str());
}
void Genotyper::update_SVs(std::vector<Breakpoint *> & svs, long ref_space) { //refspace for the ref reads!!
......
......@@ -25,8 +25,8 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
version="1.0.7";
huge_ins = 2000;//TODO check??
version="1.0.8";
huge_ins = 999;//TODO check??
}
~Parameter() {
......@@ -81,6 +81,9 @@ public:
bool ignore_std;
bool reportBND;
bool print_seq;
bool change_coords; //indicating for --Ivcf
bool skip_parameter_estimation;
bool cs_string;
void set_regions(std::string reg) {
size_t i = 0;
......
......@@ -22,11 +22,13 @@
#include "Ignore_Regions.h"
#include "plane-sweep/PlaneSweep_slim.h"
#include "print/BedpePrinter.h"
#include "ArgParseOutput.h"
#include "force_calling/Force_calling.h"
//cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
//TODO:
//check strand headers.
// strand bias??
// I think you could make your performance on PacBio reads even better with a few modifications:
//b. In pbsv, I use a simply mononucleotide consistency check to determine whether to cluster insertions from different reads as supporting the "same" events. In addition to looking at the similarity of length and breakpoints,
......@@ -35,36 +37,119 @@
//[min(A1,A2)+min(C1,C2)+min(G1,G2)+min(T1,T2)[/[max...]/
Parameter* Parameter::m_pInstance = NULL;
template<typename T>
void printParameter(std::stringstream & usage, TCLAP::ValueArg<T> & arg) {
usage << " " << arg.longID() << std::endl;
usage << " " << arg.getDescription();
if (!arg.isRequired()) {
usage << " [" << arg.getValue() << "]";
}
usage << std::endl;
}
void printParameter(std::stringstream & usage, TCLAP::SwitchArg & arg) {
usage << " " << arg.longID() << std::endl;
usage << " " << arg.getDescription();
if (!arg.isRequired()) {
usage << " [" << (arg.getValue() ? "true" : "false") << "]";
}
usage << std::endl;
}
void read_parameters(int argc, char *argv[]) {
// TCLAP::CmdLine cmd("", ' ', "", true);
TCLAP::CmdLine cmd("Sniffles version ", ' ', Parameter::Instance()->version);
TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string");
TCLAP::ValueArg<std::string> arg_vcf("v", "vcf", "VCF output file name", false, "", "string");
TCLAP::ValueArg<std::string> arg_input_vcf("", "Ivcf", "Input VCF file name. Enable force calling", false, "", "string");
TCLAP::ValueArg<std::string> arg_bedpe("b", "bedpe", " bedpe output file name", false, "", "string");
TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_vcf("v", "vcf", "VCF output file name", false, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_input_vcf("", "Ivcf", "Input VCF file name. Enable force calling", false, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_bedpe("b", "bedpe", " bedpe output file name", false, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string", cmd);
//TCLAP::ValueArg<std::string> arg_chrs("c", "chrs", " comma seperated list of chrs to scan", false, "", "string");
TCLAP::ValueArg<int> arg_support("s", "min_support", "Minimum number of reads that support a SV. Default: 10", false, 10, "int");
TCLAP::ValueArg<int> arg_splits("", "max_num_splits", "Maximum number of splits per read to be still taken into account. Default: 7", false, 7, "int");
TCLAP::ValueArg<int> arg_dist("d", "max_distance", "Maximum distance to group SV together. Default: 1kb", false, 1000, "int");
TCLAP::ValueArg<int> arg_threads("t", "threads", "Number of threads to use. Default: 3", false, 3, "int");
TCLAP::ValueArg<int> arg_minlength("l", "min_length", "Minimum length of SV to be reported. Default: 30", false, 30, "int");
TCLAP::ValueArg<int> arg_mq("q", "minmapping_qual", "Minimum Mapping Quality. Default: 20", false, 20, "int");
TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. -1: report all. Default: 0", false, 0, "int");
TCLAP::ValueArg<int> arg_segsize("r", "min_seq_size", "Discard read if non of its segment is larger then this. Default: 2kb", false, 2000, "int");
TCLAP::ValueArg<int> arg_zmw("z", "min_zmw", "Discard SV that are not supported by at least x zmws. This applies only for PacBio recognizable reads. Default: 0", false, 0, "int");
TCLAP::ValueArg<std::string> arg_tmp_file("", "tmp_file", "path to temporary file otherwise Sniffles will use the current directory.", false, "", "string");
TCLAP::ValueArg<int> arg_support("s", "min_support", "Minimum number of reads that support a SV.", false, 10, "int", cmd);
TCLAP::ValueArg<int> arg_splits("", "max_num_splits", "Maximum number of splits per read to be still taken into account.", false, 7, "int", cmd);
TCLAP::ValueArg<int> arg_dist("d", "max_distance", "Maximum distance to group SV together.", false, 1000, "int", cmd);
TCLAP::ValueArg<int> arg_threads("t", "threads", "Number of threads to use.", false, 3, "int", cmd);
TCLAP::ValueArg<int> arg_minlength("l", "min_length", "Minimum length of SV to be reported.", false, 30, "int", cmd);
TCLAP::ValueArg<int> arg_mq("q", "minmapping_qual", "Minimum Mapping Quality.", false, 20, "int", cmd);
TCLAP::ValueArg<int> arg_numreads("n", "num_reads_report", "Report up to N reads that support the SV in the vcf file. -1: report all.", false, 0, "int", cmd);
TCLAP::ValueArg<int> arg_segsize("r", "min_seq_size", "Discard read if non of its segment is larger then this.", false, 2000, "int", cmd);
TCLAP::ValueArg<int> arg_zmw("z", "min_zmw", "Discard SV that are not supported by at least x zmws. This applies only for PacBio recognizable reads.", false, 0, "int", cmd);
TCLAP::ValueArg<int> arg_cluster_supp("", "cluster_support", "Minimum number of reads supporting clustering of SV.", false, 1, "int", cmd);
TCLAP::ValueArg<int> arg_parameter_maxdist("", "max_dist_aln_events", "Maximum distance between alignment (indel) events.", false, 4, "int", cmd);
TCLAP::ValueArg<int> arg_parameter_maxdiff("", "max_diff_per_window", "Maximum differences per 100bp.", false, 50, "int", cmd);
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. Default: false", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. Default: false", cmd, false);
TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) Default: false", cmd, false);
TCLAP::ValueArg<int> arg_cluster_supp("", "cluster_support", "Minimum number of reads supporting clustering of SV. Default: 1", false, 1, "int");
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). Default=0.0", false, 0.0, "float");
TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.3, "float");
TCLAP::ValueArg<float> arg_homofreq("", "min_homo_af", "Threshold on allele frequency (0-1). Default=0.0", false, 0.8, "float");
cmd.add(arg_homofreq);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. ", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. ", cmd, false);
TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) ", cmd, false);
TCLAP::SwitchArg arg_coords("", "change_coords", "Adopt coordinates for force calling if finding evidence. ", cmd, false);
TCLAP::SwitchArg arg_parameter("", "skip_parameter_estimation", "Enables the scan if only very few reads are present. ", cmd, false);
TCLAP::SwitchArg arg_cs_string("", "cs_string", "Enables the scan of CS string instead of Cigar and MD. ", cmd, false);
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). ", false, 0.0, "float", cmd);
TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). ", false, 0.3, "float", cmd);
TCLAP::ValueArg<float> arg_homofreq("", "min_homo_af", "Threshold on allele frequency (0-1). ", false, 0.8, "float", cmd);
TCLAP::ValueArg<float> arg_delratio("", "del_ratio", "Estimated ration of deletions per read (0-1). ", false, 0.0458369, "float", cmd);
TCLAP::ValueArg<float> arg_insratio("", "ins_ratio", "Estimated ratio of insertions per read (0-1). ", false, 0.049379, "float", cmd);
std::stringstream usage;
usage << "" << std::endl;
usage << "Usage: sniffles [options] -m <sorted.bam> -v <output.vcf> " << std::endl;
usage << "" << std::endl;
usage << "Input/Output:" << std::endl;
printParameter<std::string>(usage, arg_bamfile);
printParameter<std::string>(usage, arg_vcf);
printParameter<std::string>(usage, arg_bedpe);
printParameter<std::string>(usage, arg_input_vcf);
printParameter<std::string>(usage, arg_tmp_file);
usage << "" << std::endl;
usage << "General:" << std::endl;
printParameter<int>(usage, arg_support);
printParameter<int>(usage, arg_splits);
printParameter<int>(usage, arg_dist);
printParameter<int>(usage, arg_threads);
printParameter<int>(usage, arg_minlength);
printParameter<int>(usage, arg_mq);
printParameter<int>(usage, arg_numreads);
printParameter<int>(usage, arg_segsize);
printParameter<int>(usage, arg_zmw);
printParameter(usage,arg_cs_string);
usage << "" << std::endl;
usage << "Clustering/phasing and genotyping:" << std::endl;
printParameter(usage, arg_genotype);
printParameter(usage, arg_cluster);
printParameter<int>(usage, arg_cluster_supp);
printParameter<float>(usage, arg_allelefreq);
printParameter<float>(usage, arg_homofreq);
printParameter<float>(usage, arg_hetfreq);
usage << "" << std::endl;
usage << "Advanced:" << std::endl;
printParameter(usage, arg_bnd);
printParameter(usage, arg_seq);
printParameter(usage, arg_std);
usage << "" << std::endl;
usage << "Parameter estimation:" << std::endl;
printParameter(usage, arg_parameter);
printParameter<float>(usage, arg_delratio);
printParameter<float>(usage, arg_insratio);
printParameter<int>(usage, arg_parameter_maxdiff);
printParameter<int>(usage, arg_parameter_maxdist);
cmd.setOutput(new ArgParseOutput(usage.str(), ""));
/* cmd.add(arg_homofreq);
cmd.add(arg_hetfreq);
cmd.add(arg_input_vcf);
cmd.add(arg_cluster_supp);
......@@ -82,10 +167,11 @@ void read_parameters(int argc, char *argv[]) {
cmd.add(arg_allelefreq);
cmd.add(arg_support);
cmd.add(arg_bamfile);
// cmd.add(arg_chrs);
// cmd.add(arg_chrs);*/
//parse cmd:
cmd.parse(argc, argv);
Parameter::Instance()->change_coords = arg_coords.getValue();
Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
Parameter::Instance()->read_name = " "; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
......@@ -112,6 +198,17 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->min_zmw = arg_zmw.getValue();
Parameter::Instance()->homfreq = arg_homofreq.getValue();
Parameter::Instance()->hetfreq = arg_hetfreq.getValue();
Parameter::Instance()->skip_parameter_estimation = arg_parameter.getValue();
Parameter::Instance()->cs_string = arg_cs_string.getValue();
if (Parameter::Instance()->skip_parameter_estimation) {
cout<<"\tSkip parameter estimation."<<endl;
Parameter::Instance()->score_treshold = 2;
Parameter::Instance()->window_thresh = arg_parameter_maxdiff.getValue();
Parameter::Instance()->max_dist_alns = arg_parameter_maxdist.getValue();
Parameter::Instance()->avg_del =arg_delratio.getValue();
Parameter::Instance()->avg_ins = arg_insratio.getValue();
}
//Parse IDS:
/*std::string buffer = arg_chrs.getValue();
......
......@@ -151,6 +151,6 @@ void Cluster_SVS::update_SVs() {
std::cout << "\tCleaning tmp files" << std::endl;
std::string del = "rm ";
del += Parameter::Instance()->tmp_phasing;
// system(del.c_str());
system(del.c_str());
}
......@@ -31,30 +31,54 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
long length = 0;
for (size_t i = 0; i < ref.size(); i++) {
ref_lens[ref[i].RefName.c_str()] = length;
length += ref[i].RefLength + Parameter::Instance()->max_dist;
length += (long) ref[i].RefLength + (long) Parameter::Instance()->max_dist;
}
//sometimes the stop coordinates are off especially for smaller chrs!??
//parse VCF file
std::vector<strvcfentry> entries = parse_vcf(Parameter::Instance()->input_vcf, 0);
std::cout << "\t\t" << entries.size() << " SVs found in input." << std::endl;
int invalid_svs=0;
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i].type != -1) {
position_str svs;
//cout<<"start: "<<entries[i].start.chr << " stop "<<entries[i].stop.chr<<endl;
svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
read_str read;
if(ref_lens.find(entries[i].start.chr)==ref_lens.end()){
cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].start.chr<<endl;
}
if(ref_lens.find(entries[i].stop.chr)==ref_lens.end()){
cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].stop.chr<<endl;
}
read.coordinates.first = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
read.coordinates.second = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
if (entries[i].type == 4) { //ins?
if(entries[i].sv_len== Parameter::Instance()->huge_ins){
entries[i].sv_len++; // bad hack!
}
svs.stop.max_pos += (long) entries[i].sv_len;
read.coordinates.second+=(long) entries[i].sv_len;
// cout << "Parse: " << entries[i].start.pos << " " << entries[i].stop.pos << " " << svs.start.min_pos <<" "<<svs.stop.max_pos << endl;
}
read.SV = assign_type(entries[i].type);
read.strand = entries[i].strands;
read.type = 2; //called
read.length = entries[i].sv_len; //svs.stop.max_pos-svs.start.min_pos;//try
svs.support["input"] = read;
Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len);
Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len, read.SV);
final.insert(br, root_final);
//std::cout << "Print:" << std::endl;
//final.print(root_final);
} else {
cerr << "Invalid type found skipping" << endl;
invalid_svs++;
}
}
cerr << "Invalid types found skipping " << invalid_svs <<" entries." << endl;
//std::cout << "Print:" << std::endl;
//final.print(root_final);
entries.clear();
}
......@@ -81,9 +105,8 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
std::map<std::string, long> ref_lens;
fill_tree(final, root_final, ref, ref_lens);
std::cout << "Start parsing..." << std::endl;
int current_RefID = 0;
std::cout << "Start parsing: Chr "<<ref[current_RefID].RefName << std::endl;
//FILE * alt_allel_reads;
FILE * ref_allel_reads;
......@@ -91,6 +114,7 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
}
Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
long ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName];
long num_reads = 0;
while (!tmp_aln->getQueryBases().empty()) {
......@@ -166,7 +190,7 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
}
}
std::cout << "Print:" << std::endl;
//std::cout << "Print:" << std::endl;
//final.print(root_final);
//filter and copy results:
......@@ -186,4 +210,5 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
points[i]->predict_SV();
printer->printSV(points[i]); //redo! Ignore min support + STD etc.
}
std::cout << "Fin!" << std::endl;
}
......@@ -96,6 +96,7 @@ short get_type(std::string type) {
} else if (strncmp(type.c_str(), "BND", 3) == 0) { //can be inv/inter/tra
return 5;
}
//std::cout << "type:" << type[0] << type[1] << type[2] << std::endl;
return -1;
}
......@@ -276,8 +277,8 @@ std::string get_most_effect(std::string alt, int ref) {
}
std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
size_t buffer_size = 200000000;
char*buffer = new char[buffer_size];
//size_t buffer_size = 200000000;
std::string buffer; //= new char[buffer_size];
std::ifstream myfile;
myfile.open(filename.c_str(), std::ifstream::in);
if (!myfile.good()) {
......@@ -286,8 +287,8 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
}
std::vector<strvcfentry> calls;
myfile.getline(buffer, buffer_size);
//myfile.getline(buffer, buffer_size);
getline(myfile, buffer);
int num = 0;
while (!myfile.eof()) {
if (buffer[0] != '#') {
......@@ -306,7 +307,7 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
tmp.num_reads.first = 0;
tmp.num_reads.second = 0;
tmp.sv_len = -1;
for (size_t i = 0; i < buffer_size && buffer[i] != '\0' && buffer[i] != '\n'; i++) {
for (size_t i = 0; i < buffer.size() && buffer[i] != '\0' && buffer[i] != '\n'; i++) {
if (count == 0 && buffer[i] != '\t') {
tmp.start.chr += buffer[i];
......@@ -328,7 +329,7 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
tmp.stop = parse_stop(&buffer[i]);
//std::cout<<"Stop:"<<tmp.stop.pos<<std::endl;
}
if (count == 7 && strncmp(&buffer[i], "SVTYPE=", 7) == 0) {
if (count == 7 && (tmp.type == -1 && strncmp(&buffer[i], "SVTYPE=", 7) == 0)) {
tmp.type = get_type(std::string(&buffer[i + 7]));
}
if (count == 7 && strncmp(&buffer[i], ";SU=", 4) == 0) { //for lumpy!
......@@ -376,18 +377,13 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
tmp.num_reads = parse_delly(&buffer[i]);
}
if (count == 4 && buffer[i - 1] == '<') {
if (count == 4 && (tmp.type == -1 && buffer[i - 1] == '<')) {
tmp.type = get_type(std::string(&buffer[i]));
}
if (tmp.stop.pos == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) {
tmp.stop = parse_pos(&buffer[i - 1]);
}
if (count == 9 && buffer[i - 1] == '\t') {
tmp.calls[filename] = std::string(&buffer[i]);
break;
}
if (count < 9) {
tmp.header += buffer[i];
}
......@@ -434,14 +430,7 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
tmp.sv_len = abs(tmp.start.pos - tmp.stop.pos);
}
if ((strcmp(tmp.start.chr.c_str(), tmp.stop.chr.c_str()) != 0 || (tmp.sv_len >= min_svs))) { // || tmp.type==4
std::size_t found = tmp.stop.chr.find("chr");
if (found != std::string::npos) {
tmp.stop.chr.erase(tmp.stop.chr.begin() + found, tmp.stop.chr.begin() + found + 3);
}
found = tmp.start.chr.find("chr");
if (found != std::string::npos) {
tmp.start.chr.erase(tmp.start.chr.begin() + found, tmp.start.chr.begin() + found + 3);
}
if (tmp.type == 5) { //BND
if (strcmp(tmp.stop.chr.c_str(), tmp.start.chr.c_str()) == 0) {
......@@ -452,11 +441,12 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
}
calls.push_back(tmp);
}
tmp.calls.clear();
} else {
}
myfile.getline(buffer, buffer_size);
getline(myfile, buffer);
}
myfile.close();
//std::cout << calls.size() << std::endl;
......
......@@ -8,7 +8,7 @@
#include "BedpePrinter.h"
void BedpePrinter::print_header() {
fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\n");
fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\tFILTER\n");
}
void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
......@@ -29,18 +29,35 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
std::string chr;
std::string strands = SV->get_strand(2);
int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr);
//start coordinates:
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
fprintf(file, "%i", pos);
fprintf(file, "%c", '\t');
fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr));
fprintf(file, "%c", '\t');
pos = IPrinter::calc_pos(SV->get_coordinates().stop.min_pos, ref, chr);
//stop coordinates
string chr_start;
int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr_start);
long end_coord = SV->get_coordinates().stop.min_pos;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord = std::max((SV->get_coordinates().stop.min_pos -(long)SV->get_length()), (long)start);
}
pos = IPrinter::calc_pos(end_coord, ref, chr);
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
fprintf(file, "%i", pos);
fprintf(file, "%c", '\t');
fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().stop.max_pos, ref, chr));
end_coord = SV->get_coordinates().stop.max_pos;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord = std::max((SV->get_coordinates().stop.max_pos - (long) SV->get_length()), (long)start);
}
fprintf(file, "%i", IPrinter::calc_pos(end_coord, ref, chr));
fprintf(file, "%c", '\t');
fprintf(file, "%i", id);
id++;
......@@ -55,21 +72,37 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%c", '\t');
fprintf(file, "%i", SV->get_support());
fprintf(file, "%c", '\t');
pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
fprintf(file, "%s", chr.c_str());
fprintf(file, "%s", chr_start.c_str());
fprintf(file, "%c", '\t');
fprintf(file, "%i", pos);
fprintf(file, "%i", start);
fprintf(file, "%c", '\t');
pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
end_coord = SV->get_coordinates().stop.most_support;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord = std::max((SV->get_coordinates().stop.most_support - (long) SV->get_length()), (long)start);
}
pos = IPrinter::calc_pos(end_coord, ref, chr);
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
fprintf(file, "%i", pos);
fprintf(file, "%c", '\t');
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) {
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {//!
fprintf(file, "%s", "NA");
} else {
fprintf(file, "%i", SV->get_length());
}
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
fprintf(file, "%s", "\tUNRESOLVED");
} else if (std_quant.first < 10 && std_quant.second < 10) {
fprintf(file, "%s", "\tPRECISE");
} else {
fprintf(file, "%s", "\tIMPRECISE");
}
//fprintf(file, "%c", '\t');
//fprintf(file, "%i", SV->get_support());
fprintf(file, "%c", '\n');
......
......@@ -40,15 +40,18 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=.,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=.,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=.,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=.,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
fprintf(file, "%s", "##INFO=<ID=RNAMES,Number=1,Type=String,Description=\"Names of reads supporting SVs (comma seperated)\">\n");
}
fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=A,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=A,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=.,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
fprintf(file, "%s", "##INFO=<ID=AF,Number=.,Type=Integer,Description=\"Allele Frequency.\">\n");
fprintf(file, "%s", "##INFO=<ID=ZMW,Number=.,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=A,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
fprintf(file, "%s", "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency.\">\n");
fprintf(file, "%s", "##INFO=<ID=ZMW,Number=A,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
fprintf(file, "%s", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
fprintf(file, "%s", "##FORMAT=<ID=DR,Number=1,Type=Integer,Description=\"# high-quality reference reads\">\n");
fprintf(file, "%s", "##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality variant reads\">\n");
......@@ -86,7 +89,12 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%i", id);
id++;
int end = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr);
long end_coord=SV->get_coordinates().stop.most_support;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord=std::max((SV->get_coordinates().stop.most_support - (long)SV->get_length()),(long) start);
}
int end = IPrinter::calc_pos(end_coord, ref, chr);
std::string strands = SV->get_strand(1);
fprintf(file, "%s", "\tN\t");
if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
......@@ -112,7 +120,14 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", IPrinter::get_type(SV->get_SVtype()).c_str());
fprintf(file, "%c", '>');
}
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
fprintf(file, "%s", "\t.\tUNRESOLVED\t");
}else{
fprintf(file, "%s", "\t.\tPASS\t");
}
if (std_quant.first < 10 && std_quant.second < 10) {
fprintf(file, "%s", "PRECISE");
} else {
......@@ -126,11 +141,11 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", chr.c_str());
fprintf(file, "%s", ";END=");
if (SV->get_SVtype() & INS) {
fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
} else {
//if (SV->get_SVtype() & INS) {
// fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
//} else {
fprintf(file, "%i", end);
}
//}
}
if (zmws != 0) {
fprintf(file, "%s", ";ZMW=");
......@@ -160,8 +175,8 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", SV->get_supporting_types().c_str());
fprintf(file, "%s", ";SVLEN=");
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && !SV->get_types().is_SR) {
fprintf(file, "%s", "NA");
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {//!
fprintf(file, "%i", 999999999);
} else {
fprintf(file, "%i", SV->get_length());
}
......
......@@ -291,12 +291,11 @@ void Breakpoint::calc_support() {
this->sv_type = eval_type(sv);
if (get_SVtype() & TRA) { // we cannot make assumptions abut support yet.
set_valid((bool) (get_support() > 1)); // this is needed as we take each chr independently and just look at the primary alignment
set_valid((bool) (get_support() >= 1)); // this is needed as we take each chr independently and just look at the primary alignment
} else if (get_support() >= Parameter::Instance()->min_support) {
predict_SV();
set_valid((bool) (get_length() > Parameter::Instance()->min_length));
}
}
char Breakpoint::eval_type(ushort* SV) {
......@@ -335,11 +334,15 @@ char Breakpoint::eval_type(ushort* SV) {
}
long get_median(std::vector<long> corrds) {
sort(corrds.begin(), corrds.end());
if (corrds.size() % 2 == 0) {
return (corrds[corrds.size() / 2 - 1] + corrds[corrds.size() / 2]) / 2;
}
return corrds[corrds.size() / 2];
if (corrds.size() == 1) {
return corrds[0];
}
return corrds[(int) corrds.size() / 2];
}
void Breakpoint::predict_SV() {
......@@ -355,10 +358,14 @@ void Breakpoint::predict_SV() {
std::vector<long> stops2;
std::vector<long> lengths2;
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end(); i++) {
bool scan_reads=true;
if(positions.support.find("input") != positions.support.end() && !Parameter::Instance()->change_coords){
scan_reads=false;
}
if (((*i).second.SV & this->sv_type) && strncmp((*i).first.c_str(), "input", 5) != 0) { // && !((*i).second.SV & INS && (*i).second.length==Parameter::Instance()->huge_ins)) { ///check type
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end() && scan_reads; i++) {
if (((*i).second.SV & this->sv_type) && strncmp((*i).first.c_str(), "input", 5) != 0) { // && !((*i).second.SV & INS && (*i).second.length==Parameter::Instance()->huge_ins)) { ///check type
if ((*i).second.coordinates.first != -1) {
if ((*i).second.length != Parameter::Instance()->huge_ins) {
if (starts.find((*i).second.coordinates.first) == starts.end()) {
......@@ -419,7 +426,7 @@ void Breakpoint::predict_SV() {
int maxim = 0;
long coord = 0;
if (num > 0) {
//find the start coordinate:
for (map<long, int>::iterator i = starts.begin(); i != starts.end(); i++) {
if ((*i).second > maxim) {
coord = (*i).first;
......@@ -433,6 +440,7 @@ void Breakpoint::predict_SV() {
}
this->indel_sequence = "";
//find the stop coordinate:
maxim = 0;
coord = 0;
......@@ -449,7 +457,7 @@ void Breakpoint::predict_SV() {
} else {
this->positions.stop.most_support = coord;
}
//find the length:
if (!(this->get_SVtype() & INS)) { //all types but Insertions:
this->length = this->positions.stop.most_support - this->positions.start.most_support;
......@@ -463,20 +471,18 @@ void Breakpoint::predict_SV() {
coord = (*i).first;
maxim = (*i).second;
}
}
if (maxim < 3) {
if (maxim == 0) { //for forcecalling
this->length = this->positions.stop.most_support - this->positions.start.most_support;
} else if (maxim < 3) {
this->length = get_median(lengths2);
} else {
this->length = coord;
}
// if(del)
// cout << "third len: " << this->length << endl; // problem!
}
starts.clear();
stops.clear();
for (size_t i = 0; i < strands.size(); i++) {
maxim = 0;
std::string id;
......@@ -494,7 +500,6 @@ void Breakpoint::predict_SV() {
}
strands.clear();
std::map<std::string, read_str>::iterator tmp = positions.support.begin();
int start_prev_dist = 1000;
int stop_prev_dist = 1000;
......@@ -507,14 +512,17 @@ void Breakpoint::predict_SV() {
tmp++;
}
}
if (num == 0 && positions.support.find("input") != positions.support.end()) {
this->positions.stop.most_support = this->positions.stop.max_pos;
this->positions.start.most_support = this->positions.start.min_pos;
this->length = this->positions.stop.max_pos - this->positions.start.min_pos;
this->supporting_types = "";
if (positions.support.find("input") != positions.support.end()) {
if (num == 0) {
//cout << "Force called: "<< this->positions.support["input"].length << endl;
this->positions.start.most_support = this->positions.support["input"].coordinates.first;
this->positions.stop.most_support = this->positions.support["input"].coordinates.second;
//this->length = this->positions.support["input"].length;
}
this->supporting_types += "input";
}
this->supporting_types = "";
if (aln) {
this->type.is_ALN = true;
this->supporting_types += "AL";
......@@ -613,7 +621,7 @@ std::string Breakpoint::get_strand(int num_best) {
#include "Detect_Breakpoints.h"
std::string Breakpoint::to_string() {
std::stringstream ss;
if (positions.support.size() > 1) {
if (positions.support.size() >= 1) {
ss << "\t\tTREE: ";
ss << TRANS_type(this->get_SVtype());
ss << " ";
......