Skip to content
Commits on Source (2)
......@@ -12,12 +12,12 @@ HTSLD_LIB=
CXX=g++ -std=c++0x
#COMPILER FLAGS
CXXFLAG_REL=-O2
CXXFLAG_REL=-O3
CXXFLAG_DBG=-g
CXXFLAG_WRN=-Wall -Wextra -Wno-sign-compare -Wno-unused-local-typedefs -Wno-deprecated -Wno-unused-parameter
#BASE LIBRARIES
LIB_FLAGS=-Wl,-Bstatic -lz -lgsl -lblas -lbz2 -Wl,-Bdynamic -lm -lpthread
LIB_FLAGS=-lz -lgsl -lblas -lbz2 -llzma -lgslcblas -lm -lpthread
#FILE LISTS
BFILE=bin/QTLtools
......@@ -27,16 +27,35 @@ CFILE=$(shell find src -name *.cpp)
OFILE=$(shell for file in `find src -name *.cpp`; do echo obj/$$(basename $$file .cpp).o; done)
VPATH=$(shell for file in `find src -name *.cpp`; do echo $$(dirname $$file); done)
#DEFAULT VERSION (I.E. UNIGE DESKTOP RELEASE VERSION)
all: desktop
#DEFAULT VERSION (SET UP THE VARIABLES IN THE BEGINING OF THE MAKEFILE)
all: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
all: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
all: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
all: LDFLAG=$(CXXFLAG_REL)
all: $(BFILE)
#DEFAULT DEBUG VERSION (SET UP THE VARIABLES IN THE BEGINING OF THE MAKEFILE)
all-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
all-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
all-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
all-dbg: LDFLAG=$(CXXFLAG_DBG)
all-dbg: $(BFILE)
#DEFAULT VERSION (SET UP THE VARIABLES IN THE BEGINING OF THE MAKEFILE)
all-static: LIB_FLAGS=-Wl,-Bstatic -lz -lgsl -lblas -lbz2 -llzma -lgslcblas -Wl,-Bdynamic -lm -lpthread
all-static: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
all-static: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
all-static: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
all-static: LDFLAG=$(CXXFLAG_REL)
all-static: $(BFILE)
#UNIGE DESKTOP RELEASE VERSION
desktop: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
desktop: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
desktop: HTSLD_INC=$(HOME)/Tools/htslib-1.3.1
desktop: HTSLD_LIB=$(HOME)/Tools/htslib-1.3.1
desktop: HTSLD_INC=$(HOME)/Tools/htslib-1.3
desktop: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
desktop: BOOST_INC=/usr/include
desktop: BOOST_LIB=/usr/lib
desktop: BOOST_LIB=/usr/lib/x86_64-linux-gnu
desktop: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
desktop: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
desktop: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
......@@ -44,15 +63,24 @@ desktop: LDFLAG=$(CXXFLAG_REL)
desktop: $(BFILE)
#UNIGE DESKTOP DEBUG VERSION
desktop-dbg: desktop
desktop-dbg: RMATH_INC=$(HOME)/Tools/R-3.2.2/src/include
desktop-dbg: RMATH_LIB=$(HOME)/Tools/R-3.2.2/src/nmath/standalone
desktop-dbg: HTSLD_INC=$(HOME)/Tools/htslib-1.3
desktop-dbg: HTSLD_LIB=$(HOME)/Tools/htslib-1.3
desktop-dbg: BOOST_INC=/usr/include
desktop-dbg: BOOST_LIB=/usr/lib/x86_64-linux-gnu
desktop-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
desktop-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
desktop-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
desktop-dbg: LDFLAG=$(CXXFLAG_DBG)
desktop-dbg: $(BFILE)
#VITAL-IT RELEASE VERSION
cluster: RMATH_INC=/software/R/3.1.1/include
cluster: RMATH_LIB=/software/R/3.1.1/lib64
cluster: HTSLD_INC=/software/UHTS/Analysis/samtools/1.2/include
cluster: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.2/lib64
cluster: LIB_FLAGS=-lz -lgsl -lblas -lbz2 -lm -lpthread -lgslcblas -llzma
cluster: RMATH_INC=/software/R/3.4.2/include
cluster: RMATH_LIB=/software/R/3.4.2/lib64
cluster: HTSLD_INC=/software/UHTS/Analysis/samtools/1.4/include
cluster: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.4/lib64
cluster: BOOST_INC=/software/include
cluster: BOOST_LIB=/software/lib64
cluster: CXXFLAG=$(CXXFLAG_REL) $(CXXFLAG_WRN)
......@@ -62,9 +90,18 @@ cluster: LDFLAG=$(CXXFLAG_REL)
cluster: $(BFILE)
#VITAL-IT DEBUG VERSION
cluster-dbg: cluster
cluster-dbg: LIB_FLAGS=-lz -lgsl -lblas -lbz2 -lm -lpthread -lgslcblas -llzma
cluster-dbg: RMATH_INC=/software/R/3.4.2/include
cluster-dbg: RMATH_LIB=/software/R/3.4.2/lib64
cluster-dbg: HTSLD_INC=/software/UHTS/Analysis/samtools/1.4/include
cluster-dbg: HTSLD_LIB=/software/UHTS/Analysis/samtools/1.4/lib64
cluster-dbg: BOOST_INC=/software/include
cluster-dbg: BOOST_LIB=/software/lib64
cluster-dbg: CXXFLAG=$(CXXFLAG_DBG) $(CXXFLAG_WRN)
cluster-dbg: LDFLAG=$(CXXFLAG_DBG)
cluster-dbg: IFLAG=-Ilib/OTools -Ilib -I$(RMATH_INC) -I$(HTSLD_INC) -I$(BOOST_INC)
cluster-dbg: LIB_FILES=$(RMATH_LIB)/libRmath.a $(HTSLD_LIB)/libhts.a $(BOOST_LIB)/libboost_iostreams.a $(BOOST_LIB)/libboost_program_options.a
cluster-dbg: $(BFILE)
#MAC RELEASE VERSION
mac: RMATH_INC=$(HOME)/Libraries/R-3.2.2/src/include
......@@ -132,7 +169,7 @@ obj/union_%.o: union_%.cpp union_data.h src/common/data.h src/common/filter.h $(
obj/extract_%.o: extract_%.cpp extract_data.h src/common/data.h src/common/filter.h $(TFILE)
$(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
obj/quan_%.o: quan_%.cpp quan_data.h src/common/data.h src/common/filter.h $(TFILE)
obj/quan_%.o: quan_%.cpp quan_data.h quan_xxhash.h src/common/data.h src/common/filter.h $(TFILE)
$(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
obj/ase_%.o: ase_%.cpp ase_data.h src/common/data.h src/common/filter.h $(TFILE)
......@@ -144,6 +181,12 @@ obj/bamstat_%.o: bamstat_%.cpp bamstat_data.h src/common/data.h src/common/filte
obj/fdensity_%.o: fdensity_%.cpp fdensity_data.h src/common/data.h src/common/filter.h $(TFILE)
$(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
obj/rep_%.o: rep_%.cpp rep_data.h src/common/data.h src/common/filter.h $(TFILE)
$(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
obj/gwas_%.o: gwas_%.cpp gwas_data.h src/common/data.h src/common/filter.h $(TFILE)
$(CXX) -o $@ -c $< $(CXXFLAG) $(IFLAG)
clean:
rm -f obj/*.o $(BFILE)
......@@ -189,4 +232,10 @@ clean-bamstat:
clean-fdensity:
rm -f obj/fdensity_*.o $(BFILE)
clean-rep:
rm -f obj/rep_*.o $(BFILE)
clean-gwas:
rm -f obj/gwas_*.o $(BFILE)
\ No newline at end of file
......@@ -77,7 +77,7 @@ public:
for (set < unsigned int >::iterator itNM = i_nonmissing.begin(); itNM != i_nonmissing.end() ; ++itNM) {
float value;
std::istringstream in(covariate[*itNM]);
if (!(in >> value)) {
if (!(in >> value) || in.rdbuf()->in_avail() != 0) {
factors.insert(covariate[*itNM]);
isAlphabetic = true;
} else isNumeric = true;
......@@ -88,12 +88,12 @@ public:
vector < vector < float > > additional_hcov;
if (factors.size() == 0) {
additional_hcov = vector < vector < float > > (1, vector < float > (n_samples, 0.0));
for (int i = 0 ; i < covariate.size() ; i++) additional_hcov[0][i] = std::stof(covariate[i]);
for (int i = 0 ; i < covariate.size() ; i++) if (i_nonmissing.count(i)) additional_hcov[0][i] = std::stof(covariate[i]);
} else if (factors.size() > 1) {
factors.erase(factors.begin());
for (set < string > ::iterator itF = factors.begin(); itF != factors.end() ; itF++) {
additional_hcov.push_back(vector < float > (n_samples, 0.0));
for (int i = 0 ; i < n_samples ; i++) additional_hcov.back()[i] = (covariate[i] == (*itF));
for (int i = 0 ; i < n_samples ; i++) if (i_nonmissing.count(i)) additional_hcov.back()[i] = (covariate[i] == (*itF));
}
} else return COV_DROP;
......
......@@ -49,7 +49,7 @@ public:
float n;
std::istringstream in(str);
if (!(in >> n)) return false;
return true;
return in.rdbuf()->in_avail() == 0;
}
template < class T >
......
......@@ -27,6 +27,8 @@
#include "mode_union/union_data.h"
#include "mode_bamstat/bamstat_data.h"
#include "mode_fdensity/fdensity_data.h"
#include "mode_rep/rep_data.h"
#include "mode_gwas/gwas_data.h"
void printModes(){
vrb.ctitle("Usage:");
......@@ -47,6 +49,8 @@ void printModes(){
vrb.print(" extract Data extraction mode");
vrb.print(" quan Quantification mode");
vrb.print(" ase Measure allelic imbalance at every het genotype");
vrb.print(" rep Replicate QTL associations into independent data set");
vrb.print(" gwas GWAS tests");
}
int main(int argc, char ** argv) {
......@@ -69,7 +73,7 @@ int main(int argc, char ** argv) {
vrb.bullet("Webpage : https://qtltools.github.io/qtltools/");
vrb.bullet("Version : " + string(QTLTOOLS_VERSION));
vrb.bullet("Date : " + running_timer.date());
if (!match_mode) vrb.bullet("Citation: A complete tool set for molecular QTL discovery and analysis, https://doi.org/10.1101/068635");
if (!match_mode) vrb.bullet("Citation: A complete tool set for molecular QTL discovery and analysis, https://doi.org/10.1038/ncomms15452");
else vrb.bullet("Citation: MBV; a method to solve sample mislabeling and detect technical bias in large combined genotype and sequencing assay data sets");
//4. Switch mode
......@@ -111,7 +115,7 @@ int main(int argc, char ** argv) {
else if (strcmp(argv[1], "rtc-union") == 0) union_main(args);
//5.11. QUANTIFICATION mode
else if (strcmp(argv[1], "quan") == 0) quan_main(args);
else if (strcmp(argv[1], "quan") == 0) quan2_main(args);
//5.12. ASE mode
else if (strcmp(argv[1], "ase") == 0) ase_main(args);
......@@ -122,7 +126,13 @@ int main(int argc, char ** argv) {
//5.14. FDENSITY mode
else if (strcmp(argv[1], "fdensity") == 0) fdensity_main(args);
//5.15. UNRECOGNIZED mode
//5.15. REPLICATION mode
else if (strcmp(argv[1], "rep") == 0) rep_main(args);
//5.16. GWAS mode
else if (strcmp(argv[1], "gwas") == 0) gwas_main(args);
//5.17. UNRECOGNIZED mode
else if (strcmp(argv[1], "--help") == 0) {
printModes();
exit(EXIT_SUCCESS);
......
......@@ -19,7 +19,7 @@
#include "otools.h"
#include "filter.h"
#define QTLTOOLS_VERSION "1.1"
#define QTLTOOLS_VERSION "1.2"
class data {
public:
......
......@@ -79,6 +79,10 @@ public:
inclusion_map.insert(value);
}
void addInclusion(vector < string > & values){
for (int e = 0 ; e < values.size() ; e ++) inclusion_map.insert(values[e]);
}
void addExclusion(string value){
exclusion_map.insert(value);
}
......
......@@ -20,6 +20,7 @@
#define CIS_PERM 1
#define CIS_NOMI 2
#define CIS_COND 3
#define CIS_EXTR 4
//AGGREGATION MODES
#define GRP_NONE 0
......
......@@ -39,7 +39,7 @@ void cis_main(vector < string > & argv) {
boost::program_options::options_description opt_modes ("\x1B[32mAnalysis type\33[0m");
opt_modes.add_options()
("permute", boost::program_options::value< int >(), "MODE1: PERMUTATION PASS.")
("nominal", boost::program_options::value< double >(), "MODE2: NOMINAL PASS.")
("nominal", boost::program_options::value< string >(), "MODE2: NOMINAL PASS.")
("mapping", boost::program_options::value< string >(), "MODE3: MAPPING PASS.");
boost::program_options::options_description opt_aggr ("\x1B[32mPhenotype aggregation methods\33[0m");
......@@ -102,9 +102,12 @@ void cis_main(vector < string > & argv) {
//ONLY MODE2: NOMINAL PASS
if (D.options.count("nominal")) {
D.mode = CIS_NOMI;
if (D.options["nominal"].as < double >() <= 0 || D.options["nominal"].as < double >() > 1.0) vrb.error("Significance threshold is outside of the range ]0,1]");
else vrb.bullet("TASK: Report all nominal associations with p <= " + stb.str(D.options["nominal"].as < double >()));
D.threshold = D.options["nominal"].as < double >();
string argument = D.options["nominal"].as < string >();
if (stb.numeric(argument)) {
D.threshold = atof(argument.c_str());
if (D.threshold <= 0 || D.threshold> 1.0) vrb.error("Significance threshold is outside of the range ]0,1]");
else vrb.bullet("TASK: Report all nominal associations with p <= " + stb.str(D.threshold));
} else vrb.bullet("TASK: Report all nominal associations with p below thresholds in [" + argument +"]");
}
//ONLY MODE3: MAPPING PASS
if (D.options.count("mapping")) {
......@@ -183,7 +186,6 @@ void cis_main(vector < string > & argv) {
//---------------
D.processBasicOptions();
D.readSampleFromBED(D.options["bed"].as < string > ()); //Read samples in BED
htsFile * fp = hts_open(D.options["vcf"].as < string > ().c_str(),"r");
if (fp->format.format == sam) D.readSampleFromBED(D.options["vcf"].as < string > ());
else D.readSampleFromVCF(D.options["vcf"].as < string > ());
......@@ -195,6 +197,10 @@ void cis_main(vector < string > & argv) {
D.readGenotypes(D.options["vcf"].as < string > ()); //Read data in VCF
if (D.options.count("cov")) D.readCovariates(D.options["cov"].as < string > ());
if (D.options.count("mapping")) D.readThresholds(D.options["mapping"].as < string > ());
if (D.options.count("nominal")) {
string argument = D.options["nominal"].as < string >();
if (!stb.numeric(argument)) D.readThresholds(argument);
}
//------------------------
// 10. INITIALIZE ANALYSIS
......
......@@ -104,7 +104,10 @@ void cis_data::runNominalPass(string fout) {
//STEP10: PRINT RESULTS IN FILE
for (unsigned int p = 0 ; p < group_idx[i_group].size() ; p ++) {
for (unsigned int v = 0 ; v < variant_indexes.size() ; v ++) {
if (pval_nom[v * group_idx[i_group].size() + p] <= threshold) {
bool toBeWritten = false;
if ((phenotype_threshold.size() == 0) && (pval_nom[v * group_idx[i_group].size() + p] <= threshold)) toBeWritten = true;
if ((phenotype_threshold.size() != 0) && (pval_nom[v * group_idx[i_group].size() + p] <= phenotype_threshold[group_idx[i_group][p]])) toBeWritten = true;
if (toBeWritten) {
if (grp_mode == GRP_NONE) fdo << phenotype_id[group_idx[i_group][p]];
else fdo << phenotype_grp[group_idx[i_group][p]];
fdo << " " << phenotype_chr[group_idx[i_group][p]];
......
......@@ -34,7 +34,7 @@ public:
int covariate_count; //covariate number
vector < vector < string > > covariate_val; //covariate values
vector < string > covariate_id; //covariate ids
vector < string > covariate_target;
vector < vector < string > > covariate_target;
residualizer * covariate_engine; //covariate engine machinery
//QTL COVARIATES
......
......@@ -44,8 +44,9 @@ void correct_data::processBED(string fin, string fout) {
if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
for (int t = 6 ; t < tokens.size() ; t ++) {
mappingS.push_back(findSample(tokens[t]));
if (mappingS.back() >= 0) fdo << "\t" << tokens[t];
//if (mappingS.back() >= 0) fdo << "\t" << tokens[t];
}
for (int i = 0 ; i < sample_count ; i ++) fdo << "\t" << sample_id[i];
fdo << endl;
//Read phenotypes
......@@ -59,11 +60,22 @@ void correct_data::processBED(string fin, string fout) {
if (residualize) {
if (covariate_target.size() > 0) {
covariate_engine = new residualizer (sample_count);
int all_covariates = 0;
int qtl_covariates = 0;
for (int c = 0 ; c < covariate_count ; c ++) {
if (covariate_target[c] == "ALL") covariate_engine->push(covariate_val[c]);
if (covariate_target[c] == tokens[4]) covariate_engine->push(covariate_val[c]);
for (int g = 0 ; g < covariate_target[c].size() ; g ++) {
if (covariate_target[c][g] == "ALL") {
covariate_engine->push(covariate_val[c]);
all_covariates ++;
}
if (covariate_target[c][g] == tokens[3]) {
covariate_engine->push(covariate_val[c]);
qtl_covariates ++;
}
}
}
covariate_engine->build();
vrb.bullet (tokens[3] + " residualized for #cov_common = " + stb.str(all_covariates) + "\t#cov_qtl=" + stb.str(qtl_covariates));
}
covariate_engine->residualize(values);
if (covariate_target.size() > 0) {
......
......@@ -56,20 +56,35 @@ void correct_data::readQTLCovariates(string fqtl, string fvcf) {
string buffer; vector < string > tokens;
//File qtl
map < string, string > map_qtl;
map < string, vector < string > > map_qtl;
map < string, vector < string > > :: iterator it_map_qtl;
vrb.title("Reading QTLs in [" + fqtl + "]");
input_file fd (fqtl);
if (fd.fail()) vrb.error("Cannot open file!");
int n_variant_covariates = 0;
int n_variant_duplicates = 0;
while (getline(fd, buffer)) {
stb.split(buffer, tokens);
if (tokens.size() != 2) vrb.error("Wrong Incorrect number of columns, expected 2!");
map_qtl.insert(pair < string, string > (tokens[1], tokens[0]));
string variant = tokens[1];
string gene = tokens[0];
it_map_qtl = map_qtl.find(variant);
if (it_map_qtl == map_qtl.end()) {
vector < string > value = vector < string >(1, gene);
map_qtl.insert(pair < string, vector < string > > (tokens[1], value));
n_variant_covariates ++;
} else {
it_map_qtl->second.push_back(gene);
n_variant_duplicates ++;
}
vrb.bullet(stb.str(map_qtl.size()) + " QTL(s) read");
}
vrb.bullet(stb.str(n_variant_covariates) + " unique variants used as covariates");
vrb.bullet(stb.str(n_variant_duplicates) + " unique variants act on multiple genes");
fd.close();
//
for (int c = 0 ; c < covariate_count ; c ++) covariate_target.push_back("ALL");
for (int c = 0 ; c < covariate_count ; c ++) covariate_target.push_back(vector < string > (1, "ALL"));
//File VCF
vector < int > mappingS;
......@@ -103,9 +118,9 @@ void correct_data::readQTLCovariates(string fqtl, string fvcf) {
bcf_unpack(line, BCF_UN_STR);
string sid = string(line->d.id);
map < string, string > :: iterator itM = map_qtl.find(sid);
it_map_qtl = map_qtl.find(sid);
if (itM != map_qtl.end() && filter_covariate.check(sid)) {
if (it_map_qtl != map_qtl.end() && filter_covariate.check(sid)) {
covariate_val.push_back(vector < string > (sample_count, "0"));
for(int i = 0 ; i < n_samples ; i ++) {
if (mappingS[i] >= 0) {
......@@ -116,7 +131,8 @@ void correct_data::readQTLCovariates(string fqtl, string fvcf) {
}
}
}
covariate_target.push_back(itM->second);
covariate_target.push_back(vector < string >());
for (int g = 0 ; g < it_map_qtl->second.size() ; g ++) covariate_target.back().push_back(it_map_qtl->second[g]);
covariate_count++;
n_includedG++;
} else n_excludedG ++;
......
......@@ -30,7 +30,7 @@ void fdensity_main(vector < string > & argv) {
boost::program_options::options_description opt_parameters ("\x1B[32mParameters\33[0m");
opt_parameters.add_options()
("window", boost::program_options::value< int >()->default_value(1000000), "Window size arround TSS in bp.")
("window", boost::program_options::value< int >()->default_value(1000000), "Window size around TSS in bp.")
("bin", boost::program_options::value< int >()->default_value(1000), "Bin size in bp.");
D.option_descriptions.add(opt_files).add(opt_parameters);
......
......@@ -102,7 +102,7 @@ void fdensity_data::runDensityCalculation(string fout) {
for (int t = 0 ; t < tss_count ; t ++) {
vector < Interval < bool > > ann_in_bin;
Itree[t].findOverlapping(wfrom, wto, ann_in_bin);
n_annotation += ann_in_bin.size();
n_annotation += (ann_in_bin.size() > 0);
}
fdo << wfrom << " " << wto << " " << n_annotation << endl;
......
......@@ -26,13 +26,6 @@ void genrich_data::overlapGWASandQTL(string fout) {
vrb.bullet("#qtls=" + stb.str(qtl_idx.size()));
vrb.bullet("#gwas=" + stb.str(gwas_idx.size()));
vector < bool > overlap_qtl = vector < bool > (qtl_idx.size() , false);
for (int q = 0 ; q < qtl_idx.size() ; q ++) for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], qtl_idx[q]);
unsigned int n_obs_overlap = 0;
for (int q = 0 ; q < qtl_idx.size() ; q ++) if (overlap_qtl[q]) n_obs_overlap ++;
vrb.bullet("#observed overlap=" + stb.str(n_obs_overlap) + " (=" + stb.str(n_obs_overlap * 100.0 / qtl_idx.size(), 2) + " %)");
vrb.title("Classifying null variants");
vector < vector < unsigned int > > null_sets = vector < vector < unsigned int > > (bin_min_maf.size());
for (int v = 0 ; v < genotype_pos.size() ; v ++) if (!genotype_qtl[v] && genotype_bin[v] >= 0) null_sets[genotype_bin[v]].push_back(v);
......@@ -43,6 +36,29 @@ void genrich_data::overlapGWASandQTL(string fout) {
}
vrb.bullet("#null variants per bin = " + stb.str(bs_null_count.mean(),3) + " +/-" + stb.str(bs_null_count.sd(), 3));
vrb.title("Discarding poorly populated bins");
int n_discarded = 0, n_removed_qtl = 0;
for (int b = 0 ; b < null_sets.size() ; b ++) if (null_sets[b].size() < 3) {
for (int q = 0 ; q < qtl_idx.size() ; q ++) if (genotype_bin[qtl_idx[q]] == b) {
genotype_bin[qtl_idx[q]] = -1;
n_removed_qtl ++;
}
n_discarded ++;
}
vrb.bullet("#bin discarded=" + stb.str(n_discarded) + " out of " + stb.str(null_sets.size()));
vrb.bullet("#qtl discarded=" + stb.str(n_removed_qtl) + " out of " + stb.str(qtl_idx.size()));
vrb.title("Calculating overlap between OBSERVED set of variants and GWAS hits");
vector < bool > overlap_qtl = vector < bool > (qtl_idx.size() , false);
for (int q = 0 ; q < qtl_idx.size() ; q ++) {
if (genotype_bin[qtl_idx[q]] >= 0) {
for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], qtl_idx[q]);
}
}
unsigned int n_obs_overlap = 0;
for (int q = 0 ; q < qtl_idx.size() ; q ++) if (overlap_qtl[q]) n_obs_overlap ++;
vrb.bullet("#observed overlap=" + stb.str(n_obs_overlap) + " (=" + stb.str(n_obs_overlap * 100.0 / qtl_idx.size(), 2) + " %)");
vrb.title("Calculating overlap between NULL sets of variants and GWAS hits");
vrb.bullet("#permutations=" + stb.str(n_permutations));
basic_stats bs_null_overlap;
......@@ -53,17 +69,23 @@ void genrich_data::overlapGWASandQTL(string fout) {
//cerr << "1. Permutation " << p << endl;
vector < int > seq_null_qtl;
for (int q = 0 ; q < qtl_idx.size() ; q ++) {
if (genotype_bin[qtl_idx[q]] >= 0) {
unsigned int idx_bin = genotype_bin[qtl_idx[q]];
unsigned int idx_rnd = rng.getInt(null_sets[idx_bin].size());
//cerr << q << " " << idx_bin << " " << idx_rnd << " " << null_sets[idx_bin].size() << endl;
seq_null_qtl.push_back(null_sets[idx_bin][idx_rnd]);
} else seq_null_qtl.push_back(-1);
}
//cerr << "1. Size = " << seq_null_qtl.size() << endl;
//step2: work out overlap
//cerr << "2. Overlap " << p << endl;
overlap_qtl = vector < bool > (qtl_idx.size() , false);
for (int q = 0 ; q < qtl_idx.size() ; q ++) for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], seq_null_qtl[q]);
for (int q = 0 ; q < qtl_idx.size() ; q ++) {
if (seq_null_qtl[q] >= 0) {
for (int g = 0 ; g < gwas_idx.size() && !overlap_qtl[q] ; g ++) overlap_qtl[q] = isSameSignal(gwas_idx[g], seq_null_qtl[q]);
}
}
//step3: count overlaps
//cerr << "3. Count overlaps " << p << endl;
......
......@@ -34,7 +34,7 @@ void genrich_data::readPhenotypes(string fbed) {
while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
stb.split(string(str.s), tokens);
if (str.l && str.s[0] != '#') {
if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
if (tokens.size() < 6) vrb.error("Incorrect number of columns!");
if (filter_phenotype.check(tokens[3])) {
int chr_idx = findCHR(tokens[0]);
if (chr_idx < 0) {
......
......@@ -40,7 +40,7 @@ void genrich_data::readReferenceGenotypes(string fvcf) {
vrb.bullet("#samples = " + stb.str(included_sample));
//Variant processing
unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0;
unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0;
int ngt, ngt_arr = 0, *gt_arr = NULL;
bcf1_t * line;
while(bcf_sr_next_line (sr)) {
......@@ -66,6 +66,7 @@ void genrich_data::readReferenceGenotypes(string fvcf) {
if (maf > 0.5) maf = 1.0 - maf;
if (maf >= threshold_maf) {
int dist_tss = getDistance(chr_idx, pos);
if (dist_tss < 1e6) {
string tmp_id = chr + "_" + stb.str(pos);
genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size()));
genotype_chr.push_back(chr_idx);
......@@ -79,6 +80,7 @@ void genrich_data::readReferenceGenotypes(string fvcf) {
genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]);
}
}
} else n_excludedV_toofar++;
} else n_excludedV_rare ++;
} else n_excludedV_void ++;
} else n_excludedV_uchr ++;
......@@ -98,4 +100,5 @@ void genrich_data::readReferenceGenotypes(string fvcf) {
if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded");
if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss");
if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants");
if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants");
}
/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.*/
#include "gwas_data.h"
void gwas_data::runGwasPassOnVCF(string fvcf, string fout) {
vrb.title("Sweep through genotype data in [" + fvcf + "]");
bcf_sweep_t * sw = bcf_sweep_init(fvcf.c_str());
if (!sw) vrb.error("Cannot open file for reading [" + fvcf + "]");
bcf_hdr_t * hdr = bcf_sweep_hdr(sw);
if (!hdr) vrb.error("Cannot read header!");
//Specify samples to be read
string concat_id=sample_id[0];
for (int i = 1; i < sample_id.size() ; i ++) concat_id+=","+sample_id[i];
bcf_hdr_set_samples(hdr, concat_id.c_str(), 0);
vrb.bullet("#samples=" + stb.str(sample_count));
unsigned long n_variants = 0, n_curr_tests = 0, n_prev_tests = 0;
int mDS = 0;
float * vDS = NULL;
bcf1_t * rec;
timer testing_speed_timer;
testing_speed_timer.clock();
output_file fdhits (fout);
if (fdhits.fail()) vrb.error("Cannot open file for writing [" + fout + "]");
while ( (rec = bcf_sweep_fwd(sw)) ) {
bcf_unpack(rec, BCF_UN_STR);
string vid = string(rec->d.id);
string chr = string(bcf_hdr_id2name(hdr, rec->rid));
int pos = rec->pos + 1;
if (rec->n_allele == 2) {
int n_captured = bcf_get_format_float(hdr, rec, "DS", &vDS, &mDS);
if (n_captured == sample_count) {
imputeGenotypes(vDS);
normalize(vDS);
for (int p = 0 ; p < phenotype_count ; p ++) {
double rcorr = fastCorrelation(phenotype_val[p], vDS);
double npval = getNominalPvalue(abs(rcorr), sample_count - 2);
fdhits << phenotype_id[p] << " " << chr << " " << pos << " " << vid << " " << npval << " " << rcorr << endl;
}
if ((n_curr_tests/phenotype_count) % 1000 == 0) {
unsigned int elapsed_msecs = testing_speed_timer.rel_time();
vrb.bullet("#variants=" + stb.str(n_variants) + "\t#tests=" + stb.str(n_curr_tests) + "\t" + stb.str(n_curr_tests) + "\tspeed=" + stb.str((n_curr_tests - n_prev_tests) * 1.0 / (elapsed_msecs * 1000), 2) + "MT/s");
testing_speed_timer.clock();
n_prev_tests = n_curr_tests;
}
n_curr_tests += phenotype_count;
}
}
n_variants ++;
}
unsigned int elapsed_msecs = testing_speed_timer.rel_time();
vrb.bullet("#variants=" + stb.str(n_variants) + "\t#tests=" + stb.str(n_curr_tests) + "\t" + stb.str(n_curr_tests) + "\tspeed=" + stb.str((n_curr_tests - n_prev_tests) * 1.0 / (elapsed_msecs * 1000), 2) + "MT/s");
n_prev_tests = n_curr_tests;
bcf_sweep_destroy(sw);
fdhits.close();
}
void gwas_data::runGwasPassOnBED(string fbed, string fout) {
vrb.title("Sweep through BED file [" + fbed + "]");
htsFile *fp = hts_open(fbed.c_str(),"r");
if (!fp) vrb.error("Cannot open file");
tbx_t *tbx = tbx_index_load(fbed.c_str());
if (!tbx) vrb.error("Cannot open index file");
kstring_t str = {0,0,0};
if (hts_getline(fp, KS_SEP_LINE, &str) <= 0 || !str.l || str.s[0] != tbx->conf.meta_char ) vrb.error("Cannot read header line!");
//Process sample names
vector < string > tokens;
vector < int > mappingS;
stb.split(string(str.s), tokens);
if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
for (int t = 6 ; t < tokens.size() ; t ++) mappingS.push_back(findSample(tokens[t]));
vrb.bullet("#samples=" + stb.str(sample_count));
//Read phenotypes
int n_tested_variants = 0, n_ntested_variants = 0;
vector < float > values = vector < float > (sample_count, 0.0);
output_file fdhits (fout);
if (fdhits.fail()) vrb.error("Cannot open file for writing [" + fout + "]");
while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
stb.split(string(str.s), tokens);
if (str.l && str.s[0] != tbx->conf.meta_char) {
if (tokens.size() < 7) vrb.error("Incorrect number of columns!");
string vid = tokens[3];
string chr = tokens[0];
string sta = tokens[1];
string sto = tokens[2];
if (filter_genotype.check(vid)) {
for (int t = 6 ; t < tokens.size() ; t ++) {
if (mappingS[t-6] >= 0) {
if (tokens[t] == "NA") values[mappingS[t-6]] = bcf_float_missing;
else values[mappingS[t-6]] = stof(tokens[t]);
}
}
imputeValues(values);
normalize(values);
for (int p = 0 ; p < phenotype_count ; p ++) {
double rcorr = fastCorrelation(phenotype_val[p], values);
double npval = getNominalPvalue(abs(rcorr), sample_count - 2);
fdhits << phenotype_id[p] << " " << chr << " " << sta << " " << sto << " " << vid << " " << npval << " " << rcorr << endl;
}
n_tested_variants ++;
} else n_ntested_variants ++;
}
}
//Finalize & verbose
tbx_destroy(tbx);
if (hts_close(fp)) vrb.error("Cannot properly close file");
vrb.bullet(stb.str(n_tested_variants) + " variables tested");
if (n_ntested_variants > 0) vrb.bullet(stb.str(n_ntested_variants) + " variables not tested");
fdhits.close();
}
/*Copyright (C) 2015 Olivier Delaneau, Halit Ongen, Emmanouil T. Dermitzakis
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.*/
#ifndef _GWAS_DATA_H
#define _GWAS_DATA_H
//INCLUDES
#include "../common/data.h"
class gwas_data : public data {
public:
//PHENOTYPES
int phenotype_count; //phenotype number
vector < vector < float > > phenotype_val; //phenotype values
vector < string > phenotype_id; //phenotype ids
//COVARIATES
int covariate_count; //covariate number
vector < vector < string > > covariate_val; //covariate values
vector < string > covariate_id; //covariate ids
//CONSTRUCTOR / DESTRUCTOR
gwas_data();
~gwas_data();
void clear();
//READ OR GENERATE DATA
void readPhenotypes(string);
void readCovariates(string);
//GENOTYPE & PHENOTYPE MANAGEMENT
void computeDosages(int *, float *);
void imputeGenotypes(float *);
void imputePhenotypes();
void imputeValues(vector < float > &);
void normalizePhenotypes();
void normalize(float *);
void normalize(vector < float > &);
void residualizePhenotypes();
void normalTranformPhenotypes();
//COMPUTATION METHODS [ALL INLINES FOR SPEED]
double fastCorrelation(vector < float > &, float *);
double fastCorrelation(vector < float > &, vector < float > &);
double slowCorrelation(vector < float > &, float *);
double getNominalPvalue(double, double);
//ANALYSIS
void runGwasPassOnVCF(string, string);
void runGwasPassOnBED(string, string);
};
//***************************************************************//
//******************** DECLARE FUNCTIONS ************************//
//***************************************************************//
void gwas_main(vector < string > &);
//***************************************************************//
//******************** INLINE FUNCTIONS *************************//
//***************************************************************//
/*
* Fast implementation of inner_product optimized for 64 bytes cache lines.
*/
inline double gwas_data::fastCorrelation(vector < float > & P, float * G) {
int i = 0;
int repeat = (sample_count / 4);
int left = (sample_count % 4);
double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
while (repeat --) {
sum0 += P[i] * G[i];
sum1 += P[i+1] * G[i+1];
sum2 += P[i+2] * G[i+2];
sum3 += P[i+3] * G[i+3];
i += 4;
}
switch (left) {
case 3: sum0 += P[i+2] * G[i+2];
case 2: sum0 += P[i+1] * G[i+1];
case 1: sum0 += P[i+0] * G[i+0];
case 0: ;
}
return sum0 + sum1 + sum2 + sum3;
}
inline double gwas_data::fastCorrelation(vector < float > & P, vector < float > & G) {
int i = 0;
int repeat = (sample_count / 4);
int left = (sample_count % 4);
double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
while (repeat --) {
sum0 += P[i] * G[i];
sum1 += P[i+1] * G[i+1];
sum2 += P[i+2] * G[i+2];
sum3 += P[i+3] * G[i+3];
i += 4;
}
switch (left) {
case 3: sum0 += P[i+2] * G[i+2];
case 2: sum0 += P[i+1] * G[i+1];
case 1: sum0 += P[i+0] * G[i+0];
case 0: ;
}
return sum0 + sum1 + sum2 + sum3;
}
inline double gwas_data::slowCorrelation(vector < float > & P, float * G) {
double sum = 0.0;
for (int e = 0 ; e < sample_count ; e ++) sum += P[e] * G[e];
return sum;
}
inline double gwas_data::getNominalPvalue(double corr, double df) {
double pval = pf(df * corr * corr / (1 - corr * corr), 1, df, 0, 0);
if (pval <= std::numeric_limits<double>::min()) pval =std::numeric_limits<double>::min();
return pval;
}
#endif
......@@ -13,29 +13,29 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.*/
#include "quan_data.h"
void quan_data::read_Sample_Names(vector <string> &names){
if (names.size() == 0){
samples = bams;
}else if (names.size()==1){
input_file fd(names[0]);
if (fd.fail()){
//Assuming a single name
fd.close();
samples.push_back(names[0]);
}else{
//Assuming a file with names
string buffer;
vector < string > str;
while(getline(fd, buffer)) {
stb.split(buffer, str);
if (str.size()!=1) vrb.error("Expecting a single sample name per line in [" + names[0] + "] but got: " + buffer);
samples.push_back(str[0]);
#include "gwas_data.h"
gwas_data::gwas_data() {
sample_count = 0;
phenotype_count = 0;
covariate_count = 0;
}
fd.close();
void gwas_data::clear() {
sample_count = 0;
sample_id.clear();
phenotype_count = 0;
phenotype_val.clear();
phenotype_id.clear();
covariate_count = 0;
covariate_val.clear();
covariate_id.clear();
sample_count = 0;
phenotype_count = 0;
covariate_count = 0;
}
}else samples = names;
if (samples.size() != bams.size()) vrb.error("Sample names does not match with BAM files!");
gwas_data::~gwas_data() {
clear();
}