Skip to content
Commits on Source (4)
No preview for this file type
project(DiscoSNP++)
cmake_minimum_required(VERSION 2.6)
#add_compile_options(
# "-Wall" "-Wpedantic" "-Wextra" "-fexceptions"
# "$<$<CONFIG:DEBUG>:-O0;-g3;-ggdb>"
#)
################################################################################
# The version number.
################################################################################
SET (gatb-tool_VERSION_MAJOR 2)
SET (gatb-tool_VERSION_MINOR 2)
SET (gatb-tool_VERSION_PATCH 10)
SET (gatb-tool_VERSION_MINOR 4)
SET (gatb-tool_VERSION_PATCH 2)
IF (DEFINED MAJOR)
SET (gatb-tool_VERSION_MAJOR ${MAJOR})
......
......@@ -17,6 +17,8 @@ Uricaru R., Rizk G., Lacroix V., Quillery E., Plantard O., Chikhi R., Lemaitre C
Peterlongo, P., Riou, C., Drezen, E., Lemaitre, C. (2017). [DiscoSnp ++ : de novo detection of small variants from raw unassembled read set(s).](http://doi.org/https://doi.org/10.1101/209965) BioRxiv.
Gauthier, J., Mouden, C., Suchan, T., Alvarez, N., Arrigo, N., Riou, C., Lemaitre, C., Peterlongo, P. (2017). [DiscoSnp-RAD: de novo detection of small variants for population genomics](https://www.biorxiv.org/content/early/2017/11/09/216747). BioRxiv
## DiscoSnp++ or DiscoSnpRad
We propose a DiscoSnp++ adaptation for RAD-Seq data. A script, called `run_discoSnpRad.sh`, is adapted to this kind of data. See below for more details.
......
discosnp (2.4.2-1) UNRELEASED; urgency=medium
* Team upload.
* New upstream version
* Reported fixed typos upstream
- https://github.com/GATB/DiscoSnp/pull/15/
- https://github.com/GATB/DiscoSnp/pull/14/
* FTBFS:
-- UNABLE TO FIND A DIRECTORY FOR gatb-core...
Call Stack (most recent call first):
/usr/lib/x86_64-linux-gnu/cmake/GatbCore.cmake:36 (LOOKUP_PATH)
CMakeLists.txt:47 (include)
-- Configuring incomplete, errors occurred!
-- Steffen Moeller <moeller@debian.org> Tue, 17 Dec 2019 14:33:46 +0100
discosnp (2.3.0-3) unstable; urgency=medium
* debhelper-compat 12
......
......@@ -2,8 +2,10 @@ Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 21 Jan 2019 09:01:19 +0100
Description: Result of 2to3
--- a/scripts/filter_out_using_MAF.py
+++ b/scripts/filter_out_using_MAF.py
Index: discosnp/scripts/filter_out_using_MAF.py
===================================================================
--- discosnp.orig/scripts/filter_out_using_MAF.py
+++ discosnp/scripts/filter_out_using_MAF.py
@@ -2,8 +2,8 @@ import sys
import gzip
......@@ -24,76 +26,11 @@ Description: Result of 2to3
--- a/scripts_RAD/1SNP_by_cluster.py
+++ b/scripts_RAD/1SNP_by_cluster.py
@@ -88,5 +88,5 @@ vcf_file=sys.argv[1]
dict, nb_SNP_before, nb_SNP_after = store_info(vcf_file)
output_newvcf(vcf_file, dict)
-print("nb SNP before : " + str(nb_SNP_before))
-print("nb SNP after : " + str(nb_SNP_after))
+print(("nb SNP before : " + str(nb_SNP_before)))
+print(("nb SNP after : " + str(nb_SNP_after)))
--- a/scripts_RAD/filter_paralogs.py
+++ b/scripts_RAD/filter_paralogs.py
@@ -100,4 +100,4 @@ dict, nb_cluster_tot = store_info(vcf_fi
clusters_to_keep, nb_clusters_kept = discard_clusters(dict, y)
output_newvcf(vcf_file, clusters_to_keep)
-print(str(nb_clusters_kept) + " on " + str(nb_cluster_tot) + " clusters had less than " + str(y*100) + "% of SNP with less than " + str(x*100) + "% heterygous genotypes")
+print((str(nb_clusters_kept) + " on " + str(nb_cluster_tot) + " clusters had less than " + str(y*100) + "% of SNP with less than " + str(x*100) + "% heterygous genotypes"))
--- a/scripts_RAD/filter_vcf_by_indiv_cov_and_max_missing.py
+++ b/scripts_RAD/filter_vcf_by_indiv_cov_and_max_missing.py
@@ -17,25 +17,25 @@ import getopt
def usage():
'''Usage'''
- print "-----------------------------------------------------------------------------"
- print sys.argv[0]," : vcf filter"
- print "-----------------------------------------------------------------------------"
- print "usage: ",sys.argv[0]," -i vcf_file -o output_file [-c min_cov -m max_missing -s]"
- print " -i: input vcf file"
- print " -o: output vcf file"
- print " -m: max missing genotype proportion to keep a variant (between 0 and 1, def = 1)"
- print " -c: min coverage to call a genotype (int, def=0)"
- print " -s: snp only (def= all variants)"
- print "-----------------------------------------------------------------------------"
+ print("-----------------------------------------------------------------------------")
+ print(sys.argv[0]," : vcf filter")
+ print("-----------------------------------------------------------------------------")
+ print("usage: ",sys.argv[0]," -i vcf_file -o output_file [-c min_cov -m max_missing -s]")
+ print(" -i: input vcf file")
+ print(" -o: output vcf file")
+ print(" -m: max missing genotype proportion to keep a variant (between 0 and 1, def = 1)")
+ print(" -c: min coverage to call a genotype (int, def=0)")
+ print(" -s: snp only (def= all variants)")
+ print("-----------------------------------------------------------------------------")
sys.exit(2)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "i:o:c:m:s", ["in=", "out=","cov=","miss=","snp-only"])
- except getopt.GetoptError, err:
+ except getopt.GetoptError as err:
# print help information and exit:
- print str(err) # will print something like "option -a not recognized"
+ print(str(err)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
@@ -60,7 +60,7 @@ def main():
assert False, "unhandled option"
if input_file == 0 or output_file == 0:
- print "Missing arguments"
+ print("Missing arguments")
usage()
sys.exit(2)
else:
--- a/scripts/ClassVCF_creator.py
+++ b/scripts/ClassVCF_creator.py
@@ -177,7 +177,7 @@ class VARIANT():
Index: discosnp/scripts/ClassVCF_creator.py
===================================================================
--- discosnp.orig/scripts/ClassVCF_creator.py
+++ discosnp/scripts/ClassVCF_creator.py
@@ -218,7 +218,7 @@ class VARIANT():
#---------------------------------------------------------------------------------------------------------------------------
def RetrievePolymorphismFromHeader(self):
'''Gets from the dicoAllele all the positions, and the nucleotides for each variant '''
......@@ -102,32 +39,7 @@ Description: Result of 2to3
self.upper_path.listPosForward.append(int(posD)+1)
self.lower_path.listPosForward.append(int(posD)+1)
self.upper_path.listPosReverse.append(len(self.upper_path.seq)-int(posD))
@@ -390,12 +390,12 @@ class VARIANT():
if IDVariantUp != IDVariantLow:
if bitwiseFlag & 2048 : #Checks if it's a supplementary alignment
print("Supplementary alignment:")
- print(self.upper_path.listSam)
+ print((self.upper_path.listSam))
return (2)
else :
print("WARNING two consecutive lines do not store the same variant id: ")
- print(self.upper_path.listSam)
- print(self.lower_path.listSam)
+ print((self.upper_path.listSam))
+ print((self.lower_path.listSam))
return (1)
else:
@@ -438,7 +438,7 @@ class PATH():
def RetrieveXA(self,VCFObject):
VCFObject.XA=""
- for position,(NM,cigarcode) in self.dicoMappingPos.items():
+ for position,(NM,cigarcode) in list(self.dicoMappingPos.items()):
if cigarcode!="":
VCFObject.XA+=str(position)+","
if VCFObject.XA:
@@ -852,7 +852,7 @@ class INDEL(VARIANT):
@@ -920,7 +920,7 @@ class INDEL(VARIANT):
else:
self.longestSequenceForward=self.ReverseComplement(self.upper_path.seq)
self.longestSequenceReverse=self.upper_path.seq
......@@ -136,7 +48,7 @@ Description: Result of 2to3
#In case of forward strand mapped
#we return the disco indel + the lefmost nucleotide before the indel (by taking into account the ambiguity
self.upper_path.listPosForward.append(int(posD)-int(amb))
@@ -1230,8 +1230,8 @@ class VCFFIELD():
@@ -1302,8 +1302,8 @@ class VCFFIELD():
error+=1
if error>0:
print(" !!! Line where the error occured !!!")
......@@ -147,8 +59,10 @@ Description: Result of 2to3
else : return (error)
--- a/scripts/discoSnp++_to_csv.py
+++ b/scripts/discoSnp++_to_csv.py
Index: discosnp/scripts/discoSnp++_to_csv.py
===================================================================
--- discosnp.orig/scripts/discoSnp++_to_csv.py
+++ discosnp/scripts/discoSnp++_to_csv.py
@@ -64,7 +64,7 @@ while 1:
i+=1
sys.stdout.write( com2_tab[i][:-1]+",")
......@@ -158,8 +72,10 @@ Description: Result of 2to3
--- a/scripts/filterOnBestDP_multiple_variant_at_same_pos.py
+++ b/scripts/filterOnBestDP_multiple_variant_at_same_pos.py
Index: discosnp/scripts/filterOnBestDP_multiple_variant_at_same_pos.py
===================================================================
--- discosnp.orig/scripts/filterOnBestDP_multiple_variant_at_same_pos.py
+++ discosnp/scripts/filterOnBestDP_multiple_variant_at_same_pos.py
@@ -25,7 +25,7 @@ while True:
line = vcf_for_igv.readline()
if not line: break
......@@ -181,8 +97,10 @@ Description: Result of 2to3
if out:break
vcf_for_igv.close()
--- a/scripts/filter_out_using_ratio_of_covered_files.py
+++ b/scripts/filter_out_using_ratio_of_covered_files.py
Index: discosnp/scripts/filter_out_using_ratio_of_covered_files.py
===================================================================
--- discosnp.orig/scripts/filter_out_using_ratio_of_covered_files.py
+++ discosnp/scripts/filter_out_using_ratio_of_covered_files.py
@@ -53,7 +53,7 @@ while True:
if coverage_high[i]>=minimal_coverage or coverage_low[i]>=minimal_coverage: number_of_covered_sets+=1
......@@ -192,8 +110,10 @@ Description: Result of 2to3
--- a/scripts/functionObjectVCF_creator.py
+++ b/scripts/functionObjectVCF_creator.py
Index: discosnp/scripts/functionObjectVCF_creator.py
===================================================================
--- discosnp.orig/scripts/functionObjectVCF_creator.py
+++ discosnp/scripts/functionObjectVCF_creator.py
@@ -191,25 +191,25 @@ def CheckAtDistanceXBestHits(upper_path,
best_up=1024
if int(upper_path.mappingPosition)==0 and int(lower_path.mappingPosition)==0:#Checks if paths are unmappped
......@@ -224,20 +144,11 @@ Description: Result of 2to3
if nbMismatch == best_low:
position_set.add(position)
if len(position_set) > 1:
--- a/scripts/one2zeroBased_vcf.py
+++ b/scripts/one2zeroBased_vcf.py
@@ -10,7 +10,7 @@ VCFfile=open("VCFone2zeroBAsed.vcf","w")
for line in file:
if "#" == line[0]:
- print (line,)
+ print((line,))
continue
line=line.rstrip('\n')
#SNP_higher_path_3 117 3 C G . . Ty=SNP;Rk=1.00000;UL=86;UR=261;CL=.;CR=.;C1=124,0;C2=0,134 GT:DP:PL 0/0:124:10,378,2484 1/1:134:2684,408,10
--- a/scripts/VCF_creator.py
+++ b/scripts/VCF_creator.py
@@ -75,7 +75,7 @@ for opt, arg in opts :
Index: discosnp/scripts/VCF_creator.py
===================================================================
--- discosnp.orig/scripts/VCF_creator.py
+++ discosnp/scripts/VCF_creator.py
@@ -78,7 +78,7 @@ def main():
listName[1]=listName[1].replace("_", " ")
stream_file=open(fileName,'r')
else :
......@@ -246,7 +157,7 @@ Description: Result of 2to3
sys.exit(2)
elif opt in ("-o","--output"):
if arg!=None:
@@ -91,7 +91,7 @@ for opt, arg in opts :
@@ -94,7 +94,7 @@ def main():
print("!! No filtered sam output !!")
sys.exit(2)
else:
......
......@@ -2,24 +2,28 @@ Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 21 Jan 2019 09:01:19 +0100
Description: Adapt test scripts to Debian PATH
--- a/test/simple_test.sh
+++ b/test/simple_test.sh
Index: discosnp/test/simple_test.sh
===================================================================
--- discosnp.orig/test/simple_test.sh
+++ discosnp/test/simple_test.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-../run_discoSnp++.sh -r fof.txt -T
+run_discoSnp++.sh -r fof.txt -T
diff discoRes_k_31_c_auto_D_100_P_1_b_0_coherent.fa ref_discoRes_k_31_c_auto_D_100_P_1_b_0_coherent.fa
diff discoRes_k_31_c_3_D_100_P_3_b_0_coherent.fa ref_discoRes_k_31_c_3_D_100_P_3_b_0_coherent.fa
if [ $? -ne 0 ] ; then
--- a/test/large_test/local_large_test.sh
+++ b/test/large_test/local_large_test.sh
Index: discosnp/test/large_test/local_large_test.sh
===================================================================
--- discosnp.orig/test/large_test/local_large_test.sh
+++ discosnp/test/large_test/local_large_test.sh
@@ -2,7 +2,7 @@
#####################
# Default option run:
#####################
-../../run_discoSnp++.sh -r fof.txt -T
+run_discoSnp++.sh -r fof.txt -T
-../../run_discoSnp++.sh -r fof.txt -T -P 1
+run_discoSnp++.sh -r fof.txt -T -P 1
if [ $? -ne 0 ] ; then
echo "*** Default option FAILURE:"
echo "*** discoSnp failure"
......@@ -6,63 +6,21 @@ Description: There is no point in delivering read_file_names in /usr/bin
.
Some other issues of these scripts are fixed as well.
--- a/run_discoSnpRad.sh
+++ b/run_discoSnpRad.sh
@@ -65,7 +65,7 @@ genotyping="-genotype"
remove=1
verbose=1
short_read_connector_path=""
-EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+EDIR=/usr
if [ -d "$EDIR/build/" ] ; then # VERSION SOURCE COMPILED
read_file_names_bin=$EDIR/build/bin/read_file_names
@@ -73,14 +73,14 @@ if [ -d "$EDIR/build/" ] ; then # VERSIO
kissnp2_bin=$EDIR/build/bin/kissnp2
kissreads2_bin=$EDIR/build/bin/kissreads2
else # VERSION BINARY
- read_file_names_bin=$EDIR/bin/read_file_names
+ read_file_names_bin=/usr/lib/discosnp/bin/read_file_names
dbgh5_bin=$EDIR/bin/dbgh5
kissnp2_bin=$EDIR/bin/kissnp2
kissreads2_bin=$EDIR/bin/kissreads2
fi
-chmod +x $EDIR/scripts/*.sh $EDIR/scripts_RAD/*.sh $EDIR/run_discoSnpRad.sh 2>/dev/null # Usefull for binary distributions
+# chmod +x $EDIR/scripts/*.sh $EDIR/scripts_RAD/*.sh $EDIR/run_discoSnpRad.sh 2>/dev/null # Usefull for binary distributions
useref=""
genome=""
@@ -463,14 +463,14 @@ echo -e "\t#############################
T="$(date +%s)"
if [ -f "$src_file" ]; then
- cmd="$EDIR/scripts_RAD/discoRAD_finalization.sh ${kissprefix}_coherent.fa $short_read_connector_path"
+ cmd="/usr/share/discosnp/scripts_RAD/discoRAD_finalization.sh ${kissprefix}_coherent.fa $short_read_connector_path"
echo $cmd
$cmd
T="$(($(date +%s)-T))"
echo "RAD clustering per locus time in seconds: ${T}"
else
echo "IF YOU WANT TO CLUSTERIZE RESULTS, RUN: "
- echo " $EDIR/scripts_RAD/discoRAD_finalization.sh ${kissprefix}_coherent.fa short_read_connector_path"
+ echo " /usr/share/discosnp/scripts_RAD/discoRAD_finalization.sh ${kissprefix}_coherent.fa short_read_connector_path"
echo " With short_read_connector_path indicating the directory containing short_read_connector.sh command "
fi
--- a/run_discoSnp++.sh
+++ b/run_discoSnp++.sh
@@ -54,7 +54,7 @@ paired=""
remove=1
verbose=1
Index: discosnp/run_discoSnp++.sh
===================================================================
--- discosnp.orig/run_discoSnp++.sh
+++ discosnp/run_discoSnp++.sh
@@ -65,7 +65,8 @@ verbose=1
stop_after_kissnp=0
e=""
-EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+EDIR=/usr
#EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-EDIR=$( python -c "import os.path; print(os.path.dirname(os.path.realpath(\"${BASH_SOURCE[0]}\")))" ) # as suggested by Philippe Bordron
+#EDIR=$( python -c "import os.path; print(os.path.dirname(os.path.realpath(\"${BASH_SOURCE[0]}\")))" ) # as suggested by Philippe Bordron
+EDIR="/usr"
if [ -d "$EDIR/build/" ] ; then # VERSION SOURCE COMPILED
read_file_names_bin=$EDIR/build/bin/read_file_names
@@ -62,13 +62,13 @@ if [ -d "$EDIR/build/" ] ; then # VERSIO
@@ -74,13 +75,13 @@ if [ -d "$EDIR/build/" ] ; then # VERSIO
kissnp2_bin=$EDIR/build/bin/kissnp2
kissreads2_bin=$EDIR/build/bin/kissreads2
else # VERSION BINARY
......@@ -77,33 +35,29 @@ Description: There is no point in delivering read_file_names in /usr/bin
+#chmod +x $EDIR/scripts/*.sh $EDIR/run_discoSnpRad.sh 2>/dev/null # Usefull for binary distributions
useref=""
genome=""
@@ -491,7 +491,7 @@ echo -e "\t#################### CREATE V
echo -e "\t###############################################################"
wraith="false"
@@ -667,7 +668,7 @@ echo -e " #################### CREATE VC
echo -e " ############################################################### $reset"
if [ -z "$genome" ]; then # NO reference genome use, vcf creator mode 1
- vcfCreatorCmd="$EDIR/scripts/run_VCF_creator.sh -p ${kissprefix}_coherent.fa -o ${kissprefix}_coherent.vcf"
+ vcfCreatorCmd="/usr/share/discosnp/scripts/run_VCF_creator.sh -p ${kissprefix}_coherent.fa -o ${kissprefix}_coherent.vcf"
echo $vcfCreatorCmd
echo $green$vcfCreatorCmd$cyan
if [[ "$wraith" == "false" ]]; then
$vcfCreatorCmd
if [ $? -ne 0 ]
@@ -500,7 +500,7 @@ if [ -z "$genome" ]; then # NO referenc
@@ -678,7 +679,7 @@ if [ -z "$genome" ]; then # NO referenc
exit 1
fi
else # A Reference genome is provided, vcf creator mode 2
- vcfCreatorCmd="$EDIR/scripts/run_VCF_creator.sh $bwa_path_option -G $genome $bwa_path_option -p ${kissprefix}_coherent.fa -o ${kissprefix}_coherent.vcf -I $option_cores_post_analysis $e"
+ vcfCreatorCmd="/usr/share/discosnp/scripts/run_VCF_creator.sh $bwa_path_option -G $genome $bwa_path_option -p ${kissprefix}_coherent.fa -o ${kissprefix}_coherent.vcf -I $option_cores_post_analysis $e"
echo $vcfCreatorCmd
echo $green$vcfCreatorCmd$cyan
if [[ "$wraith" == "false" ]]; then
$vcfCreatorCmd
--- a/scripts_RAD/discoRAD_finalization.sh
+++ b/scripts_RAD/discoRAD_finalization.sh
@@ -61,7 +61,7 @@ cmd="${short_read_connector_directory}/s
echo -e "\t\t$cmd"
$cmd
# Compute the clustering
-cmd="${EDIR}/../build/bin/quick_hierarchical_clustering ${discofile}.txt > ${discofile}.cluster"
+cmd="/usr/lib/discosnp/bin/quick_hierarchical_clustering ${discofile}.txt > ${discofile}.cluster"
echo -e "\t\t$cmd"
$cmd > ${discofile}.cluster
# Generate a .fa file with clustering information
@@ -711,4 +712,4 @@ if [[ "$wraith" == "false" ]]; then
fi
echo -e " Thanks for using discoSnp++ - http://colibread.inria.fr/discoSnp/ - Forum: http://www.biostars.org/t/discoSnp/"
echo -e "################################################################################################################$reset"
-fi
\ No newline at end of file
+fi
......@@ -2,6 +2,6 @@ use_debian_packaged_gatb-core.patch
no_install_to_wrong_dir.patch
path_to_readme_file_names.patch
spelling.patch
fix_scripts.patch
#fix_scripts.patch
adapt_test_scripts.patch
2to3.patch
......@@ -2,8 +2,10 @@ Author: Andreas Tille <tille@debian.org>
Last-Update: Mon, 21 Jan 2019 09:01:19 +0100
Description: Fix spelling
--- a/tools/kissnp2/src/Kissnp2.cpp
+++ b/tools/kissnp2/src/Kissnp2.cpp
Index: discosnp/tools/kissnp2/src/Kissnp2.cpp
===================================================================
--- discosnp.orig/tools/kissnp2/src/Kissnp2.cpp
+++ discosnp/tools/kissnp2/src/Kissnp2.cpp
@@ -53,8 +53,8 @@ Kissnp2::Kissnp2 () : Tool ("Kissnp2")
getParser()->push_front (new OptionNoParam (STR_DISCOSNP_LOW_COMPLEXITY, "conserve low complexity SNPs", false));
getParser()->push_front (new OptionOneParam (STR_MAX_AMBIGOUS_INDELS, "Maximal size of ambiguity of INDELs. INDELS whose ambiguity is higher than this value are not output", false, "20"));
......@@ -15,17 +17,28 @@ Description: Fix spelling
"\t\t2: No limitation on branching (low precision, high recall)", false, "1"));
getParser()->push_front (new OptionOneParam (STR_MAX_SYMMETRICAL_CROSSROADS,"In b2 mode only: maximal number of symmetrical croasroads traversed while trying to close a bubble. Default: no limit", false, "-1"));
@@ -63,7 +63,7 @@ Kissnp2::Kissnp2 () : Tool ("Kissnp2")
getParser()->push_front (new OptionOneParam (STR_URI_OUTPUT, "output name", true));
@@ -64,7 +64,7 @@ Kissnp2::Kissnp2 () : Tool ("Kissnp2")
getParser()->push_front (new OptionOneParam (STR_KISSNP2_COVERAGE_FILE_NAME, "File (.h5) generated by kissnp2, containing the coverage threshold per read set", false, "_removemeplease"));
getParser()->push_front (new OptionOneParam (STR_URI_INPUT, "input file (likely a hdf5 file)", true));
- getParser()->push_front (new OptionNoParam (STR_KISSNP2_DONT_OUTPUT_FIRST_COV, "Don't output the first coverage threshold. Use this option whent the refernece file is used for finding the variants", false));
+ getParser()->push_front (new OptionNoParam (STR_KISSNP2_DONT_OUTPUT_FIRST_COV, "Don't output the first coverage threshold. Use this option whent the reference file is used for finding the variants", false));
+ getParser()->push_front (new OptionNoParam (STR_KISSNP2_DONT_OUTPUT_FIRST_COV, "Don't output the first coverage threshold. Use this option when the reference file is used for finding the variants", false));
getParser()->push_front (new OptionOneParam (STR_MAX_INDEL_SIZE, "maximal size of a predicted indel", false, "0"));
--- a/tools/kissreads2/src/Kissreads2.cpp
+++ b/tools/kissreads2/src/Kissreads2.cpp
@@ -75,7 +75,7 @@ Kissnp2::Kissnp2 () : Tool ("Kissnp2")
getParser()->push_back (new OptionOneParam (BubbleFinder::STR_BFS_MAX_BREADTH, "maximum breadth for BFS", false, "20"));
getParser()->push_front (new OptionNoParam (STR_RADSEQ, "keep truncated bubbles, that have no successors on the 2 paths at the same position", false));
- getParser()->push_back (new OptionOneParam (STR_MAX_TRUNCATED_PATH_LENGTH_DIFFERENCE, "Longest accepted difference length between two paths of a truncated bubbleS", false, "0"));
+ getParser()->push_back (new OptionOneParam (STR_MAX_TRUNCATED_PATH_LENGTH_DIFFERENCE, "Longest accepted difference length between two paths of a truncated bubble", false, "0"));
}
Index: discosnp/tools/kissreads2/src/Kissreads2.cpp
===================================================================
--- discosnp.orig/tools/kissreads2/src/Kissreads2.cpp
+++ discosnp/tools/kissreads2/src/Kissreads2.cpp
@@ -41,7 +41,7 @@ Kissreads2::Kissreads2 () : Tool ("Kissr
/** We add options known by kissnp2. */
......
# DiscoSnpRAD: small variant discovery and genotyping for RAD-seq data
DiscoSnpRAD is a pipeline based on discoSnp++ to discover small variants in RAD-like sequencing data. The differences with respect to using directly discoSnp++ lies in three main features:
* an enhanced bubble model to deal with RAD-like sequences
* using specific discoSnp++ parameters and filters, adapted to RAD-like data
* clustering the called variants into loci
**Reference:**
Gauthier, J., Mouden, C., Suchan, T., Alvarez, N., Arrigo, N., Riou, C., Lemaitre, C., Peterlongo, P. (2017). [DiscoSnp-RAD: de novo detection of small variants for population genomics](https://www.biorxiv.org/content/early/2017/11/09/216747). BioRxiv
## Installation
* discoSnp++
* `short_read_connector` must have been downloaded and installed (clustering task). [https://github.com/GATB/short_read_connector](https://github.com/GATB/short_read_connector)
## Usage
```
./run_discoSnpRad.sh --fof read_file_of_files --src_path <directory> [discoSnp++ OPTIONS]
```
Clustering
```
-S|--src_path <directory>
**absolute** path to short_read_connector directory, containing the "short_read_connector.sh" file.
-Note1: short read connector must be compiled.
-Note2: with this option, discoSnpRad provide a vcf file containing SNPs and INDELS, clustered by locus
```
All other options are described in [discoSnp++ README](../README.md). Note that many discoSNP++ parameters have here default values, specifically adapted to RAD-seq data.
To see all options:
```
./run_discoSnpRad.sh -h
```
## Output
* a log file reminds all filtering steps applied and the name of the output .vcf file
* a vcf file containing results of filtering and clustering
## Content of this directory
Additionnally to the main script of discoSnpRAD, this directory contains two sub-directories :
* [clustering_scripts](clustering_scripts/) : it contains the scripts used by the main script of discoSnpRAD for clustering and formatting the variants.
* [post-processing_scripts](post-processing_scripts/) : it contains several scripts that can be usefull to post-process the results of discoSnpRAD, ie. filtering results according to various criteria, changing format, preparing data for Structure, etc.
#!/bin/sh
# REQUIRES:
## Python 3
## short_read_connector: installed and compiled: https://github.com/GATB/short_read_connector
# echo "WARNING: short_read_connector must have been compiled"
red=`tput setaf 1`
green=`tput setaf 2`
yellow=`tput setaf 3`
cyan=`tput setaf 6`
bold=`tput bold`
reset=`tput sgr0`
function help {
echo $reset
echo "====================================================="
echo "Filtering, clustering per locus and vcf formatting of $rawdiscofile"
echo "====================================================="
echo "this script manages bubble clustering from a discofile.fa file, and the integration of cluster informations in a vcf file"
echo " 1/ Remove variants with more than 95% missing genotypes and low rank (<0.4)"
echo " 2/ Cluster variants per locus"
echo " 3/ Format the variants in a vcf file with cluster information"
echo "Usage: ./discoRAD_clustering.sh -f discofile -s SRC_directory/ -o output_file.vcf"
# echo "nb: all options are MANDATORY\n"
echo "OPTIONS:"
echo "\t -f: DiscoSnp fasta output containing coherent predictions"
echo "\t -s: Path to Short Read Connector"
echo "\t -o: output file path (vcf)"
echo "\t -w: Wraith mode: only show all discoSnpRad commands without running them"
}
wraith="false"
# if [ "$#" -lt 7 ]; then
# help
# exit
# fi
while getopts "f:s:o:hw" opt; do
case $opt in
f)
rawdiscofile=$OPTARG
;;
s)
short_read_connector_directory=$OPTARG
;;
o)
output_file=$OPTARG
;;
w)
wraith="true"
;;
h)
help
exit
;;
esac
done
if [[ -z "${rawdiscofile}" ]]; then
echo "${red}-f is mandatory$reset" >&2
exit
fi
if [[ -z "${short_read_connector_directory}" ]]; then
echo "${red}-s is mandatory$reset" >&2
exit
fi
if [[ -z "${output_file}" ]]; then
echo "${red}-o is mandatory$reset" >&2
exit
fi
# Detect the directory path
EDIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
if [ -d "$EDIR/../../build/" ] ; then # VERSION SOURCE COMPILED
BINDIR=$EDIR"/../../build/bin"
else # VERSION BINARY
BINDIR=$EDIR"/../../bin"
fi
rawdiscofile_base=$( basename "${rawdiscofile}" .fa)
#################### PARAMETERS VALUES #######################
#Get k value (for clustering purpose)
originalk=$( echo $rawdiscofile | awk -F k_ '{ print $2 }' | cut -d "_" -f 1)
usedk=$((originalk-1))
## Short Read Connector is limited to kmers of size at most 31 (u_int64).
if [ ${usedk} -gt 31 ]
then
usedk=31
fi
# rank filter parameter
min_rank=0.4
# max cluster size parameter
max_cluster_size=150
percent_missing=0.95
echo "${yellow}############################################################"
echo "######### MISSING DATA AND LOW RANK FILTERING #############"
echo "############################################################"
echo "${yellow}#Filtering variants with more than ${min_rank} missing data and rank<${min_rank} ...${reset}"
disco_filtered=${rawdiscofile_base}_filtered
cmdFilter="python3 ${EDIR}/fasta_and_cluster_to_filtered_vcf.py -i ${rawdiscofile} -f -o ${disco_filtered}.fa -m ${percent_missing} -r ${min_rank} 2>&1 "
echo $green$cmdFilter$cyan
if [[ "$wraith" == "false" ]]; then
eval $cmdFilter
if [ ! -s ${disco_filtered}.fa ]; then
echo "${red}No variant pass the filters, exit$reset"
exit 0
fi
fi
######################### Clustering ###########################
echo "${yellow}############################################################"
echo "###################### CLUSTERING ##########################"
echo "############################################################"
echo "#Clustering variants (sharing at least a ${usedk}-mers)...${reset}"
# Simplify headers (for dsk purposes)
disco_simpler=${disco_filtered}_simpler
cmdCat="cat ${disco_filtered}.fa | cut -d \"|\" -f 1 | sed -e \"s/^ *//g\""
echo $green $cmdCat "> ${disco_simpler}.fa$cyan"
if [[ "$wraith" == "false" ]]; then
eval $cmdCat > ${disco_simpler}.fa
fi
#cat ${disco_filtered}.fa | cut -d "|" -f 1 | sed -e "s/^ *//g" > ${disco_simpler}.fa
cmdLs="ls ${disco_simpler}.fa"
echo $green$cmdLs "> ${disco_simpler}.fof$cyan"
if [[ "$wraith" == "false" ]]; then
eval $cmdLs > ${disco_simpler}.fof
fi
#ls ${disco_simpler}.fa > ${disco_simpler}.fof
# Compute sequence similarities
cmdSRC="${short_read_connector_directory}/short_read_connector.sh -b ${disco_simpler}.fa -q ${disco_simpler}.fof -s 0 -k ${usedk} -a 1 -l -p ${disco_simpler} 1>&2 "
echo $green$cmdSRC$cyan
if [[ "$wraith" == "false" ]]; then
eval $cmdSRC
fi
echo $reset
if [ $? -ne 0 ]
then
echo "${red}there was a problem with Short Read Connector, exit$reset"
exit 1
fi
# Format one line per edge
cmd="python3 ${EDIR}/from_SRC_to_edges.py ${disco_simpler}.txt"
echo $green$cmd "> ${disco_simpler}_edges.txt$cyan"
if [[ "$wraith" == "false" ]]; then
eval $cmd "> ${disco_simpler}_edges.txt"
fi
# Compute the clustering
cmdqhc="${BINDIR}/quick_hierarchical_clustering ${disco_simpler}_edges.txt"
echo $green$cmdqhc " > ${disco_simpler}.cluster$cyan"
if [[ "$wraith" == "false" ]]; then
eval $cmdqhc "> ${disco_simpler}.cluster"
fi
if [ $? -ne 0 ]
then
echo "${red}there was a problem with quick_hierarchical_clustering, exit$reset"
exit 1
fi
echo $reset
######################### VCF generation with cluster information FROM ORIGINAL FASTA ###########################
echo "${yellow}############################################################"
echo "###################### OUTPUT VCF ##########################"
echo "############################################################$reset"
cmdVCF="python3 ${EDIR}/fasta_and_cluster_to_filtered_vcf.py -i ${disco_filtered}.fa -o ${output_file} -c ${disco_simpler}.cluster -s ${max_cluster_size} 2>&1 "
echo $green$cmdVCF$cyan
if [[ "$wraith" == "false" ]]; then
eval $cmdVCF
fi
if [ $? -ne 0 ]
then
echo "${red}there was a problem with vcf creation, exit$reset"
exit 1
fi
echo "${yellow}#######################################################################"
echo "######################### CLEANING ##################################"
echo "#######################################################################$reset"
rm -f ${disco_simpler}*
#rm -f ${disco_filtered}.fa
echo "${yellow}============================"
echo " DISCORAD clustering DONE "
echo "============================"
echo " Results in ${output_file}$reset"
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
''' ***********************************************
Script to filter and format discoSnp raw output file (.fa) into a vcf format file (.vcf)
Author - Claire Lemaitre
Usage:
python3 fasta_and_cluster_to_filtered_vcf.py -i disco_bubbles_coherent.fa [-o disco_bubbles_coherent.vcf -m 0.95 -r 0.4]
*********************************************** '''
import sys
import getopt
import random
import re #regular expressions
import time
import os
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../../scripts/"))
from vcf_formatting_functions import *
''' Usages in discoSnp pipeline scripts (RAD/clustering_scripts/discoRAD_clustering.sh):
If no clustering:
python3 fasta_and_cluster_to_filtered_vcf.py -i disco_bubbles_coherent.fa -o disco_bubbles_coherent.vcf -m 0.95 -r 0.4
end of the pipeline
If clustering:
python3 fasta_and_cluster_to_filtered_vcf.py -i disco_bubbles_coherent.fa -f -o disco_bubbles_coherent_filtered.fa_removemeplease -m 0.95 -r 0.4
# then clustering with file disco_bubbles_coherent_filtered.fa_removemeplease
python3 fasta_and_cluster_to_filtered_vcf.py -i disco_bubbles_coherent_filtered.fa_removemeplease -c disco_bubbles_coherent_filtered_simpler.cluster -o disco_bubbles_coherent_clustered.vcf -s 150
rm -f disco_bubbles_coherent_filtered.fa_removemeplease
end of pipeline
'''
def store_clusters(cluster_file):
if cluster_file==None: return None, None
clusters=open(cluster_file,"r")
read_id_to_cluster_id={}
cluster_id_to_cluster_size={}
cluster_id=-1
for cluster in clusters:
# a line is "70166 70345 70409 70222 70406 70167 70223 69786 70407 69787 70408 70611 70610 70344 "
cluster_id+=1
cluster_id_to_cluster_size[cluster_id]=int(len(cluster.rstrip().split())/2)
for read_id in cluster.rstrip().split():
read_id_to_cluster_id[int(read_id.split('-')[0])]=cluster_id # A line can be formated as 70166 70345-info_about_similarity
clusters.close()
return read_id_to_cluster_id, cluster_id_to_cluster_size
def get_cluster_id_and_size(sequence_id, read_id_to_cluster_id, cluster_id_to_cluster_size):
if not read_id_to_cluster_id: return ".", "."
if sequence_id not in read_id_to_cluster_id:
print("Warning, sequence id "+str(sequence_id)+" not in clusters",file=sys.stderr)
return ".", "."
return read_id_to_cluster_id[sequence_id], cluster_id_to_cluster_size[read_id_to_cluster_id[sequence_id]]
def usage():
'''Usage'''
print("-----------------------------------------------------------------------------")
print(sys.argv[0]," : discoSnp output filtering and formatting in vcf")
print("-----------------------------------------------------------------------------")
print("usage: ",sys.argv[0]," -i disco_bubbles.fa")
print(" -r: min rank value filter (default = 0)")
print(" -m: max missing value filter (default = 1)")
print(" -s: max cluster size filter (default = 0, ie no size limit)")
print(" -o: output vcf file path (default = stdout)")
print(" -f: output a filtered fasta file instead of a vcf file")
print(" -c: considers a cluster input file. In this situation, can filter on cluster size and prints the cluster_id and cluster_size in the INFO field of each variant")
print(" -h: help")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:r:m:s:o:fc:", ["help", "in=", "rank=", "miss=", "size=", "out=", "fastaout", "cluster"])
except getopt.GetoptError as err:
# print help information and exit:
print(str(err)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
# Default parameters
fasta_file = 0
fasta_only = 0
min_rank = 0
max_miss = 1
max_cluster_size = 0
k = 31
out_file = None
cluster_file = None
with_cluster = False
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i", "--in"):
fasta_file = arg
elif opt in ("-r", "--rank"):
min_rank = float(arg)
elif opt in ("-m", "--miss"):
max_miss = float(arg)
elif opt in ("-s", "--size"):
max_cluster_size = int(arg)
elif opt in ("-o", "--out"):
out_file = arg
elif opt in ("-f", "--fastaout"):
fasta_only = 1
elif opt in ("-c"):
cluster_file = arg
with_cluster = True
else:
assert False, "unhandled option"
if fasta_file == 0:
print("option -i (--in) is mandatory")
usage()
sys.exit(2)
else:
today = time.localtime()
date = str(today.tm_year) + str(today.tm_mon) + str(today.tm_mday)
source = sys.argv[0]
# First identifying what kind of fasta we have with the first line
tig_type = 0 # 0 : no extension, 1 unitig, 2 contig (ie. unitig and contig length are output)
nb_samples = 0
nb_fixed_fields = 0 #nb fields before C1_X|C2_Y|... depends if unitig and/or contig lengths have been output
## LOAD clusters
read_id_to_cluster_id, cluster_id_to_cluster_size = store_clusters(cluster_file)
with open(fasta_file, 'r') as filin:
for line in filin:
splitted_1 = line.split("|")
#nb_samples:
tig_type = len(re.findall("left_\w+_length",line))
nb_fixed_fields = 4 + 2*tig_type
nb_samples = (len(splitted_1) - (nb_fixed_fields + 1))/3
if nb_samples % 1 != 0:
print(f"Warning: could not detect the correct nb of samples : {nb_samples}")
sys.exit(2)
nb_samples = int(nb_samples)
break
sys.stdout.close = lambda: None #make stdout unclosable, to use with and handle both `with open(…)` and `sys.stdout` nicely. cf. https://stackoverflow.com/questions/17602878/how-to-handle-both-with-open-and-sys-stdout-nicely
# Now going through all lines
with open(fasta_file, 'r') as filin, (open(out_file,'w') if out_file else sys.stdout) as filout:
if not fasta_only:
# Write vcf comment lines
filout.write(vcf_header(source,date,fasta_file,nb_samples))
nb_kept_variants = 0
nb_analyzed_variants = 0
line_count = 0
sequence_id=-1
fasta_4lines = "" ## Remembering 4 consecutive lines for kept variants to write in a fasta file if fasta_only mode
keep_variant = False
cluster_id=-1
cluster_size=0
for line in filin:
if line_count%2 == 0: sequence_id+=1 # first sequence is 1, second (lower path of first variant) is 2, ...
if line_count ==0:
fasta_4lines="" #back to empty for incoming variant
cluster_id,cluster_size = get_cluster_id_and_size(sequence_id, read_id_to_cluster_id, cluster_id_to_cluster_size)
line_count += 1
fasta_4lines += line
if line_count == 1:
keep_variant = False
nb_analyzed_variants += 1
# Header higher path
line = line.strip()
splitted_1 = line.split("|")
#fasta_4lines = splitted_1[0] + "\n" #simplified headers for fasta_only and src
## FILTERING
#filter cluster size
if max_cluster_size >0 and cluster_id != ".":
if cluster_size > max_cluster_size:
continue
#filter rank
rank = float(splitted_1[-1].split("rank_")[1])
if rank < min_rank:
continue
# filter missing genotype ratio
if max_miss !=1:
nb_missing = len(re.findall(r"G\d+_\./\.",line))
missing_ratio = nb_missing / nb_samples
if missing_ratio >= max_miss:
continue
keep_variant = True # for fasta_only mode
nb_kept_variants += 1
if keep_variant and line_count == 3:
#Header lower path
line = line.strip()
splitted_2 = line.split("|")
#fasta_4lines += splitted_2[0] + "\n" #simplified headers for fasta_only and src
if line_count == 4:
line_count = 0
if keep_variant:
if fasta_only:
#fasta_4lines += line #simplified headers for fasta_only and src
filout.write(fasta_4lines) # TODO: do we writte fasta variants if not in a cluster and a cluster file is provided?
else:
#now format in vcf format
line = line.strip()
filout.write(format_vcf(splitted_1, splitted_2, nb_samples, rank, line, cluster_id, cluster_size, tig_type))
#print(f"{nb_lost_variants} variant bubbles filtered out")
#print(f"{nb_kept_variants} variant bubbles output out of {nb_tot_variants} ({nb_analyzed_variants} analyzed)")
sys.stderr.write(f"{nb_kept_variants} variant bubbles output out of {nb_analyzed_variants}\n")
if __name__ == "__main__":
main()
import sys
if len(sys.argv)>1:
file = open(sys.argv[1])
else:
file=sys.stdin
#IN= "9976:4242 9976 9604 "
#OUT=
# "9976 4242"
# "9976 9604"
def printLine(line):
if line[0]=="#":
print (line)
return
id_source=line.split(":")[0]
ids_children=line.split(":")[1].split(" ")
for id_child in ids_children:
if id_child!=id_source:
print (id_source,id_child)
for line in file:
printLine(line.strip().rstrip())
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' ***********************************************
Script to select only one variant per locus (cluster) in a discoSnp vcf output file (.vcf)
Author - Claire Lemaitre, Pierre Peterlongo, Inria
Usage:
python3 1SNP_per_cluster.py -i vcf_file [-o new_vcf_file]
Details:
The selected variant is not chosen at random, for a given cluster, it is the one with the less missing genotypes (if ties, it is the first read in the input file)
*********************************************** '''
import sys
import getopt
def usage():
'''Usage'''
print("-----------------------------------------------------------------------------")
print(sys.argv[0]+" : selects one variant per cluster")
print("-----------------------------------------------------------------------------")
print("usage: "+sys.argv[0]+" -i vcf_file [-o output_file]")
print(" -i: vcf file [mandatory]")
print(" -o: output vcf file (default = stdout)")
print(" -h: help")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:o:")
except getopt.GetoptError as err:
# print help information and exit:
print(str(err))
usage()
sys.exit(2)
# Default parameters
vcf_file = None
out_file = None
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i"):
vcf_file = arg
elif opt in ("-o", "--out"):
out_file = arg
else:
assert False, "unhandled option"
if vcf_file==None:
print ("Error: option -i is mandatory")
usage()
sys.exit(2)
format_ok = check_format(vcf_file)
if not format_ok:
print("Error: the format of the input vcf is not correct, it must contain clustering information")
sys.exit(2)
dict_, nb_SNP_before, nb_SNP_after = store_info(vcf_file)
output_newvcf(vcf_file, out_file, dict_)
def check_format(vcf_file):
''' Checks if the vcf has the correct format, ie : the INFO field must contain clustering information, such as:
Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=.;Cluster=79466;ClSize=12
'''
filin = open(vcf_file, 'r')
checked = False
while not checked:
line = filin.readline()
if line.startswith("#"): continue
INFO_split = line.split("\t")[7].split(";")
checked = True
if len(INFO_split) < 10: return False
tmp_cluster = INFO_split[8].split("Cluster=")
if len(tmp_cluster) < 2: return False
if tmp_cluster[1] == ".": return False
try:
cl_id = int(tmp_cluster[1])
except ValueError:
return False
return True
filin.close()
def store_info(vcf_file):
dict_ = {} ## {num_cluster : [num_SNP ayant le moins de géno manquants, nb_missgeno]}
filin = open(vcf_file, 'r')
nb_SNP_tot = 0
while True:
"""#SNP_higher_path_14643 30 14643 C T . . Ty=SNP;Rk=0.55424;UL=0;UR=0;CL=0;CR=0;Genome=.;Sd=.;Cluster=1285;ClSize=4 GT:DP:PL:AD:HQ 0/1:38:554,48,75:7,31:71,71 0/1:20:63,23,263:15,5:71,71"""
line = filin.readline()
if not line: break
if line.startswith("#"): continue
nb_SNP_tot += 1
num_cluster = int(line.split("Cluster=")[1].split(";")[0])
if num_cluster == -1: continue
id_SNP = line.split("\t")[2]
if num_cluster not in dict_:
dict_[num_cluster] = ["x", sys.maxsize] # ghost best variant for this new cluster
genotypes = [i.split(":")[0] for i in line.split("\t")[9:]]
nb_missing = 0
for ind in genotypes:
if ind[0] == "." :
nb_missing += 1
if nb_missing >= dict_[num_cluster][1]: continue
dict_[num_cluster] = [id_SNP, nb_missing]
filin.close()
return dict_, nb_SNP_tot, len(dict_)
def output_newvcf(vcf_file, out_file, dict_) :
filin = open(vcf_file, 'r')
if out_file:
filout=open(out_file,'w')
else:
filout = sys.stdout
while True:
line = filin.readline() #SNP_higher_path_14643 30 14643 C T . . Ty=SNP;Rk=0.55424;UL=0;UR=0;CL=0;CR=0;Genome=.;Sd=.;Cluster=1285;ClSize=4 GT:DP:PL:AD:HQ 0/1:38:554,48,75:7,31:71,71 0/1:20:63,23,263:15,5:71,71
if not line: break
if line.startswith("#"):
filout.write(line)
continue
try:
cluster = int(line.split("Cluster=")[1].split(";")[0])
except ValueError:
print ("No cluster size information stored in the vcf, exit")
sys.exit(1)
if cluster == -1 : continue
id_SNP = line.split("\t")[2]
if id_SNP != dict_[cluster][0]: continue
filout.write(line)
filin.close()
filout.close()
if __name__ == "__main__":
main()
# Directory containing some scripts to post-process and filter discoSnpRad results
## Filtering scripts
1. **script** `filter_by_cluster_size_and_rank.py`:
* removes variants belonging to a cluster (locus) whose size (nb of variants) is outside the given size range (options `-m` and `-M`)
* removes variants with rank lower than a given threshold given by option `-r`
* Usage :
`python filter_by_cluster_size_and_rank.py -i vcf_file [-o new_vcf_file -m 0 -M 150 -r 0.4]`
3. **script** `filter_vcf_by_indiv_cov_max_missing_and_maf.py`:
* replaces individual genotypes that have DP less than the value given by option `-c` by missing genotype `./.`
* removes variants (vcf lines) that have a fraction of missing genotypes greater than the value given by option `-m`
* removes variants (vcf lines) that have a minor allele frequency smaller than the value given by option `-f`
* outputs only SNP variants if option `-s`.
* Usage :
`python filter_vcf_by_indiv_cov_max_missing_and_maf.py -i vcf_file -o new_vcf_file [-c min_cov -m max_missing -f maf -s] `
3. **script** `filter_paralogs.py`:
* identifies variants (vcf lines) that have a fraction of heterozygous genotypes greater than `x` (not counting missing genotypes)
* removes variants (vcf lines) that belong to a cluster having a fraction of such variants greater than `y`
* Example : `x=0.1` and `y= 0.5` and if we consider a cluster to represent a locus. This filter removes loci that have more than 50% of the SNPs that have each more than 10% of heterozygous genotypes.
* Usage :
`python filter_paralogs.py -i vcf_file -o new_vcf_file [-x 0.1 -y 0.5]`
## Scripts for STRUCTURE analyses :
4. **script** `1SNP_by_cluster.py`
* selects one SNP per cluster (the one with less missing genotypes)
* Usage :
`python 1SNP_per_cluster.py -i vcf_file -o new_vcf_file`
5. **script** `vcf2structure.sh`
* changes the vcf format to a Structure format (input of the software Structure)
* Usage:
`vcf2structure.sh file.vcf > fle.str`
## Mapping to a reference genome, and keeping the cluster information :
When a reference genome is available, even if variants have been called in a reference-free manner, it could be useful to get the positions of variants on this reference genome.
To get such information, two programs are necessary in the case of a discoSnpRAD result:
* `VCF_creator` (in dir `[DISCO_DIR]/scripts/`).
Note : it uses the mapper bwa, which must to be in the PATH env variable.
* **script** `add_cluster_info_to_mapped_vcf.py` in this current directory, to append the clustering information (and some minimal filtering on cluster size) in the vcf output by VCF_creator.
Here is the full pipeline to map the result of discoSnpRAD with name prefix `myDiscoSnpRADResult` to a reference genome `myReferenceGenome.fa`:
```
# Running VCF_creator with a given reference genome ()
sh [DISCO_DIR]/scripts/run_VCF_creator.sh -G dm6_masked.fa -p myDiscoSnpRADResult_raw_filtered.fa -e -o temp.vcf
# Adding clustering information (and minimal filtering on cluster size)
python add_cluster_info_to_mapped_vcf.py -m temp.vcf -u myDiscoSnpRADResult_clustered.vcf -o myDiscoSnpRADResult_mapped.vcf
# final vcf is myDiscoSnpRADResult_mapped.vcf
```
Additionnally, in a validation context, if one wants to compare variant positions between two such vcf files, the following command will output recall and precision metrics:
```
python [DISCO_DIR]/scripts/validation_scripts/compare_vcf_disco_pos_allele_only.py truth.vcf myDiscoSnpRADResult_mapped.vcf
```
<!---
== Clustering full process ==
-----------------------------
`ls ${discofile}.fa > ${discofile}.fof `
`short_read_connector.sh -b ${discofile}.fa -q ${discofile}.fof -s 0 -k ${k} -a
1 -l -p ${discofile}.txt`
`./quick_hierarchical_clustering ${discofile}.txt > ${discofile}.cluster`
`python3 clusters_and_fasta_to_fasta.py ${discofile}.fa ${discofile}.cluster >
${discofile}_with_clusters.fa`
`run_VCF_creator.sh -p ${discofile}_with_clusters.fa
-o ${discofile}_with_clusters.vcf`
**NB**: Then the `${discofile}_with_clusters.vcf` can be simply sorted to put
together variants from each cluster.
**Sources**:
`short_read_connector.sh` from <https://github.com/GATB/short_read_connector>
`run_VCF_creator.sh` from <https://github.com/GATB/DiscoSnp> (scripts directory)
== Old scripts ==
* `filter_missgeno.py" : filtering out variants with too many missing genotypes
* From a .fa file generated by discoSnp-Rad, remove variants with too many missing genotypes
* Usage:
`filter_missgeno.py file max_missing` (with max_missing in percent)
-->
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import getopt
def usage():
'''Usage'''
print("-----------------------------------------------------------------------------")
print(sys.argv[0]," : merge 2 vcf files, add cluster info present in the unmmaped vcf in the mapped vcf (ie. with mapping info on a ref genome)")
print("-----------------------------------------------------------------------------")
print("usage: ",sys.argv[0])
print(" -u unmapped vcf file [mandatory]")
print(" -m mapped vcf file [mandatory]")
print(" -o: output vcf file path (default = stdout)")
print(" -h: help")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def output_newvcf(unmapped_file, mapped_file, out_file):
filin = open(unmapped_file, 'r')
## store cluster_info for each variant (identified by its unique id, 3rd column of the vcf line)
id_to_cluster_info={}
for line in filin.readlines():
line=line.strip()
if line[0]=="#": continue
#SNP_higher_path_3 199 3 C G . . Ty=SNP;Rk=1.0;UL=86;UR=261;CL=169;CR=764;Genome=.;Sd=.;Cluster=0;ClSize=3 ...
splitted = line.split("\t")
cluster_info = ";".join(splitted[7].split(";")[8:10])
id = splitted[2]
id_to_cluster_info[id] = cluster_info
#print(f"{id}-{cluster_info};")
filin.close()
## Print vcf
if out_file:
filout=open(out_file,'w')
else:
filout = sys.stdout
filin = open(mapped_file,"r")
for line in filin.readlines():
if line[0]=="#":
filout.write (line)
else:
##chr3R 26778135 3 C G . PASS Ty=SNP;Rk=1.0;UL=86;UR=261;CL=169;CR=764;Genome=.;Sd=1 ...
splitted = line.split("\t")
id = splitted[2]
if id not in id_to_cluster_info: # if no cluster info, we do not print the variant at all because we assume that if absent means that it has been filtered out for a good reason (typically : too large cluster)
continue
cluster_info = id_to_cluster_info[id]
INFO = splitted[7] + ";" + cluster_info
tojoin = splitted[:7] + [INFO] + splitted[8:]
filout.write ("\t".join(tojoin))
filin.close()
filout.close()
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hu:m:o:")
except getopt.GetoptError as err:
# print help information and exit:
usage()
sys.exit(2)
# Default parameters
unmapped_file = None
mapped_file = None
out_file = None
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-u"):
unmapped_file = arg
elif opt in ("-m"):
mapped_file = arg
elif opt in ("-o", "--out"):
out_file = arg
else:
assert False, "unhandled option"
if mapped_file==None:
print ("-m missing")
usage()
sys.exit(2)
if unmapped_file==None:
print ("-u missing")
usage()
sys.exit(2)
output_newvcf(unmapped_file, mapped_file, out_file)
if __name__ == "__main__":
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' ***********************************************
Script to filter out variants in a discoSnp vcf output file (.vcf), according to cluster size and/or rank value
Author - Claire Lemaitre, Pierre Peterlongo, Inria
Usage:
python3 filter_by_cluster_size_and_rank.py -i vcf_file [-o output_file -m 0 -M 150 -r 0.4]
Details:
filter a vcf file by keeping only variants such that :
- the cluster (locus) they belong to contains x variants with x in [m,M]
- with a rank value >= r
outputs a vcf
*********************************************** '''
import sys
import getopt
def usage():
'''Usage'''
print("-----------------------------------------------------------------------------")
print(sys.argv[0]+" : filter vcf variants based on their cluster size and/or rank value")
print("-----------------------------------------------------------------------------")
print("usage: "+sys.argv[0]+" -i vcf_file [-o output_file -m 0 -M 150 -r 0.4]")
print(" -i: input vcf file [mandatory]")
print(" -m: min cluster size (included)")
print(" -M: max cluster size (included)")
print(" -r; min rank (included)")
print(" -o: output vcf file (default = stdout)")
print(" -h: help")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def output_newvcf(in_file, out_file, min_cluster_size, max_cluster_size, rank_min):
filin = open(in_file, 'r')
if out_file:
filout=open(out_file,'w')
else:
filout = sys.stdout
for line in filin.readlines():
line=line.strip()
if line[0]=='#':
filout.write(line+"\n")
continue
#SNP_higher_path_3 199 3 C G . . Ty=SNP;Rk=1.0;UL=86;UR=261;CL=169;CR=764;Genome=.;Sd=.;Cluster=0;ClSize=3 ...
if min_cluster_size>0 or max_cluster_size<sys.maxsize: # make the split only if necessary
try:
cluster_size = int(line.split("ClSize=")[1].split()[0])
except ValueError:
print ("No cluster size information stored in the vcf, exit")
sys.exit(1)
if cluster_size < min_cluster_size or cluster_size > max_cluster_size:
continue # does not respect the cluster size filtering criteria
if rank_min > 0: # make the split only if necessary
rank = float(line.split()[7].split(';')[1].split('=')[-1])
if rank < rank_min:
continue # does not respect the rank min filtering criteria
filout.write(line+"\n") # all filters passed
filin.close()
filout.close()
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:m:M:o:r:")
except getopt.GetoptError as err:
# print help information and exit:
print(str(err))
usage()
sys.exit(2)
# Default parameters
min_cluster_size = 0
max_cluster_size = sys.maxsize
min_rank=0
in_file = None
out_file = None
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i"):
in_file = arg
elif opt in ("-m"):
min_cluster_size = float(arg)
elif opt in ("-M"):
max_cluster_size = float(arg)
elif opt in ("-r"):
min_rank = float(arg)
elif opt in ("-o", "--out"):
out_file = arg
else:
assert False, "unhandled option"
if in_file==None:
print("Error: option -i is mandatory")
usage()
sys.exit(2)
format_ok = check_format(in_file)
if not format_ok:
print("Error: the format of the input vcf is not correct, it must contain clustering information")
sys.exit(2)
output_newvcf(in_file, out_file, min_cluster_size, max_cluster_size, min_rank)
def check_format(vcf_file):
''' Checks if the vcf has the correct format, ie : the INFO field must contain clustering information, such as:
Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=.;Cluster=79466;ClSize=12
'''
filin = open(vcf_file, 'r')
checked = False
while not checked:
line = filin.readline()
if line.startswith("#"): continue
INFO_split = line.split("\t")[7].split(";")
checked = True
if len(INFO_split) < 10: return False
tmp_cluster = INFO_split[8].split("Cluster=")
if len(tmp_cluster) < 2: return False
if tmp_cluster[1] == ".": return False
try:
cl_id = int(tmp_cluster[1])
except ValueError:
return False
return True
filin.close()
if __name__ == "__main__":
main()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' ***********************************************
Script to filter out variants in a discoSnp vcf output file (.vcf), according to the fraction of heterozygous genotypes per locus
Author - Claire Lemaitre, Pierre Peterlongo, Inria
Usage:
python3 filter_paralogs.py -i vcf_file -o new_vcf_file [-x 0.1 -y 0.5]
Details:
variants with a proportion (not considering missing genotypes) of heterozygous genotypes greater than x are considered as "bad" variants
all variants (vcf lines) belonging to clusters (loci) with a proportion of "bad" variants greater than y are filtered out
*********************************************** '''
import sys
import getopt
def usage():
'''Usage'''
print("-----------------------------------------------------------------------------")
print(sys.argv[0]+" : discoSnp output filtering according to the fraction of heterozygous genotypes per locus")
print("-----------------------------------------------------------------------------")
print("usage: "+sys.argv[0]+" -i vcf_file -o new_vcf_file [-x 0.1 -y 0.5]")
print(" -i: input vcf file [mandatory]")
print(" -o: output vcf file [mandatory]")
print(" -x: max fraction of heterozygous genotypes per variant (default = 0.1)")
print(" -y: max fraction of bad variants per locus (default = 0.5)")
print(" -h: help")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hi:x:y:o:", ["help", "in=", "x=", "y=", "out="])
except getopt.GetoptError as err:
# print help information and exit:
print(str(err)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
# Default parameters
vcf_file = 0
x = 0.1
y = 0.5
k = 31
out_file = 0
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-i", "--in"):
vcf_file = arg
elif opt in ("-x", "--x"):
x = float(arg)
elif opt in ("-y", "--y"):
y = float(arg)
elif opt in ("-o", "--out"):
out_file = arg
else:
assert False, "unhandled option"
if vcf_file == 0 or out_file == 0:
print("Error: options -i and -o are mandatory")
usage()
sys.exit(2)
else:
format_ok = check_format(vcf_file)
if not format_ok:
print("Error: the format of the input vcf is not correct, it must contain clustering information")
sys.exit(2)
dict, nb_cluster_tot = store_info(vcf_file, x)
clusters_to_keep, nb_clusters_kept = discard_clusters(dict, y)
output_newvcf(vcf_file, out_file, clusters_to_keep)
#
#input: disco.vcf
#output : new disco.vcf without SNP located in clusters with more than y% SNP heterozygous in more than x% individuals
#
#Usage : python filter_paralogs.py disco.vcf x y
#
print(str(nb_clusters_kept) + " on " + str(nb_cluster_tot) + " clusters had less than " + str(y*100) + "% of SNP with less than " + str(x*100) + "% heterygous genotypes")
def check_format(vcf_file):
''' Checks if the vcf has the correct format, ie : the INFO field must contain clustering information, such as:
Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=.;Cluster=79466;ClSize=12
'''
filin = open(vcf_file, 'r')
checked = False
while not checked:
line = filin.readline()
if line.startswith("#"): continue
INFO_split = line.split("\t")[7].split(";")
checked = True
if len(INFO_split) < 10: return False
tmp_cluster = INFO_split[8].split("Cluster=")
if len(tmp_cluster) < 2: return False
if tmp_cluster[1] == ".": return False
cl_id = int(tmp_cluster[1])
return True
filin.close()
def store_info(vcf_file, x):
dict = {} ## {num_cluster : [nb SNP avec 0/1 > x%, nb SNP total]}
......@@ -20,7 +112,7 @@ def store_info(vcf_file, x):
filin = open(vcf_file, 'r')
while True:
"""cluster_4651_size_14_SNP_higher_path_826058 31 826058 A G . . Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=. GT:DP:PL:AD:HQ 1/1:7:144,25,5:0,7:0,72 1/1:14:284,46,5:0,14
"""SNP_higher_path_9999 31 9999_1 A G . . Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=.;Cluster=79466;ClSize=12 GT:DP:PL:AD:HQ 1/1:7:144,25,5:0,7:0,72 1/1:14:284,46,5:0,14
:0,71 1/1:8:164,28,5:0,8:0,73 1/1:59:1184,182,7:0,59:0,70 1/1:144:2884,438,11:0,144:0,72 1/1:30:604,95,6:0,30:0,70 1/1:37:744,116,6:0,37:0,72 ./.:1:.,.,.:0,1:0,71 1/1:31:624,98,6:0,31
:0,69 1/1:10:204,34,5:0,10:0,72 1/1:45:904,140,6:0,45:0,72 1/1:31:624,98,6:0,31:0,73 1/1:11:224,37,5:0,11:0,66 1/1:133:2664,405,10:0,133:0,72 1/1:26:524,83,5:0,26:0,68 1/1:
43:864,134,6:0,43:0,72 1/1:27:544,86,5:0,27:0,72 1/1:6:124,22,5:0,6:0,70 1/1:32:644,101,6:0,32:0,73 1/1:55:1104,170,7:0,55:0,71 1/1:20:404,64,5:0,20:0,73 ./.:0:.,.,.:0,0:0,0 ./.:
......@@ -30,9 +122,8 @@ def store_info(vcf_file, x):
line = filin.readline()
if not line: break
if line.startswith("#"): continue
if not line.startswith("cluster"): continue
num_cluster = int(line.split("\t")[0].split("_")[1])
num_cluster = int(line.split("\t")[7].split(";")[8].split("Cluster=")[1])
if num_cluster not in dict :
dict[num_cluster] = [0,0]
......@@ -71,10 +162,10 @@ def discard_clusters(dict, y):
return clusters_to_keep, len(clusters_to_keep)
def output_newvcf(vcf_file, clusters_to_keep) :
def output_newvcf(vcf_file, out_file, clusters_to_keep) :
filin = open(vcf_file, 'r')
new_vcf = open("para_" + str(x) + "_" + str(y) + "_" + str(vcf_file), 'w')
new_vcf = open(out_file, 'w')
while True:
......@@ -84,20 +175,13 @@ def output_newvcf(vcf_file, clusters_to_keep) :
new_vcf.write(line)
continue
if not line.startswith("cluster"): continue
cluster = int(line.split("\t")[0].split("cluster_")[1].split("_size")[0])
cluster = int(line.split("\t")[7].split(";")[8].split("Cluster=")[1])
if cluster not in clusters_to_keep: continue
new_vcf.write(line)
filin.close()
new_vcf.close()
if __name__ == "__main__":
main()
vcf_file=sys.argv[1]
x = float(sys.argv[2])
y = float(sys.argv[3])
dict, nb_cluster_tot = store_info(vcf_file, x)
clusters_to_keep, nb_clusters_kept = discard_clusters(dict, y)
output_newvcf(vcf_file, clusters_to_keep)
print(str(nb_clusters_kept) + " on " + str(nb_cluster_tot) + " clusters had less than " + str(y*100) + "% of SNP with less than " + str(x*100) + "% heterygous genotypes")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
import sys
import getopt
''' ***********************************************
Script to filter out variants in a discoSnp vcf output file (.vcf), according to various features
Author - Claire Lemaitre, Pierre Peterlongo, Inria
#
# filter a vcf file by keeping only variants such that :
# - each individual with DP < min_cov is replaced by a missing genotype
# - the proportion of missing genotype is < max_missing
# outputs a vcf
Usage:
python3 filter_vcf_by_indiv_cov_max_missing_and_maf.py -i vcf_file -o new_vcf_file [-x 0.1 -y 0.5]
Details:
filter a vcf file by keeping only variants such that :
- each individual genotype with DP < min_cov is replaced by a missing genotype
- the proportion of missing genotype is < max_missing
- the minor allele freq is >= min_maf
outputs a vcf
Note : NA are indicated as "./." genotypes.
*********************************************** '''
# Note : NA are indicated as "./." genotypes.
# Last modified : Oct. 2019
import sys
import getopt
def usage():
'''Usage'''
print "-----------------------------------------------------------------------------"
print sys.argv[0]," : vcf filter"
print "-----------------------------------------------------------------------------"
print "usage: ",sys.argv[0]," -i vcf_file -o output_file [-c min_cov -m max_missing -s]"
print " -i: input vcf file"
print " -o: output vcf file"
print " -m: max missing genotype proportion to keep a variant (between 0 and 1, def = 1)"
print " -c: min coverage to call a genotype (int, def=0)"
print " -s: snp only (def= all variants)"
print "-----------------------------------------------------------------------------"
print("-----------------------------------------------------------------------------")
print(sys.argv[0]+" : vcf filter")
print("-----------------------------------------------------------------------------")
print("usage: "+sys.argv[0]," -i vcf_file -o output_file [-c min_cov -m max_missing -f min_maf -s]")
print(" -i: input vcf file [mandatory]")
print(" -o: output vcf file [mandatory]")
print(" -m: max missing genotype proportion to keep a variant (between 0 and 1, def = 1)")
print(" -c: min coverage to call a genotype (int, def=0)")
print(" -f: min minor allele frequency (maf) (between 0 and 1, def=0)")
print(" -s: snp only (def= all variants)")
print("-----------------------------------------------------------------------------")
sys.exit(2)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "i:o:c:m:s", ["in=", "out=","cov=","miss=","snp-only"])
except getopt.GetoptError, err:
opts, args = getopt.getopt(sys.argv[1:], "i:o:c:m:f:s", ["in=", "out=","cov=","miss=", "freq=","snp-only"])
except getopt.GetoptError as err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
print(str(err)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
......@@ -45,6 +56,7 @@ def main():
min_cov = 0
max_missing_prop = 1
snp_only = 0
min_maf = 0
for opt, arg in opts:
if opt in ("-i", "--in"):
input_file = arg
......@@ -54,13 +66,15 @@ def main():
min_cov = int(arg)
elif opt in ("-m", "--miss"):
max_missing_prop = float(arg)
elif opt in ("-f", "--freq"):
min_maf = float(arg)
elif opt in ("-s", "--snp-only"):
snp_only=1
else:
assert False, "unhandled option"
if input_file == 0 or output_file == 0:
print "Missing arguments"
print("Error: options -i and -o are mandatory")
usage()
sys.exit(2)
else:
......@@ -101,6 +115,9 @@ def main():
line_towrite = "\t".join(splitted[:start_geno])
missing_count = 0
ref_count = 0
alt_count = 0
for geno in splitted[start_geno:]:
geno_info = geno.split(":")
genotype = geno_info[0]
......@@ -113,12 +130,29 @@ def main():
genotype = "./."
missing_count += 1
new_na_count += 1
else:
#count ref and alt alleles
allele1 = genotype[0]
allele2 = genotype[2]
if allele1 == "0":
ref_count += 1
else:
alt_count += 1
if allele2 == "0":
ref_count += 1
else:
alt_count += 1
#rewrite geno info
geno_new = genotype+":"+":".join(geno_info[1:])
line_towrite += "\t"+geno_new
## Filter on missing count
if missing_count <= max_missing:
maf = 0
# Should we also remove variants that are no longer variable if min_maf = 0, ie alt_count == 0 and ref_count ==0 ??
if ref_count+alt_count>0:
maf = (float) (min(ref_count,alt_count))/(ref_count+alt_count)
## Filter on missing count and maf
if missing_count <= max_missing and maf >= min_maf:
filout.write(line_towrite)
var_written += 1
selected_na_count += missing_count
......@@ -131,7 +165,7 @@ def main():
print("# filter parameters : indiv_DP>="+str(min_cov)+", missing<="+str(max_missing)+" (prop*n_geno = "+str(max_missing_prop)+" * "+str(n_geno)+"), snp-only="+str(snp_only))
print("# "+str(var_count)+" seen variants, "+str(var_written)+" variants after filtering")
print("# initial missing count (on snps only if snp-only=1) = "+str(na_count)+", genotypes changed to NA = "+str(new_na_count)+", final missing count in new vcf = "+str(selected_na_count))
print("# initial missing percent (on snps only if snp-only=1) = {0:.2f} %, final missing percent in new vcf = {1:.2f} %".format(100*float(na_count)/float(var_count),100*float(selected_na_count)/float(var_written)))
print("# initial missing percent (on snps only if snp-only=1) = {0:.2f} %, final missing percent in new vcf = {1:.2f} %".format(100*float(na_count)/float(var_count*n_geno),100*float(selected_na_count)/float(var_written*n_geno)))
......
#! /bin/bash
out_name=`echo $1 | sed -e 's/.vcf//g'`
#extract genotypes
grep -v "#" $1 | sed -e 's/\t\.:/\t\.\/\.:/g' | cut -f 10- | sed -e 's/[:][[:graph:]]*//g' | sed -e 's/\//\t/g' -e 's/|/\t/g' -e 's/\./-9/g' > $1_matrix
#transpose matrix
awk '
{
for (i=1; i<=NF; i++) {
a[NR,i] = $i
}
}
NF>p { p = NF }
END {
for(j=1; j<=p; j++) {
str=a[1,j]
for(i=2; i<=NR; i++){
str=str"\t"a[i,j];
}
print str
}
}' $1_matrix > $out_name.str
rm $1_matrix