Steffen Möller · Steffen Möller · c39a6543 · c39a6543 · ab934c30 · ab934c30
--- a/bin/SNPClassifier/Manuals/The SNP CSV File Description.doc
+++ b/bin/SNPClassifier/Manuals/The SNP CSV File Description.doc
--- a/INSTALL
+++ b/INSTALL
+## This (modified) SIFT version tested on gcc version 4.7.2
+## This SIFT version tested on old legacy BLAST package (blastpgp and formatdb)
+
+This SIFT installation is for
+ - submitting a single protein sequence
+ - submitting a protein alignment
+ - submitting a NCBI gi id.
+
+This SIFT installation is NOT for:
+ - submitting VCF files (please use SIFT 4G)
+ - submitting genomic coordinates of variants (please use SIFT 4G)
+ - SIFT indel prediction  
+Please go to sift-dna.org or sift-dna.org/sift4g to download code for 
+the above functionalities.
+ 
+1. INSTALL NCBI BLAST TOOLS 
+	SIFT uses PSI-BLAST and makeblastdb commands from BLAST+ (This has been tested for 2.4.0).
+
+	Download and install NCBI BLAST 2.4.0+ (or the latest) version at:
+ 
+	     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/
+
+	SIFT uses NCBI's PSI-BLAST and makeblastdb.  Make sure these programs 
+	are inside your package (otherwise install 2.4.0 which has these 
+	programs).  
+		ls <NCBI_BLAST_DIR>/bin
+
+		makeblastdb
+		psiblast
+
+
+2. Download and format reference protein sequence database 
+	This step is required for SIFT sequence submission. It is optional 
+	if you are inputting a protein alignment or a NCBI gi id.
+
+	SIFT searches a database of protein sequences to find homologous
+        sequences.  You will need to download a database of protein sequences
+        and format it properly so that SIFT subroutines can read it.
+
+	2a. Download a protein database. Some options are:
+
+	    UniRef: 
+		ftp://ftp.ebi.ac.uk/pub/databases/fastafiles/uniref/ 
+
+	    SWISS-PROT/TrEMBL: 
+		ftp://ftp.ebi.ac.uk/pub/databases/fastafiles/uniprot/ 
+
+	    Refseq:
+		ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete/
+
+
+	    (I personally prefer uniref90.gz) 
+
+	2b. Format your database
+	    SIFT cannot process names of fasta sequences that contain
+	    delimiters such as "|" or ":". Also, it needs the first 10
+	    characters of the sequence name to be unique.  
+	
+	    Format the standard databases so that the sequence names are
+	    simpler.
+
+	    For UniRef, the format command is:
+	    zcat uniref90.gz | perl -pe 's/^>UR090:UniRef90_/>/'  > uniref90.fa
+
+	    For Swiss-Prot or TrEMBL databases, the format command is: 
+	    zcat uniprot_swissprot.gz | perl -pe 's/^>TR:/>/; s/^>SP:/>/' > uniprotkb_swissprot.fa
+
+	    For NCBI:
+            zcat complete.1000.protein.faa.gz |perl -pe 's/^>gi\|/>gi/' | perl -pe 's/\|/ /' > complete.1000.protein.faa
+
+            The names will be changed to the proper format for proper 
+	    parsing.
+
+            If you have your own protein sequence database and SIFT 
+	    is not properly recognizing the names, go to 
+            src/Alignment.c and modify "fix_names"
+            Recompile ALL programs.
+
+	2c.  Format your database:
+		<BLAST_DIR>/bin/makeblastdb -in <protein database fasta> -dbtype prot
+					  	
+3. UNPACK SIFT
+	
+	tar -zxvf sift-<version>.tar.gz
+
+	This will create a directory of the form, sift-<version>.  Change to
+	the directory.
+
+	cd sift-<version>
+
+
+You can SKIP STEPS 4 & 5 if you're using Linux. Linux executables are already in the bin directory. 
+
+4. COMPILE BLIMPS CODE: 
+	Go to the blimps directory:	
+		cd <SIFT_HOME>/blimps	
+	
+	Compile blimps by typing:
+		make clean
+		make all	
+		
+	Folders 'obj','lib' and 'include' will be created. 
+	Check that the blimps library 'libblimps.a' is generated in the 'lib'
+	folder. 
+	If 'libblimps.a' is present, this indicates that BLIMPS has compiled
+	successfully.
+	
+	
+5. COMPILE SIFT CODE:
+	Compile SIFT
+	Set the BLIMPS path in the file cccb (<SIFT_HOME>/src/cccb)
+	
+		cd <SIFT_HOME>/src
+	
+	Edit the file 'cccb'
+	
+		set b = <BLIMPS Home>		
+		set CC = <Path to GCC>	
+
+
+	Compile the SIFT executables.	
+		cd <SIFT_HOME>/src
+		./compile.csh	
+
+		
+	'compile.csh' will generate and move executable to <SIFT_HOME>/bin dirctory.
+	Check <SIFT_HOME>/bin directory for the following executables.
+		choose_seqs_via_psiblastseedmedian
+                clump_output_alignedseq
+                info_on_seqs
+                consensus_to_seq
+                psiblast_res_to_fasta_dbpairwise
+                seqs_from_psiblast_res
+
+                                                                          
+6. SET PATHS 
+
+	Edit the files in <SIFT_HOME>/bin to set environmental variables
+		SIFT_for_submitting_fasta_seq.csh
+		SIFT_for_submitting_NCBI_gi_id.csh
+		
+	# Set NCBI, BLIMPS and SIFT path in file 'SIFT_for_submitting_fasta_seq.csh' (<SIFT_HOME>/bin/SIFT_for_submitting_fasta_seq.csh)
+	
+		setenv NCBI <BLAST directory where psiblast and makeblastdb are>
+			
+		setenv SIFT_DIR <SIFT_HOME>			# Location of SIFT (use SIFT absolute path eg: /home/software/sift)
+			
+		setenv tmpdir = <SIFT_HOME>/tmp			# SIFT's temporary output directory (use sift absolute path eg: /home/software/sift/tmp) 
+
+
+7. RUNNING SIFT
+
+	# Before test/run SIFT program, change working directory to SIFT bin (<SIFT_HOME>/bin) directory and execute below SIFT command (important).
+	# ie, Current working directoy is '<SIFT_HOME>/bin'
+
+	A.  Input: Protein sequence. (SIFT chooses homologues).
+		Requires 3 inputs:
+		1) Protein sequence in fasta format.
+		2) Protein database to search.  These sequences are 
+		   assumed to be functional
+		3) File of substitutions to be predicted on (optional).  
+		   See test/lacI.subst for an example of the format. 
+		   This file is optional. Alternatively, you can print 
+	           scores for the entire protein sequence. 
+
+		Results will be in <tmpdir>/<seq_file>.SIFTprediction
+
+		COMMANDLINE FOR A LIST OF SUBSTITUTIONS:
+		Go to the bin directory <SIFT_HOME>:
+
+			cd <SIFT_HOME>
+
+		Type the commandline: 
+
+			bin/SIFT_for_submitting_fasta_seq.csh <seq file> <protein_database> <file of substitutions> 
+
+		EXAMPLE:
+			bin/SIFT_for_submitting_fasta_seq.csh test/lacI.fasta <protein_database> test/lacI.subst 
+
+		Results will appear in <tmpdir>/lacI.fasta.SIFTprediction. 
+
+		COMMANDLINE TO PRINT ALL SIFT SCORES
+		bin/SIFT_for_submitting_fasta_seq.csh <seq file> <protein_database> - 
+		A dash "-" replaces the list of substitutions.
+
+		Results will appear in <tmp_dir>/<seq file>.SIFTprediction  
+
+	B.	Input: Your own protein alignment (and the path to the environmental variable BLIMPS_DIR, which was set in SIFT.csh).
+
+		COMMANDLINE FORMAT FOR A LIST OF SUBSTITIONS:
+
+		cd <SIFT_DIR>
+		export BLIMPS_DIR=<blimps_path>
+		bin/info_on_seqs <protein alignment> <substitution file> <output file>
+
+		EXAMPLE:
+		Type:
+
+		cd <SIFT_DIR>
+                export BLIMPS_DIR=<blimps_path>
+		bin/info_on_seqs test/lacI.alignedfasta test/lacI.subst test/lacI.fasta.SIFTprediction
+
+		Prediction results will appear in 
+		test/lacI.fasta.SIFTprediction, read above for description 
+		of output.
+
+		COMMANDLINE TO PRINT ALL SIFT SCORES:
+
+		export BLIMPS_DIR=<blimps_path>
+		bin/info_on_seqs <protein alignment> - <output file>
+
+		Example:
+		Type :
+
+		export BLIMPS_DIR=<blimps_path>
+		bin/info_on_seqs test/lacI.alignedfasta - test/lacI.fasta.SIFTprediction
+
+		and scores for each position will appear in the file 
+		test/lacI.fast.SIFTprediction
+
+	C.      Input: BLink gi. 
+		Rather than using BLAST to search for homologous sequences,
+		homologues are retrieved from NCIB BLink. 
+		
+			csh ./SIFT_for_submitting_NCBI_gi_id.csh <gi id> <subst_file> BEST
+
+                1) <gi id> : the NCBI protein gi
+ 
+		   This protein ID is used to look up BLAST hits from NCBI
+                2) File of substitutions to be predicted on (optional).
+                   See test/lacI.subst for an example of the format.
+                   This file is optional. Alternatively, you can print
+                   scores for the entire protein sequence by entering a "-".
+		3) Type of hits to retrieve from BLink (optional).  The two options
+		   are BEST or ALL.  By default,ALL hits are retrived. To get
+	           reciprocal best hits , pass in "BEST".  
+                
+		Results will be stored in the $tmpdir/<gi id>.SIFTprediction
+
+                COMMANDLINE FOR A LIST OF SUBSTITUTIONS:
+                If you are in SIFT_DIR, the commandline is:
+
+                	csh ./SIFT_for_submitting_NCBI_gi_id.csh <gi id> <file of substitutions> <BEST or ALL> 
+
+                EXAMPLE:
+                If you have a list of substitutions, type the following:
+                csh bin/SIFT_for_submitting_NCBI_gi_id.csh 22209009 test/gi22209009.subst BEST 
+
+                Results will appear in $tmpdir/22209009.SIFTprediction
+
+                COMMANDLINE TO PRINT ALL SIFT SCORES
+               	csh bin/SIFT_for_submitting_NCBI_gi_id.csh <gi id> - BEST 
+
+		A dash "-" replaces the substitution file, and BEST is optional.
+                Results will appear in <gi id>.SIFT prediction.
+
+8.  OUTPUT
+	A. When a list of substitutions is submitted (.subst file)
+		Results will appear in <tmpdir>/lacI.fasta.SIFTprediction and
+                look something like:
+
+                K2S     TOLERATED 0.08 3.47 LOW CONFIDENCE
+                P3M     TOLERATED 0.08  3.35 LOW CONFIDENCE
+                V15K    INTOLERANT 0.00 2.84
+
+                According to this output, the SIFT score for K2S is 0.08
+                and the median information of the sequences that have an
+                amino acid represented at the position 2 is 3.47.  If this
+                number exceedes 3.25 the substitution is annotated as
+                having LOW CONFIDENCE (which means too few sequences were
+                represented at that position.)  There are enough sequences for
+                confidence in the V15K prediction.
+
+
+	B. When "-" is used instead of a substitution list, predictions for 
+	   all possible amino acid changes are printed out.
+
+		Results will appear in <tmp_dir>/<seq file>.SIFTprediction
+                Each row is a position
+                in the sequence (row 1 is amino acid position 1, row 2 is
+                amino acid 2) and
+                the SIFT scores for each amino acid substitution are printed for                each row.
+
--- a/README
+++ b/README
--- a/README_4.0.3
+++ b/README_4.0.3
-SIFT 4.0 includes three new tools to enable exome-wide analysis of single nucleotide variants and indels.
-
-SIFT_exome_nssnvs.pl
-SIFT_exome_indels.pl
-SNPClassifier
-
-(for SNPClassifier, see the documentation in bin/SNPClassifier/ directory)
-
-1. SIFT_exome_nssnvs.pl script takes as input, a list of multiple chromosome coordinates of coding
-single nucleotide variants and outputs variant annotation along with SIFT predictions and scores.
-This tool requires human variation databases built using SQLite3 that need to be downloaded before
-the tool can be used. 
-
-NOTE: This tool is also available on the SIFT website at
-http://sift.jcvi.org/www/SIFT_chr_coords_submit.html
-
-1a. Setting up:
-
-Human variation databases should be downloaded from the ftp location 
-ftp://ftp.jcvi.org/pub/data/sift/Human_db_36/ , unzipped and placed in 
-the directory SIFT_HOME/db/Human_db_version/
-
-Please see ftp://ftp.jcvi.org/pub/data/sift/Human_db_36/README (using your 
-browser) for more information about downloading and using the databases in
-standalone mode.
-
-1b. Preparing the input
-
-Input Format Example : RESIDUE BASED COORDINATE SYSTEM (comma separated) 3,81780820,-1,T/C
-2,43881517,1,A/T,#User Comment
-2,43857514,1,T/C
-6,88375602,1,G/A,#User Comment
-22,29307353,-1,T/A
-10,115912482,-1,C/T
-
-Format Example 2: SPACE BASED COORDINATE SYSTEM (comma separated) 3,81780819,81780820,-1,T/C
-2,43881516,43881517,1,A/T,#User Comment
-2,43857513,43857514,1,T/C
-6,88375601,88375602,1,G/A,#User Comment
-22,29307352,29307353,-1,T/A
-10,115912481,115912482,-1,C/T
-
-An example input file is provided: SIFT_HOME/test/snvs.input
-Format Description [comma separated: chromosome,coordinate,oientation,alleles,user comment(optional) ]
-Please do not use spaces except in the user comments field
-
-Coordinate System:
-SIFT accepts both reidue-based and a space-based coordinates for single nucleotide variants.
-If there is only one column of coordinates, as shown in Example 1 above, SIFT assumes the coordinate
-system is residue-based, if there are two columns, as shown in Example 2 above, SIFT assumes the
-coordinate system is space-based.
-
-The space-based coordinate system counts the spaces before and after bases rather than the bases themselves.
-Zero always refers to the space before the first base.
-
-The sequence 'ACGT' has coordinates (0,4) and its subsequence 'CG' has coordinates (1,3) as shown in Example 3 below.
-The difference between the start and end coordinates gives the sequence length. Misinterpretation of these
-coordinates can easily lead to 'off-by-one'. errors. Space-based coordinates become necessary when describing
-insertions/deletions and genomic rearrangements.
-
-Example 3:
-
-0 A 1 C 2 G 3 T 4
-
-In a residue based system as described in Example 4 below, each base is assigned a coordinate base on its
-absolute position, starting from 1. The sequence 'ACGT' has coordinates (1,4) and its subsequence 'CG' has
-coordinates (2,3).
-
-Example 4:
-A C G T
-1 2 3 4
-
-
-Orientation:
-Use 1 for positve strand and -1 for negative strand. If orientation is not known, use 1 as default.
-
-Alleles:
-Use 'base1/base2' where either base1 or base2 may be the reference allele. SIFT will predict for non-reference
-allele only. If you need prediction for reference allele, then use base1/base1 where base1 is the reference allele.
-
-1c. Running the tool
-
-cd to SIFT_home/bin directory and edit SIFT_exome_nssnvs.pl to change the line
-$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0/';
-to
-$ENV{'SIFT_HOME'} = '<YOUR SIFT_HOME_PATH>';
-
-Following is the usage of SIFT_exome_nssnvs.pl
-
-usage: 
-./SIFT_exome_nssnvs.pl 
-        -i <Query SNP filename with complete path>
-        -d <Variation db directory path>
-        -o <Optional: output file with complete path - default=/usr/local/projects/SIFT/sift4.0//tmp>
-
-To run the example input provided in the SIFT_HOME/test directory, 
-
-./SIFT_exome_nssnvs.pl -i ../test/snvs.input -d <SIFT_HOME>/db/Human_db_36/ 
-Your input data has been recognized to use SPACE based coordinate system. Your job id is 30072 and is currently running.  Your job has been partitioned into datasets of 1000 positions and the status of each job can be viewed /usr/local/projects/SIFT/sift4.0//tmp/30072/30072.outpage.txt. Once the status of a job is 'Complete', you may view the results. A partitioned job with 1000 input rows typically takes 6-7 min to complete.
-
-The output directory is SIFT_HOME/tmp/PID by default and PID in the above case is 30072
-
-The status of long running jobs (> 5000 input rows) may be viewed at <OUTPUT_DIR>/PID.outpage.txt
-
-2. SIFT_exome_indels.pl script takes as input, a list of multiple chromosome coordinates of coding
-insertion/deletion variants and outputs variant annotation. SIFT scores and predictions are not provided
-at this stage. This tool requires human coding information files that need to be downloaded before
-the tool can be used.
-
-NOTE: This tool is also available on SIFT website at 
-http://sift.jcvi.org/www/SIFT_chr_coords_indels_submit.html
-
-2a. Setting up: 
-
-Human coding information files should be downloaded from the ftp location
-ftp://ftp.jcvi.org/pub/data/sift/Coding_info_36/ , unzipped and placed in
-the directory SIFT_HOME/coding_info/Homo_sapien_version/
-
-Please see ftp://ftp.jcvi.org/pub/data/sift/Coding_info_36/README (using your
-browser) for more information about downloading and using the files with this
-tool or with SNPClassifier.
-
-2b. Preparing the input
-
-Format Example: SPACE BASED COORDINATE SYSTEM (comma separated) 10,102760304,102760304,1,GCGGCT,#User comment 1
-10,50205013,50205013,1,ACACACACACAC
-5,179134934,179134935,1,/,#User comment 2
-1,153108866,153108866,1,CTGCTGCTGCTG
-11,6368547,6368547,1,GCTGGCGCTGGC
-11,65081932,65081932,1,AGCAGC
-12,110521161,110521164,1,/
-12,116990733,116990736,1,/
-12,123453048,123453048,1,CTG
-12,131113090,131113090,1,GCA
-12,1932613,1932613,1,CTG
-
-
-Format Description [comma separated: chromosome,coordinate,oientation,alleles,user comment(optional) ]
-Please do not use spaces except in the user comments field
-
-Coordinate System:
-SIFT accepts only space-based coordinates for insertion / deletion variants.
-The space-based coordinate system counts the spaces before and after bases rather than the bases themselves.
-Zero always refers to the space before the first base.
-
-The sequence 'ACGT' has coordinates (0,4) and its subsequence 'CG' has coordinates (1,3) as shown in Example 1 below.
-The difference between the start and end coordinates gives the sequence length. Misinterpretation of these
-coordinates can easily lead to 'off-by-one' errors. Space-based coordinates become necessary when describing
-insertions/deletions and genomic rearrangements.
-
-Example 1:
-
-0 A 1 C 2 G 3 T 4
-Orientation:
-Use 1 for positive strand and -1 for negative strand. If orientation is not known, use 1 as default.
-
-Alleles:
-For Insertion, the begin and end coordinates should be same and the allele should be a string of inserted nucleotides in one of the following formats.
-1. ----/ATGC
-2. -/ATGC
-3. ATGC
-
-For Deletion, the difference between begin and end coordinates should be equal to the length of the deleted string. the allele can either be left blank or be specified in one of the followig formats
-1. ATGC/----
-2. ATGC/-
-3. /
-
-2c. Running the tool
-
-cd to SIFT_home/bin directory and edit SIFT_exome_indels.pl to change the line
-$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0/';
-to
-$ENV{'SIFT_HOME'} = '<YOUR SIFT_HOME_PATH>';
-
-Following is the usage of SIFT_exome_nssnvs.pl
-usage: 
-./SIFT_exome_indels.pl 
-        -i <Query indels filename with complete path>
-        -c <coding info directory path>
-        -d <Variation db directory path>
-        -o <Optional: output file with complete path - default=SIFT_HOME/tmp>
-
-        All values should be in local 0 space based coordinates.
-
-To run the example input provided in the SIFT_HOME/test directory,
-./SIFT_exome_nssnvs.pl -i ../test/indels.input -c <SIFT_HOME>/coding_info/Homo_sapien_36/ -d <SIFT_HOME>/db/Human_db_36/ 
-
-The default output directory is SIFT_HOME/tmp/PID.
-
-2d. Description of output 
-
-(This can also be viewed on the SIFT website at http://sift.jcvi.org/www/chr_coords_example_indels.html)
-
-Amino Acid Position Change
-
-This column contains the change coordinates within the original protein sequence and the modified 
-protein sequence. For example, the insertion of GCGGCT at location 102760304 of chromosome 10 of 
-Homo Sapiens (represented by input row: 0,102760304,102760304,1,GCGGCT) inserts two additional 
-amino acids Arginine 'R' and Serine 'S' at position 145 to 147 (space based coordinates) in the 
-modified protein sequence. 
-
- 
->ENST00000238965; MISMATCH = 145-145
-GPQEQGSPASCFETSPAGHATQASPYHPRACRGGFYLLPVNGFPEEEDNGELRERLGALK
-VSPSASAPRHPHKGIPPLQDVPVDAFTPLRIACTPPPQLPPVAPRPLRPNWLLTEPLSRE
-HPPQSQIRGRAQSRSRSRSRSRSRSSRGQGKSPGRRSPSPVPTPAPSMTNGRYHKPRKAR
-PPLPRPLDGEAAKVGAKQGPSESGTEGTAKEAAMKNPSGELKTVTLSKMKQSLGISISGG
-IESKVQPMVKIEKIFPGGAAFLSGALQAGFELVAVDGENLEQVTHQRAVDTIRRAYRNKA
-REPMELVVRVPGPSPRPSPSDSSALTDGGLPADHLPAHQPLDAAPVPAHWLPEPPTNPQT
-PPTDARLLQPTPSPAPSPALQTPDSKPAPSPRIP
- 
->ENST00000238965; MISMATCH = 145-147
-GPQEQGSPASCFETSPAGHATQASPYHPRACRGGFYLLPVNGFPEEEDNGELRERLGALK
-VSPSASAPRHPHKGIPPLQDVPVDAFTPLRIACTPPPQLPPVAPRPLRPNWLLTEPLSRE
-HPPQSQIRGRAQSRSRSRSRSRSRSrsSRGQGKSPGRRSPSPVPTPAPSMTNGRYHKPRK
-ARPPLPRPLDGEAAKVGAKQGPSESGTEGTAKEAAMKNPSGELKTVTLSKMKQSLGISIS
-GGIESKVQPMVKIEKIFPGGAAFLSGALQAGFELVAVDGENLEQVTHQRAVDTIRRAYRN
-KAREPMELVVRVPGPSPRPSPSDSSALTDGGLPADHLPAHQPLDAAPVPAHWLPEPPTNP
-QTPPTDARLLQPTPSPAPSPALQTPDSKPAPSPRIP
-
-
-Indel location
-
-This percentage indicates the approximate location of the indel in the protein. For example, 
-a value less than 50% means that the indel is located in the first half of the protein and is 
-close to the amino terminus, whereas a number greater than 50% means that the indel is closer 
-to the carboxy terminus.
-
-Transcript Visualization
-
-<---{}--{}[]--[*.]--[]--[]--[]--[]--[]--[]--[]{}---|
-
-The above example visualization mimics the structure of the transcript containing the indel.
-
-<--- indicates the 3' end
---| indicates the 5' end
-{}   indicate UTR 
-[]   indicates a coding exon
--   indicats an intron
-.    indicates the start of insertion or deletion
-*    indicates the end of deletion
-
-If the 3'end of the transcript appears to the left of the 5' end, as in this case, then the 
-transcript is transcribed from the negative strand. This transcript has two 3'UTRs, one 5'UTR, 
-nine exons and nine introns. The indel both starts and ends in the 8th coding exon.
-
-
-Nucleotide change
-
-The input allele (insertion or deletion) and +/- 5 base pairs are shown. For example,
-the user input for insertion variant "10,102760304,102760304,1,GCGGCT" will populate 
-this column with the following information
-cggct-GCGGCT-acggc
-
-whereas a user input for deletion variant "12,110521161,110521164,1,/" will populate 
-this column with the following information
-TGCTG-ctg-TTGCT
-
-For insertions, the inserted bases are displayed in uppercase and the flanking bases are 
-displayed in lowercase. For deletions, the deleted bases are displayed in lowercase whereas 
-the flanking bases are displayed in uppercase.
-
-
-Amino acid change
-
-This column displays the amino acid change caused by the indel. For example
-QQTT->QQqTT indicates the addition of amino acid Glutamine ('Q') in the modified protein sequence,
-whereas EEeDA->EEDA indicates the deletion of amino acid Glutamic acid, 'E' in the 
-modified protein sequence.
-
-
-Protein sequence change
-
-This column links  original and modified protein sequence files with regions of mismatch (caused due to indel) 
-colored in red. For example, an insertion represented by the user input 
-"1,153108866,153108866,1,CTGCTGCTGCTG" 
-causes an expansion in polyglutamine tract as shown in the following fasta format sequences. 
-The Fasta headers contain the Ensembl transcript ID along with the coordinates of change.
-
- 
->ENST00000271915; MISMATCH = 80-80
-MDTSGHFHDSGVGDLDEDPKCPCPSSGDEQQQQQQQQQQQQPPPPAPPAAPQQPLGPSLQ
-PQPPQLQQQQQQQQQQQQQQPPHPLSQLAQLQSQPVHPGLLHSSPTAFRAPPSSNSTAIL
-HPSSRQGSQLNLNDHLLGHSPSSTATSGPGGGSRHRQASPLVHRRDSNPFTEIAMSSCKY
-SGGVMKPLSRLSASRRNLIEAETEGQPLQLFSPSNPPEIVISSREDNHAHQTLLHHPNAT
-HNHQHAGTTASSTTFPKANKRKNQNIGYKLGHRRALFEKRKRLSDYALIFGMFGIVVMVI
-ETELSWGLYSKDSMFSLALKCLISLSTIILLGLIIAYHTREVQLFVIDNGADDWRIAMTY
-ERILYISLEMLVCAIHPIPGEYKFFWTARLAFSYTPSRAEADVDIILSIPMFLRLYLIAR
-VMLLHSKLFTDASSRSIGALNKINFNTRFVMKTLMTICPGTVLLVFSISLWIIAAWTVRV
-CERYHDQQDVTSNFLGAMWLISITFLSIGYGDMVPHTYCGKGVCLLTGIMGAGCTALVVA
-VVARKLELTKAEKHVHNFMMDTQLTKRIKNAAANVLRETWLIYKHTKLLKKIDHAKVRKH
-QRKFLQAIHQLRSVKMEQRKLSDQANTLVDLSKMQNVMYDLITELNDRSEDLEKQIGSLE
-SKLEHLTASFNSLPLLIADTLRQQQQQLLSAIIEARGVSVAVGTTHTPISDSPIGVSSTS
-FPTPYTSSSSC
- 
->ENST00000271915; MISMATCH = 80-84
-MDTSGHFHDSGVGDLDEDPKCPCPSSGDEQQQQQQQQQQQQPPPPAPPAAPQQPLGPSLQ
-PQPPQLQQQQQQQQQQQQQQqqqqPPHPLSQLAQLQSQPVHPGLLHSSPTAFRAPPSSNS
-TAILHPSSRQGSQLNLNDHLLGHSPSSTATSGPGGGSRHRQASPLVHRRDSNPFTEIAMS
-SCKYSGGVMKPLSRLSASRRNLIEAETEGQPLQLFSPSNPPEIVISSREDNHAHQTLLHH
-PNATHNHQHAGTTASSTTFPKANKRKNQNIGYKLGHRRALFEKRKRLSDYALIFGMFGIV
-VMVIETELSWGLYSKDSMFSLALKCLISLSTIILLGLIIAYHTREVQLFVIDNGADDWRI
-AMTYERILYISLEMLVCAIHPIPGEYKFFWTARLAFSYTPSRAEADVDIILSIPMFLRLY
-LIARVMLLHSKLFTDASSRSIGALNKINFNTRFVMKTLMTICPGTVLLVFSISLWIIAAW
-TVRVCERYHDQQDVTSNFLGAMWLISITFLSIGYGDMVPHTYCGKGVCLLTGIMGAGCTA
-LVVAVVARKLELTKAEKHVHNFMMDTQLTKRIKNAAANVLRETWLIYKHTKLLKKIDHAK
-VRKHQRKFLQAIHQLRSVKMEQRKLSDQANTLVDLSKMQNVMYDLITELNDRSEDLEKQI
-GSLESKLEHLTASFNSLPLLIADTLRQQQQQLLSAIIEARGVSVAVGTTHTPISDSPIGV
-SSTSFPTPYTSSSSC
-
-
-Causes Nonsense Mediated Decay
-
-Nonsense mediated decay (NMD) is a cellular mechanism of mRNA surveillance to detect 
-nonsense mutations and prevent the expression of truncated or erroneous proteins.
-This column indicates whether the input indel is likely to cause NMD. If NMD occurs, 
-then the indel is equivalent to a gene deletion because the mRNA is never translated.
-
-There is no NMD when:
-1) the resulting premature termination codon is in the last exon
-or-
-2) the resulting premature termintion codon is in the last 50 nucleotides in the second to last exon
-
-Repeat detected
-
-This column gets populated if the input insertion/deletion is found to expand or contract a 
-coding repeat region. For example, an input row '1,153108866,153108866,1,CTGCTGCTGCTG' causes 
-an insertion resulting in the expansion of a poly-glutamine repeat. A poly-glutamine repeat of 
-length 14 that expands to length 18 is illustrated in this column by 'PQL(q)14P-->PQL(q)18P'. 
-The repeat amino acid(s) are shown in parenthesis followed by the repeat number and bounded 
-by flanking amino acids.
-
-Warning: NCBI reference miscall
-
-If you receive a reference miscall warning in the coordinates column (first column) of the output 
-table, this means that your input coordinates overlap or contain a location that is not a true indel, 
-but likely to be an error in NCBI human genome reference sequence. 
-
- 
--- a/VERSION_UPDATE
+++ b/VERSION_UPDATE
@@ -34,4 +34,26 @@ VERSION 4.0.3 released March 8, 2010
 Change snv_db_engine to match SIFT website
 Clarify instructions  for SIFT_exome_indels.pl and SIFT_intersect_cds.pl 
 Make available web tools
- 
+
+VERSION 5.1.1, released January 17, 2014
+SIFT Standalone v5.1.1 release - Minor bug fixed for SIFT indels tool
+
+VERSION 5.2.0, released July 12, 2014
+SIFT 5.2.0 released. Code has been updated to be compatible with later versions of BLAST (tested on 2.2.28+)
+
+VERSION 5.2.1, released August 11, 2014
+SIFT 5.2.1 released. Fixed bug to increase coverage. BLAST results on sequences with repetitive sequences were not always parsed correctly and would give a segmentation fault. Bug fix will increase number of sequences with predictions. Prediction scoring remains unchanged.
+
+VERSION 6.0.0, released August 1, 2016
+Released SIFT 6.0.0. This release only features "Single Protein Tools", and includes BLIMPS code for easier installation. 
+
+VERSION 6.0.1, released August 15, 2016
+Released SIFT 6.0.1. Fixes a bug for parsing BLAST output. This release only features "Single Protein Tools", and includes BLIMPS code for easier installation. 
+
+VERSION 6.1.0, released December 5, 2016
+Changed blimps code sequences.h sequence_struct->undefined from int to double so that percent identity is compared correctly
+
+VERSION 6.2.0, released December 12, 2016
+Changes to get_BLink_seq.pl : (1)  https protocol for querying NCBI instead of htt (2) added "complete genomes" option (cg_only) (3) limit to 399 sequences, so that with query, it's 400 sequences. 
+
+VERSION 6.2.1, Fixed bus error by initializing variable. Also changed code to allow compilation without warnings. 
--- a/bin/Classify_SNPs_Indels.pl
+++ b/bin/Classify_SNPs_Indels.pl
--- a/bin/DNA_PROT_SUBROUTINES.pl
+++ b/bin/DNA_PROT_SUBROUTINES.pl
--- a/bin/IntersectFeatures.jar
+++ b/bin/IntersectFeatures.jar
--- a/bin/IntersectLocations.sh
+++ b/bin/IntersectLocations.sh
-#!/bin/sh
-bin="/usr/local/projects/SIFT/sift4.0/bin/"
-java -Xmx500m -jar $bin/IntersectFeatures.jar $1 $2 $3 $4 $5
--- a/bin/SIFT_exome_indels.pl
+++ b/bin/SIFT_exome_indels.pl
--- a/bin/SIFT_exome_nssnvs.pl
+++ b/bin/SIFT_exome_nssnvs.pl
-#!/usr/local/bin/perl
-use List::Util qw[min max];
-use File::Copy;
-use Getopt::Std;
-
-
-# This program is licensed to you under the Fred
-# Hutchinos Cancer Research Center (FHCRC)
-# NONCOMMERICAL LICENSE.  A copy of the license may be found at
-# http://blocks.fhcrc.org/sift/license.html and should be attached
-# to this software
-
-$| = 1;
-system("umask 006");
-$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0.2/';
-my $SIFT_HOME = $ENV{'SIFT_HOME'};
-use vars qw($opt_i $opt_d $opt_o $opt_A $opt_B $opt_C $opt_D $opt_E $opt_F $opt_G $opt_H $opt_I $opt_J $opt_K $opt_L);
-getopts("i:d:o:A:B:C:D:E:F:G:H:I:J:K:L:");
-my $usage = "usage: 
-$0 
-        -i <Query SNP filename with complete path>
-        -d <Variation db directory path>
-
-	<OPTIONAL>
-        -o <output file with complete path: default=$SIFT_HOME/tmp>
-	-A 1 to output Ensembl Gene ID: default: 0
-	-B 1 to output Gene Name: default: 0
-	-C 1 to output Gene Description: default: 0
-	-D 1 to output Ensembl Protein Family ID: default: 0
-	-E 1 to output Ensembl Protein Family Description: default: 0
-	-F 1 to output Ensembl Transcript Status (Known / Novel): default: 0
-	-G 1 to output Protein Family Size: default: 0
-	-H 1 to output Ka/Ks (Human-mouse): default: 0
-	-I 1 to output Ka/Ks (Human-macaque): default: 0
-	-J 1 to output OMIM Disease: default: 0
-	-K 1 to output Allele Frequencies (All Hapmap Populations - weighted average): default: 0
-	-L 1 to output Allele Frequencies (CEU Hapmap population): default: 0
-";
-if(!(
-        defined($opt_i) 
-        )){
-        print STDERR $usage;
-        die "No SNV list entered";
-}
-if(!(
-        defined($opt_d))){
-        print STDERR $usage;
-        die "No database file entered";
-}
-
-my $oo1 = defined($opt_A) ? 1:0;
-my $oo2 = defined($opt_B) ? 1:0;
-my $oo3 = defined($opt_C) ? 1:0;
-my $oo4 = defined($opt_D) ? 1:0;
-my $oo5 = defined($opt_E) ? 1:0;
-my $oo6 = defined($opt_F) ? 1:0;
-my $oo7 = defined($opt_G) ? 1:0;
-my $oo8 = defined($opt_H) ? 1:0;
-my $oo9 = defined($opt_I) ? 1:0;
-my $oo10 = defined($opt_J) ? 1:0;
-my $oo11 = defined($opt_K) ? 1:0;
-my $oo12 = defined($opt_L) ? 1:0;
-my $pid             = $$;
-my $bin             = "$SIFT_HOME/bin";
-my $tmp = defined($opt_o) ? "$opt_o/$pid" : "$SIFT_HOME/tmp/$pid";
-my $Variation_db_dir = $opt_d;
-mkdir $tmp;
-my $num_coords_per_split = 1000;
-require "$bin/SIFT_subroutines.pm";
-my $input_chr_file = $opt_i;
-
-my $output_options = "$oo1,$oo2,$oo3,$oo4,$oo5,$oo6,$oo7,$oo8,$oo9,$oo10,$oo11,$oo12";
-my $all_chr_file = "$tmp/$pid.allchrfile";
-copy ($input_chr_file , $all_chr_file);
-open (CHR_FILE,"$all_chr_file") || die ("Cannot open all chr file for validation");
-while (<CHR_FILE>){
-	if ($_ =~ /\d/ && $_ =~ /\,/){
-		$first_line = $_;
-		last;
-	}
-}
-close(CHR_FILE);
-if ($first_line =~ /[\d+,X,x,Y,y],\d+,\d+,\-?1,[A,T,G,C]\/[A,T,G,C]/i){
-	$COORD_SYSTEM = "SPACE";
-}
-elsif($first_line =~ /[\d+,x,X,y,Y],\d+,\-?1,[A,T,G,C]\/[A,T,G,C]/i){
-	$COORD_SYSTEM = "RESIDUE";
-}
-else{
-	print "Error: Incorrect input format. Please see <A HREF=\"\/www\/chr_coords_example.html\">Sample format</A>";
-	last;	
-}
-
-print
-"Your input data has been recognized to use $COORD_SYSTEM based coordinate system. Your job id is $pid and is currently running.  Your job has been partitioned into datasets of $num_coords_per_split positions and the status of each job can be viewed $tmp/$pid.outpage.txt. Once the status of a job is 'Complete', you may view the results. A partitioned job with $num_coords_per_split input rows typically takes 6-7 min to complete.\n";
-
-
-#prepare output status page
-open (OUTPAGE,">>$tmp/$pid.outpage.txt") || die ("cannot open outpage");
-print OUTPAGE "SIFT Results Status\n\n";
-$heading = "Job\tJob size\tJob ID\tJob status\tResults file\n";
-
-
-#if COORD SYSTEM is ABSOLUTE then convert chr file to space based.
-if ($COORD_SYSTEM eq "RESIDUE"){
-	open (CHR_FILE,"$all_chr_file") || die ("Cannot open all chr file");
-	open (CHR_FILE_NEW,">$all_chr_file.new") || die ("Cannot open new all chr file");
-	while (<CHR_FILE>){
-		chomp;
-		if ($_ !~ /\d/){next}
-		@elts = split /\,/, $_;
-		$chr = @elts[0];
-		$coord2 = @elts[1];
-		$coord1 = $coord2-1;
-		$orn = @elts[2];
-		$alleles = @elts[3];
-		$comment = @elts[4];
-		print CHR_FILE_NEW "$chr,$coord1,$coord2,$orn,$alleles,$comment\n";
-	}
-	close(CHR_FILE);
-	close (CHR_FILE_NEW);
-	system("mv $all_chr_file.new $all_chr_file");
-}
-
-
-open (CHR_FILE,"$all_chr_file") || die ("Cannot open all chr file");
-$count = 0;
-$new_pid = $pid+1;
-$input_set = 1;
-open (OUTFILE,">$tmp/$new_pid.chrfile");
-
-while (<CHR_FILE>){
-	chomp;
-	if ($_ !~ /\d/){next}
-	$count++;
-	if ($count % $num_coords_per_split== 0){
-		push @pid_list,$new_pid;
-		print OUTFILE "$_\n";
-                $job_size = $num_coords_per_split;
-		$start = $count-$job_size+1;
-		$end = $count ;
-                print OUTPAGE "Partitioned set $input_set\t";
-                print OUTPAGE "Input rows $start to $end\t";
-                print OUTPAGE "$new_pid\t";
-                print OUTPAGE "Not started\tNot available\n";
-		close(OUTFILE);
-		$input_set++;
-                $new_pid++;
-
-		open (OUTFILE,">$tmp/$new_pid.chrfile");
-	}
-	else{
-		print OUTFILE "$_\n";
-		
-	}
-}
-close(CHR_FILE);
-close(OUTFILE);
-$job_size = $count % $num_coords_per_split;
-$start = $end+1;
-$end = $start + $job_size-1;
-if ($job_size == 0){
-	system("rm -f $tmp/$new_pid.chrfile");
-	print OUTPAGE "Complete set\t";
-        print OUTPAGE "Input rows 1 to $end\t";
-        print OUTPAGE "$pid\t";
-        print OUTPAGE "Not started\tNot available\n";
-
-}
-else{
-	push @pid_list, $new_pid;
-	print OUTPAGE "Partitioned set $input_set\t";
-	print OUTPAGE "Input rows $start to $end\t";
-	print OUTPAGE "$new_pid\t";
-	print OUTPAGE "Not started\tNot available\n";
-	print OUTPAGE "Complete set\t";
-        print OUTPAGE "Input rows 1 to $end\t";
-        print OUTPAGE "$pid\t";
-        print OUTPAGE "Not started\tNot available\n";
-
-}
-print OUTPAGE "\n\n";
-print OUTPAGE "Batch Report\n";
-print OUTPAGE "Number of input (non-intronic) variants: \n";
-print OUTPAGE "Coding variants: \n";
-print OUTPAGE "Coding variants predicted: \n";
-print OUTPAGE "Tolerated: \n";
-print OUTPAGE "Damaging: \n";
-print OUTPAGE "Nonsynonymous: \n";
-print OUTPAGE "Synonymous: \n";
-print OUTPAGE "Novel: \n";
-
-close (OUTPAGE);
-#system("cp $tmp/$pid.outpage.txt $tmp/$pid.outpage.swap.txt");
-#direct to the outpage url
-#print "Location: $outpage_url\n\n";
-#print_outpage();
-
-open (BATCH_FILE,">$tmp/$pid.batchfile");
-for ($i = 0; $i < scalar @pid_list; $i++){
-	$new_pid = @pid_list[$i];
-	chomp $new_pid;
-	if ($i == scalar @pid_list -1){
-		print BATCH_FILE "$pid\t$new_pid\t$tmp/$new_pid.chrfile\t$organism\t$seq_identity_filter\t$COORD_SYSTEM\tLAST\t$output_options\t$address\n";
-	}
-	else{
-		print BATCH_FILE "$pid\t$new_pid\t$tmp/$new_pid.chrfile\t$organism\t$seq_identity_filter\t$COORD_SYSTEM\tNOT_LAST\t$output_options\t$address\n";
-	}
-}
-system("$bin/sift_feed_to_chr_coords_batch.pl $tmp/$pid.batchfile $tmp $Variation_db_dir");
-close (BATCH_FILE);
-
-
-
-
-
-
-
-
-
-
-
-
-
-sub print_outpage{
-	open(OUTPAGE, "$tmp/$pid.outpage.txt") || die("cannot open outpage");
-	while (<OUTPAGE>){
-        	print;
-	}
-	close (OUTPAGE);
-}
-
-
-
-
-
-
-
-
-sub general_parse {
-        local ( $content_type, $query_string ) = @_;
-        local ( %ans, @q, $pair, $loc, $boundary, $temp, $temp1 );
-
-        if ( $content_type eq "application/x-www-form-urlencoded" ) {
-
-                # break up into individual name/value lines
-                @q = split( /&/, $query_string );
-
-                foreach $pair (@q) {
-
-                        # break the name/value pairs up
-                        # use split rather than regular expressions because the value
-                        # may have
-                        #  newlines in it
-                        split( /=/, $pair, 2 );
-
-                        # change '+' to ' '
-                        $_[1] =~ s/\+/ /g;
-
-                        # change the escaped characters (must be after the split on '&'
-                        # and '=')
-                        $_[1] =~ s/%(..)/pack("c",&hextodec("$1"))/eg;
-
-                        $ans{ $_[0] } = $_[1];
-                }    #end of foreach $pair
-
-        }    #end of if ($content_type)
-        else {
-                $loc = index( $content_type, "boundary=" );
-                if ( $loc > 0 ) {
-                        $temp = substr( $content_type, $loc + 9 );
-
-                 #               Why is this necessary? (boundary= doesn't match actual)
-                        $boundary = "--" . $temp;
-
-                        # break up into individual name/value lines
-                        @q = split( /$boundary/, $query_string );
-
-                        foreach $pair (@q) {
-
-                                # break the name/value pairs up
-                                $loc = index( $pair, "name=" );
-                                $temp = substr( $pair, $loc + 5 );
-
-                                #         $loc = index($temp, "\n\n");
-                                $loc = index( $temp, "\n" );
-                                $temp1 = substr( $temp, $loc + 2 );
-
-                                #   Get rid of stuff after the name; including semicolon if any
-                                $loc_semi = index( $temp, ";" );
-                                $loc_eol  = index( $temp, "\n" );
-                                $loc      = $loc_eol;
-                                if ( $loc_semi > 0 && $loc_semi < $loc ) {
-                                        $loc = $loc_semi;
-                                }
-                                if ( $loc > 0 ) { $temp = substr( $temp, 0, $loc ); }
-
-                                #               Get rid of quotes around the name
-                                $temp =~ s/\"//g;
-
-                                #               Still has a trailing whitespace character ...
-                                $temp =~ s/\s//g;
-
-                  #               Substitute spaces with nothing
-                  #               Need to strip leading/ending whitespace off of $temp1,
-                  #               but be careful not to strip off internal CRs
-                  #               MAC file lines end in just \r, no \n, so makelis won't
-                  # find all
-                  #               of the sequences; DOS file lines end in \r\n, UNIX in
-                  #\n.
-                  #               Change \r\n to \n and then \r to \n
-#######PROGRAM -SPECIFIC!!!!!!!######################
-                 #In my case, I want to keep the newlines in fields which have "file" or
-                 # 'seq"
-                 # and remove newlines everywhere else.
-                 #if ( $temp =~ /file/ || $temp =~ /seq/ || $temp =~ /subst/ ) {
-                                $temp1 =~ s/\r\n/\n/g;
-                                $temp1 =~ s/\r/\n/g;
-
-                                #}
-
-                         # for other variables that are NOT files or file-like, remove extra
-                         #whitespace
-                         #else { $temp1 =~ s/\s//g; }
-                                if ( $temp ne "" ) { $ans{$temp} = $temp1; }
-                        }    # end of foreach
-                }    #end of if loc > 0
-                else {
-                        print "Cannot parse\n";
-                        print "content_type=$content_type\n";
-                        print "query_string=$query_string\n";
-                }
-        }
-        return %ans;
-
-        #print "</PRE>";
-}    # end of general_parse
-sub hextodec {
-        unpack( "N", pack( "H8", substr( "0" x 8 . shift, -8 ) ) );
-}
-
--- a/bin/SIFT_for_submitting_NCBI_gi_id.csh
+++ b/bin/SIFT_for_submitting_NCBI_gi_id.csh
@@ -12,18 +12,18 @@
 # BEST is recommended if high-confidence scores can be obtained 

 ### Set these for your installation
-#	Location of blastpgp
-setenv NCBI ../../blast
-setenv NCBI /usr/local/blast/
+#	Location of psiblast (absolute path)
+setenv NCBI /mnt2/pauline/ncbi-blast-2.4.0+/bin/
+#setenv NCBI /usr/local/blast/

-#	Location of BLIMPS
-setenv BLIMPS_DIR ../blimps
-
-#	Location of SIFT 
-setenv SIFT_DIR ../
+#	Location of SIFT (absolute path) 
+setenv SIFT_DIR /mnt2/pauline/sift6.2.1/

 #	SIFT's output files are written here
-set tmpdir = ../tmp
+set tmpdir = $SIFT_DIR/tmp/
+
+#       Location of BLIMPS. This should not need any editing
+setenv BLIMPS_DIR $SIFT_DIR/blimps

 ### Shouldn't need to make any more changes, look for output in $tmpdir 
 set bindir = $SIFT_DIR/bin
@@ -41,14 +41,13 @@ perl $bindir/perlscripts/get_BLINK_seq.pl $gi $tmpdir/$gi.unaligned $bestORall
 cat $unalignedseqs | perl -pe 's/\|//' | perl -pe 's/\|/\t/' | perl -pe 's/^\n//' | awk '{if ($1 ~ /^>/) { print $1} else { print;}}' > $unalignedseqs.2
        set alignedseqs = $unalignedseqs.aligned
        perl $bindir/perlscripts/separate_query_from_rest_of_seqs.pl $unalignedseqs.2  $unalignedseqs.queryseq  $unalignedseqs.database
-        $NCBI/formatdb -i $unalignedseqs.database -o T -p T
+        $NCBI/makeblastdb -in $unalignedseqs.database -dbtype prot
 # extremely large evalues and multipass threshold because want to make sure get
-#all the
-# sequences the user submits
-        $NCBI/blastpgp -d $unalignedseqs.database -i $unalignedseqs.queryseq -o $unalignedseqs.psiblastout -m 0 -j 4 -e 10 -h 1 -b 399
+#all the sequences the user submits
+        $NCBI/psiblast -db $unalignedseqs.database -query $unalignedseqs.queryseq -out $unalignedseqs.psiblastout -outfmt 0 -evalue 10 -num_descriptions 399 -num_alignments 399
        echo QUERY > $unalignedseqs.listseq
        grep ">" $unalignedseqs.database | cut -d" " -f1 | cut -c2- >> $unalignedseqs.listseq
-        $bindir/seqs_from_psiblast_res $unalignedseqs.psiblastout $unalignedseqs.listseq 4 $unalignedseqs.queryseq $alignedseqs $$
+        $bindir/seqs_from_psiblast_res $unalignedseqs.psiblastout $unalignedseqs.listseq 1 $unalignedseqs.queryseq $alignedseqs $$


 # final scoring 

--- a/bin/SIFT_for_submitting_fasta_seq.csh
+++ b/bin/SIFT_for_submitting_fasta_seq.csh
 #!/bin/csh
-#		SIFT.csh
+#               SIFT.csh
 # This program is licensed to you under the Fred
 # Hutchinos Cancer Research Center (FHCRC)
 # NONCOMMERICAL LICENSE.  A copy of the license may be found at
@@ -9,21 +9,21 @@

 # Argument 1: the protein sequence file in fasta format
 # Argument 2: the pathname to the protein sequence database
-# Argument 3: the substitution file (file containing amino acid substitutions to be predicted on. 
+# Argument 3: the substitution file (file containing amino acid substitutions to be predicted on.

 ### Set these for your installation
-#	Location of blastpgp
-setenv NCBI /usr/local/packages/blast-2.2.18/bin/
+#       Location of blastpgp
+setenv NCBI /mnt2/pauline/ncbi-blast-2.4.0+/bin/


-#	Location of BLIMPS
-setenv BLIMPS_DIR /opt/www/sift/sift3.0/blimps/
+#       Location of SIFT
+setenv SIFT_DIR /mnt2/pauline/sift6.2.1 

-#	Location of SIFT 
-setenv SIFT_DIR /opt/www/sift/sift3.0
+#       SIFT's output files are written here
+setenv tmpdir $SIFT_DIR/tmp/ 

-#	SIFT's output files are written here
-set tmpdir = /opt/www/sift/sift3.0/tmp
+#       Location of BLIMPS.This should not need any editing.
+setenv BLIMPS_DIR $SIFT_DIR/blimps

 ### Shouldn't need to make any more changes, look for output in $tmpsift
 set bindir = $SIFT_DIR/bin
@@ -32,7 +32,7 @@ set tail_of_root = $root_file:t
 set tmpfasta = $tmpdir/$tail_of_root.alignedfasta
 set tmpsift = $tmpdir/$tail_of_root.SIFTprediction

-$bindir/seqs_chosen_via_median_info.csh $1 $2 $4
+$bindir/seqs_chosen_via_median_info.csh $1 $2 2.75
 $bindir/info_on_seqs $tmpfasta $3 $tmpsift
 echo "Output in $tmpsift"


--- a/bin/SIFT_intersect_cds.pl
+++ b/bin/SIFT_intersect_cds.pl
-#!/usr/local/bin/perl
-$| = 1;
-use Getopt::Std;
-use File::Basename;
-
-# This program is licensed to you under the Fred
-# Hutchinos Cancer Research Center (FHCRC)
-# NONCOMMERICAL LICENSE.  A copy of the license may be found at
-# http://blocks.fhcrc.org/sift/license.html and should be attached
-# to this software
-#
-
-#       Set file permissions to rw-rw----
-system("umask 006");
-
-$ENV{'SIFT_HOME'} = '/usr/local/projects/SIFT/sift4.0/';
-my $SIFT_HOME = $ENV{'SIFT_HOME'};
-use vars qw ($opt_i $opt_c $opt_o);
-getopts ("i:c:o:");
-
-
-my $usage = "usage:
-$0
-	-i <List of variants>
-	-c <File with coding coordinates in gff format>
-
-	<OPTIONAL>
-        -o <output file with complete path: default=$SIFT_HOME/tmp>
-";
-
-if (! defined($opt_i)) { 
-        print STDERR $usage;
-        die "No SNV list entered";
-}
-
-
-if (! defined($opt_c)){
-        print STDERR $usage;
-        die "No coding file was entered";
-}
-
-
-my $bin             = "$SIFT_HOME/bin";
-my $tmp             = "$SIFT_HOME/tmp";
-my $all_chr_file =  $opt_i;
-my ($filename, $dir, $ext) = fileparse  ($all_chr_file, '\.*'); 
-my $outfilename = defined ($opt_o) ? "$opt_o" :  $tmp . "/" . $filename . "_codingOnly";
-
-my $coding_file = $opt_c; 
-#$coding_info_dir . "/Homo_sapien/ens.hum.ncbi36.ver41.cds.merge.gff";
-
-#Read input list of chromosome coordinates and add to all_chr_string
-my $all_chr_string;
-
-open (CHR_FILE,"$all_chr_file") || die "Cannot open $all_chr_file";
-while (<CHR_FILE>){
-        if ($_ =~ /\d/ && $_ =~ /\,/){
-                $first_line = $_;
-                last;
-        }
-}
-close(CHR_FILE);
-
-if ($first_line =~ /\d+,\d+,\d+,\-?1,/i){
-        $COORD_SYSTEM = "SPACE";
-}
-elsif($first_line =~ /\d+,\d+,\-?1,/i){
-        $COORD_SYSTEM = "RESIDUE";
-}
-else{
-        print "Incorrect input format. Please see http://sift.jcvi.org/www\/chr_coords_example.html";
-        last;
-}
-
-print
-"Your input data has been recognized to use $COORD_SYSTEM based coordinate system\n";
-# if COORD SYSTEM is SPACE, then there are insertions with same coordinates
-
-#if COORD SYSTEM is ABSOLUTE then convert chr file to space based.
-my $new_chr_file = "$SIFT_HOME/tmp/" . $filename .  ".gff";
-if ($COORD_SYSTEM eq "RESIDUE"){
-	convert_residue_to_space_gff ($all_chr_file, $new_chr_file);
-}
-if ($COORD_SYSTEM eq "SPACE") {
-	convert_space_with_insertions_gff ($all_chr_file, $new_chr_file);
-}
-	
-
- system ("sort -k1,1 -k4,4n -k5,5n $new_chr_file > $new_chr_file.sorted");
-my @lines_to_get = `$bin/IntersectLocations.sh $new_chr_file.sorted gff $coding_file gff simple | grep -v Total | cut -f1 | uniq`;
-
-my $all_chr_in_cds_file = $all_chr_file . ".cds.txt";
-
-my ($cds_count, $non_cds_count) = get_lines ($new_chr_file, 
-			$outfilename, 
-			@lines_to_get);
-
-print "Your filtered output is in $outfilename\n";
-
-exit(0);
-
-sub
-get_lines
-{
-	my  ($all_chr_file, $all_chr_in_cds_file, @lines_to_get)  = @_;
-
-	my %line_to_get_hash ;
-	# more elegant way to do this by iterating through @lines_to_get, but oh well
-	for (my $i=0; $i < @lines_to_get; $i++) {
-		chomp ($lines_to_get[$i]);
-		$lines_to_get_hash{$lines_to_get[$i]} = 1;
-	}
-	open (CHR_FILE, $all_chr_file) || die "can't open $all_chr_file";
-	open (OUT_CHR_FILE, ">$all_chr_in_cds_file") || die "can't open $all_chr_in_cds_file";
-	my $in_cds = 0;
-	my $not_in_cds = 0;
-        my $line;
-	while ($line = <CHR_FILE>){
-		chomp ($line);
-		my @fields = split (/\t/, $line);
-		my $line_num = $fields[1];
-		if (exists ($lines_to_get_hash{$line_num})) {
-			print OUT_CHR_FILE $fields[8] . "\n";
-			$in_cds++;
-		} else { 
-			$not_in_cds++;
-		}
-	}
-	close (CHR_FILE);
-	close (OUT_CHR_FILE);
-	return ($in_cds, $not_in_cds);
-}
-
-sub
-convert_space_with_insertions_gff
-{
-       my ($all_chr_file, $new_chr_file) =  @_;
-
-        open (CHR_FILE,"$all_chr_file") || die ("Cannot open all chr file");
-        open (CHR_FILE_NEW,">$new_chr_file") || die ("Cannot open $new_chr_file");
-        my $line_num = 0;
-        my $line;
-	while ($line = <CHR_FILE>){
-                chomp ($line);
-                if ($line !~ /\d/){next}
-                @elts = split (/\,/, $line);
-                $chr = @elts[0];
-                $coord1 = @elts[1];
-                $coord2 = @elts[2]; 
-                $orn = @elts[3];
-                $alleles = @elts[4];
-                $comment = @elts[5];
-		# for insertions
-		if ($coord1 == $coord2) {
-			$coord1 = $coord1 - 1;
-		}
-                print CHR_FILE_NEW "$chr\t$line_num\t$line_num\t$coord1\t$coord2\t.\t+\t.\t$line\n"; 
-                $line_num++;
-        }
-        close(CHR_FILE);
-        close (CHR_FILE_NEW);
-
-} # end convert_space_with_insertions_gff
-
-sub
-convert_residue_to_space_gff
-{
-	my ($all_chr_file, $new_chr_file) =  @_;
-
-   	open (CHR_FILE,"$all_chr_file") || die ("Cannot open all chr file");
-        open (CHR_FILE_NEW,">$new_chr_file") || die ("Cannot open $new_chr_file");
-	my $line_num = 0;
-        my $line;
-	while ($line = <CHR_FILE>){
-                chomp ($line);
-                if ($line !~ /\d/){next}
-                @elts = split (/\,/, $line);
-                $chr = @elts[0];
-                $coord2 = @elts[1];
-                $coord1 = $coord2-1;
-                $orn = @elts[2];
-                $alleles = @elts[3];
-                $comment = @elts[4];
-                print CHR_FILE_NEW "$chr\t$line_num\t$line_num\t$coord1\t$coord2\t.\t+\t.\t$line\n"; 
-		$line_num++;
-        }
-        close(CHR_FILE);
-        close (CHR_FILE_NEW);
-
-}
-
-
-
-# returns hash for a file, 2nd field is the key and the 3rd field
-# is the value 4th field, is the delimiter
-sub make_hash {
-	my ($file) = @_;
-	my %hash;
-	open( HASH, $file ) || die "can't open $file";
-	my $line;
-	while ( $line = <HASH> ) {
-		chomp($line);
-		if ( exists( $hash{$line} ) ) {
-			$hash{$line}++;
-		}
-		else {
-			$hash{$line} = 1;
-		}
-	}
-	close(HASH);
-	return (%hash);
-}
-
-
-sub round {
-    my($number) = shift;
-    return int($number + .5);
-}
-
--- a/bin/SIFT_subroutines.pm
+++ b/bin/SIFT_subroutines.pm
--- a/bin/SNPClassifier/Classify_SNPs.pl
+++ b/bin/SNPClassifier/Classify_SNPs.pl
--- a/bin/SNPClassifier/Extract_Coding_Info.pl
+++ b/bin/SNPClassifier/Extract_Coding_Info.pl
--- a/bin/SNPClassifier/Manuals/.svn/all-wcprops
+++ b/bin/SNPClassifier/Manuals/.svn/all-wcprops
--- a/bin/SNPClassifier/Manuals/.svn/entries
+++ b/bin/SNPClassifier/Manuals/.svn/entries
--- a/bin/SNPClassifier/Manuals/.svn/format
+++ b/bin/SNPClassifier/Manuals/.svn/format
-9