Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe
--- a/Analysis/DifferentialExpression/PtR
+++ b/Analysis/DifferentialExpression/PtR
@@ -567,6 +567,7 @@ main: {
    #$Rscript .= "library(gplots)\n";
    $Rscript .= "library(Biobase)\n";
    $Rscript .= "library(qvalue)\n";
+    $Rscript .= "library(fastcluster)\n";

    $Rscript .= "options(stringsAsFactors = FALSE)\n";
    
@@ -1674,7 +1675,7 @@ main: {
    print $ofh $Rscript;
    close $ofh;
    
-    &process_cmd("R --vanilla -q < $R_script_file");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script_file");

    exit(0);
    

--- a/Analysis/DifferentialExpression/cut_tree_into_clusters.pl
+++ b/Analysis/DifferentialExpression/cut_tree_into_clusters.pl
@@ -87,6 +87,7 @@ main: {
    print $ofh "library(cluster)\n";
    #print $ofh "library(gplots)\n";
    print $ofh "library(Biobase)\n";
+    print $ofh "library(fastcluster)\n";
    print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
    
    print $ofh "load(\"$R_data_file\")\n";
@@ -126,7 +127,7 @@ main: {
    close $ofh;
    

-    &process_cmd("R --vanilla -q < $R_script");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");

    exit(0);
    

--- a/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl
+++ b/Analysis/DifferentialExpression/define_clusters_by_cutting_tree.pl
@@ -88,6 +88,7 @@ main: {
    print $ofh "library(cluster)\n";
    #print $ofh "library(gplots)\n";
    print $ofh "library(Biobase)\n";
+    print $ofh "library(fastcluster)\n";
    print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
    
    print $ofh "load(\"$R_data_file\")\n";
@@ -180,7 +181,7 @@ main: {
    close $ofh;


-    &process_cmd("R --vanilla -q < $R_script");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");

    
    ###################################################

--- a/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl
+++ b/Analysis/DifferentialExpression/diff_expr_analysis_to_heatmap_html.pl
@@ -84,7 +84,7 @@ main: {
        print $ofh "get_cluster_info(\"$R_data_file\")\n";
        close $ofh;
        
-        &process_cmd("R --vanilla -q --slave < $R_script");
+        &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q --slave < $R_script");

        
        open (my $fh, $matrix_data) or die "Error, cannot open file $matrix_data";

--- a/Analysis/DifferentialExpression/matrix_to_gene_plots.pl
+++ b/Analysis/DifferentialExpression/matrix_to_gene_plots.pl
@@ -108,7 +108,7 @@ main: {
    close $ofh;
    

-    &process_cmd("R --vanilla -q < $R_script");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
    

    exit(0);

--- a/Analysis/DifferentialExpression/plot_expression_patterns.pl
+++ b/Analysis/DifferentialExpression/plot_expression_patterns.pl
@@ -98,7 +98,7 @@ main: {
    close $ofh;
    

-    &process_cmd("R --vanilla -q < $R_script");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
    

    exit(0);

--- a/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl
+++ b/Analysis/DifferentialExpression/remove_batch_effects_from_count_matrix.pl
@@ -109,7 +109,7 @@ sub run_edgeR_remove_batch_effect {
    close $ofh;

    ## Run R-script
-    my $cmd = "R --vanilla -q < $Rscript_name";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";


    eval {

--- a/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl
+++ b/Analysis/DifferentialExpression/replicates_to_sample_averages_matrix.pl
@@ -132,7 +132,7 @@ __EORSCRIPT__

 ;

-    my $cmd = "R --vanilla -q < $out_rscript";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $out_rscript";
    my $ret = system($cmd);
    if ($ret) {
        die "Error, cmd: $cmd died with ret $ret";

--- a/Analysis/DifferentialExpression/run_DE_analysis.pl
+++ b/Analysis/DifferentialExpression/run_DE_analysis.pl
@@ -445,7 +445,7 @@ sub run_edgeR_sample_pair {
    close $ofh;

    ## Run R-script
-    my $cmd = "R --vanilla -q < $Rscript_name";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";


    eval {
@@ -538,7 +538,7 @@ sub run_DESeq2_sample_pair {
    close $ofh;
    
    ## Run R-script
-    my $cmd = "R --vanilla -q < $Rscript_name";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
    &process_cmd($cmd);
    
    return;
@@ -614,7 +614,7 @@ sub run_limma_voom_sample_pair {
    close $ofh;

    ## Run R-script
-    my $cmd = "R --vanilla -q < $Rscript_name";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";


    eval {
@@ -713,7 +713,7 @@ sub run_ROTS_sample_pair {
    close $ofh;

    ## Run R-script
-    my $cmd = "R --vanilla -q < $Rscript_name";
+    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";


    eval {

--- a/Analysis/DifferentialExpression/run_GOseq.pl
+++ b/Analysis/DifferentialExpression/run_GOseq.pl
@@ -117,7 +117,10 @@ main: {
    
    print $ofh "\n\n# GO-Seq protocol: build pwf based on ALL DE features\n";
        
-
+    print $ofh "missing_gene_lengths = sample_set_gene_ids[! sample_set_gene_ids %in% rownames(gene_lengths)]\n";
+    print $ofh "if (length(missing_gene_lengths) > 0) {\n";
+    print $ofh "     stop(\"Error, missing gene lengths for features: \", paste(missing_gene_lengths, collapse=', '))\n";
+    print $ofh "}\n";
    print $ofh "sample_set_gene_lengths = gene_lengths[sample_set_gene_ids,]\n";
    print $ofh "GO_info_listed = GO_info_listed[ names(GO_info_listed) %in% sample_set_gene_ids ]\n";
    

--- a/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl
+++ b/Analysis/DifferentialExpression/run_TMM_normalization_write_FPKM_matrix.pl
@@ -118,7 +118,7 @@ sub run_TMM {
    
    close $ofh;

-    &process_cmd("R --vanilla -q < $tmm_norm_script");
+    &process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $tmm_norm_script");
    
    return("$counts_matrix_file.TMM_info.txt");


--- a/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.tiers_to_boxplot.pl
+++ b/Analysis/FL_reconstruction_analysis/tier_gene_trans_alignments.tiers_to_boxplot.pl
@@ -38,7 +38,7 @@ main: {
    print $ofh "dev.off();\n";
    close $ofh;
    
-    system("R --vanilla -q < __tmp_tiers_boxplot.R");
+    system("R --no-save --no-restore --no-site-file --no-init-file -q < __tmp_tiers_boxplot.R");

    exit($?);
 }

--- a/Analysis/SuperTranscripts/AllelicVariants/run_variant_calling.py
+++ b/Analysis/SuperTranscripts/AllelicVariants/run_variant_calling.py
@@ -43,26 +43,38 @@ def main():
                        "GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"),
        epilog="", formatter_class=argparse.RawTextHelpFormatter) 
    
-    parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True, help="Path to the SuperTranscripts fasta file.")
+    parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True,
+                        help="Path to the SuperTranscripts fasta file.")

-    parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True, help="Path to the SuperTranscript gtf file.")
+    parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True,
+                        help="Path to the SuperTranscript gtf file.")

    group = parser.add_mutually_exclusive_group(required=True)

-    group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2, help="Pair of paired ends read files.")
+    group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2,
+                       help="Pair of paired ends read files.")

-    group.add_argument('-s', '--single', dest="single_reads", type=str, help="Single reads file.")
+    group.add_argument('-s', '--single', dest="single_reads", type=str,
+                       help="Single reads file.")

-    parser.add_argument('-o', '--output', dest="out_path", type=str, required=True, help="Path to the folder where to generate the output.")
+    group.add_argument("-S", "--samples_file", dest="samples_file", type=str,
+                       help="Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))")
    
-    parser.add_argument('-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int, help="Size of the reads (used for STAR --sjdbOverhang). default=150")
+    parser.add_argument('-o', '--output', dest="out_path", type=str, required=True,
+                        help="Path to the folder where to generate the output.")

-    parser.add_argument('-t', '--threads', dest="nthreads", type=str, default="4", help="Number of threads to use for tools that are multithreaded.")
+    parser.add_argument('-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int,
+                        help="Size of the reads (used for STAR --sjdbOverhang). default=150")
    
-    parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000", help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")
+    parser.add_argument('-t', '--threads', dest="nthreads", type=str, default="4",
+                        help="Number of threads to use for tools that are multithreaded.")

+    parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000",
+                        help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")

-    parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function")
+
+    parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="",
+                        help="options to pass through to STAR's genomeGenerate function")

    args = parser.parse_args()

@@ -76,13 +88,31 @@ def main():
    if not GATK_HOME:
        exit("Error, missing path to GATK in $GATK.")

-
-
    # get real paths before changing working directory in case they are relative paths
    if args.paired_reads:
        reads_paths = [os.path.realpath(f) for f in args.paired_reads]
-    else:
+    elif args.single_reads:
        reads_paths = [os.path.realpath(args.single_reads)]
+    elif args.samples_file:
+        left_fq_list = list()
+        right_fq_list = list()
+        with open(args.samples_file) as fh:
+            for line in fh:
+                line = line.rstrip()
+                if not re.match("\w", line):
+                    continue
+                fields = line.split("\t")
+                left_fq = fields[2]
+                left_fq_list.append(os.path.realpath(left_fq))
+                if len(fields) > 3:
+                    right_fq = fields[3]
+                    right_fq_list.append(os.path.realpath(right_fq))
+        reads_paths = [",".join(left_fq_list)]
+        if right_fq_list:
+            reads_paths.append(",".join(right_fq_list))
+    else:
+        raise RuntimeError("no reads specified") # should never get here
+
    
    st_fa_path = os.path.realpath(args.st_fa)

@@ -160,6 +190,10 @@ def main():
    pipeliner.run()


+    ##
+    ## GATK settings based on best practices:
+    ## https://software.broadinstitute.org/gatk/documentation/article.php?id=3891
+    ##
    
    # clean and convert sam file with Picard-Tools
    logger.info("Cleaning and Converting sam file with Picard-Tools.")

--- a/Analysis/SuperTranscripts/DTU/dexseq_wrapper.pl
+++ b/Analysis/SuperTranscripts/DTU/dexseq_wrapper.pl
@@ -224,7 +224,7 @@ main: {
        
    }
    
-    $cmd = "R --vanilla -q < $dexseq_rscript";
+    $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $dexseq_rscript";
    $pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.$top_genes_plot.ok"));
    
    $pipeliner->run();

--- a/Analysis/SuperTranscripts/Trinity_gene_splice_modeler.py
+++ b/Analysis/SuperTranscripts/Trinity_gene_splice_modeler.py
@@ -44,6 +44,9 @@ def main():
    parser.add_argument("--no_squeeze", required=False, action="store_true",
                        default=False, help="don't merge unbranched stretches of node identifiers")

+    parser.add_argument("--no_refinement", required=False, action="store_true",
+                        default=False, help="don't refine splice graph by further collapsing allelic variants")
+    
    parser.add_argument("--incl_cdna", required=False, action="store_true",
                        default=False, help="rewrite Trinity fasta file using simplified graph structure")

@@ -110,6 +113,10 @@ def main():
            node_path_obj_list.append(n_path)
            #print(str(n_path))

+        # adjust for unique prefix yet shared suffix of FST nodes
+        node_path_obj_list = Node_path.Node_path.adjust_for_fst_nodes(tgraph, node_path_obj_list)
+
+        
        # generate multiple alignment

        start_time = time.time()
@@ -125,18 +132,34 @@ def main():
        if args.verbose:
            logger.info("Final splice_model_alignment for Gene {} :\n{}\n".format(gene_name, str(splice_model_alignment)))

+
+        ################################
+        ## Splice graph refinement stage
+        
        squeezed_splice_model = splice_model_alignment
+
+        
        if args.no_squeeze:
            logger.info("--no_squeeze set, so not squeezing structure")
+            #squeezed_splice_model = Splice_model_refiner.refine_alignment(squeezed_splice_model, reset_node_ids=True) # True important here... can have repeat nodes
        else:
+            # SQUEEZE
            squeezed_splice_model = splice_model_alignment.squeeze()
            logger.debug("Squeezed splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))

-            
+        if not args.no_refinement:
            squeezed_splice_model = Splice_model_refiner.refine_alignment(squeezed_splice_model, reset_node_ids=True) # True important here... can have repeat nodes
+            logger.debug("After seq-refinement of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
+            squeezed_splice_model = Splice_model_refiner.remove_redundant_paths(squeezed_splice_model)
+            logger.debug("After removing redundant paths of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
            
+            if not args.no_squeeze:
                # re-squeeze
                squeezed_splice_model = squeezed_splice_model.squeeze()
+                logger.debug("Post-refinement and final squeeze of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
+
+
+        # done with splice graph refinement.
        
        squeezed_splice_model.reassign_node_loc_ids_by_align_order()
        

--- a/Analysis/SuperTranscripts/pylib/Compact_graph_whole.py
+++ b/Analysis/SuperTranscripts/pylib/Compact_graph_whole.py
@@ -29,7 +29,9 @@ class Compact_graph_whole:
    
    def compact_unbranched(self, tgraph):

-        for node in tgraph.get_all_nodes():
+        all_nodes = list(tgraph.get_all_nodes())
+        
+        for node in all_nodes:

            if node.is_dead():
                continue

--- a/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.py
+++ b/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.py
@@ -426,7 +426,7 @@ class Gene_splice_modeler:
        writes the multiply aligned isoform sequences to an output filehandle
        """
        
-        transcript_names = malign_dict.keys()
+        transcript_names = list(malign_dict.keys())

        alignment_length = len(malign_dict[ transcript_names[ 0 ] ])


--- a/Analysis/SuperTranscripts/pylib/Node_path.py
+++ b/Analysis/SuperTranscripts/pylib/Node_path.py
@@ -76,3 +76,98 @@ class Node_path:

        return path_str
        
+
+    @staticmethod
+    def adjust_for_fst_nodes(tgraph, node_path_list):
+        """
+        fst nodes will have an extra 5' sequence as compared to the corresponding non-fst nodes.
+
+        If both the fst and non-fst version of the node exist, must modify the fst nodes so that
+        they are separated from their 5' extension, and the core of the node (suffix) is shared.
+
+        input: TGraph obj, list of node_path objects.
+
+        The node_path objects are modified in-place as needed.
+        A fst-node will be truncated to the unique prefix and the non-fst node will be integrated into the path.
+
+        returns the node_path_list with any required adjustments
+
+        """
+
+        # get list of fst nodes requiring adjustment
+
+        fst_nodes_require_adj = list()
+
+        nodes = tgraph.get_all_nodes()
+        for node in nodes:
+            node_id = node.get_loc_id()
+            if re.search("fst", node_id):
+                core_node_id = re.sub("fst", "", node_id)
+                core_node = tgraph.retrieve_node(core_node_id)
+                if core_node is not None:
+                    fst_nodes_require_adj.append( (node, core_node) )
+
+        if not fst_nodes_require_adj:
+            # nothing to do
+            logger.debug("no FST nodes to adjust")
+            return node_path_list
+
+
+        logger.debug("Adjusting FST nodes: {}".format(fst_nodes_require_adj))
+
+        old_fst_node_to_new_fst_nodes = dict()
+        fst_nodes_to_delete = list()
+
+        # perform node modifications:
+        for (fst_node, core_node) in fst_nodes_require_adj:
+
+            fst_node_seq = fst_node.get_seq()
+            core_node_seq = core_node.get_seq()
+
+            # reverse, index, then revcomp the index value to get the actual position.
+            fst_node_seq_rev = fst_node_seq[::-1]
+            core_node_seq_rev = fst_node_seq[::-1]
+            
+            if not re.match(core_node_seq_rev, fst_node_seq_rev):
+                raise RuntimeError("Error, core_node_seq:\n{}\nis not a suffix of fst seq:\n{}\n".format(core_node_seq, fst_node_seq))
+            
+            prefix_endpt = len(fst_node_seq) - len(core_node_seq)
+            
+
+            if prefix_endpt == 0:
+                assert fst_node_seq == core_node_seq, "Error, prefix starts at first position but sequences are not equivalent"
+                core_node.add_transcripts(fst_node.get_transcripts())
+                old_fst_node_to_new_fst_nodes[fst_node] = [core_node]
+                fst_nodes_to_delete.append(fst_node)
+
+            else:
+                prefix_string = fst_node_seq[0:prefix_endpt]
+                logger.debug("FST-SEQ-EXTRACTION\n\nFSTseq:\n{}\n\nCOREseq:\n{}\n\nPREFIXseq:\n{}\n\n".format(fst_node_seq, core_node_seq, prefix_string))
+                core_node.add_transcripts(fst_node.get_transcripts())
+                old_fst_node_to_new_fst_nodes[fst_node] = [fst_node, core_node]
+                fst_node.set_seq(prefix_string)
+
+        # now perform node path updates
+        for node_path in node_path_list:
+            nodes = node_path.get_path()
+
+            first_node = nodes[0]
+            if first_node in old_fst_node_to_new_fst_nodes:
+                # must replace
+                replacement_node_list = old_fst_node_to_new_fst_nodes[first_node]
+                if len(replacement_node_list) == 1:
+                    # swap it out
+                    nodes[0] = replacement_node_list[0]
+                elif len(replacement_node_list) == 2:
+                    nodes[0] = replacement_node_list[1]
+                    nodes.insert(0, replacement_node_list[0])
+                else:
+                    raise RuntimeError("shouldn't get here")
+                
+        # purge the nodes targeted for deletion
+        for node in fst_nodes_to_delete:
+            tgraph.prune_node(node)
+
+        return node_path_list
+
+        
--- a/Analysis/SuperTranscripts/pylib/Splice_model_refiner.py
+++ b/Analysis/SuperTranscripts/pylib/Splice_model_refiner.py
@@ -66,9 +66,6 @@ def refine_alignment(node_alignment_obj, reset_node_ids=False):

    splice_graph_node_alignment = splice_graph_to_node_alignment(refined_tgraph)

-    splice_graph_node_alignment = remove_redundant_paths(splice_graph_node_alignment)
-
-    
    return(splice_graph_node_alignment)



--- a/Analysis/SuperTranscripts/pylib/TGraph.py
+++ b/Analysis/SuperTranscripts/pylib/TGraph.py
@@ -104,6 +104,18 @@ class TGraph:
        self.node_cache.pop(node.get_loc_id())
    

+    def retrieve_node(self, node_id):
+        """
+        does not instantiate, only retrieves.
+        If loc_node_id is not in the graph, returns None
+        """
+
+        if node_id in self.node_cache:
+            return self.node_cache[node_id]
+        else:
+            return None
+        
+        
    def get_gene_id(self):
        return self.gene_id