Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · 8187ddaf · 8187ddaf · 8187ddaf
--- a/Analysis/DifferentialExpression/GOplot.Rscript
+++ b/Analysis/DifferentialExpression/GOplot.Rscript
@@ -63,8 +63,9 @@ GOBubble(reduced_circ, labels = 2.8)
 GOCircle(circ)

 # Generate a circular visualization of selected terms
-IDs <- c('GO:0007507', 'GO:0001568', 'GO:0001944', 'GO:0048729', 'GO:0048514', 'GO:0005886', 'GO:0008092', 'GO:0008047')
-GOCircle(circ, nsub = IDs)
+# (set the ids to your liking)
+#IDs <- c('GO:0007507', 'GO:0001568', 'GO:0001944', 'GO:0048729', 'GO:0048514', 'GO:0005886', 'GO:0008092', 'GO:0008047')
+#GOCircle(circ, nsub = IDs)

 # Generate a circular visualization for 10 terms
 GOCircle(circ, nsub = 10)
@@ -72,5 +73,5 @@ GOCircle(circ, nsub = 10)

 ##### Now, the rest is up to you. ;-)

-quit(save = "no", status = 0, runLast = FALSE)
+quit(save = "yes", status = 0, runLast = FALSE)

--- a/Analysis/DifferentialExpression/plot_log2FC_hist.Rscript
+++ b/Analysis/DifferentialExpression/plot_log2FC_hist.Rscript
+#!/usr/bin/env Rscript
+
+files=list.files(pattern=".*DE_results$")
+pdf("logFC.histograms.pdf")
+
+for (file in files) {
+    message("analyzing: ", file)
+    data = read.table(file, header=T)
+    hist(data$log2FoldChange, br=50, main=file)
+}
+
--- a/Analysis/DifferentialExpression/rename_matrix_feature_identifiers.pl
+++ b/Analysis/DifferentialExpression/rename_matrix_feature_identifiers.pl
@@ -50,6 +50,9 @@ main: {
        
        chomp;
        my @ids = split(/\t/);
+        
+        $ids[0] =~ s/\|.*$//;
+        
        if (my $new_id = $new_ids{$ids[0]}) {
            $ids[0] = $new_id;
        }

--- a/Analysis/DifferentialExpression/run_DE_analysis.pl
+++ b/Analysis/DifferentialExpression/run_DE_analysis.pl
@@ -306,7 +306,7 @@ sub parse_sample_info {

    my %samples;

-    open (my $fh, $sample_file) or die $!;
+    open (my $fh, $sample_file) or die "Error, cannot locate samples file: $sample_file";
    while (<$fh>) {
        unless (/\w/) { next; }
        if (/^\#/) { next; } # allow comments
@@ -521,8 +521,10 @@ sub run_DESeq2_sample_pair {

    print $ofh "res\$padj[is.na(res\$padj)]  <- 1\n"; # Carsten Kuenne

+    print $ofh "res = as.data.frame(res[order(res\$pvalue),])\n"; # rank by pvalue
+    
    ## output results
-    print $ofh "write.table(as.data.frame(res[order(res\$pvalue),]), file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE)\n";
+    print $ofh "write.table(res, file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE)\n";
    print $ofh "write.table(rnaseqMatrix, file=\'$output_prefix.DESeq2.count_matrix\', sep='\t', quote=FALSE)\n";
    
    

--- a/Analysis/SuperTranscripts/AllelicVariants/run_variant_calling.py
+++ b/Analysis/SuperTranscripts/AllelicVariants/run_variant_calling.py
@@ -20,6 +20,8 @@ import Pipeliner

 logger = None

+UTILDIR = os.path.sep.join([os.path.dirname(os.path.realpath(__file__)), "util"])
+

 def main():

@@ -59,6 +61,9 @@ def main():

    parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000", help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")

+
+    parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function")
+
    args = parser.parse_args()

    
@@ -108,7 +113,8 @@ def main():
    else:
        pipeliner.add_commands([Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar" +
                                                  " CreateSequenceDictionary R=" + st_fa_path +
-                                                  " O=" + dict_file,
+                                                  " O=" + dict_file +
+                                                  " VALIDATION_STRINGENCY=LENIENT ",
                                                  "picard_dict_st.ok")])
        pipeliner.run()

@@ -120,9 +126,12 @@ def main():
                                   " --runMode genomeGenerate" +
                                   " --genomeDir star_genome_idx " +
                                   " --genomeFastaFiles {} ".format(st_fa_path) +
+                                   " --genomeSAindexNbases 8 " +  # as per A. Dobin
                                   " --sjdbGTFfile {} ".format(st_gtf_path) +
                                   " --sjdbOverhang {} ".format(args.sjdbOverhang) +
-                                   " --limitGenomeGenerateRAM {}".format(args.maxram) )
+                                   " --limitGenomeGenerateRAM {}".format(args.maxram) +
+                                   " {} ".format(args.STAR_genomeGenerate_opts)
+                                   )
    
    pipeliner.add_commands([
        Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"),
@@ -160,6 +169,7 @@ def main():
                          " AddOrReplaceReadGroups " +
                          "I=Aligned.sortedByCoord.out.bam " +
                          "O=rg_added_sorted.bam " +
+                          " VALIDATION_STRINGENCY=SILENT " +
                          "SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample",
                          "add_read_groups.ok"),

@@ -169,10 +179,22 @@ def main():
                          "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics",
                          "mark_dups.ok"),

+        Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar ValidateSamFile " +
+                          "I=dedupped.bam " +
+                          "IGNORE_WARNINGS=true " +
+                          "MAX_OUTPUT=100000 " +
+                          "IGNORE=MATE_NOT_FOUND " +
+                          "O=dedupped.bam.validation",
+                          "bam_validate.ok",
+                          ignore_error=True),
+
+        Pipeliner.Command(UTILDIR + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam",
+                          "make_valid_dedupped_bam.ok"),
+        
        Pipeliner.Command("java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " +
                          "-T SplitNCigarReads -R " + st_fa_path +
-                          " -I dedupped.bam -o splitNCigar.bam " +
-                          " -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS",
+                          " -I dedupped.valid.bam -o splitNCigar.bam " +
+                          " -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS  --validation_strictness LENIENT",
                          "splitNCigarReads.ok")
        ])
    pipeliner.run()

--- a/Analysis/SuperTranscripts/AllelicVariants/util/clean_bam.pl
+++ b/Analysis/SuperTranscripts/AllelicVariants/util/clean_bam.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my $usage = "\n\t\tusage: $0 target.bam  errors.txt  output.bam\n\n";
+
+my $target_bam = $ARGV[0] or die $usage;
+my $errors_file = $ARGV[1] or die $usage;
+my $output_bam = $ARGV[2] or die $usage;
+
+
+
+main: {
+    
+    my %error_reads = &parse_error_reads($errors_file);
+
+    if (%error_reads) {
+        
+        my $in_cmd = "samtools view -h $target_bam | ";
+        
+        my $out_cmd = " | samtools view -Sb -o $output_bam";
+        
+        open(my $in_fh, $in_cmd) or die "Error, cannot open cmd: $in_cmd";
+        open(my $out_fh, $out_cmd) or die "Error, cannot open cmd: $out_cmd";
+        
+        while (my $line = <$in_fh>) {
+            if ($line =~ /^\@/) {
+                # header line
+                print $out_fh $line;
+            }
+            else {
+                my @x = split(/\t/, $line);
+                unless ($error_reads{$x[0]}) {
+                    print $out_fh $line;
+                }
+            }
+        }
+        close $in_fh;
+        close $out_fh;
+        
+        
+        &process_cmd("samtools index $output_bam");
+
+        my $output_index = $output_bam;
+        $output_index =~ s/\.bam$/\.bai/;
+
+        rename("$output_bam.bai",  $output_index);
+        
+    }
+    else {
+        
+        &process_cmd("ln -s $target_bam $output_bam");
+        
+        ###################
+        # include the index
+        
+        $target_bam =~ s/\.bam$/\.bai/;
+        $output_bam =~ s/\.bam$/\.bai/;
+        
+        &process_cmd("ln -s $target_bam $output_bam");
+
+        
+        
+    }
+    
+    
+    
+    exit(0);
+}
+
+####
+sub process_cmd {
+    my ($cmd) = @_;
+    
+    print STDERR "CMD: $cmd\n";
+    my $ret = system($cmd);
+    if ($ret) {
+        die "Error, CMD: $cmd died with ret $ret";
+    }
+    
+    return;
+}
+
+
+####
+sub parse_error_reads {
+    my ($file) = @_;
+    
+    # examples
+    
+    #ERROR: Record 6714568, Read name NS500412:143:HJWV5AFXX:1:21307:19673:9975, Read CIGAR M operator maps off end of reference
+    #ERROR: Record 6714569, Read name NS500412:143:HJWV5AFXX:3:21405:18461:10525, Read CIGAR M operator maps off end of reference
+    #ERROR: Record 6714570, Read name NS500412:143:HJWV5AFXX:1:21307:21758:18659, Alignment start (262144) must be <= reference sequence length (241) on reference TRINITY_DN11029_c1_g1
+    #ERROR: Record 6714570, Read name NS500412:143:HJWV5AFXX:1:21307:21758:18659, Read CIGAR M operator maps off end of reference
+    #ERROR: Record 6714571, Read name NS500412:143:HJWV5AFXX:2:11202:8325:4497, Alignment start (262144) must be <= reference sequence length (241) on reference TRINITY_DN11029_c1_g1
+    
+    
+    my %error_reads;
+    
+    open(my $fh, $file) or die "Error, cannot open file: $file";
+    while (<$fh>) {
+        if (/ERROR: Record (\d+), Read name (\S+), /) {
+            my $read_name = $2;
+            $error_reads{$read_name} = 1;
+        }
+    }
+    close $fh;
+    
+    return(%error_reads);
+}
+
+
+
+
+
+    
--- a/Analysis/SuperTranscripts/DTU/dexseq_wrapper.pl
+++ b/Analysis/SuperTranscripts/DTU/dexseq_wrapper.pl
@@ -11,6 +11,8 @@ use File::Basename;

 my $CPU = 2;

+my $top_genes_plot = 50;
+
 my $usage = <<__EOUSAGE__;

 #################################################################
@@ -33,6 +35,8 @@ my $usage = <<__EOUSAGE__;
 #
 #  --CPU <int>                       default: $CPU
 #
+#  --top_genes_plot <int>            default: $top_genes_plot
+# 
 ################################################################


@@ -59,6 +63,8 @@ my $SS_lib_type = "";
              'CPU=i' => \$CPU,
              'aligner=s' => \$aligner,
              'SS_lib_type=s' => \$SS_lib_type,
+              'top_genes_plot=i' => \$top_genes_plot,
+              
    );


@@ -194,7 +200,6 @@ main: {
        print $ofh "library(DEXSeq)\n";
        print $ofh "samples_info = read.table(\"$samples_table_file\", header=T, row.names=1)\n";
        print $ofh "dxd = DEXSeqDataSetFromHTSeq(as.vector(samples_info\$counts_filename), sampleData=samples_info, design = ~ sample + exon + condition:exon, flattenedfile=\"$genes_gtf_file.dexseq.gff\")\n";
-        print $ofh "pdf(\"$out_prefix.pdf\")\n";
        print $ofh "dxd = estimateSizeFactors( dxd )\n";
        print $ofh "dxd = estimateDispersions( dxd )\n";
        print $ofh "plotDispEsts( dxd )\n";
@@ -202,14 +207,25 @@ main: {
        print $ofh "dxd = estimateExonFoldChanges( dxd, fitExpToVar=\"condition\")\n";
        print $ofh "dxr1 = DEXSeqResults( dxd )\n";
        print $ofh "dxr1.sorted = dxr1[order(dxr1\$padj),]\n";
+        print $ofh "save(list = ls(all=TRUE), file = \"$out_prefix.Rdata\")\n";
        print $ofh "write.table(dxr1.sorted, file=\"$out_prefix.results.dat\", quote=F, sep=\"\t\")\n";
-        print $ofh "plotMA( dxr1, cex=0.8 )\n";
+        
+        print $ofh "pdf(\"$out_prefix.pdf\")\n";
+        
+        print $ofh "top_genes = unique(dxr1.sorted\$groupID[dxr1.sorted\$padj < 0.1 & ! is.na(dxr1.sorted\$padj)])\n";
+        print $ofh "top_genes = top_genes[1:min($top_genes_plot, length(top_genes))]\n";
+        print $ofh "message(\"Top $top_genes_plot genes: (\", paste(top_genes, collapse=','), \")\")\n"; 
+        print $ofh "for (gene in top_genes) { \n"
+            . "    plotDEXSeq( dxr1 , gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 , expression=FALSE, norCounts=TRUE, splicing=TRUE, displayTranscripts=TRUE)\n"
+            . "}\n";
+                        
+        #print $ofh "plotMA( dxr1, cex=0.8 )\n";
        close $ofh;
        
    }
    
    $cmd = "R --vanilla -q < $dexseq_rscript";
-    $pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.ok"));
+    $pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.$top_genes_plot.ok"));
    
    $pipeliner->run();

@@ -226,13 +242,26 @@ sub parse_samples_file {
    
    open (my $fh, $samples_file) or die "Error, cannot open file: $samples_file";
    while (<$fh>) {
+        if (/^\#/) { next; }
+        unless (/\w/) { next; }
        chomp;
-        my @x = split(/\t/);
+        my $line = $_;
+        my @x = split(/\s+/);
        my $condition = $x[0];
        my $replicate = $x[1];
        my $left_fq = $x[2];
        my $right_fq = $x[3]; # only if paired-end

+        unless ($left_fq) {
+            die "Error, cannot find required fields in sample file line: $line";
+        }
+        unless (-s $left_fq) {
+            die "Error, cannot locate left_fq: $left_fq as specified in samples file line: $line ";
+        }
+        if ($right_fq && ! -s $right_fq) {
+            die "Error, cannot locate right_fq: $right_fq as specified in samples file line: $line";
+        }
+        
        if ($seen{$replicate}) {
            die "Error, replicate name: [$replicate] must be unique among all replicate names.  Please update your samples file";
        }

--- a/Analysis/SuperTranscripts/Trinity_gene_splice_modeler.py
+++ b/Analysis/SuperTranscripts/Trinity_gene_splice_modeler.py
--- a/Analysis/SuperTranscripts/_misc/aln_before_after.pl
+++ b/Analysis/SuperTranscripts/_misc/aln_before_after.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use lib($ENV{EUK_MODULES});
+use Fasta_reader;
+
+my $usage = "\n\n\tusage: $0 trin_before.fa  trin_after.fa\n\n";
+
+my $trin_before_file = $ARGV[0] or die $usage;
+my $trin_after_file = $ARGV[1] or die $usage;
+
+main: {
+
+    my %trin_seqs_before = &parse_trin_fa($trin_before_file);
+
+    my %trin_seqs_after = &parse_trin_fa($trin_after_file);
+
+
+    foreach my $acc (keys %trin_seqs_after) {
+        my $seq_before = $trin_seqs_before{$acc} or die "Error, no before transcript for [$acc]";
+        my $seq_after = $trin_seqs_after{$acc} or die "Error, no after transcript for [$acc]";
+        
+        if ($seq_before eq $seq_after) {
+            print "* $acc identical\n";
+        }
+        else {
+            my $filename = $acc;
+            $filename =~ s/\W/_/g;
+
+            open(my $ofh, ">$filename") or die $!;
+            print $ofh ">before\n$seq_before\n"
+                . ">after\n$seq_after\n";
+            close $ofh;
+
+            my $ret = system("clustalw2 $filename");
+            if ($ret) {
+                die $!;
+            }
+        }
+    }
+
+    exit(0);
+
+}
+
+####
+sub parse_trin_fa {
+    my ($file) = @_;
+
+    my $fasta_reader = new Fasta_reader($file);
+
+    my %seqs = $fasta_reader->retrieve_all_seqs_hash();
+
+    return(%seqs);
+}
--- a/Analysis/SuperTranscripts/pylib/Compact_graph_partial.py
+++ b/Analysis/SuperTranscripts/pylib/Compact_graph_partial.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+
+import TGraph
+import TNode
+import Node_alignment
+import Compact_graph_whole
+from Topological_sort import Topological_sort
+import TGLOBALS
+
+logger = logging.getLogger(__name__)
+
+
+
+class Compact_graph_partial(Compact_graph_whole.Compact_graph_whole):
+
+    def __init__(self):
+
+        pass
+
+    
+    def compact_graph(self, tgraph):
+
+        compacted_flag = True
+
+        round = 0
+        while compacted_flag:
+
+            round += 1
+            logger.debug("\n\t### Partial graph compaction, Round: {}".format(round))
+
+            tgraph.clear_touch_settings() # start fresh
+            compacted_flag = False # reset for this round
+
+            sorted_nodes = Topological_sort.topologically_sort(tgraph.get_all_nodes())
+
+            if TGLOBALS.DEBUG:
+                dot_filename = "ladeda.BEGIN.compacted_partial.round_{}.dot".format(round)
+                tgraph.draw_graph(dot_filename)
+                logger.debug("-wrote dot: {}".format(dot_filename))
+            
+            for node in sorted_nodes:
+
+                if node.get_touched_val() > 0:
+                    continue
+
+                logger.debug("\n\n/////////// R[{}] Exploring compaction from node: {}\n\n".format(round, node.toString()))
+                
+                # try compact upward
+                prev_nodes = node.get_prev_nodes()
+                if len(prev_nodes) > 1 and self.untouched(prev_nodes) and self.all_have_lower_topological_orderings(node, prev_nodes):
+                    if self.compact_upward(prev_nodes):
+                        compacted_flag = True
+
+                next_nodes = node.get_next_nodes()
+                if len(next_nodes) > 1 and self.untouched(next_nodes) and self.all_have_higher_topological_orderings(node, next_nodes):
+                    if self.compact_downward(next_nodes):
+                        compacted_flag = True
+
+            if compacted_flag:
+                if TGLOBALS.DEBUG:
+                    dot_filename = "ladeda.END.compacted_partial.round_{}.dot".format(round)
+                    tgraph.draw_graph(dot_filename)
+                    logger.debug("-wrote dot: {}".format(dot_filename))
+
+                self.compact_unbranched(tgraph)
+
+
+    ####################
+    ## Upward Compaction
+    
+    def compact_upward(self, prev_nodes):
+
+        logger.debug("compact_upward: Partial {}".format(prev_nodes))
+
+        prev_nodes = list(prev_nodes)
+
+        for i in range(0, len(prev_nodes)-1):
+            for j in range(i+1, len(prev_nodes)):
+                if self.compact_upward_node_pair(prev_nodes[i], prev_nodes[j]):
+                    return True
+
+        return False
+
+    def compact_upward_node_pair(self, node_A, node_B):
+
+        logger.debug("compact_upward_node_pair:\nnode_A: {}\nnode_B: {}".format(node_A.toString(), node_B.toString()))
+        
+        # ensure they're on parallel paths in the graph.
+        if (node_A.is_ancestral(node_B) or node_B.is_ancestral(node_A)):
+            logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
+            return False
+
+        # switch A,B so A is the shorter sequence node
+        
+        if len(node_B.get_seq()) < len(node_A.get_seq()):
+            (node_A, node_B) = (node_B, node_A)
+
+        seqA = node_A.get_seq()
+        seqB = node_B.get_seq()
+        
+        shorter_len = len(seqA)
+
+        # reverse the strings and check number of mismatches
+        seqA_rev = seqA[::-1]
+        seqB_rev = seqB[::-1]
+
+        num_matches = 0
+        for i in range(0, shorter_len):
+            if seqA_rev[i] == seqB_rev[i]:
+                num_matches += 1
+            else:
+                break
+
+        if num_matches == 0:
+            logger.debug("no matching suffix between {} and {}".format(node_A, node_B))
+            return False
+        
+        
+        # go ahead and merge the two in their region of common overlap
+        tgraph = node_A.get_graph()
+
+        shared_seq = seqA[-1*num_matches:]
+        uniqA_seq = seqA[0:(len(seqA)-num_matches)]
+        uniqB_seq = seqB[0:(len(seqB)-num_matches)]
+
+        logger.debug("Suffices:\nseqA:{}\nSeqB:{}\nShared:{}".format(seqA, seqB, shared_seq))
+        
+
+        assert len(seqA) == len(shared_seq) + len(uniqA_seq)
+        assert len(seqB) == len(shared_seq) + len(uniqB_seq)
+        
+        ##### operations needed:
+        #
+        #         A                  A
+        #           \                 \
+        #             X       -->      C --X
+        #           /                 /
+        #         B                  B
+        #
+        #   - create C and set to partial shared sequence
+        #   - add B and A transcripts to C
+        #   - A and B link to C only
+        #   - C recapitulates all outgoing edges from  A and B
+
+        logger.debug("node_A before mods: {}".format(node_A.toString()))
+        logger.debug("node_B before mods: {}".format(node_B.toString()))
+
+        combined_transcripts = node_A.get_transcripts().union(node_B.get_transcripts())
+        combined_loc_id = "Upart:" + node_A.get_loc_id() + "," + node_B.get_loc_id()
+        node_C = tgraph.get_node(combined_transcripts, combined_loc_id, shared_seq)
+
+        node_A.set_seq(uniqA_seq)
+        node_B.set_seq(uniqB_seq)
+
+        # reset next nodes from A,B to C
+        all_next_nodes = node_A.get_next_nodes().union(node_B.get_next_nodes())
+        tgraph.prune_edges([node_A], node_A.get_next_nodes())
+        tgraph.prune_edges([node_B], node_B.get_next_nodes())
+        tgraph.add_edges([node_C], all_next_nodes)
+        node_C.touch()
+        
+        if len(uniqA_seq) == 0:
+            # no longer need this now
+            tgraph.add_edges(node_A.get_prev_nodes(), [node_C])
+            tgraph.prune_node(node_A)
+        else:
+            tgraph.add_edges([node_A], [node_C])
+            node_A.touch()
+
+        if len(uniqB_seq) == 0:
+            # no longer need it
+            tgraph.add_edges(node_B.get_prev_nodes(), [node_C])
+            tgraph.prune_node(node_B)
+        else:
+            tgraph.add_edges([node_B], [node_C])
+            node_B.touch()
+            
+                
+        logger.debug("node_A after mods: {}".format(node_A.toString()))
+        logger.debug("node_B after mods: {}".format(node_B.toString()))
+        logger.debug("node_C after mods: {}".format(node_C.toString()))
+        
+        logger.debug("\n\n\t*** partially UP-compacted nodes: {} and {} + {} ***".format(node_A, node_B, node_C))
+        
+        return True
+
+    ######################
+    ## Downward compaction
+
+    def compact_downward(self, next_nodes):
+
+        logger.debug("compact_downward: Partial {}".format(next_nodes))
+
+        next_nodes = list(next_nodes)
+        for i in range(0, len(next_nodes)-1):
+            for j in range(i+1, len(next_nodes)):
+                if self.compact_downward_node_pair(next_nodes[i], next_nodes[j]):
+                    return True
+
+        return False
+
+    def compact_downward_node_pair(self, node_A, node_B):
+
+        logger.debug("compact_downward_node_pair:\nnode_A: {}\nnode_B: {}".format(node_A.toString(), node_B.toString()))
+        
+        # ensure they're on parallel paths in the graph.
+        if (node_A.is_descendant(node_B) or node_B.is_descendant(node_A)):
+            logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
+            return False
+
+        # switch A,B so A is the shorter sequence node
+
+        if len(node_B.get_seq()) < len(node_A.get_seq()):
+            (node_A, node_B) = (node_B, node_A)
+
+        seqA = node_A.get_seq()
+        seqB = node_B.get_seq()
+
+        shorter_len = len(seqA)
+
+
+        num_matches = 0
+        for i in range(0, shorter_len):
+            if seqA[i] == seqB[i]:
+                num_matches += 1
+            else:
+                break
+
+
+        if num_matches == 0:
+            logger.debug("-no matching prefix match between {} and {}".format(node_A, node_B))
+            return False
+
+        
+        # go ahead and merge the two in their region of common overlap
+        tgraph = node_A.get_graph()
+
+        shared_seq = seqA[0:num_matches]
+        uniqA_seq = seqA[num_matches:]
+        uniqB_seq = seqB[num_matches:]
+        
+        logger.debug("Prefixes:\nseqA:{}\nSeqB:{}\nShared:{}".format(seqA, seqB, shared_seq))
+
+        ##### operations needed:
+        #
+        #                  A                          A
+        #                 /                          /
+        #            -- X           -->     -- X -- C
+        #                 \                          \ 
+        #                  B                          B
+        #
+        #  - find shared prefix of A,B
+        #  - create C for shared prefix, shorten A,B
+        #  - link A,B down from C
+        #  - C takes on all prev nodes from A,B
+
+        logger.debug("node_A before mods: {}".format(node_A.toString()))
+        logger.debug("node_B before mods: {}".format(node_B.toString()))
+
+        combined_transcripts = node_A.get_transcripts().union(node_B.get_transcripts())
+        combined_loc_id = "Dpart:" + node_A.get_loc_id() + "," + node_B.get_loc_id()
+        node_C = tgraph.get_node(combined_transcripts, combined_loc_id, shared_seq)
+
+        node_A.set_seq(uniqA_seq)
+        node_B.set_seq(uniqB_seq)
+
+        # reset prev nodes from A,B to C
+        all_prev_nodes = node_A.get_prev_nodes().union(node_B.get_prev_nodes())
+        tgraph.prune_edges(node_A.get_prev_nodes(), [node_A])
+        tgraph.prune_edges(node_B.get_prev_nodes(), [node_B])
+        tgraph.add_edges(all_prev_nodes, [node_C])
+        node_C.touch()
+
+        
+        if len(uniqA_seq) == 0:
+            # no longer need this now
+            tgraph.add_edges([node_C], node_A.get_next_nodes())
+            tgraph.prune_node(node_A)
+        else:
+            tgraph.add_edges([node_C], [node_A])
+            node_A.touch()
+
+        if len(uniqB_seq) == 0:
+            # no longer need it
+            tgraph.add_edges([node_C], node_B.get_next_nodes())
+            tgraph.prune_node(node_B)
+        else:
+            tgraph.add_edges([node_C], [node_B])
+            node_B.touch()
+
+        
+        logger.debug("node_A after mods: {}".format(node_A.toString()))
+        logger.debug("node_B after mods: {}".format(node_B.toString()))
+        logger.debug("node_C after mods: {}".format(node_C.toString()))
+        
+        logger.debug("\n\n\t*** partially compacted nodes: {} and {} + {} ***".format(node_A, node_B, node_C))
+
+        return True
+
+
--- a/Analysis/SuperTranscripts/pylib/Compact_graph_whole.py
+++ b/Analysis/SuperTranscripts/pylib/Compact_graph_whole.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+
+import TGraph
+import TNode
+import Node_alignment
+from Topological_sort import Topological_sort
+import TGLOBALS
+
+logger = logging.getLogger(__name__)
+
+
+
+MAX_MM_RATE = 0.05 
+
+
+class Compact_graph_whole:
+
+    def __init__(self):
+
+        pass
+
+    
+    def compact_unbranched(self, tgraph):
+
+        for node in tgraph.get_all_nodes():
+
+            if node.is_dead():
+                continue
+
+            prev_nodes = list(node.get_prev_nodes())
+            if len(prev_nodes) == 1:
+                # merge them
+                prev_node = prev_nodes[0]
+                if len(prev_node.get_next_nodes()) != 1:
+                    # prev node is downward branched... no dice
+                    continue
+
+                logger.debug("linear compaction of nodes {} and {}".format(prev_node, node))
+                node.add_transcripts(prev_node.get_transcripts())
+                node.set_seq(prev_node.get_seq() + node.get_seq())
+                tgraph.add_edges(prev_node.get_prev_nodes(), [node])
+                tgraph.prune_node(prev_node)
+
+
+    def compact_graph(self, tgraph, num_allowed_variants):
+
+        compacted_flag = True
+
+        round = 0
+        while compacted_flag:
+            
+            round += 1
+            logger.debug("\n\t### Num allowed variants: {}, Round: {}".format(num_allowed_variants, round))
+
+            tgraph.clear_touch_settings() # start fresh
+            compacted_flag = False # reset for this round
+
+            sorted_nodes = Topological_sort.topologically_sort(tgraph.get_all_nodes())
+
+            if TGLOBALS.DEBUG:
+                dot_filename = "ladeda.compacted_whole.BEGIN.num_allowed_{}.round_{}.dot".format(num_allowed_variants, round)
+                tgraph.draw_graph(dot_filename)
+                logger.debug("-wrote dot: {}".format(dot_filename))
+            
+            for node in sorted_nodes:
+
+                if node.get_touched_val() > 0:
+                    continue
+
+                logger.debug("\n\n///////// R[{}]  Exploring compaction at node: {}\n\n".format(round, node.toString()))
+                
+                # try compact upward
+                prev_nodes = node.get_prev_nodes()
+                if len(prev_nodes) > 1 and self.untouched(prev_nodes) and self.all_have_lower_topological_orderings(node, prev_nodes):
+                    if self.compact_upward(prev_nodes, num_allowed_variants):
+                        compacted_flag = True
+
+                next_nodes = node.get_next_nodes()
+                if len(next_nodes) > 1 and self.untouched(next_nodes) and self.all_have_higher_topological_orderings(node, next_nodes):
+                    if self.compact_downward(next_nodes, num_allowed_variants):
+                        compacted_flag = True
+
+            if compacted_flag:
+                if TGLOBALS.DEBUG:
+                    dot_filename = "ladeda.compacted_whole.END.num_allowed_{}.round_{}.dot".format(num_allowed_variants, round)
+                    tgraph.draw_graph(dot_filename)
+                    logger.debug("-wrote dot: {}".format(dot_filename))
+
+                self.compact_unbranched(tgraph)
+
+
+    def untouched(self, node_list):
+        for node in node_list:
+            if node.get_touched_val() > 1:
+                return False
+
+        return True
+
+
+    def all_have_lower_topological_orderings(self, node, other_nodes_list):
+        topo_order = node.get_topological_order()
+        if topo_order < 0:
+            raise RuntimeError("Error, must run topological ordering first")
+
+        for other_node in other_nodes_list:
+            if other_node.get_topological_order() > topo_order:
+                return False
+        return True
+
+
+    def all_have_higher_topological_orderings(self, node, other_nodes_list):
+        topo_order = node.get_topological_order()
+        if topo_order < 0:
+            raise RuntimeError("Error, must run topological ordering first")
+
+        for other_node in other_nodes_list:
+            if other_node.get_topological_order() < topo_order:
+                return False
+        return True
+    
+
+    ####################
+    ## Upward Compaction
+    
+    def compact_upward(self, prev_nodes, num_allowed_variants):
+
+        logger.debug("compact_upward: {} max_allowed_variants: {}".format(prev_nodes, num_allowed_variants))
+
+        prev_nodes = list(prev_nodes)
+
+        for i in range(0, len(prev_nodes)-1):
+            for j in range(i+1, len(prev_nodes)):
+                if self.compact_upward_node_pair(prev_nodes[i], prev_nodes[j], num_allowed_variants):
+                    return True
+
+        return False
+
+    def compact_upward_node_pair(self, node_A, node_B, num_allowed_variants):
+        logger.debug("compact_upward_node_pair({}, {}, num_allowed_variants: {}".format(node_A, node_B, num_allowed_variants))
+        
+        # ensure they're on parallel paths in the graph.
+        if (node_A.is_ancestral(node_B) or node_B.is_ancestral(node_A)):
+            logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
+            return False
+        logger.debug("-not ancestral: {}, {}".format(node_A, node_B))
+        
+        # switch A,B so A is the shorter sequence node
+
+        if len(node_B.get_seq()) < len(node_A.get_seq()):
+            (node_A, node_B) = (node_B, node_A)
+
+        seqA = node_A.get_seq()
+        seqB = node_B.get_seq()
+
+        shorter_len = len(seqA)
+
+        # reverse the strings and check number of mismatches
+        seqA_rev = seqA[::-1]
+        seqB_rev = seqB[::-1]
+
+        num_mm = 0
+        for i in range(0, shorter_len):
+            if seqA_rev[i] != seqB_rev[i]:
+                num_mm += 1
+
+        if num_mm > num_allowed_variants:
+            logger.debug("num_mm: {} exceeds {} for node pair: {} and {}".format(num_mm, num_allowed_variants, node_A, node_B))
+            return False
+
+        if num_mm / shorter_len > MAX_MM_RATE:
+            logger.debug("num_mm: {} / len: {} exceeds max mm rate: {} for node pair: {} and {}".format(num_mm, shorter_len,
+                                                                                                        MAX_MM_RATE, node_A, node_B))
+            return False
+
+
+        # go ahead and merge the two in their region of common overlap
+        tgraph = node_A.get_graph()
+
+        ##### operations needed:
+        #
+        #         A
+        #           \
+        #            -- X
+        #           / 
+        #         B
+        #
+        #   -add B transcripts to A
+        #   -adjust all B-downward edges to A
+        #   -if B is longer than A, set upward to A and trim B part to unique region
+        #      otherwise, remove B altogether.
+        #
+
+        logger.debug("node_A before mods: {}".format(node_A.toString()))
+        logger.debug("node_B before mods: {}".format(node_B.toString()))
+
+        node_A.add_transcripts(node_B.get_transcripts())
+        tgraph.add_edges( [node_A], node_B.get_next_nodes() )
+        tgraph.prune_edges( [node_B], node_B.get_next_nodes() )
+
+        node_A.touch()
+        node_B.touch()
+
+
+        if len(seqB) > shorter_len:
+            delta = len(seqB) - shorter_len
+            seqB = seqB[0:delta]
+            node_B.set_seq(seqB)
+            tgraph.add_edges([node_B], [node_A])
+        else:
+            # remove node_B prev edges and attach them to A
+            tgraph.add_edges(node_B.get_prev_nodes(), [node_A])
+            # remove B altogether
+            tgraph.prune_node(node_B)
+
+        logger.debug("node_A after mods: {}".format(node_A.toString()))
+        logger.debug("node_B after mods: {}".format(node_B.toString()))
+
+        logger.debug("\n\n\t*** compacted nodes: {} and {} ***".format(node_A, node_B))
+
+        return True
+
+    ######################
+    ## Downward compaction
+
+    def compact_downward(self, next_nodes, num_allowed_variants):
+
+        logger.debug("compact_downward: {} max_allowed_variants: {}".format(next_nodes, num_allowed_variants))
+
+        next_nodes = list(next_nodes)
+        for i in range(0, len(next_nodes)-1):
+            for j in range(i+1, len(next_nodes)):
+                if self.compact_downward_node_pair(next_nodes[i], next_nodes[j], num_allowed_variants):
+                    return True
+
+        return False
+
+    def compact_downward_node_pair(self, node_A, node_B, num_allowed_variants):
+
+        # ensure they're on parallel paths in the graph.
+        if (node_A.is_descendant(node_B) or node_B.is_descendant(node_A)):
+            logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
+            return False
+
+        # switch A,B so A is the shorter sequence node
+
+        if len(node_B.get_seq()) < len(node_A.get_seq()):
+            (node_A, node_B) = (node_B, node_A)
+
+        seqA = node_A.get_seq()
+        seqB = node_B.get_seq()
+
+        shorter_len = len(seqA)
+
+
+        num_mm = 0
+        for i in range(0, shorter_len):
+            if seqA[i] != seqB[i]:
+                num_mm += 1
+
+        if num_mm > num_allowed_variants:
+            logger.debug("num_mm: {} exceeds {} for node pair: {} and {}".format(num_mm, num_allowed_variants, node_A, node_B))
+            return False
+
+        if num_mm / shorter_len > MAX_MM_RATE:
+            logger.debug("num_mm: {} / len: {} exceeds max mm rate: {} for node pair: {} and {}".format(num_mm, shorter_len,
+                                                                                                        MAX_MM_RATE, node_A, node_B))
+            return False
+
+
+        # go ahead and merge the two in their region of common overlap
+        tgraph = node_A.get_graph()
+
+        ##### operations needed:
+        #
+        #                   A
+        #                 /
+        #            -- X
+        #                 \ 
+        #                   B
+        #
+        #   -add B transcripts to A
+        #   -adjust all B-downward edges to A
+        #   -if B is longer than A, set downward to A and trim B part to unique region
+        #      otherwise, remove B altogether.
+        #
+
+        logger.debug("node_A before mods: {}".format(node_A.toString()))
+        logger.debug("node_B before mods: {}".format(node_B.toString()))
+
+        node_A.add_transcripts(node_B.get_transcripts())
+        tgraph.add_edges(node_B.get_prev_nodes(), [node_A])
+        tgraph.prune_edges(node_B.get_prev_nodes(), [node_B])
+
+        node_A.touch()
+        node_B.touch()
+
+        if len(seqB) > shorter_len:
+            seqB = seqB[shorter_len:]
+            node_B.set_seq(seqB)
+            tgraph.add_edges([node_A], [node_B])
+        else:
+            # remove node_B prev edges and attach them to A
+            tgraph.add_edges([node_A], node_B.get_next_nodes())
+            # remove B altogether
+            tgraph.prune_node(node_B)
+
+        logger.debug("node_A after mods: {}".format(node_A.toString()))
+        logger.debug("node_B after mods: {}".format(node_B.toString()))
+
+        logger.debug("\n\n\t*** compacted nodes: {} and {} ***".format(node_A, node_B))
+
+        return True
+
+
--- a/Analysis/SuperTranscripts/pylib/DP_matrix.py
+++ b/Analysis/SuperTranscripts/pylib/DP_matrix.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+
+logger = logging.getLogger(__name__)
+
+           
+class DP_matrix:
+    """
+    defines the dynamic programming matrix for the node multiple alignments
+    """
+    
+    @staticmethod
+    def build_DP_matrix(num_rows, num_cols):
+        dp_matrix = list()
+        for i in range(0, num_rows+1):
+            row = []
+            for j in range(0, num_cols+1):
+                struct = { 'i' : i,
+                           'j' : j,
+                           'bt' : -1,
+                           'score' : 0,
+                           }
+                row.append(struct)
+            dp_matrix.append(row)
+        
+        return dp_matrix    
+
+
+    @staticmethod
+    def toString(dp_matrix):
+        nrow = len(dp_matrix)
+        ncol = len(dp_matrix[0])
+
+        logger.debug("Matrix is {} X {}".format(nrow, ncol))
+
+        ret_text = ""
+        for row in dp_matrix:
+            str_row = [ str(x['score']) for x in row ]
+            ret_text += "\t".join(str_row) + "\n"
+
+        return ret_text
+
--- a/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.py
+++ b/Analysis/SuperTranscripts/pylib/Gene_splice_modeler.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+import TGraph
+import TNode
+import Node_path
+import Node_alignment
+from GraphCycleException import GraphCycleException 
+import Topological_sort
+import DP_matrix
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
+class Gene_splice_modeler:
+
+    """
+    Builds supertranscipts.
+
+    object instance members:
+
+        gene_id : str
+
+        alignments : list of Node_alignment objects
+
+    """
+    
+    def __init__(self, gene_id, node_path_obj_list):
+
+        """
+        initialize alignments list with simple single 'alignment' objects with
+        each path as an individual alignment with just its path nodes.
+
+        params:
+
+        gene_id : str
+
+        node_path_obj_list : list of Node_path objects, each Node_path corresponding to an individual Trinity isoform
+
+        """
+        
+        self.gene_id = gene_id
+        self.alignments = list()
+
+        logger.debug("Gene_splice_modeler inputs: {}".format(node_path_obj_list))
+        
+        for node_path_obj in node_path_obj_list:
+            transcript_name = node_path_obj.get_transcript_name()
+            alignment_obj = Node_alignment.Node_alignment.get_single_seq_node_alignment(node_path_obj)
+
+            self.alignments.append(alignment_obj)
+
+        
+
+    def get_gene_id(self):
+        return self.gene_id
+
+    
+    def build_splice_model(self):
+        """
+        method to construct the super transcript.
+
+        Tries 2 approaches:
+            a.  If there isn't an obvious repetitive node structure and so the graph formas a DAG,
+                we build a splice graph and perform topological sorting of the nodes.
+            b.  If there is some repetitive structure, we resort to performing a multiple alignment-based method to
+                organize relationships among nodes in isoforms, and the multiple alignment produces the linear ordering
+                for the supertranscript.
+
+        """
+
+        
+        if not self.alignment_contains_repeat_node():
+            # no obvious cycles
+            try:
+                return self.topological_order_splice_model()
+            except GraphCycleException:
+                # have a more complex cycle here...
+                # try again w/ mult align approach.
+                return self.multiple_alignment_splice_model()
+            
+        else:
+            return self.multiple_alignment_splice_model()
+
+
+    def alignment_contains_repeat_node(self):
+
+        for alignment in self.alignments:
+            loc_ids = set()
+            for i in range(0, alignment.width()):
+                node_obj = alignment.get_representative_column_node(i)
+                loc_id = node_obj.get_loc_id()
+                if loc_id in loc_ids:
+                    return True
+                loc_ids.add(loc_id)
+
+        return False
+
+
+    def topological_order_splice_model(self):
+        """
+        Build supertranscript using simpler topological sorting of the nodes.
+        """
+        
+        logger.debug("\tusing topological sort method.\n");
+        gene_id = self.get_gene_id()
+        
+        ## make a generic graph.
+        graph = TGraph.TGraph(gene_id)
+        for alignment in self.alignments:
+            logger.debug("topological_order_splice_model, input alignment: " + str(alignment))
+            node_list = alignment.get_aligned_nodes()[0] # should be unaligned here, so just ordered path list.
+            transcript_name = alignment.get_transcript_names()[0]
+            logger.debug("topological_order_splice_model, node list: " + str(node_list))
+            for i in range(0, len(node_list)):
+                node_obj = node_list[i]
+                loc_id = node_obj.get_loc_id()
+                generic_node = graph.get_node(transcript_name, loc_id, node_obj.get_seq()) # rely on Node class caching system
+                logger.debug("generic node: " + str(generic_node))
+                
+                if i > 0:
+                    # set prev node info
+                    prev_node_obj = node_list[i-1]
+                    prev_generic_node = graph.get_node(transcript_name, prev_node_obj.get_loc_id(), prev_node_obj.get_seq())
+                    generic_node.add_prev_node(prev_generic_node)
+
+                if i < len(node_list) - 1:
+                    next_node_obj = node_list[i+1]
+                    next_generic_node = graph.get_node(transcript_name, next_node_obj.get_loc_id(), next_node_obj.get_seq())
+                    generic_node.add_next_node(next_generic_node)
+
+        logger.debug("Before sorting nodes: " + str(graph))
+
+        topologically_sorted_nodes = Topological_sort.Topological_sort.topologically_sort(graph.get_all_nodes())
+
+        logger.debug("Topologically sorted nodes: " + str(topologically_sorted_nodes))
+        
+        # index loc node ids
+        aligned_loc_id_pos = dict()
+        for i in range(0, len(topologically_sorted_nodes)):
+            loc_id = topologically_sorted_nodes[i].get_loc_id()
+            aligned_loc_id_pos[loc_id] = i
+
+
+        new_alignments = list()
+        transcript_ids = list()
+        for alignment in self.alignments:
+            transcript_ids.append(alignment.get_transcript_names()[0]) # really should only be one here.
+            new_alignment = [None for i in topologically_sorted_nodes]
+            for node in alignment.get_aligned_nodes()[0]:
+                loc_id = node.get_loc_id()
+                new_idx = aligned_loc_id_pos[loc_id]
+                new_alignment[new_idx] = node
+            new_alignments.append(new_alignment)
+
+        splice_graph_model = Node_alignment.Node_alignment(gene_id, transcript_ids, new_alignments)
+
+        logger.debug("Splice graph model: " + str(splice_graph_model))
+
+        return splice_graph_model
+    
+    def multiple_alignment_splice_model(self):
+        """
+        Multiple alignment algorithm for dealing with repeat nodes:
+        For each best matching pair of transcripts (or aligned transcripts),
+        perform alignment, and replace aligned pair with a single alignment object.
+        """
+        
+        logger.debug("\tusing mult alignment method.\n");
+                    
+        alignments = self.alignments
+
+        if len(alignments) == 1:
+            # no alignment is necessary.
+            return alignments[0]
+        
+        # determine initial path similarity
+        similarity_matrix = Gene_splice_modeler.compute_similarity_matrix(self.alignments)
+        logger.debug("Similarity matrix:\n" + str(similarity_matrix))
+
+        ## build multiple alignment in a hierarchical way
+        while len(similarity_matrix) > 1:
+
+            # set diag to -1 to avoid any zero ties w/ self-vals
+            for i in range(0,len(alignments)):
+                similarity_matrix[ i ][ i ] = -1
+            
+            ## find best pair
+            best_pair_idx = int(numpy.argmax(similarity_matrix))
+            num_alignments = len(similarity_matrix)
+            best_pair_idx_1 = int(best_pair_idx / num_alignments)
+            best_pair_idx_2 = best_pair_idx % num_alignments
+            
+            ## merge pair into single alignment
+            align_a = alignments[ best_pair_idx_1 ]
+            align_b = alignments[ best_pair_idx_2 ]
+
+            align_merged = Gene_splice_modeler.merge_alignments(align_a, align_b)
+            
+            ## recompute matrix
+            new_alignment_list = list()
+            for i in range(0, len(alignments)):
+                if i not in (best_pair_idx_1, best_pair_idx_2):
+                    new_alignment_list.append(alignments[ i ])
+            new_alignment_list.append(align_merged)
+
+            alignments = new_alignment_list
+
+            logger.debug("\nUpdated alignments:\n" + str(alignments))
+            
+            similarity_matrix = Gene_splice_modeler.compute_similarity_matrix(alignments)
+            logger.debug("Similarity matrix:\n" + str(similarity_matrix))
+
+
+        if len(alignments) > 1:
+            raise RuntimeError("Error, should only have one alignment but have {} alignments after merge".format(len(alignments)))
+        
+        return alignments[0]
+
+
+    @staticmethod
+    def compute_similarity_matrix(alignments_list):
+        """
+        similarity matrix indicates number of shared nodes between each pair of isoforms.
+        """
+        
+        num_alignments = len(alignments_list)
+        sim_matrix = numpy.zeros( (num_alignments, num_alignments), dtype='int_' )
+
+        for i in range(0, num_alignments-1):
+            align_i = alignments_list[i]
+            for j in range(i+1, num_alignments):
+                align_j = alignments_list[j]
+
+                common_nodes = Node_alignment.Node_alignment.compute_number_common_nodes(align_i, align_j)
+                num_common_nodes = len(common_nodes)
+
+                sim_matrix[ i ][ j ] = num_common_nodes
+                
+
+
+        return sim_matrix
+        
+
+    @staticmethod
+    def merge_alignments(align_a, align_b):
+        """
+        Computes a mismatch-free multiple alignment (just matches and gaps) between two Node_alignment objects
+
+        returns single Node_alignment object containing the contents of aligned align_a and align_b as aligned.
+        
+        """
+        
+        logger.debug("Merging alignments {} and {}".format(align_a, align_b))
+
+        ## ensure the transcripts are disjoint
+        transcript_names_align_A = set(align_a.get_transcript_names())
+        transcript_names_align_B = set(align_b.get_transcript_names())
+
+        if not set.isdisjoint(transcript_names_align_A, transcript_names_align_B):
+            raise RuntimeError("Error, transcripts in alignments to merge are not disjoint: {} and {}".format(transcript_names_align_A, transcript_names_align_B))
+
+        
+        width_a = align_a.width()
+        width_b = align_b.width()
+
+        # do global alignments w/o penalizing end gaps
+        dp_matrix = DP_matrix.DP_matrix.build_DP_matrix(width_a, width_b)
+
+        # put align B across top (cols) and align A at side (row)
+        # init the matrix zero rows
+        for i in range(1, width_a+1):
+            dp_matrix[ i ][ 0 ]['bt'] = 'DEL_B' # UP
+        for j in range(1, width_b+1):
+            dp_matrix[ 0 ][ j ]['bt'] = 'DEL_A' # LEFT
+        
+        # score the DP matrix
+        for i in range(1, width_a+1):
+            for j in range(1, width_b+1):
+
+                score_cell_match = Gene_splice_modeler.get_match_score(align_a, i-1, align_b, j-1) # score matrix is 1-based, align is 0-based
+                
+                score_diag = dp_matrix[ i-1 ][ j-1 ]['score'] + score_cell_match
+
+                score_del_a = dp_matrix[ i ][ j-1 ]['score']
+
+                score_del_b = dp_matrix[ i-1 ][ j ]['score']
+
+
+                if score_cell_match > 0 and score_diag >= score_del_a and score_diag >= score_del_b:
+                    dp_matrix[ i ][ j ]['score'] = score_diag
+                    dp_matrix[ i ][ j ]['bt'] = 'DIAG'
+                elif score_del_a >= score_del_b:
+                    dp_matrix[ i ][ j ]['score'] = score_del_a
+                    dp_matrix[ i ][ j ]['bt'] = 'DEL_A'
+                else:
+                    dp_matrix[ i ][ j ]['score'] = score_del_b
+                    dp_matrix[ i ][ j ]['bt'] = 'DEL_B'
+
+
+        #logger.debug("DP_matrix:\n" + DP_matrix.toString(dp_matrix))
+
+        """
+        # get max score
+        max_score = 0
+        max_i = -1
+        max_j = -1
+        for i in range(0,width_a+1):
+            score = dp_matrix[ i ][ width_b ]['score']
+            if score > max_score:
+                max_score = score
+                max_i = i
+                max_j = width_b
+        for j in range(0, width_b+1):
+            score = dp_matrix[ width_a ][ j ]['score']
+            if score > max_score:
+                max_score = score
+                max_i = width_a
+                max_j = j
+        
+        logger.info("found max score {} at position: ({},{})".format(max_score, max_i, max_j))
+        """
+
+        # keep as global alignment
+        max_i = width_a
+        max_j = width_b
+        
+        # backtrack
+        i = max_i
+        j = max_j
+        all_merged_alignment_nodes_list = list()
+        while i > 0 or j > 0:
+            score_struct = dp_matrix[ i ][ j ]
+            
+            nodes_align_a = align_a.get_node_LIST_at_column_pos(i-1) # again, remember align has zero-based coords, whereas dp_matrix is 1-based
+            nodes_align_b = align_b.get_node_LIST_at_column_pos(j-1)
+
+            align_nodes = list()
+                        
+            bt_dir = score_struct['bt']
+
+            #logger.debug("backtrack-dir: " + bt_dir)
+
+            if bt_dir == 'DIAG':
+                i -= 1
+                j -= 1
+                align_nodes = nodes_align_a + nodes_align_b
+            
+
+            elif bt_dir == 'DEL_B':   # UP
+                i -= 1
+
+                align_nodes += nodes_align_a
+                for x in range(0,len(nodes_align_b)):
+                    align_nodes.append(None)
+            
+            elif bt_dir == 'DEL_A':  # LEFT
+                j -= 1
+
+                for x in range(0,len(nodes_align_a)):
+                    align_nodes.append(None)
+                align_nodes += nodes_align_b
+
+            else:
+                raise RuntimeError("bt: ({},{}), bt_dir not defined".format(i,j))
+
+            all_merged_alignment_nodes_list.append(align_nodes)
+
+        all_merged_alignment_nodes_list.reverse()
+        logger.debug("Merged alignment nodes list: " + str(all_merged_alignment_nodes_list) )        
+
+        # prep merged alignment obj
+        merged_transcript_name_list = align_a.get_transcript_names() + align_b.get_transcript_names()
+        node_obj_matrix = list()
+        # interate through each node list, reorganize into a matrix
+        for i in range(0,len(merged_transcript_name_list)):
+            row = list()
+            for node_obj_list in all_merged_alignment_nodes_list:
+                row.append(node_obj_list[i])
+            node_obj_matrix.append(row)
+
+
+        logger.debug("merged alignment node matrix:\n" + str(node_obj_matrix))
+
+        merged_alignment_obj = Node_alignment.Node_alignment(align_a.get_gene_id(), merged_transcript_name_list, node_obj_matrix)
+
+        logger.debug("merged alignment obj:\n" + str(merged_alignment_obj))
+
+        #sys.exit(1) # DEBUG
+        
+        return merged_alignment_obj
+
+    
+                
+    @staticmethod
+    def get_match_score(align_a, idx_a, align_b, idx_b):
+        """
+        just determines if indices in two transcripts have the same node identifier
+        """
+        
+        node_set_a = align_a.get_node_set_at_column_pos(idx_a)
+        node_set_b = align_b.get_node_set_at_column_pos(idx_b)
+    
+        node_set_a = Node_alignment.Node_alignment.get_node_loc_ids(node_set_a)
+        node_set_b = Node_alignment.Node_alignment.get_node_loc_ids(node_set_b)
+            
+        if (set.intersection(node_set_a, node_set_b)):
+            return 1 # match
+        else:
+            return 0 # no match
+
+
+    @staticmethod
+    def write_malign(gene_name, malign_dict, ofh, align_width=100):
+        """
+        writes the multiply aligned isoform sequences to an output filehandle
+        """
+        
+        transcript_names = malign_dict.keys()
+
+        alignment_length = len(malign_dict[ transcript_names[ 0 ] ])
+
+        align_start = 0
+
+        align_text = ""
+
+        while align_start < alignment_length:
+            for transcript_name in transcript_names:
+                align_region = malign_dict[ transcript_name ][ align_start : min(alignment_length, align_start + align_width) ]
+                align_text += transcript_name + "\t" + align_region + "\n"
+            align_text += "\n" # spacer between alignment blocks
+            align_start += align_width
+
+        ofh.write("// {}\n\n{}\n".format(gene_name, align_text))
+
+
--- a/Analysis/SuperTranscripts/pylib/GraphCycleException.py
+++ b/Analysis/SuperTranscripts/pylib/GraphCycleException.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+
+logger = logging.getLogger(__name__)
+
+
+class GraphCycleException(Exception):
+    pass
+
--- a/Analysis/SuperTranscripts/pylib/Node_alignment.py
+++ b/Analysis/SuperTranscripts/pylib/Node_alignment.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+import TNode
+import TGraph
+
+logger = logging.getLogger(__name__)
+
+class Node_alignment:
+
+    """
+    Object has two members:
+
+        transcript_names = [ transA,
+                             transB,
+                             transC,
+                             ...
+                             ]
+
+        aligned_nodes = [ [transA_node_1, transA_node_2, ... ],
+                          [transB_node_1, transB_node_2, ... ],
+                          [ None,         transC_node_1, ... ],  
+                        ]
+
+    Note, can have None at node positions to include gaps.
+
+    """
+
+    GAP = None
+
+    def __init__(self, gene_id, transcript_name_list, node_obj_matrix):
+        self.gene_id = gene_id
+        self.transcript_names = transcript_name_list
+        self.aligned_nodes = node_obj_matrix
+
+    def get_gene_id(self):
+        return self.gene_id
+
+    def set_gene_id(self, gene_id):
+        self.gene_id = gene_id
+    
+    def get_transcript_names(self):
+        # accessor
+        return self.transcript_names
+
+    def get_aligned_nodes(self):
+        # accessor
+        return self.aligned_nodes
+
+    @staticmethod
+    def get_single_seq_node_alignment(path_obj):
+        """
+        Factory method:
+           constructs a Node_alignment object from a Node_path object
+
+           mostly just reshaping the info for use with the multiple alignment methods.
+        
+        """
+        
+        node_list = list()
+        for node_obj in path_obj.get_path():
+            node_list.append(node_obj)
+
+        transcript_name = path_obj.get_transcript_name()
+        self = Node_alignment(transcript_name, [transcript_name], [node_list])
+        
+        return self
+
+
+    @staticmethod
+    def compute_number_common_nodes(align_A, align_B):
+        """
+        given to Node_alignment objects, counts the number of shared nodes
+        """
+        
+        node_set_a = Node_alignment.get_node_set(align_A)
+        node_set_b = Node_alignment.get_node_set(align_B)
+
+        node_set_a = Node_alignment.get_node_loc_ids(node_set_a)
+        node_set_b = Node_alignment.get_node_loc_ids(node_set_b)
+                
+        common_nodes = set.intersection(node_set_a, node_set_b)
+
+        return common_nodes
+
+
+    @staticmethod
+    def get_node_loc_ids(node_set):
+        """
+        private static method
+        gets the list of loc_id among all nodes in the set
+        """
+        
+        loc_ids_set = set()
+        for node in node_set:
+            loc_id = node.get_loc_id()
+            loc_ids_set.add(loc_id)
+
+        return loc_ids_set
+    
+
+    @staticmethod
+    def get_node_set(align_obj):
+        """
+        extracts a list of unique Node objects from the Node_alignment object
+        """
+        
+        num_trans = len(align_obj)
+        alignment_width = align_obj.width()
+
+        node_set = set()
+        
+        for align_num in range(0,num_trans):
+            for align_pos in range(0,alignment_width):
+                node_obj = align_obj.aligned_nodes[ align_num ][ align_pos ]
+                if node_obj is not None:
+                    node_set.add(node_obj)
+
+        return node_set
+
+
+    def get_node_set_at_column_pos(self, col_pos):
+        """
+        At a given column of the Node_alignment, extracts the list of unique nodes
+        """
+
+        # FIXME: since we're not dealing with mismatched nodes, there really should only be one node here
+        # that's shared among the different alignments
+        # Need refactoring across the next few methods as well for the same reason.
+        
+        node_objs = set()
+        for i in range(0, len(self)):
+            node_obj = self.aligned_nodes[ i ][ col_pos ]
+            if node_obj is not None:
+                node_objs.add(node_obj)
+        
+        return node_objs
+
+    def get_representative_column_node(self, col_pos):
+
+        node_list = list(self.get_node_set_at_column_pos(col_pos))
+
+        return node_list[ 0 ]
+    
+
+    def get_node_LIST_at_column_pos(self, col_pos):
+
+        node_objs = list()
+        for i in range(0, len(self)):
+            node_obj = self.aligned_nodes[ i ][ col_pos ]
+            node_objs.append(node_obj)
+        
+        return node_objs
+
+    def get_node_occupancy_at_column_pos(self, col_pos):
+
+        node_list = self.get_node_LIST_at_column_pos(col_pos)
+
+        occupancy_list = list()
+        for node in node_list:
+            if node is None:
+                occupancy_list.append(False)
+            else:
+                occupancy_list.append(True)
+
+        return occupancy_list
+
+
+    def append_node_to_each_entry(self, node_obj):
+
+        for i in range(0, len(self)):
+            self.aligned_nodes[ i ].append(node_obj)
+
+    def append_node_according_to_occupancy_pattern(self, node_obj, occupancy_pattern):
+
+        for i in range(0, len(self)):
+            if occupancy_pattern[i] is True:
+                self.aligned_nodes[ i ].append(node_obj)
+            else:
+                self.aligned_nodes[ i ].append(None)
+
+        
+
+
+    def add_column(self, column_node_list):
+        num_alignments = len(self)
+        if len(column_node_list) != num_alignments:
+            raise RuntimeError("Error, column size differs from num_alignments")
+
+        for i in range(0,num_alignments):
+            self.aligned_nodes[ i ].append(column_node_list[ i ])
+        
+                    
+    def __len__(self):
+        """
+        number of transcripts represented in the alignment
+        """
+        
+        return(len(self.transcript_names))
+
+    def width (self):
+        """
+        width of the alignment (number of columns)
+        """
+        return(len(self.aligned_nodes[0])) 
+
+    
+    def __repr__(self):
+
+        num_transcripts = len(self.transcript_names)
+        ret_text = "\n# Alignment obj contains: {} transcripts: {}\n\n".format(num_transcripts, ",".join(self.transcript_names))
+
+        align_width = self.width()
+
+        NODES_PER_LINE = 10
+
+        # each alignment block
+        for i in range(0, align_width, NODES_PER_LINE):
+
+            # report alignment for each entry
+            for j in range(0,num_transcripts):
+                transcript_name = self.transcript_names[ j ]
+                aligned_nodes_entry = self.aligned_nodes[ j ]
+
+                ret_text += "{}".format(transcript_name)
+                for x in range(i, i+NODES_PER_LINE):
+                    if x >= align_width:
+                        break
+                    
+                    ret_text += "\t{}".format(aligned_nodes_entry[ x ])
+
+                ret_text += "\n" # end of current line
+
+            ret_text += "\n" # spacer between alignment blocks
+                        
+            #ret_text += "Align [{}] trans {} : path {}".format(i, transcript_name, str(aligned_nodes_entry)) + "\n"
+
+        return ret_text
+    
+
+    def squeeze(self):
+        """
+        merge unbranched nodes into single nodes
+        """
+        
+        num_transcripts = len(self)
+        width = self.width()
+
+        node_obj_matrix = list()
+        for i in range(0,num_transcripts):
+            node_obj_matrix.append([])
+
+        squeezed_alignment = Node_alignment(self.get_gene_id(), self.get_transcript_names(), node_obj_matrix)
+
+        # walk the node list and merge unbranched stretches into single nodes
+        block_breakpoints = []
+        prev_col_node_set = self.get_node_occupancy_at_column_pos(0)
+        for i in range(1,width):
+            node_column_set = self.get_node_occupancy_at_column_pos(i)
+
+            #print("Comparing {} to {} == {}".format(prev_col_node_set, node_column_set, prev_col_node_set == node_column_set))
+
+            if node_column_set != prev_col_node_set:
+                block_breakpoints.append(i)
+            prev_col_node_set = node_column_set
+
+        block_breakpoints.append(width)
+
+        logger.debug("Block_breakpoints: {}".format(block_breakpoints))
+
+        blocked_nodes = list()
+        for i in range(0, width+1):
+            if i in block_breakpoints:
+                # found block terminator
+                node_to_add = None
+                if len(blocked_nodes) > 1:
+                    node_to_add = TNode.TNode.merge_nodes(blocked_nodes)
+                else:
+                    node_to_add = blocked_nodes[ 0 ]
+
+                blocked_node_occupancy = self.get_node_occupancy_at_column_pos(i-1)
+                squeezed_alignment.append_node_according_to_occupancy_pattern(node_to_add, blocked_node_occupancy)
+
+                blocked_nodes = list() # reinit
+            
+            # add to running block
+            if i < width:
+                blocked_nodes.append(self.get_representative_column_node(i))
+        
+        return squeezed_alignment
+
+
+    def to_gene_fasta_and_gtf(self, gene_name):
+
+        transcript_names = self.get_transcript_names()
+        
+        gene_seq = ""
+
+        # init transcript gtf records
+        transcript_to_gtf_lines = dict()
+        transcript_to_malign = dict()
+        transcript_to_Trinity_fa_seq = dict()
+        transcript_to_Trinity_fa_path = dict()
+
+
+        for transcript_name in transcript_names:
+            transcript_to_gtf_lines[ transcript_name ] = ""
+            transcript_to_malign[ transcript_name ] = ""
+
+            transcript_to_Trinity_fa_path[ transcript_name ] = list()
+            transcript_to_Trinity_fa_seq[ transcript_name ] = ""
+
+            
+        for i in range(0,self.width()):
+            node_obj = self.get_representative_column_node(i)
+
+            node_seq = node_obj.get_seq()
+            if len(node_seq) == 0:
+                raise RuntimeError("Error, node seq of length zero: node=" + str(node_obj))
+
+            node_id = node_obj.get_loc_id()
+
+            node_occupancy = self.get_node_occupancy_at_column_pos(i)
+
+            pos_start = len(gene_seq) + 1
+            gene_seq += node_obj.get_seq()
+            pos_end = len(gene_seq)
+
+            # include gtf record for transcripts
+            for j in range(0,len(transcript_names)):
+                transcript_name = transcript_names[ j ]
+                if node_occupancy[ j ] is True:
+                    # make gtf record
+                    transcript_to_gtf_lines[ transcript_name ] += "\t".join([gene_name, "Trinity_gene", "exon",
+                                                                            str(pos_start), str(pos_end), '.', '+', '.',
+                                                                            "gene_id \"{}\"; transcript_id \"{}\"\n".format(
+                                                                                gene_name, transcript_name) ] )
+                    transcript_to_malign[ transcript_name ] += node_seq
+
+                    # build Trinity fasta sequence and path info:
+                    cdna_seq_len = len(transcript_to_Trinity_fa_seq[ transcript_name ])
+                    rel_node_start = cdna_seq_len # index starting at zero
+                    rel_node_end = cdna_seq_len + len(node_seq) -1
+
+                    transcript_to_Trinity_fa_seq[ transcript_name ] += node_seq
+                    transcript_to_Trinity_fa_path[ transcript_name ].append("{}:{}-{}".format(node_id, rel_node_start, rel_node_end))
+
+                else:
+                    for x in range(0,len(node_seq)):
+                        transcript_to_malign[ transcript_name ] += '.'
+
+        
+        # build mini-gtf section
+        gene_gtf = "\n".join(transcript_to_gtf_lines.values())
+
+        # build Trinity fasta text
+        trinity_fasta_text = ""
+        for transcript_name in transcript_names:
+            transcript_seq = transcript_to_Trinity_fa_seq[transcript_name]
+            path_list = transcript_to_Trinity_fa_path[transcript_name]
+            #logger.debug("path list: " + str(path_list))
+            path_list_text = " ".join(path_list)
+            trinity_fasta_text += ">{} len={} path=[{}]\n{}\n".format(transcript_name, len(transcript_seq),
+                                                                      path_list_text, transcript_seq)
+        
+        return (gene_seq, gene_gtf, trinity_fasta_text, transcript_to_malign)
+        
+
+
+    def reassign_node_loc_ids_by_align_order(self):
+
+        for i in range(0,self.width()):
+            repr_node = self.get_representative_column_node(i)
+            repr_node.set_loc_id(str(i))
+
+    
+
+    def to_splice_graph(self, gene_name, reset_node_ids=False):
+
+        aligned_nodes = self.get_aligned_nodes()
+        
+        width = self.width()
+        refined_tgraph = TGraph.TGraph(gene_name)
+
+        new_node_list = list()
+
+        for i in range(0,width):
+            repr_node = self.get_representative_column_node(i)
+        
+            transcripts = repr_node.get_transcripts()
+
+            loc_id = repr_node.get_loc_id()
+            if reset_node_ids:
+                loc_id = "loc_" + str(i)
+                            
+            new_node = refined_tgraph.get_node(transcripts, loc_id, repr_node.get_seq())
+            new_node_list.append(new_node)
+
+        #############
+        # build graph
+
+        for iso_node_alignment in aligned_nodes:
+
+            prev = None
+            for i in range(0,width):
+
+                if iso_node_alignment[i] != None:
+                    if prev != None:
+                        refined_tgraph.add_edges([prev], [new_node_list[i]])
+                    prev = new_node_list[i]
+            
+        logger.debug("New graph node listing:")
+        for node in new_node_list:
+            logger.debug(node.toString())
+
+
+        return refined_tgraph
--- a/Analysis/SuperTranscripts/pylib/Node_path.py
+++ b/Analysis/SuperTranscripts/pylib/Node_path.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+import TNode
+import Trinity_util
+
+logger = logging.getLogger(__name__)
+
+
+class Node_path:
+    """
+    Object representation of the connected set of Node objects that represent the reconstructed isoforms graph traversal
+
+    Instance members:
+
+        transcript_name : (str)  name of the isoform
+
+        node_obj_list : (list) of Node objects
+
+    """
+
+    
+    def __init__(self, tgraph, transcript_name, path_string, sequence):
+        """
+        constructor, instantiates Node_path and builds vertices in the graph
+        """
+        
+        self.transcript_name = transcript_name
+        self.node_obj_list = list()
+
+        node_descr_list = re.findall("\d+:\d+\-\d+", path_string)
+
+        # 1st node is special (full kmer prefix included)
+        first_kmer_flag = False
+        obj_node_list = list()
+        for node_descr in node_descr_list:
+            (loc_node_id, node_coord_range) = node_descr.split(":")
+            (lend,rend) = node_coord_range.split("-")
+            lend = int(lend)
+            rend = int(rend)
+
+            if not first_kmer_flag:
+                first_kmer_flag = True
+                loc_node_id += 'fst'
+            
+            # use factory call to instantiate node objects:
+            node_obj = tgraph.get_node(transcript_name,
+                                     loc_node_id, sequence[lend:rend+1]) # coords in path were already zero-based
+            
+            self.node_obj_list.append(node_obj)
+
+    
+    def get_transcript_name(self):
+        return self.transcript_name
+
+    def get_path(self):
+        return self.node_obj_list
+    
+
+    def __repr__(self):
+        node_str_list = list()
+        for node in self.node_obj_list:
+            node_str_list.append(str(node))
+
+        path_str = "--".join(node_str_list)
+
+        return path_str
+        
--- a/Analysis/SuperTranscripts/pylib/Splice_model_refiner.py
+++ b/Analysis/SuperTranscripts/pylib/Splice_model_refiner.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+
+import TGraph
+import TNode
+import Node_alignment
+import Topological_sort
+
+from Compact_graph_whole import Compact_graph_whole
+from Compact_graph_partial import Compact_graph_partial
+import TGLOBALS
+
+logger = logging.getLogger(__name__)
+
+
+MAX_MM_RATE = 0.05 
+
+
+def refine_alignment(node_alignment_obj, reset_node_ids=False):
+
+    """
+    Create a new splice graph based on the node alignment obj.
+
+    Since some nodes may show up as duplicate (repeat) nodes, assign each a unique ID
+    """
+    
+    logger.debug("refine_alignment({})".format(node_alignment_obj))
+
+
+    # convert to splice graph
+    refined_tgraph = node_alignment_obj.to_splice_graph("^^SGRAPH2^^", reset_node_ids)
+
+    if TGLOBALS.DEBUG:
+        refined_tgraph.draw_graph("ladeda.pre.dot")
+    
+    graph_compactor = Compact_graph_whole()
+    graph_compactor.compact_unbranched(refined_tgraph)
+
+    if TGLOBALS.DEBUG:
+        refined_tgraph.draw_graph("ladeda.linear_compact.dot")
+    
+    ###########
+    #
+    
+    for allowed_variants in (0, 1, 2):
+        graph_compactor.compact_graph(refined_tgraph, allowed_variants)
+        if TGLOBALS.DEBUG:
+            refined_tgraph.draw_graph("ladeda.compact.m{}.dot".format(allowed_variants))
+        
+
+    ## now extract prefix and suffix matches
+
+    partial_graph_compactor = Compact_graph_partial()
+    partial_graph_compactor.compact_graph(refined_tgraph)
+    
+    if TGLOBALS.DEBUG:
+        refined_tgraph.draw_graph("ladeda.final.dot")
+    
+    # convert compacted graph into a node alignment obj
+
+    splice_graph_node_alignment = splice_graph_to_node_alignment(refined_tgraph)
+
+    splice_graph_node_alignment = remove_redundant_paths(splice_graph_node_alignment)
+
+    
+    return(splice_graph_node_alignment)
+
+
+
+def splice_graph_to_node_alignment(tgraph):
+
+    topologically_sorted_nodes = Topological_sort.Topological_sort.topologically_sort(tgraph.get_all_nodes())
+
+    logger.debug("Topologically sorted nodes: " + str(topologically_sorted_nodes))
+
+    # index loc node ids
+    aligned_loc_id_pos = dict()
+    for i in range(0, len(topologically_sorted_nodes)):
+        loc_id = topologically_sorted_nodes[i].get_loc_id()
+        aligned_loc_id_pos[loc_id] = i
+
+
+    new_alignments = list()
+    transcript_ids = set()
+    for node in topologically_sorted_nodes:
+        transcript_ids = transcript_ids.union(node.get_transcripts())
+
+    transcript_ids = list(transcript_ids)
+    
+    for transcript_id in transcript_ids:
+        new_alignment = [None for i in topologically_sorted_nodes]
+        for node in topologically_sorted_nodes:
+            if transcript_id in node.get_transcripts():
+                loc_id = node.get_loc_id()
+                new_idx = aligned_loc_id_pos[loc_id]
+                new_alignment[new_idx] = node
+        new_alignments.append(new_alignment)
+
+    splice_graph_node_alignment = Node_alignment.Node_alignment(tgraph.get_gene_id(), transcript_ids, new_alignments)
+
+    logger.debug("Splice graph node_alignment: " + str(splice_graph_node_alignment))
+
+    return(splice_graph_node_alignment)
+
+
+
+def remove_redundant_paths(node_alignment):
+
+    gene_id = node_alignment.get_gene_id()
+    transcript_names = node_alignment.get_transcript_names()
+    aligned_nodes = node_alignment.get_aligned_nodes()
+
+    num_transcripts_before_reduction = len(transcript_names)
+
+    # do all pairwise comparisons
+    # check for containments
+
+    containments = set()
+
+    for i in range(0,len(aligned_nodes)-1):
+        for j in range(i+1, len(aligned_nodes)):
+            if a_contains_b(aligned_nodes[i], aligned_nodes[j]):
+                containments.add(j)
+            elif a_contains_b(aligned_nodes[j], aligned_nodes[i]):
+                containments.add(i)
+
+    if containments:
+        adj_transcript_names = list()
+        adj_aligned_nodes = list()
+        for i in range(0, len(aligned_nodes)):
+            if i not in containments:
+                adj_transcript_names.append(transcript_names[i])
+                adj_aligned_nodes.append(aligned_nodes[i])
+
+        adj_splice_graph_node_alignment = Node_alignment.Node_alignment(gene_id, adj_transcript_names, adj_aligned_nodes)
+
+        num_after_reduction = len(adj_transcript_names)
+        
+        logger.debug("Containments found, reporting reduced set {} of {} = {:.2f}%".format(
+            num_after_reduction, num_transcripts_before_reduction,
+            num_after_reduction/num_transcripts_before_reduction*100))
+        
+        return adj_splice_graph_node_alignment
+
+    else:
+        logger.debug("No containments found")
+        return node_alignment # no changes
+
+
+def get_first_node_idx(node_list):
+
+    # find starting place for comparison
+    begin_idx = -1
+    for i in range(0,len(node_list)):
+        if node_list[i] is not None:
+            begin_idx = i
+            break
+
+    if begin_idx < 0:
+        raise RuntimeError("Error, didn't find first non-none value among {} and {}".format(node_list_A, node_list_B))
+
+    return begin_idx
+
+
+def get_end_node_idx(node_list):
+    
+    # find ending place for comparison
+    end_idx = -1
+
+    for i in reversed(range(0, len(node_list))):
+        if node_list[i] is not None:
+            end_idx = i
+            break
+    
+    if end_idx < 0:
+        raise RuntimeError("Error, didn't find last non-none value among {} and {}".format(node_list_A, node_list_B))
+
+    return end_idx
+
+
+
+def a_contains_b(node_list_A, node_list_B):
+
+    A_start = get_first_node_idx(node_list_A)
+    A_end = get_end_node_idx(node_list_A)
+
+    B_start = get_first_node_idx(node_list_B)
+    B_end = get_end_node_idx(node_list_B)
+
+    if not (A_start <= B_start and A_end >= B_end):
+        return False # no containment
+
+    # ensure that in overlapping region, they have identical nodes
+    for i in range(B_start, B_end+1):
+        # if we see any difference, then not compatible 
+        if node_list_A[i] != node_list_B[i]:
+            return False
+
+    # must be compatible and contained
+    return True
--- a/Analysis/SuperTranscripts/pylib/TGLOBALS.py
+++ b/Analysis/SuperTranscripts/pylib/TGLOBALS.py
+#!/usr/bin/env python
+
+DEBUG = False
+
--- a/Analysis/SuperTranscripts/pylib/TGraph.py
+++ b/Analysis/SuperTranscripts/pylib/TGraph.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+import TNode
+
+logger = logging.getLogger(__name__)
+
+
+class TGraph:
+
+ 
+    def __init__(self, gene_id):
+
+        self.node_cache = dict()
+        self.gene_id = gene_id
+
+    
+    def get_node(self, transcript_id, loc_node_id, node_seq):
+        
+        """
+        Instantiates Node objects, and stores them in a graph.
+
+        *** use this method for instantiating all Node objects ***
+
+        use clear_node_cache() to clear the graph
+                
+        """
+                
+        logger.debug("{}\t{}".format(loc_node_id, node_seq))
+        
+        if len(node_seq) == 0:
+            raise RuntimeError("Error, non-zero length node_seq required for parameter")
+
+
+        if loc_node_id in self.node_cache:
+            node_obj = self.node_cache[ loc_node_id ]
+            node_obj.add_transcripts(transcript_id)
+            if node_obj.seq != node_seq:
+                errmsg = "ERROR: have conflicting node sequences for {} node_id: {}\n".format(self.get_gene_id(),
+                                                   loc_node_id) + "{}\n vs. \n{}".format(node_obj.seq, node_seq)
+                logger.critical(errmsg)
+                
+                raise RuntimeError(errmsg)
+            else:
+                return node_obj
+            
+        else:
+            # instantiate a new one
+            node_obj = TNode.TNode(self, transcript_id, loc_node_id, node_seq)
+            self.node_cache[ loc_node_id ] = node_obj
+            return node_obj
+
+
+
+    def get_all_nodes(self):
+        return self.node_cache.values()
+    
+    def clear_node_cache(self):
+        """
+        clears the graph
+        """
+        self.node_cache.clear()
+    
+    def clear_touch_settings(self):
+        """
+        clear the touch settings for each of the nodes
+        """
+
+        for node in self.get_all_nodes():
+            node.clear_touch()
+    
+
+
+    def add_edges(self, from_nodes_list, to_nodes_list):
+
+        for from_node in from_nodes_list:
+            for to_node in to_nodes_list:
+                from_node.add_next_node(to_node)
+                to_node.add_prev_node(from_node)
+
+    def prune_edges(self, from_nodes_list, to_nodes_list):
+
+        for from_node in from_nodes_list:
+            for to_node in to_nodes_list:
+                from_node.remove_next_node(to_node)
+                to_node.remove_prev_node(from_node)
+    
+
+    def prune_node(self, node):
+        logger.debug("pruning node: {}".format(node))
+        self.prune_edges(node.get_prev_nodes(), [node])
+        self.prune_edges([node], node.get_next_nodes())
+        node.dead = True
+        self.node_cache.pop(node.get_loc_id())
+    
+        
+    def get_gene_id(self):
+        return self.gene_id
+
+
+    def draw_graph(self, filename):
+        
+        ofh = open(filename, 'w')
+
+        ofh.write("digraph G {\n")
+
+        for node_id in self.node_cache:
+            node = self.node_cache[node_id]
+            node_seq = node.get_seq()
+            gene_node_id = node.get_gene_node_id()
+            next_nodes = node.get_next_nodes()
+
+            ofh.write("{} [label=\"{}:Len{}:T{}:{}\"]\n".format(node.get_id(), gene_node_id, len(node_seq),
+                                                                node.get_topological_order(), node_seq))
+            
+            for next_node in next_nodes:
+                ofh.write("{}->{}\n".format(node.get_id(), next_node.get_id()))
+
+        ofh.write("}\n") # close it
+
+        ofh.close()
+    
--- a/Analysis/SuperTranscripts/pylib/TNode.py
+++ b/Analysis/SuperTranscripts/pylib/TNode.py
+#!/usr/bin/env python
+# encoding: utf-8
+
+from __future__ import (absolute_import, division,
+                        print_function, unicode_literals)
+
+import os, sys, re
+import logging
+import argparse
+import collections
+import numpy
+import time
+
+import TGraph
+
+logger = logging.getLogger(__name__)
+
+
+class TNode:
+    """
+    generic Trinity graph node object representing a node in the Trinity isoform reconstruction graph
+
+    Node's are objects within a gene and can be shared among transcript isoforms.
+
+    instance members include:
+
+        tgraph : (TGraph obj) graph for the Trinity gene, which will hold the nodes.
+
+        transcripts: list(str) names of the isoforms that contains this node.
+
+        loc_node_id : (int) identifier of the node
+
+        seq : (str)  nucleotide sequence for this node in the transcript
+
+        len : (int)  length of the node sequence
+
+        prev : (set) node objects connected as parental nodes in the graph
+
+        next : (set) node objects connected as descendant nodes in the graph
+
+    class members include:
+
+        merged_nodeset_counter : (int) tracking nodes that get merged under squeeze operations.
+        
+    """
+    
+    node_cache = dict()
+
+    merged_nodeset_counter = 0
+
+    all_nodes_counter = 0
+
+
+    def __init__(self, tgraph, transcript_id, loc_node_id, node_seq):
+        """
+        constructor, but don't use directly.... instead, use TGraph.get_node() factory function
+        """
+        
+        if len(node_seq) == 0:
+            raise RuntimeError("Error, TNode instantiation requires node sequence of length > 0")
+
+        self.tgraph = tgraph
+        self.transcripts = set()
+        self.add_transcripts(transcript_id)
+        self.loc_node_id = loc_node_id
+        self.seq = node_seq
+        self.len = len(node_seq)
+
+        TNode.all_nodes_counter += 1
+        self._id = TNode.all_nodes_counter
+        
+        #logger.info("{}\t{}".format(loc_node_id, node_seq))
+
+        self.prev = set()
+        self.next = set()
+        self.stashed_prev = set() # for manipulation during topological sorting
+        self.stashed_next = set() 
+
+        self.touched = 0  
+        self.dead = False
+        self.topological_order = -1  # updated on topological sorting
+
+    #########################
+    ## various Node ID values
+    #########################
+
+
+    def get_id(self):
+        # a private unique identifier for all nodes
+        return self._id
+        
+    def get_loc_id(self):
+        return self.loc_node_id
+
+    def set_loc_id(self, loc_node_id):
+        self.loc_node_id = loc_node_id
+        
+    
+    def get_gene_id(self):
+        return self.tgraph.get_gene_id()
+
+    def get_gene_node_id(self):
+
+        gene_id = self.get_gene_id()
+        loc_id = self.get_loc_id()
+
+        node_id = gene_id + "::" + loc_id
+                
+        return node_id
+
+    def get_touched_val(self):
+        return self.touched
+
+    def is_dead(self):
+        return(self.dead)
+
+    def is_ancestral(self, node, visited=None):
+
+        if visited is None:
+            visited = set() #init round
+        
+        #logger.debug("is_ancestral search from {} of node {}".format(self, node))
+        if node == self:
+            #logger.debug("node is self")
+            return True
+        
+        if node in self.prev:
+            #logger.debug("node in self.prev")
+            return True
+        else:
+            #logger.debug("continuing search")
+            visited.add(self)
+            #logger.debug("visited: {}".format(visited))
+            for prev_node in self.prev:
+                #logger.debug("cascading towards prev_node: {}".format(prev_node))
+                if prev_node in visited:
+                    #logger.debug("prev_node in visited")
+                    pass
+                else:
+                    #logger.debug("prev_node not in visited")
+                    found = prev_node.is_ancestral(node, visited)
+                    if found:
+                        return True
+        return False
+
+
+    def is_descendant(self, node, visited=None):
+
+        if visited == None:
+            visited = set()  # init round
+        
+        if node == self:
+            return True
+
+        if node in self.next:
+            return True
+        else:
+            visited.add(self)
+            for next_node in self.next:
+                if next_node not in visited:
+                    found = next_node.is_descendant(node, visited)
+                    if found:
+                        return True
+        return False
+    
+    ## Other accessors
+
+    def get_graph(self):
+        return self.tgraph
+
+    def get_seq(self):
+        return self.seq
+
+
+    def set_seq(self, seq):
+        self.seq = seq
+
+
+    def get_topological_order(self):
+        return self.topological_order
+
+    def set_topological_order(self, topo_order):
+        self.topological_order = topo_order
+    
+    def get_transcripts(self):
+        return self.transcripts
+
+    def add_transcripts(self, transcript_name_or_set):
+        if type(transcript_name_or_set) is set:
+            self.transcripts.update(transcript_name_or_set)
+        elif type(transcript_name_or_set) is str:
+            self.transcripts.add(transcript_name_or_set)
+        else:
+            raise RuntimeError("Error, parameter must be a string or a set ")
+        
+    def get_prev_nodes(self):
+        return set(self.prev)
+
+    def get_next_nodes(self):
+        return set(self.next)
+    
+    def add_next_node(self, next_node_obj):
+        self.next.add(next_node_obj)
+
+    def remove_next_node(self, remove_node_obj):
+        self.next.remove(remove_node_obj)
+
+    def stash_next_node(self, stash_node_obj):
+        self.remove_next_node(stash_node_obj)
+        self.stashed_next.add(stash_node_obj)
+
+    def add_prev_node(self, prev_node_obj):
+        self.prev.add(prev_node_obj)
+
+    def remove_prev_node(self, remove_node_obj):
+        self.prev.remove(remove_node_obj)
+
+    def stash_prev_node(self, stash_node_obj):
+        self.remove_prev_node(stash_node_obj)
+        self.stashed_prev.add(stash_node_obj)
+
+
+    def restore_stashed_nodes(self):
+        self.prev.update(self.stashed_prev)
+        self.stashed_prev = set()
+
+        self.next.update(self.stashed_next)
+        self.stashed_next = set()
+
+    def get_prev_node_loc_ids(self):
+        loc_ids = list()
+        for node in self.get_prev_nodes():
+            loc_ids.append(node.get_loc_id())
+        return loc_ids
+
+    def get_next_node_loc_ids(self):
+        loc_ids = list()
+        for node in self.get_next_nodes():
+            loc_ids.append(node.get_loc_id())
+        return loc_ids
+    
+    def __repr__(self):
+        return(self.loc_node_id)
+
+
+    ## Touching nodes 
+    def touch(self):
+        self.touched += 1
+
+    def untouch(self):
+        self.touched -= 1
+
+    def clear_touch(self):
+        self.touched = 0
+
+
+    def toString(self):
+        txt = str("prev: " + str(self.get_prev_node_loc_ids()) +
+                  ", me: " + str(self.get_loc_id()) +
+                  ", next: " + str(self.get_next_node_loc_ids()) +
+                  ", transcripts: " + str(self.transcripts) +
+                  ", " + self.get_seq())
+
+        if self.topological_order >= 0:
+            txt += ", topo_order={}".format(self.topological_order)
+        
+        if self.dead:
+            txt += " ** dead ** "
+        
+        return txt
+    
+
+    @classmethod
+    def merge_nodes(cls, node_list):
+        """
+        Merges linear stretches of nodes into a single new node that has
+        concatenated sequences of the input nodes
+        """
+        
+        merged_node_seq = ""
+        TNode.merged_nodeset_counter += 1
+        merged_loc_node_id = "M{}".format(TNode.merged_nodeset_counter)
+
+        for node_obj in node_list:
+            seq = node_obj.get_seq()
+            merged_node_seq += seq
+
+
+        transcripts = node_list[0].get_transcripts()
+        tgraph = node_list[0].get_graph()
+
+        merged_node = TNode(tgraph, transcripts, merged_loc_node_id, merged_node_seq)
+        
+        return merged_node
+