Skip to content
Commits on Source (3)
......@@ -63,8 +63,9 @@ GOBubble(reduced_circ, labels = 2.8)
GOCircle(circ)
# Generate a circular visualization of selected terms
IDs <- c('GO:0007507', 'GO:0001568', 'GO:0001944', 'GO:0048729', 'GO:0048514', 'GO:0005886', 'GO:0008092', 'GO:0008047')
GOCircle(circ, nsub = IDs)
# (set the ids to your liking)
#IDs <- c('GO:0007507', 'GO:0001568', 'GO:0001944', 'GO:0048729', 'GO:0048514', 'GO:0005886', 'GO:0008092', 'GO:0008047')
#GOCircle(circ, nsub = IDs)
# Generate a circular visualization for 10 terms
GOCircle(circ, nsub = 10)
......@@ -72,5 +73,5 @@ GOCircle(circ, nsub = 10)
##### Now, the rest is up to you. ;-)
quit(save = "no", status = 0, runLast = FALSE)
quit(save = "yes", status = 0, runLast = FALSE)
#!/usr/bin/env Rscript
files=list.files(pattern=".*DE_results$")
pdf("logFC.histograms.pdf")
for (file in files) {
message("analyzing: ", file)
data = read.table(file, header=T)
hist(data$log2FoldChange, br=50, main=file)
}
......@@ -50,6 +50,9 @@ main: {
chomp;
my @ids = split(/\t/);
$ids[0] =~ s/\|.*$//;
if (my $new_id = $new_ids{$ids[0]}) {
$ids[0] = $new_id;
}
......
......@@ -306,7 +306,7 @@ sub parse_sample_info {
my %samples;
open (my $fh, $sample_file) or die $!;
open (my $fh, $sample_file) or die "Error, cannot locate samples file: $sample_file";
while (<$fh>) {
unless (/\w/) { next; }
if (/^\#/) { next; } # allow comments
......@@ -521,8 +521,10 @@ sub run_DESeq2_sample_pair {
print $ofh "res\$padj[is.na(res\$padj)] <- 1\n"; # Carsten Kuenne
print $ofh "res = as.data.frame(res[order(res\$pvalue),])\n"; # rank by pvalue
## output results
print $ofh "write.table(as.data.frame(res[order(res\$pvalue),]), file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE)\n";
print $ofh "write.table(res, file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE)\n";
print $ofh "write.table(rnaseqMatrix, file=\'$output_prefix.DESeq2.count_matrix\', sep='\t', quote=FALSE)\n";
......
......@@ -20,6 +20,8 @@ import Pipeliner
logger = None
UTILDIR = os.path.sep.join([os.path.dirname(os.path.realpath(__file__)), "util"])
def main():
......@@ -59,6 +61,9 @@ def main():
parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000", help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")
parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function")
args = parser.parse_args()
......@@ -108,7 +113,8 @@ def main():
else:
pipeliner.add_commands([Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar" +
" CreateSequenceDictionary R=" + st_fa_path +
" O=" + dict_file,
" O=" + dict_file +
" VALIDATION_STRINGENCY=LENIENT ",
"picard_dict_st.ok")])
pipeliner.run()
......@@ -120,9 +126,12 @@ def main():
" --runMode genomeGenerate" +
" --genomeDir star_genome_idx " +
" --genomeFastaFiles {} ".format(st_fa_path) +
" --genomeSAindexNbases 8 " + # as per A. Dobin
" --sjdbGTFfile {} ".format(st_gtf_path) +
" --sjdbOverhang {} ".format(args.sjdbOverhang) +
" --limitGenomeGenerateRAM {}".format(args.maxram) )
" --limitGenomeGenerateRAM {}".format(args.maxram) +
" {} ".format(args.STAR_genomeGenerate_opts)
)
pipeliner.add_commands([
Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"),
......@@ -160,6 +169,7 @@ def main():
" AddOrReplaceReadGroups " +
"I=Aligned.sortedByCoord.out.bam " +
"O=rg_added_sorted.bam " +
" VALIDATION_STRINGENCY=SILENT " +
"SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample",
"add_read_groups.ok"),
......@@ -169,10 +179,22 @@ def main():
"CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics",
"mark_dups.ok"),
Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar ValidateSamFile " +
"I=dedupped.bam " +
"IGNORE_WARNINGS=true " +
"MAX_OUTPUT=100000 " +
"IGNORE=MATE_NOT_FOUND " +
"O=dedupped.bam.validation",
"bam_validate.ok",
ignore_error=True),
Pipeliner.Command(UTILDIR + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam",
"make_valid_dedupped_bam.ok"),
Pipeliner.Command("java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " +
"-T SplitNCigarReads -R " + st_fa_path +
" -I dedupped.bam -o splitNCigar.bam " +
" -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS",
" -I dedupped.valid.bam -o splitNCigar.bam " +
" -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS --validation_strictness LENIENT",
"splitNCigarReads.ok")
])
pipeliner.run()
......
#!/usr/bin/env perl
use strict;
use warnings;
my $usage = "\n\t\tusage: $0 target.bam errors.txt output.bam\n\n";
my $target_bam = $ARGV[0] or die $usage;
my $errors_file = $ARGV[1] or die $usage;
my $output_bam = $ARGV[2] or die $usage;
main: {
my %error_reads = &parse_error_reads($errors_file);
if (%error_reads) {
my $in_cmd = "samtools view -h $target_bam | ";
my $out_cmd = " | samtools view -Sb -o $output_bam";
open(my $in_fh, $in_cmd) or die "Error, cannot open cmd: $in_cmd";
open(my $out_fh, $out_cmd) or die "Error, cannot open cmd: $out_cmd";
while (my $line = <$in_fh>) {
if ($line =~ /^\@/) {
# header line
print $out_fh $line;
}
else {
my @x = split(/\t/, $line);
unless ($error_reads{$x[0]}) {
print $out_fh $line;
}
}
}
close $in_fh;
close $out_fh;
&process_cmd("samtools index $output_bam");
my $output_index = $output_bam;
$output_index =~ s/\.bam$/\.bai/;
rename("$output_bam.bai", $output_index);
}
else {
&process_cmd("ln -s $target_bam $output_bam");
###################
# include the index
$target_bam =~ s/\.bam$/\.bai/;
$output_bam =~ s/\.bam$/\.bai/;
&process_cmd("ln -s $target_bam $output_bam");
}
exit(0);
}
####
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
my $ret = system($cmd);
if ($ret) {
die "Error, CMD: $cmd died with ret $ret";
}
return;
}
####
sub parse_error_reads {
my ($file) = @_;
# examples
#ERROR: Record 6714568, Read name NS500412:143:HJWV5AFXX:1:21307:19673:9975, Read CIGAR M operator maps off end of reference
#ERROR: Record 6714569, Read name NS500412:143:HJWV5AFXX:3:21405:18461:10525, Read CIGAR M operator maps off end of reference
#ERROR: Record 6714570, Read name NS500412:143:HJWV5AFXX:1:21307:21758:18659, Alignment start (262144) must be <= reference sequence length (241) on reference TRINITY_DN11029_c1_g1
#ERROR: Record 6714570, Read name NS500412:143:HJWV5AFXX:1:21307:21758:18659, Read CIGAR M operator maps off end of reference
#ERROR: Record 6714571, Read name NS500412:143:HJWV5AFXX:2:11202:8325:4497, Alignment start (262144) must be <= reference sequence length (241) on reference TRINITY_DN11029_c1_g1
my %error_reads;
open(my $fh, $file) or die "Error, cannot open file: $file";
while (<$fh>) {
if (/ERROR: Record (\d+), Read name (\S+), /) {
my $read_name = $2;
$error_reads{$read_name} = 1;
}
}
close $fh;
return(%error_reads);
}
......@@ -11,6 +11,8 @@ use File::Basename;
my $CPU = 2;
my $top_genes_plot = 50;
my $usage = <<__EOUSAGE__;
#################################################################
......@@ -33,6 +35,8 @@ my $usage = <<__EOUSAGE__;
#
# --CPU <int> default: $CPU
#
# --top_genes_plot <int> default: $top_genes_plot
#
################################################################
......@@ -59,6 +63,8 @@ my $SS_lib_type = "";
'CPU=i' => \$CPU,
'aligner=s' => \$aligner,
'SS_lib_type=s' => \$SS_lib_type,
'top_genes_plot=i' => \$top_genes_plot,
);
......@@ -194,7 +200,6 @@ main: {
print $ofh "library(DEXSeq)\n";
print $ofh "samples_info = read.table(\"$samples_table_file\", header=T, row.names=1)\n";
print $ofh "dxd = DEXSeqDataSetFromHTSeq(as.vector(samples_info\$counts_filename), sampleData=samples_info, design = ~ sample + exon + condition:exon, flattenedfile=\"$genes_gtf_file.dexseq.gff\")\n";
print $ofh "pdf(\"$out_prefix.pdf\")\n";
print $ofh "dxd = estimateSizeFactors( dxd )\n";
print $ofh "dxd = estimateDispersions( dxd )\n";
print $ofh "plotDispEsts( dxd )\n";
......@@ -202,14 +207,25 @@ main: {
print $ofh "dxd = estimateExonFoldChanges( dxd, fitExpToVar=\"condition\")\n";
print $ofh "dxr1 = DEXSeqResults( dxd )\n";
print $ofh "dxr1.sorted = dxr1[order(dxr1\$padj),]\n";
print $ofh "save(list = ls(all=TRUE), file = \"$out_prefix.Rdata\")\n";
print $ofh "write.table(dxr1.sorted, file=\"$out_prefix.results.dat\", quote=F, sep=\"\t\")\n";
print $ofh "plotMA( dxr1, cex=0.8 )\n";
print $ofh "pdf(\"$out_prefix.pdf\")\n";
print $ofh "top_genes = unique(dxr1.sorted\$groupID[dxr1.sorted\$padj < 0.1 & ! is.na(dxr1.sorted\$padj)])\n";
print $ofh "top_genes = top_genes[1:min($top_genes_plot, length(top_genes))]\n";
print $ofh "message(\"Top $top_genes_plot genes: (\", paste(top_genes, collapse=','), \")\")\n";
print $ofh "for (gene in top_genes) { \n"
. " plotDEXSeq( dxr1 , gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 , expression=FALSE, norCounts=TRUE, splicing=TRUE, displayTranscripts=TRUE)\n"
. "}\n";
#print $ofh "plotMA( dxr1, cex=0.8 )\n";
close $ofh;
}
$cmd = "R --vanilla -q < $dexseq_rscript";
$pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.ok"));
$pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.$top_genes_plot.ok"));
$pipeliner->run();
......@@ -226,13 +242,26 @@ sub parse_samples_file {
open (my $fh, $samples_file) or die "Error, cannot open file: $samples_file";
while (<$fh>) {
if (/^\#/) { next; }
unless (/\w/) { next; }
chomp;
my @x = split(/\t/);
my $line = $_;
my @x = split(/\s+/);
my $condition = $x[0];
my $replicate = $x[1];
my $left_fq = $x[2];
my $right_fq = $x[3]; # only if paired-end
unless ($left_fq) {
die "Error, cannot find required fields in sample file line: $line";
}
unless (-s $left_fq) {
die "Error, cannot locate left_fq: $left_fq as specified in samples file line: $line ";
}
if ($right_fq && ! -s $right_fq) {
die "Error, cannot locate right_fq: $right_fq as specified in samples file line: $line";
}
if ($seen{$replicate}) {
die "Error, replicate name: [$replicate] must be unique among all replicate names. Please update your samples file";
}
......
#!/usr/bin/env perl
use strict;
use warnings;
use lib($ENV{EUK_MODULES});
use Fasta_reader;
my $usage = "\n\n\tusage: $0 trin_before.fa trin_after.fa\n\n";
my $trin_before_file = $ARGV[0] or die $usage;
my $trin_after_file = $ARGV[1] or die $usage;
main: {
my %trin_seqs_before = &parse_trin_fa($trin_before_file);
my %trin_seqs_after = &parse_trin_fa($trin_after_file);
foreach my $acc (keys %trin_seqs_after) {
my $seq_before = $trin_seqs_before{$acc} or die "Error, no before transcript for [$acc]";
my $seq_after = $trin_seqs_after{$acc} or die "Error, no after transcript for [$acc]";
if ($seq_before eq $seq_after) {
print "* $acc identical\n";
}
else {
my $filename = $acc;
$filename =~ s/\W/_/g;
open(my $ofh, ">$filename") or die $!;
print $ofh ">before\n$seq_before\n"
. ">after\n$seq_after\n";
close $ofh;
my $ret = system("clustalw2 $filename");
if ($ret) {
die $!;
}
}
}
exit(0);
}
####
sub parse_trin_fa {
my ($file) = @_;
my $fasta_reader = new Fasta_reader($file);
my %seqs = $fasta_reader->retrieve_all_seqs_hash();
return(%seqs);
}
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import TGraph
import TNode
import Node_alignment
import Compact_graph_whole
from Topological_sort import Topological_sort
import TGLOBALS
logger = logging.getLogger(__name__)
class Compact_graph_partial(Compact_graph_whole.Compact_graph_whole):
def __init__(self):
pass
def compact_graph(self, tgraph):
compacted_flag = True
round = 0
while compacted_flag:
round += 1
logger.debug("\n\t### Partial graph compaction, Round: {}".format(round))
tgraph.clear_touch_settings() # start fresh
compacted_flag = False # reset for this round
sorted_nodes = Topological_sort.topologically_sort(tgraph.get_all_nodes())
if TGLOBALS.DEBUG:
dot_filename = "ladeda.BEGIN.compacted_partial.round_{}.dot".format(round)
tgraph.draw_graph(dot_filename)
logger.debug("-wrote dot: {}".format(dot_filename))
for node in sorted_nodes:
if node.get_touched_val() > 0:
continue
logger.debug("\n\n/////////// R[{}] Exploring compaction from node: {}\n\n".format(round, node.toString()))
# try compact upward
prev_nodes = node.get_prev_nodes()
if len(prev_nodes) > 1 and self.untouched(prev_nodes) and self.all_have_lower_topological_orderings(node, prev_nodes):
if self.compact_upward(prev_nodes):
compacted_flag = True
next_nodes = node.get_next_nodes()
if len(next_nodes) > 1 and self.untouched(next_nodes) and self.all_have_higher_topological_orderings(node, next_nodes):
if self.compact_downward(next_nodes):
compacted_flag = True
if compacted_flag:
if TGLOBALS.DEBUG:
dot_filename = "ladeda.END.compacted_partial.round_{}.dot".format(round)
tgraph.draw_graph(dot_filename)
logger.debug("-wrote dot: {}".format(dot_filename))
self.compact_unbranched(tgraph)
####################
## Upward Compaction
def compact_upward(self, prev_nodes):
logger.debug("compact_upward: Partial {}".format(prev_nodes))
prev_nodes = list(prev_nodes)
for i in range(0, len(prev_nodes)-1):
for j in range(i+1, len(prev_nodes)):
if self.compact_upward_node_pair(prev_nodes[i], prev_nodes[j]):
return True
return False
def compact_upward_node_pair(self, node_A, node_B):
logger.debug("compact_upward_node_pair:\nnode_A: {}\nnode_B: {}".format(node_A.toString(), node_B.toString()))
# ensure they're on parallel paths in the graph.
if (node_A.is_ancestral(node_B) or node_B.is_ancestral(node_A)):
logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
return False
# switch A,B so A is the shorter sequence node
if len(node_B.get_seq()) < len(node_A.get_seq()):
(node_A, node_B) = (node_B, node_A)
seqA = node_A.get_seq()
seqB = node_B.get_seq()
shorter_len = len(seqA)
# reverse the strings and check number of mismatches
seqA_rev = seqA[::-1]
seqB_rev = seqB[::-1]
num_matches = 0
for i in range(0, shorter_len):
if seqA_rev[i] == seqB_rev[i]:
num_matches += 1
else:
break
if num_matches == 0:
logger.debug("no matching suffix between {} and {}".format(node_A, node_B))
return False
# go ahead and merge the two in their region of common overlap
tgraph = node_A.get_graph()
shared_seq = seqA[-1*num_matches:]
uniqA_seq = seqA[0:(len(seqA)-num_matches)]
uniqB_seq = seqB[0:(len(seqB)-num_matches)]
logger.debug("Suffices:\nseqA:{}\nSeqB:{}\nShared:{}".format(seqA, seqB, shared_seq))
assert len(seqA) == len(shared_seq) + len(uniqA_seq)
assert len(seqB) == len(shared_seq) + len(uniqB_seq)
##### operations needed:
#
# A A
# \ \
# X --> C --X
# / /
# B B
#
# - create C and set to partial shared sequence
# - add B and A transcripts to C
# - A and B link to C only
# - C recapitulates all outgoing edges from A and B
logger.debug("node_A before mods: {}".format(node_A.toString()))
logger.debug("node_B before mods: {}".format(node_B.toString()))
combined_transcripts = node_A.get_transcripts().union(node_B.get_transcripts())
combined_loc_id = "Upart:" + node_A.get_loc_id() + "," + node_B.get_loc_id()
node_C = tgraph.get_node(combined_transcripts, combined_loc_id, shared_seq)
node_A.set_seq(uniqA_seq)
node_B.set_seq(uniqB_seq)
# reset next nodes from A,B to C
all_next_nodes = node_A.get_next_nodes().union(node_B.get_next_nodes())
tgraph.prune_edges([node_A], node_A.get_next_nodes())
tgraph.prune_edges([node_B], node_B.get_next_nodes())
tgraph.add_edges([node_C], all_next_nodes)
node_C.touch()
if len(uniqA_seq) == 0:
# no longer need this now
tgraph.add_edges(node_A.get_prev_nodes(), [node_C])
tgraph.prune_node(node_A)
else:
tgraph.add_edges([node_A], [node_C])
node_A.touch()
if len(uniqB_seq) == 0:
# no longer need it
tgraph.add_edges(node_B.get_prev_nodes(), [node_C])
tgraph.prune_node(node_B)
else:
tgraph.add_edges([node_B], [node_C])
node_B.touch()
logger.debug("node_A after mods: {}".format(node_A.toString()))
logger.debug("node_B after mods: {}".format(node_B.toString()))
logger.debug("node_C after mods: {}".format(node_C.toString()))
logger.debug("\n\n\t*** partially UP-compacted nodes: {} and {} + {} ***".format(node_A, node_B, node_C))
return True
######################
## Downward compaction
def compact_downward(self, next_nodes):
logger.debug("compact_downward: Partial {}".format(next_nodes))
next_nodes = list(next_nodes)
for i in range(0, len(next_nodes)-1):
for j in range(i+1, len(next_nodes)):
if self.compact_downward_node_pair(next_nodes[i], next_nodes[j]):
return True
return False
def compact_downward_node_pair(self, node_A, node_B):
logger.debug("compact_downward_node_pair:\nnode_A: {}\nnode_B: {}".format(node_A.toString(), node_B.toString()))
# ensure they're on parallel paths in the graph.
if (node_A.is_descendant(node_B) or node_B.is_descendant(node_A)):
logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
return False
# switch A,B so A is the shorter sequence node
if len(node_B.get_seq()) < len(node_A.get_seq()):
(node_A, node_B) = (node_B, node_A)
seqA = node_A.get_seq()
seqB = node_B.get_seq()
shorter_len = len(seqA)
num_matches = 0
for i in range(0, shorter_len):
if seqA[i] == seqB[i]:
num_matches += 1
else:
break
if num_matches == 0:
logger.debug("-no matching prefix match between {} and {}".format(node_A, node_B))
return False
# go ahead and merge the two in their region of common overlap
tgraph = node_A.get_graph()
shared_seq = seqA[0:num_matches]
uniqA_seq = seqA[num_matches:]
uniqB_seq = seqB[num_matches:]
logger.debug("Prefixes:\nseqA:{}\nSeqB:{}\nShared:{}".format(seqA, seqB, shared_seq))
##### operations needed:
#
# A A
# / /
# -- X --> -- X -- C
# \ \
# B B
#
# - find shared prefix of A,B
# - create C for shared prefix, shorten A,B
# - link A,B down from C
# - C takes on all prev nodes from A,B
logger.debug("node_A before mods: {}".format(node_A.toString()))
logger.debug("node_B before mods: {}".format(node_B.toString()))
combined_transcripts = node_A.get_transcripts().union(node_B.get_transcripts())
combined_loc_id = "Dpart:" + node_A.get_loc_id() + "," + node_B.get_loc_id()
node_C = tgraph.get_node(combined_transcripts, combined_loc_id, shared_seq)
node_A.set_seq(uniqA_seq)
node_B.set_seq(uniqB_seq)
# reset prev nodes from A,B to C
all_prev_nodes = node_A.get_prev_nodes().union(node_B.get_prev_nodes())
tgraph.prune_edges(node_A.get_prev_nodes(), [node_A])
tgraph.prune_edges(node_B.get_prev_nodes(), [node_B])
tgraph.add_edges(all_prev_nodes, [node_C])
node_C.touch()
if len(uniqA_seq) == 0:
# no longer need this now
tgraph.add_edges([node_C], node_A.get_next_nodes())
tgraph.prune_node(node_A)
else:
tgraph.add_edges([node_C], [node_A])
node_A.touch()
if len(uniqB_seq) == 0:
# no longer need it
tgraph.add_edges([node_C], node_B.get_next_nodes())
tgraph.prune_node(node_B)
else:
tgraph.add_edges([node_C], [node_B])
node_B.touch()
logger.debug("node_A after mods: {}".format(node_A.toString()))
logger.debug("node_B after mods: {}".format(node_B.toString()))
logger.debug("node_C after mods: {}".format(node_C.toString()))
logger.debug("\n\n\t*** partially compacted nodes: {} and {} + {} ***".format(node_A, node_B, node_C))
return True
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import TGraph
import TNode
import Node_alignment
from Topological_sort import Topological_sort
import TGLOBALS
logger = logging.getLogger(__name__)
MAX_MM_RATE = 0.05
class Compact_graph_whole:
def __init__(self):
pass
def compact_unbranched(self, tgraph):
for node in tgraph.get_all_nodes():
if node.is_dead():
continue
prev_nodes = list(node.get_prev_nodes())
if len(prev_nodes) == 1:
# merge them
prev_node = prev_nodes[0]
if len(prev_node.get_next_nodes()) != 1:
# prev node is downward branched... no dice
continue
logger.debug("linear compaction of nodes {} and {}".format(prev_node, node))
node.add_transcripts(prev_node.get_transcripts())
node.set_seq(prev_node.get_seq() + node.get_seq())
tgraph.add_edges(prev_node.get_prev_nodes(), [node])
tgraph.prune_node(prev_node)
def compact_graph(self, tgraph, num_allowed_variants):
compacted_flag = True
round = 0
while compacted_flag:
round += 1
logger.debug("\n\t### Num allowed variants: {}, Round: {}".format(num_allowed_variants, round))
tgraph.clear_touch_settings() # start fresh
compacted_flag = False # reset for this round
sorted_nodes = Topological_sort.topologically_sort(tgraph.get_all_nodes())
if TGLOBALS.DEBUG:
dot_filename = "ladeda.compacted_whole.BEGIN.num_allowed_{}.round_{}.dot".format(num_allowed_variants, round)
tgraph.draw_graph(dot_filename)
logger.debug("-wrote dot: {}".format(dot_filename))
for node in sorted_nodes:
if node.get_touched_val() > 0:
continue
logger.debug("\n\n///////// R[{}] Exploring compaction at node: {}\n\n".format(round, node.toString()))
# try compact upward
prev_nodes = node.get_prev_nodes()
if len(prev_nodes) > 1 and self.untouched(prev_nodes) and self.all_have_lower_topological_orderings(node, prev_nodes):
if self.compact_upward(prev_nodes, num_allowed_variants):
compacted_flag = True
next_nodes = node.get_next_nodes()
if len(next_nodes) > 1 and self.untouched(next_nodes) and self.all_have_higher_topological_orderings(node, next_nodes):
if self.compact_downward(next_nodes, num_allowed_variants):
compacted_flag = True
if compacted_flag:
if TGLOBALS.DEBUG:
dot_filename = "ladeda.compacted_whole.END.num_allowed_{}.round_{}.dot".format(num_allowed_variants, round)
tgraph.draw_graph(dot_filename)
logger.debug("-wrote dot: {}".format(dot_filename))
self.compact_unbranched(tgraph)
def untouched(self, node_list):
for node in node_list:
if node.get_touched_val() > 1:
return False
return True
def all_have_lower_topological_orderings(self, node, other_nodes_list):
topo_order = node.get_topological_order()
if topo_order < 0:
raise RuntimeError("Error, must run topological ordering first")
for other_node in other_nodes_list:
if other_node.get_topological_order() > topo_order:
return False
return True
def all_have_higher_topological_orderings(self, node, other_nodes_list):
topo_order = node.get_topological_order()
if topo_order < 0:
raise RuntimeError("Error, must run topological ordering first")
for other_node in other_nodes_list:
if other_node.get_topological_order() < topo_order:
return False
return True
####################
## Upward Compaction
def compact_upward(self, prev_nodes, num_allowed_variants):
logger.debug("compact_upward: {} max_allowed_variants: {}".format(prev_nodes, num_allowed_variants))
prev_nodes = list(prev_nodes)
for i in range(0, len(prev_nodes)-1):
for j in range(i+1, len(prev_nodes)):
if self.compact_upward_node_pair(prev_nodes[i], prev_nodes[j], num_allowed_variants):
return True
return False
def compact_upward_node_pair(self, node_A, node_B, num_allowed_variants):
logger.debug("compact_upward_node_pair({}, {}, num_allowed_variants: {}".format(node_A, node_B, num_allowed_variants))
# ensure they're on parallel paths in the graph.
if (node_A.is_ancestral(node_B) or node_B.is_ancestral(node_A)):
logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
return False
logger.debug("-not ancestral: {}, {}".format(node_A, node_B))
# switch A,B so A is the shorter sequence node
if len(node_B.get_seq()) < len(node_A.get_seq()):
(node_A, node_B) = (node_B, node_A)
seqA = node_A.get_seq()
seqB = node_B.get_seq()
shorter_len = len(seqA)
# reverse the strings and check number of mismatches
seqA_rev = seqA[::-1]
seqB_rev = seqB[::-1]
num_mm = 0
for i in range(0, shorter_len):
if seqA_rev[i] != seqB_rev[i]:
num_mm += 1
if num_mm > num_allowed_variants:
logger.debug("num_mm: {} exceeds {} for node pair: {} and {}".format(num_mm, num_allowed_variants, node_A, node_B))
return False
if num_mm / shorter_len > MAX_MM_RATE:
logger.debug("num_mm: {} / len: {} exceeds max mm rate: {} for node pair: {} and {}".format(num_mm, shorter_len,
MAX_MM_RATE, node_A, node_B))
return False
# go ahead and merge the two in their region of common overlap
tgraph = node_A.get_graph()
##### operations needed:
#
# A
# \
# -- X
# /
# B
#
# -add B transcripts to A
# -adjust all B-downward edges to A
# -if B is longer than A, set upward to A and trim B part to unique region
# otherwise, remove B altogether.
#
logger.debug("node_A before mods: {}".format(node_A.toString()))
logger.debug("node_B before mods: {}".format(node_B.toString()))
node_A.add_transcripts(node_B.get_transcripts())
tgraph.add_edges( [node_A], node_B.get_next_nodes() )
tgraph.prune_edges( [node_B], node_B.get_next_nodes() )
node_A.touch()
node_B.touch()
if len(seqB) > shorter_len:
delta = len(seqB) - shorter_len
seqB = seqB[0:delta]
node_B.set_seq(seqB)
tgraph.add_edges([node_B], [node_A])
else:
# remove node_B prev edges and attach them to A
tgraph.add_edges(node_B.get_prev_nodes(), [node_A])
# remove B altogether
tgraph.prune_node(node_B)
logger.debug("node_A after mods: {}".format(node_A.toString()))
logger.debug("node_B after mods: {}".format(node_B.toString()))
logger.debug("\n\n\t*** compacted nodes: {} and {} ***".format(node_A, node_B))
return True
######################
## Downward compaction
def compact_downward(self, next_nodes, num_allowed_variants):
logger.debug("compact_downward: {} max_allowed_variants: {}".format(next_nodes, num_allowed_variants))
next_nodes = list(next_nodes)
for i in range(0, len(next_nodes)-1):
for j in range(i+1, len(next_nodes)):
if self.compact_downward_node_pair(next_nodes[i], next_nodes[j], num_allowed_variants):
return True
return False
def compact_downward_node_pair(self, node_A, node_B, num_allowed_variants):
# ensure they're on parallel paths in the graph.
if (node_A.is_descendant(node_B) or node_B.is_descendant(node_A)):
logger.debug("-not parallel paths. excluding compaction of {} and {}".format(node_A, node_B))
return False
# switch A,B so A is the shorter sequence node
if len(node_B.get_seq()) < len(node_A.get_seq()):
(node_A, node_B) = (node_B, node_A)
seqA = node_A.get_seq()
seqB = node_B.get_seq()
shorter_len = len(seqA)
num_mm = 0
for i in range(0, shorter_len):
if seqA[i] != seqB[i]:
num_mm += 1
if num_mm > num_allowed_variants:
logger.debug("num_mm: {} exceeds {} for node pair: {} and {}".format(num_mm, num_allowed_variants, node_A, node_B))
return False
if num_mm / shorter_len > MAX_MM_RATE:
logger.debug("num_mm: {} / len: {} exceeds max mm rate: {} for node pair: {} and {}".format(num_mm, shorter_len,
MAX_MM_RATE, node_A, node_B))
return False
# go ahead and merge the two in their region of common overlap
tgraph = node_A.get_graph()
##### operations needed:
#
# A
# /
# -- X
# \
# B
#
# -add B transcripts to A
# -adjust all B-downward edges to A
# -if B is longer than A, set downward to A and trim B part to unique region
# otherwise, remove B altogether.
#
logger.debug("node_A before mods: {}".format(node_A.toString()))
logger.debug("node_B before mods: {}".format(node_B.toString()))
node_A.add_transcripts(node_B.get_transcripts())
tgraph.add_edges(node_B.get_prev_nodes(), [node_A])
tgraph.prune_edges(node_B.get_prev_nodes(), [node_B])
node_A.touch()
node_B.touch()
if len(seqB) > shorter_len:
seqB = seqB[shorter_len:]
node_B.set_seq(seqB)
tgraph.add_edges([node_A], [node_B])
else:
# remove node_B prev edges and attach them to A
tgraph.add_edges([node_A], node_B.get_next_nodes())
# remove B altogether
tgraph.prune_node(node_B)
logger.debug("node_A after mods: {}".format(node_A.toString()))
logger.debug("node_B after mods: {}".format(node_B.toString()))
logger.debug("\n\n\t*** compacted nodes: {} and {} ***".format(node_A, node_B))
return True
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
logger = logging.getLogger(__name__)
class DP_matrix:
"""
defines the dynamic programming matrix for the node multiple alignments
"""
@staticmethod
def build_DP_matrix(num_rows, num_cols):
dp_matrix = list()
for i in range(0, num_rows+1):
row = []
for j in range(0, num_cols+1):
struct = { 'i' : i,
'j' : j,
'bt' : -1,
'score' : 0,
}
row.append(struct)
dp_matrix.append(row)
return dp_matrix
@staticmethod
def toString(dp_matrix):
nrow = len(dp_matrix)
ncol = len(dp_matrix[0])
logger.debug("Matrix is {} X {}".format(nrow, ncol))
ret_text = ""
for row in dp_matrix:
str_row = [ str(x['score']) for x in row ]
ret_text += "\t".join(str_row) + "\n"
return ret_text
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
import TGraph
import TNode
import Node_path
import Node_alignment
from GraphCycleException import GraphCycleException
import Topological_sort
import DP_matrix
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
class Gene_splice_modeler:
"""
Builds supertranscipts.
object instance members:
gene_id : str
alignments : list of Node_alignment objects
"""
def __init__(self, gene_id, node_path_obj_list):
"""
initialize alignments list with simple single 'alignment' objects with
each path as an individual alignment with just its path nodes.
params:
gene_id : str
node_path_obj_list : list of Node_path objects, each Node_path corresponding to an individual Trinity isoform
"""
self.gene_id = gene_id
self.alignments = list()
logger.debug("Gene_splice_modeler inputs: {}".format(node_path_obj_list))
for node_path_obj in node_path_obj_list:
transcript_name = node_path_obj.get_transcript_name()
alignment_obj = Node_alignment.Node_alignment.get_single_seq_node_alignment(node_path_obj)
self.alignments.append(alignment_obj)
def get_gene_id(self):
return self.gene_id
def build_splice_model(self):
"""
method to construct the super transcript.
Tries 2 approaches:
a. If there isn't an obvious repetitive node structure and so the graph formas a DAG,
we build a splice graph and perform topological sorting of the nodes.
b. If there is some repetitive structure, we resort to performing a multiple alignment-based method to
organize relationships among nodes in isoforms, and the multiple alignment produces the linear ordering
for the supertranscript.
"""
if not self.alignment_contains_repeat_node():
# no obvious cycles
try:
return self.topological_order_splice_model()
except GraphCycleException:
# have a more complex cycle here...
# try again w/ mult align approach.
return self.multiple_alignment_splice_model()
else:
return self.multiple_alignment_splice_model()
def alignment_contains_repeat_node(self):
for alignment in self.alignments:
loc_ids = set()
for i in range(0, alignment.width()):
node_obj = alignment.get_representative_column_node(i)
loc_id = node_obj.get_loc_id()
if loc_id in loc_ids:
return True
loc_ids.add(loc_id)
return False
def topological_order_splice_model(self):
"""
Build supertranscript using simpler topological sorting of the nodes.
"""
logger.debug("\tusing topological sort method.\n");
gene_id = self.get_gene_id()
## make a generic graph.
graph = TGraph.TGraph(gene_id)
for alignment in self.alignments:
logger.debug("topological_order_splice_model, input alignment: " + str(alignment))
node_list = alignment.get_aligned_nodes()[0] # should be unaligned here, so just ordered path list.
transcript_name = alignment.get_transcript_names()[0]
logger.debug("topological_order_splice_model, node list: " + str(node_list))
for i in range(0, len(node_list)):
node_obj = node_list[i]
loc_id = node_obj.get_loc_id()
generic_node = graph.get_node(transcript_name, loc_id, node_obj.get_seq()) # rely on Node class caching system
logger.debug("generic node: " + str(generic_node))
if i > 0:
# set prev node info
prev_node_obj = node_list[i-1]
prev_generic_node = graph.get_node(transcript_name, prev_node_obj.get_loc_id(), prev_node_obj.get_seq())
generic_node.add_prev_node(prev_generic_node)
if i < len(node_list) - 1:
next_node_obj = node_list[i+1]
next_generic_node = graph.get_node(transcript_name, next_node_obj.get_loc_id(), next_node_obj.get_seq())
generic_node.add_next_node(next_generic_node)
logger.debug("Before sorting nodes: " + str(graph))
topologically_sorted_nodes = Topological_sort.Topological_sort.topologically_sort(graph.get_all_nodes())
logger.debug("Topologically sorted nodes: " + str(topologically_sorted_nodes))
# index loc node ids
aligned_loc_id_pos = dict()
for i in range(0, len(topologically_sorted_nodes)):
loc_id = topologically_sorted_nodes[i].get_loc_id()
aligned_loc_id_pos[loc_id] = i
new_alignments = list()
transcript_ids = list()
for alignment in self.alignments:
transcript_ids.append(alignment.get_transcript_names()[0]) # really should only be one here.
new_alignment = [None for i in topologically_sorted_nodes]
for node in alignment.get_aligned_nodes()[0]:
loc_id = node.get_loc_id()
new_idx = aligned_loc_id_pos[loc_id]
new_alignment[new_idx] = node
new_alignments.append(new_alignment)
splice_graph_model = Node_alignment.Node_alignment(gene_id, transcript_ids, new_alignments)
logger.debug("Splice graph model: " + str(splice_graph_model))
return splice_graph_model
def multiple_alignment_splice_model(self):
"""
Multiple alignment algorithm for dealing with repeat nodes:
For each best matching pair of transcripts (or aligned transcripts),
perform alignment, and replace aligned pair with a single alignment object.
"""
logger.debug("\tusing mult alignment method.\n");
alignments = self.alignments
if len(alignments) == 1:
# no alignment is necessary.
return alignments[0]
# determine initial path similarity
similarity_matrix = Gene_splice_modeler.compute_similarity_matrix(self.alignments)
logger.debug("Similarity matrix:\n" + str(similarity_matrix))
## build multiple alignment in a hierarchical way
while len(similarity_matrix) > 1:
# set diag to -1 to avoid any zero ties w/ self-vals
for i in range(0,len(alignments)):
similarity_matrix[ i ][ i ] = -1
## find best pair
best_pair_idx = int(numpy.argmax(similarity_matrix))
num_alignments = len(similarity_matrix)
best_pair_idx_1 = int(best_pair_idx / num_alignments)
best_pair_idx_2 = best_pair_idx % num_alignments
## merge pair into single alignment
align_a = alignments[ best_pair_idx_1 ]
align_b = alignments[ best_pair_idx_2 ]
align_merged = Gene_splice_modeler.merge_alignments(align_a, align_b)
## recompute matrix
new_alignment_list = list()
for i in range(0, len(alignments)):
if i not in (best_pair_idx_1, best_pair_idx_2):
new_alignment_list.append(alignments[ i ])
new_alignment_list.append(align_merged)
alignments = new_alignment_list
logger.debug("\nUpdated alignments:\n" + str(alignments))
similarity_matrix = Gene_splice_modeler.compute_similarity_matrix(alignments)
logger.debug("Similarity matrix:\n" + str(similarity_matrix))
if len(alignments) > 1:
raise RuntimeError("Error, should only have one alignment but have {} alignments after merge".format(len(alignments)))
return alignments[0]
@staticmethod
def compute_similarity_matrix(alignments_list):
"""
similarity matrix indicates number of shared nodes between each pair of isoforms.
"""
num_alignments = len(alignments_list)
sim_matrix = numpy.zeros( (num_alignments, num_alignments), dtype='int_' )
for i in range(0, num_alignments-1):
align_i = alignments_list[i]
for j in range(i+1, num_alignments):
align_j = alignments_list[j]
common_nodes = Node_alignment.Node_alignment.compute_number_common_nodes(align_i, align_j)
num_common_nodes = len(common_nodes)
sim_matrix[ i ][ j ] = num_common_nodes
return sim_matrix
@staticmethod
def merge_alignments(align_a, align_b):
"""
Computes a mismatch-free multiple alignment (just matches and gaps) between two Node_alignment objects
returns single Node_alignment object containing the contents of aligned align_a and align_b as aligned.
"""
logger.debug("Merging alignments {} and {}".format(align_a, align_b))
## ensure the transcripts are disjoint
transcript_names_align_A = set(align_a.get_transcript_names())
transcript_names_align_B = set(align_b.get_transcript_names())
if not set.isdisjoint(transcript_names_align_A, transcript_names_align_B):
raise RuntimeError("Error, transcripts in alignments to merge are not disjoint: {} and {}".format(transcript_names_align_A, transcript_names_align_B))
width_a = align_a.width()
width_b = align_b.width()
# do global alignments w/o penalizing end gaps
dp_matrix = DP_matrix.DP_matrix.build_DP_matrix(width_a, width_b)
# put align B across top (cols) and align A at side (row)
# init the matrix zero rows
for i in range(1, width_a+1):
dp_matrix[ i ][ 0 ]['bt'] = 'DEL_B' # UP
for j in range(1, width_b+1):
dp_matrix[ 0 ][ j ]['bt'] = 'DEL_A' # LEFT
# score the DP matrix
for i in range(1, width_a+1):
for j in range(1, width_b+1):
score_cell_match = Gene_splice_modeler.get_match_score(align_a, i-1, align_b, j-1) # score matrix is 1-based, align is 0-based
score_diag = dp_matrix[ i-1 ][ j-1 ]['score'] + score_cell_match
score_del_a = dp_matrix[ i ][ j-1 ]['score']
score_del_b = dp_matrix[ i-1 ][ j ]['score']
if score_cell_match > 0 and score_diag >= score_del_a and score_diag >= score_del_b:
dp_matrix[ i ][ j ]['score'] = score_diag
dp_matrix[ i ][ j ]['bt'] = 'DIAG'
elif score_del_a >= score_del_b:
dp_matrix[ i ][ j ]['score'] = score_del_a
dp_matrix[ i ][ j ]['bt'] = 'DEL_A'
else:
dp_matrix[ i ][ j ]['score'] = score_del_b
dp_matrix[ i ][ j ]['bt'] = 'DEL_B'
#logger.debug("DP_matrix:\n" + DP_matrix.toString(dp_matrix))
"""
# get max score
max_score = 0
max_i = -1
max_j = -1
for i in range(0,width_a+1):
score = dp_matrix[ i ][ width_b ]['score']
if score > max_score:
max_score = score
max_i = i
max_j = width_b
for j in range(0, width_b+1):
score = dp_matrix[ width_a ][ j ]['score']
if score > max_score:
max_score = score
max_i = width_a
max_j = j
logger.info("found max score {} at position: ({},{})".format(max_score, max_i, max_j))
"""
# keep as global alignment
max_i = width_a
max_j = width_b
# backtrack
i = max_i
j = max_j
all_merged_alignment_nodes_list = list()
while i > 0 or j > 0:
score_struct = dp_matrix[ i ][ j ]
nodes_align_a = align_a.get_node_LIST_at_column_pos(i-1) # again, remember align has zero-based coords, whereas dp_matrix is 1-based
nodes_align_b = align_b.get_node_LIST_at_column_pos(j-1)
align_nodes = list()
bt_dir = score_struct['bt']
#logger.debug("backtrack-dir: " + bt_dir)
if bt_dir == 'DIAG':
i -= 1
j -= 1
align_nodes = nodes_align_a + nodes_align_b
elif bt_dir == 'DEL_B': # UP
i -= 1
align_nodes += nodes_align_a
for x in range(0,len(nodes_align_b)):
align_nodes.append(None)
elif bt_dir == 'DEL_A': # LEFT
j -= 1
for x in range(0,len(nodes_align_a)):
align_nodes.append(None)
align_nodes += nodes_align_b
else:
raise RuntimeError("bt: ({},{}), bt_dir not defined".format(i,j))
all_merged_alignment_nodes_list.append(align_nodes)
all_merged_alignment_nodes_list.reverse()
logger.debug("Merged alignment nodes list: " + str(all_merged_alignment_nodes_list) )
# prep merged alignment obj
merged_transcript_name_list = align_a.get_transcript_names() + align_b.get_transcript_names()
node_obj_matrix = list()
# interate through each node list, reorganize into a matrix
for i in range(0,len(merged_transcript_name_list)):
row = list()
for node_obj_list in all_merged_alignment_nodes_list:
row.append(node_obj_list[i])
node_obj_matrix.append(row)
logger.debug("merged alignment node matrix:\n" + str(node_obj_matrix))
merged_alignment_obj = Node_alignment.Node_alignment(align_a.get_gene_id(), merged_transcript_name_list, node_obj_matrix)
logger.debug("merged alignment obj:\n" + str(merged_alignment_obj))
#sys.exit(1) # DEBUG
return merged_alignment_obj
@staticmethod
def get_match_score(align_a, idx_a, align_b, idx_b):
"""
just determines if indices in two transcripts have the same node identifier
"""
node_set_a = align_a.get_node_set_at_column_pos(idx_a)
node_set_b = align_b.get_node_set_at_column_pos(idx_b)
node_set_a = Node_alignment.Node_alignment.get_node_loc_ids(node_set_a)
node_set_b = Node_alignment.Node_alignment.get_node_loc_ids(node_set_b)
if (set.intersection(node_set_a, node_set_b)):
return 1 # match
else:
return 0 # no match
@staticmethod
def write_malign(gene_name, malign_dict, ofh, align_width=100):
"""
writes the multiply aligned isoform sequences to an output filehandle
"""
transcript_names = malign_dict.keys()
alignment_length = len(malign_dict[ transcript_names[ 0 ] ])
align_start = 0
align_text = ""
while align_start < alignment_length:
for transcript_name in transcript_names:
align_region = malign_dict[ transcript_name ][ align_start : min(alignment_length, align_start + align_width) ]
align_text += transcript_name + "\t" + align_region + "\n"
align_text += "\n" # spacer between alignment blocks
align_start += align_width
ofh.write("// {}\n\n{}\n".format(gene_name, align_text))
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
logger = logging.getLogger(__name__)
class GraphCycleException(Exception):
pass
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
import TNode
import TGraph
logger = logging.getLogger(__name__)
class Node_alignment:
"""
Object has two members:
transcript_names = [ transA,
transB,
transC,
...
]
aligned_nodes = [ [transA_node_1, transA_node_2, ... ],
[transB_node_1, transB_node_2, ... ],
[ None, transC_node_1, ... ],
]
Note, can have None at node positions to include gaps.
"""
GAP = None
def __init__(self, gene_id, transcript_name_list, node_obj_matrix):
self.gene_id = gene_id
self.transcript_names = transcript_name_list
self.aligned_nodes = node_obj_matrix
def get_gene_id(self):
return self.gene_id
def set_gene_id(self, gene_id):
self.gene_id = gene_id
def get_transcript_names(self):
# accessor
return self.transcript_names
def get_aligned_nodes(self):
# accessor
return self.aligned_nodes
@staticmethod
def get_single_seq_node_alignment(path_obj):
"""
Factory method:
constructs a Node_alignment object from a Node_path object
mostly just reshaping the info for use with the multiple alignment methods.
"""
node_list = list()
for node_obj in path_obj.get_path():
node_list.append(node_obj)
transcript_name = path_obj.get_transcript_name()
self = Node_alignment(transcript_name, [transcript_name], [node_list])
return self
@staticmethod
def compute_number_common_nodes(align_A, align_B):
"""
given to Node_alignment objects, counts the number of shared nodes
"""
node_set_a = Node_alignment.get_node_set(align_A)
node_set_b = Node_alignment.get_node_set(align_B)
node_set_a = Node_alignment.get_node_loc_ids(node_set_a)
node_set_b = Node_alignment.get_node_loc_ids(node_set_b)
common_nodes = set.intersection(node_set_a, node_set_b)
return common_nodes
@staticmethod
def get_node_loc_ids(node_set):
"""
private static method
gets the list of loc_id among all nodes in the set
"""
loc_ids_set = set()
for node in node_set:
loc_id = node.get_loc_id()
loc_ids_set.add(loc_id)
return loc_ids_set
@staticmethod
def get_node_set(align_obj):
"""
extracts a list of unique Node objects from the Node_alignment object
"""
num_trans = len(align_obj)
alignment_width = align_obj.width()
node_set = set()
for align_num in range(0,num_trans):
for align_pos in range(0,alignment_width):
node_obj = align_obj.aligned_nodes[ align_num ][ align_pos ]
if node_obj is not None:
node_set.add(node_obj)
return node_set
def get_node_set_at_column_pos(self, col_pos):
"""
At a given column of the Node_alignment, extracts the list of unique nodes
"""
# FIXME: since we're not dealing with mismatched nodes, there really should only be one node here
# that's shared among the different alignments
# Need refactoring across the next few methods as well for the same reason.
node_objs = set()
for i in range(0, len(self)):
node_obj = self.aligned_nodes[ i ][ col_pos ]
if node_obj is not None:
node_objs.add(node_obj)
return node_objs
def get_representative_column_node(self, col_pos):
node_list = list(self.get_node_set_at_column_pos(col_pos))
return node_list[ 0 ]
def get_node_LIST_at_column_pos(self, col_pos):
node_objs = list()
for i in range(0, len(self)):
node_obj = self.aligned_nodes[ i ][ col_pos ]
node_objs.append(node_obj)
return node_objs
def get_node_occupancy_at_column_pos(self, col_pos):
node_list = self.get_node_LIST_at_column_pos(col_pos)
occupancy_list = list()
for node in node_list:
if node is None:
occupancy_list.append(False)
else:
occupancy_list.append(True)
return occupancy_list
def append_node_to_each_entry(self, node_obj):
for i in range(0, len(self)):
self.aligned_nodes[ i ].append(node_obj)
def append_node_according_to_occupancy_pattern(self, node_obj, occupancy_pattern):
for i in range(0, len(self)):
if occupancy_pattern[i] is True:
self.aligned_nodes[ i ].append(node_obj)
else:
self.aligned_nodes[ i ].append(None)
def add_column(self, column_node_list):
num_alignments = len(self)
if len(column_node_list) != num_alignments:
raise RuntimeError("Error, column size differs from num_alignments")
for i in range(0,num_alignments):
self.aligned_nodes[ i ].append(column_node_list[ i ])
def __len__(self):
"""
number of transcripts represented in the alignment
"""
return(len(self.transcript_names))
def width (self):
"""
width of the alignment (number of columns)
"""
return(len(self.aligned_nodes[0]))
def __repr__(self):
num_transcripts = len(self.transcript_names)
ret_text = "\n# Alignment obj contains: {} transcripts: {}\n\n".format(num_transcripts, ",".join(self.transcript_names))
align_width = self.width()
NODES_PER_LINE = 10
# each alignment block
for i in range(0, align_width, NODES_PER_LINE):
# report alignment for each entry
for j in range(0,num_transcripts):
transcript_name = self.transcript_names[ j ]
aligned_nodes_entry = self.aligned_nodes[ j ]
ret_text += "{}".format(transcript_name)
for x in range(i, i+NODES_PER_LINE):
if x >= align_width:
break
ret_text += "\t{}".format(aligned_nodes_entry[ x ])
ret_text += "\n" # end of current line
ret_text += "\n" # spacer between alignment blocks
#ret_text += "Align [{}] trans {} : path {}".format(i, transcript_name, str(aligned_nodes_entry)) + "\n"
return ret_text
def squeeze(self):
"""
merge unbranched nodes into single nodes
"""
num_transcripts = len(self)
width = self.width()
node_obj_matrix = list()
for i in range(0,num_transcripts):
node_obj_matrix.append([])
squeezed_alignment = Node_alignment(self.get_gene_id(), self.get_transcript_names(), node_obj_matrix)
# walk the node list and merge unbranched stretches into single nodes
block_breakpoints = []
prev_col_node_set = self.get_node_occupancy_at_column_pos(0)
for i in range(1,width):
node_column_set = self.get_node_occupancy_at_column_pos(i)
#print("Comparing {} to {} == {}".format(prev_col_node_set, node_column_set, prev_col_node_set == node_column_set))
if node_column_set != prev_col_node_set:
block_breakpoints.append(i)
prev_col_node_set = node_column_set
block_breakpoints.append(width)
logger.debug("Block_breakpoints: {}".format(block_breakpoints))
blocked_nodes = list()
for i in range(0, width+1):
if i in block_breakpoints:
# found block terminator
node_to_add = None
if len(blocked_nodes) > 1:
node_to_add = TNode.TNode.merge_nodes(blocked_nodes)
else:
node_to_add = blocked_nodes[ 0 ]
blocked_node_occupancy = self.get_node_occupancy_at_column_pos(i-1)
squeezed_alignment.append_node_according_to_occupancy_pattern(node_to_add, blocked_node_occupancy)
blocked_nodes = list() # reinit
# add to running block
if i < width:
blocked_nodes.append(self.get_representative_column_node(i))
return squeezed_alignment
def to_gene_fasta_and_gtf(self, gene_name):
transcript_names = self.get_transcript_names()
gene_seq = ""
# init transcript gtf records
transcript_to_gtf_lines = dict()
transcript_to_malign = dict()
transcript_to_Trinity_fa_seq = dict()
transcript_to_Trinity_fa_path = dict()
for transcript_name in transcript_names:
transcript_to_gtf_lines[ transcript_name ] = ""
transcript_to_malign[ transcript_name ] = ""
transcript_to_Trinity_fa_path[ transcript_name ] = list()
transcript_to_Trinity_fa_seq[ transcript_name ] = ""
for i in range(0,self.width()):
node_obj = self.get_representative_column_node(i)
node_seq = node_obj.get_seq()
if len(node_seq) == 0:
raise RuntimeError("Error, node seq of length zero: node=" + str(node_obj))
node_id = node_obj.get_loc_id()
node_occupancy = self.get_node_occupancy_at_column_pos(i)
pos_start = len(gene_seq) + 1
gene_seq += node_obj.get_seq()
pos_end = len(gene_seq)
# include gtf record for transcripts
for j in range(0,len(transcript_names)):
transcript_name = transcript_names[ j ]
if node_occupancy[ j ] is True:
# make gtf record
transcript_to_gtf_lines[ transcript_name ] += "\t".join([gene_name, "Trinity_gene", "exon",
str(pos_start), str(pos_end), '.', '+', '.',
"gene_id \"{}\"; transcript_id \"{}\"\n".format(
gene_name, transcript_name) ] )
transcript_to_malign[ transcript_name ] += node_seq
# build Trinity fasta sequence and path info:
cdna_seq_len = len(transcript_to_Trinity_fa_seq[ transcript_name ])
rel_node_start = cdna_seq_len # index starting at zero
rel_node_end = cdna_seq_len + len(node_seq) -1
transcript_to_Trinity_fa_seq[ transcript_name ] += node_seq
transcript_to_Trinity_fa_path[ transcript_name ].append("{}:{}-{}".format(node_id, rel_node_start, rel_node_end))
else:
for x in range(0,len(node_seq)):
transcript_to_malign[ transcript_name ] += '.'
# build mini-gtf section
gene_gtf = "\n".join(transcript_to_gtf_lines.values())
# build Trinity fasta text
trinity_fasta_text = ""
for transcript_name in transcript_names:
transcript_seq = transcript_to_Trinity_fa_seq[transcript_name]
path_list = transcript_to_Trinity_fa_path[transcript_name]
#logger.debug("path list: " + str(path_list))
path_list_text = " ".join(path_list)
trinity_fasta_text += ">{} len={} path=[{}]\n{}\n".format(transcript_name, len(transcript_seq),
path_list_text, transcript_seq)
return (gene_seq, gene_gtf, trinity_fasta_text, transcript_to_malign)
def reassign_node_loc_ids_by_align_order(self):
for i in range(0,self.width()):
repr_node = self.get_representative_column_node(i)
repr_node.set_loc_id(str(i))
def to_splice_graph(self, gene_name, reset_node_ids=False):
aligned_nodes = self.get_aligned_nodes()
width = self.width()
refined_tgraph = TGraph.TGraph(gene_name)
new_node_list = list()
for i in range(0,width):
repr_node = self.get_representative_column_node(i)
transcripts = repr_node.get_transcripts()
loc_id = repr_node.get_loc_id()
if reset_node_ids:
loc_id = "loc_" + str(i)
new_node = refined_tgraph.get_node(transcripts, loc_id, repr_node.get_seq())
new_node_list.append(new_node)
#############
# build graph
for iso_node_alignment in aligned_nodes:
prev = None
for i in range(0,width):
if iso_node_alignment[i] != None:
if prev != None:
refined_tgraph.add_edges([prev], [new_node_list[i]])
prev = new_node_list[i]
logger.debug("New graph node listing:")
for node in new_node_list:
logger.debug(node.toString())
return refined_tgraph
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
import TNode
import Trinity_util
logger = logging.getLogger(__name__)
class Node_path:
"""
Object representation of the connected set of Node objects that represent the reconstructed isoforms graph traversal
Instance members:
transcript_name : (str) name of the isoform
node_obj_list : (list) of Node objects
"""
def __init__(self, tgraph, transcript_name, path_string, sequence):
"""
constructor, instantiates Node_path and builds vertices in the graph
"""
self.transcript_name = transcript_name
self.node_obj_list = list()
node_descr_list = re.findall("\d+:\d+\-\d+", path_string)
# 1st node is special (full kmer prefix included)
first_kmer_flag = False
obj_node_list = list()
for node_descr in node_descr_list:
(loc_node_id, node_coord_range) = node_descr.split(":")
(lend,rend) = node_coord_range.split("-")
lend = int(lend)
rend = int(rend)
if not first_kmer_flag:
first_kmer_flag = True
loc_node_id += 'fst'
# use factory call to instantiate node objects:
node_obj = tgraph.get_node(transcript_name,
loc_node_id, sequence[lend:rend+1]) # coords in path were already zero-based
self.node_obj_list.append(node_obj)
def get_transcript_name(self):
return self.transcript_name
def get_path(self):
return self.node_obj_list
def __repr__(self):
node_str_list = list()
for node in self.node_obj_list:
node_str_list.append(str(node))
path_str = "--".join(node_str_list)
return path_str
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import TGraph
import TNode
import Node_alignment
import Topological_sort
from Compact_graph_whole import Compact_graph_whole
from Compact_graph_partial import Compact_graph_partial
import TGLOBALS
logger = logging.getLogger(__name__)
MAX_MM_RATE = 0.05
def refine_alignment(node_alignment_obj, reset_node_ids=False):
"""
Create a new splice graph based on the node alignment obj.
Since some nodes may show up as duplicate (repeat) nodes, assign each a unique ID
"""
logger.debug("refine_alignment({})".format(node_alignment_obj))
# convert to splice graph
refined_tgraph = node_alignment_obj.to_splice_graph("^^SGRAPH2^^", reset_node_ids)
if TGLOBALS.DEBUG:
refined_tgraph.draw_graph("ladeda.pre.dot")
graph_compactor = Compact_graph_whole()
graph_compactor.compact_unbranched(refined_tgraph)
if TGLOBALS.DEBUG:
refined_tgraph.draw_graph("ladeda.linear_compact.dot")
###########
#
for allowed_variants in (0, 1, 2):
graph_compactor.compact_graph(refined_tgraph, allowed_variants)
if TGLOBALS.DEBUG:
refined_tgraph.draw_graph("ladeda.compact.m{}.dot".format(allowed_variants))
## now extract prefix and suffix matches
partial_graph_compactor = Compact_graph_partial()
partial_graph_compactor.compact_graph(refined_tgraph)
if TGLOBALS.DEBUG:
refined_tgraph.draw_graph("ladeda.final.dot")
# convert compacted graph into a node alignment obj
splice_graph_node_alignment = splice_graph_to_node_alignment(refined_tgraph)
splice_graph_node_alignment = remove_redundant_paths(splice_graph_node_alignment)
return(splice_graph_node_alignment)
def splice_graph_to_node_alignment(tgraph):
topologically_sorted_nodes = Topological_sort.Topological_sort.topologically_sort(tgraph.get_all_nodes())
logger.debug("Topologically sorted nodes: " + str(topologically_sorted_nodes))
# index loc node ids
aligned_loc_id_pos = dict()
for i in range(0, len(topologically_sorted_nodes)):
loc_id = topologically_sorted_nodes[i].get_loc_id()
aligned_loc_id_pos[loc_id] = i
new_alignments = list()
transcript_ids = set()
for node in topologically_sorted_nodes:
transcript_ids = transcript_ids.union(node.get_transcripts())
transcript_ids = list(transcript_ids)
for transcript_id in transcript_ids:
new_alignment = [None for i in topologically_sorted_nodes]
for node in topologically_sorted_nodes:
if transcript_id in node.get_transcripts():
loc_id = node.get_loc_id()
new_idx = aligned_loc_id_pos[loc_id]
new_alignment[new_idx] = node
new_alignments.append(new_alignment)
splice_graph_node_alignment = Node_alignment.Node_alignment(tgraph.get_gene_id(), transcript_ids, new_alignments)
logger.debug("Splice graph node_alignment: " + str(splice_graph_node_alignment))
return(splice_graph_node_alignment)
def remove_redundant_paths(node_alignment):
gene_id = node_alignment.get_gene_id()
transcript_names = node_alignment.get_transcript_names()
aligned_nodes = node_alignment.get_aligned_nodes()
num_transcripts_before_reduction = len(transcript_names)
# do all pairwise comparisons
# check for containments
containments = set()
for i in range(0,len(aligned_nodes)-1):
for j in range(i+1, len(aligned_nodes)):
if a_contains_b(aligned_nodes[i], aligned_nodes[j]):
containments.add(j)
elif a_contains_b(aligned_nodes[j], aligned_nodes[i]):
containments.add(i)
if containments:
adj_transcript_names = list()
adj_aligned_nodes = list()
for i in range(0, len(aligned_nodes)):
if i not in containments:
adj_transcript_names.append(transcript_names[i])
adj_aligned_nodes.append(aligned_nodes[i])
adj_splice_graph_node_alignment = Node_alignment.Node_alignment(gene_id, adj_transcript_names, adj_aligned_nodes)
num_after_reduction = len(adj_transcript_names)
logger.debug("Containments found, reporting reduced set {} of {} = {:.2f}%".format(
num_after_reduction, num_transcripts_before_reduction,
num_after_reduction/num_transcripts_before_reduction*100))
return adj_splice_graph_node_alignment
else:
logger.debug("No containments found")
return node_alignment # no changes
def get_first_node_idx(node_list):
# find starting place for comparison
begin_idx = -1
for i in range(0,len(node_list)):
if node_list[i] is not None:
begin_idx = i
break
if begin_idx < 0:
raise RuntimeError("Error, didn't find first non-none value among {} and {}".format(node_list_A, node_list_B))
return begin_idx
def get_end_node_idx(node_list):
# find ending place for comparison
end_idx = -1
for i in reversed(range(0, len(node_list))):
if node_list[i] is not None:
end_idx = i
break
if end_idx < 0:
raise RuntimeError("Error, didn't find last non-none value among {} and {}".format(node_list_A, node_list_B))
return end_idx
def a_contains_b(node_list_A, node_list_B):
A_start = get_first_node_idx(node_list_A)
A_end = get_end_node_idx(node_list_A)
B_start = get_first_node_idx(node_list_B)
B_end = get_end_node_idx(node_list_B)
if not (A_start <= B_start and A_end >= B_end):
return False # no containment
# ensure that in overlapping region, they have identical nodes
for i in range(B_start, B_end+1):
# if we see any difference, then not compatible
if node_list_A[i] != node_list_B[i]:
return False
# must be compatible and contained
return True
#!/usr/bin/env python
DEBUG = False
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
import TNode
logger = logging.getLogger(__name__)
class TGraph:
def __init__(self, gene_id):
self.node_cache = dict()
self.gene_id = gene_id
def get_node(self, transcript_id, loc_node_id, node_seq):
"""
Instantiates Node objects, and stores them in a graph.
*** use this method for instantiating all Node objects ***
use clear_node_cache() to clear the graph
"""
logger.debug("{}\t{}".format(loc_node_id, node_seq))
if len(node_seq) == 0:
raise RuntimeError("Error, non-zero length node_seq required for parameter")
if loc_node_id in self.node_cache:
node_obj = self.node_cache[ loc_node_id ]
node_obj.add_transcripts(transcript_id)
if node_obj.seq != node_seq:
errmsg = "ERROR: have conflicting node sequences for {} node_id: {}\n".format(self.get_gene_id(),
loc_node_id) + "{}\n vs. \n{}".format(node_obj.seq, node_seq)
logger.critical(errmsg)
raise RuntimeError(errmsg)
else:
return node_obj
else:
# instantiate a new one
node_obj = TNode.TNode(self, transcript_id, loc_node_id, node_seq)
self.node_cache[ loc_node_id ] = node_obj
return node_obj
def get_all_nodes(self):
return self.node_cache.values()
def clear_node_cache(self):
"""
clears the graph
"""
self.node_cache.clear()
def clear_touch_settings(self):
"""
clear the touch settings for each of the nodes
"""
for node in self.get_all_nodes():
node.clear_touch()
def add_edges(self, from_nodes_list, to_nodes_list):
for from_node in from_nodes_list:
for to_node in to_nodes_list:
from_node.add_next_node(to_node)
to_node.add_prev_node(from_node)
def prune_edges(self, from_nodes_list, to_nodes_list):
for from_node in from_nodes_list:
for to_node in to_nodes_list:
from_node.remove_next_node(to_node)
to_node.remove_prev_node(from_node)
def prune_node(self, node):
logger.debug("pruning node: {}".format(node))
self.prune_edges(node.get_prev_nodes(), [node])
self.prune_edges([node], node.get_next_nodes())
node.dead = True
self.node_cache.pop(node.get_loc_id())
def get_gene_id(self):
return self.gene_id
def draw_graph(self, filename):
ofh = open(filename, 'w')
ofh.write("digraph G {\n")
for node_id in self.node_cache:
node = self.node_cache[node_id]
node_seq = node.get_seq()
gene_node_id = node.get_gene_node_id()
next_nodes = node.get_next_nodes()
ofh.write("{} [label=\"{}:Len{}:T{}:{}\"]\n".format(node.get_id(), gene_node_id, len(node_seq),
node.get_topological_order(), node_seq))
for next_node in next_nodes:
ofh.write("{}->{}\n".format(node.get_id(), next_node.get_id()))
ofh.write("}\n") # close it
ofh.close()
#!/usr/bin/env python
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
import numpy
import time
import TGraph
logger = logging.getLogger(__name__)
class TNode:
"""
generic Trinity graph node object representing a node in the Trinity isoform reconstruction graph
Node's are objects within a gene and can be shared among transcript isoforms.
instance members include:
tgraph : (TGraph obj) graph for the Trinity gene, which will hold the nodes.
transcripts: list(str) names of the isoforms that contains this node.
loc_node_id : (int) identifier of the node
seq : (str) nucleotide sequence for this node in the transcript
len : (int) length of the node sequence
prev : (set) node objects connected as parental nodes in the graph
next : (set) node objects connected as descendant nodes in the graph
class members include:
merged_nodeset_counter : (int) tracking nodes that get merged under squeeze operations.
"""
node_cache = dict()
merged_nodeset_counter = 0
all_nodes_counter = 0
def __init__(self, tgraph, transcript_id, loc_node_id, node_seq):
"""
constructor, but don't use directly.... instead, use TGraph.get_node() factory function
"""
if len(node_seq) == 0:
raise RuntimeError("Error, TNode instantiation requires node sequence of length > 0")
self.tgraph = tgraph
self.transcripts = set()
self.add_transcripts(transcript_id)
self.loc_node_id = loc_node_id
self.seq = node_seq
self.len = len(node_seq)
TNode.all_nodes_counter += 1
self._id = TNode.all_nodes_counter
#logger.info("{}\t{}".format(loc_node_id, node_seq))
self.prev = set()
self.next = set()
self.stashed_prev = set() # for manipulation during topological sorting
self.stashed_next = set()
self.touched = 0
self.dead = False
self.topological_order = -1 # updated on topological sorting
#########################
## various Node ID values
#########################
def get_id(self):
# a private unique identifier for all nodes
return self._id
def get_loc_id(self):
return self.loc_node_id
def set_loc_id(self, loc_node_id):
self.loc_node_id = loc_node_id
def get_gene_id(self):
return self.tgraph.get_gene_id()
def get_gene_node_id(self):
gene_id = self.get_gene_id()
loc_id = self.get_loc_id()
node_id = gene_id + "::" + loc_id
return node_id
def get_touched_val(self):
return self.touched
def is_dead(self):
return(self.dead)
def is_ancestral(self, node, visited=None):
if visited is None:
visited = set() #init round
#logger.debug("is_ancestral search from {} of node {}".format(self, node))
if node == self:
#logger.debug("node is self")
return True
if node in self.prev:
#logger.debug("node in self.prev")
return True
else:
#logger.debug("continuing search")
visited.add(self)
#logger.debug("visited: {}".format(visited))
for prev_node in self.prev:
#logger.debug("cascading towards prev_node: {}".format(prev_node))
if prev_node in visited:
#logger.debug("prev_node in visited")
pass
else:
#logger.debug("prev_node not in visited")
found = prev_node.is_ancestral(node, visited)
if found:
return True
return False
def is_descendant(self, node, visited=None):
if visited == None:
visited = set() # init round
if node == self:
return True
if node in self.next:
return True
else:
visited.add(self)
for next_node in self.next:
if next_node not in visited:
found = next_node.is_descendant(node, visited)
if found:
return True
return False
## Other accessors
def get_graph(self):
return self.tgraph
def get_seq(self):
return self.seq
def set_seq(self, seq):
self.seq = seq
def get_topological_order(self):
return self.topological_order
def set_topological_order(self, topo_order):
self.topological_order = topo_order
def get_transcripts(self):
return self.transcripts
def add_transcripts(self, transcript_name_or_set):
if type(transcript_name_or_set) is set:
self.transcripts.update(transcript_name_or_set)
elif type(transcript_name_or_set) is str:
self.transcripts.add(transcript_name_or_set)
else:
raise RuntimeError("Error, parameter must be a string or a set ")
def get_prev_nodes(self):
return set(self.prev)
def get_next_nodes(self):
return set(self.next)
def add_next_node(self, next_node_obj):
self.next.add(next_node_obj)
def remove_next_node(self, remove_node_obj):
self.next.remove(remove_node_obj)
def stash_next_node(self, stash_node_obj):
self.remove_next_node(stash_node_obj)
self.stashed_next.add(stash_node_obj)
def add_prev_node(self, prev_node_obj):
self.prev.add(prev_node_obj)
def remove_prev_node(self, remove_node_obj):
self.prev.remove(remove_node_obj)
def stash_prev_node(self, stash_node_obj):
self.remove_prev_node(stash_node_obj)
self.stashed_prev.add(stash_node_obj)
def restore_stashed_nodes(self):
self.prev.update(self.stashed_prev)
self.stashed_prev = set()
self.next.update(self.stashed_next)
self.stashed_next = set()
def get_prev_node_loc_ids(self):
loc_ids = list()
for node in self.get_prev_nodes():
loc_ids.append(node.get_loc_id())
return loc_ids
def get_next_node_loc_ids(self):
loc_ids = list()
for node in self.get_next_nodes():
loc_ids.append(node.get_loc_id())
return loc_ids
def __repr__(self):
return(self.loc_node_id)
## Touching nodes
def touch(self):
self.touched += 1
def untouch(self):
self.touched -= 1
def clear_touch(self):
self.touched = 0
def toString(self):
txt = str("prev: " + str(self.get_prev_node_loc_ids()) +
", me: " + str(self.get_loc_id()) +
", next: " + str(self.get_next_node_loc_ids()) +
", transcripts: " + str(self.transcripts) +
", " + self.get_seq())
if self.topological_order >= 0:
txt += ", topo_order={}".format(self.topological_order)
if self.dead:
txt += " ** dead ** "
return txt
@classmethod
def merge_nodes(cls, node_list):
"""
Merges linear stretches of nodes into a single new node that has
concatenated sequences of the input nodes
"""
merged_node_seq = ""
TNode.merged_nodeset_counter += 1
merged_loc_node_id = "M{}".format(TNode.merged_nodeset_counter)
for node_obj in node_list:
seq = node_obj.get_seq()
merged_node_seq += seq
transcripts = node_list[0].get_transcripts()
tgraph = node_list[0].get_graph()
merged_node = TNode(tgraph, transcripts, merged_loc_node_id, merged_node_seq)
return merged_node