Skip to content
Commits on Source (8)
......@@ -567,6 +567,7 @@ main: {
#$Rscript .= "library(gplots)\n";
$Rscript .= "library(Biobase)\n";
$Rscript .= "library(qvalue)\n";
$Rscript .= "library(fastcluster)\n";
$Rscript .= "options(stringsAsFactors = FALSE)\n";
......@@ -1674,7 +1675,7 @@ main: {
print $ofh $Rscript;
close $ofh;
&process_cmd("R --vanilla -q < $R_script_file");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script_file");
exit(0);
......
......@@ -87,6 +87,7 @@ main: {
print $ofh "library(cluster)\n";
#print $ofh "library(gplots)\n";
print $ofh "library(Biobase)\n";
print $ofh "library(fastcluster)\n";
print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
print $ofh "load(\"$R_data_file\")\n";
......@@ -126,7 +127,7 @@ main: {
close $ofh;
&process_cmd("R --vanilla -q < $R_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
exit(0);
......
......@@ -88,6 +88,7 @@ main: {
print $ofh "library(cluster)\n";
#print $ofh "library(gplots)\n";
print $ofh "library(Biobase)\n";
print $ofh "library(fastcluster)\n";
print $ofh "source(\"$FindBin::RealBin/R/heatmap.3.R\")\n";
print $ofh "load(\"$R_data_file\")\n";
......@@ -180,7 +181,7 @@ main: {
close $ofh;
&process_cmd("R --vanilla -q < $R_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
###################################################
......
......@@ -84,7 +84,7 @@ main: {
print $ofh "get_cluster_info(\"$R_data_file\")\n";
close $ofh;
&process_cmd("R --vanilla -q --slave < $R_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q --slave < $R_script");
open (my $fh, $matrix_data) or die "Error, cannot open file $matrix_data";
......
......@@ -108,7 +108,7 @@ main: {
close $ofh;
&process_cmd("R --vanilla -q < $R_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
exit(0);
......
......@@ -98,7 +98,7 @@ main: {
close $ofh;
&process_cmd("R --vanilla -q < $R_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $R_script");
exit(0);
......
......@@ -109,7 +109,7 @@ sub run_edgeR_remove_batch_effect {
close $ofh;
## Run R-script
my $cmd = "R --vanilla -q < $Rscript_name";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
eval {
......
......@@ -132,7 +132,7 @@ __EORSCRIPT__
;
my $cmd = "R --vanilla -q < $out_rscript";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $out_rscript";
my $ret = system($cmd);
if ($ret) {
die "Error, cmd: $cmd died with ret $ret";
......
......@@ -445,7 +445,7 @@ sub run_edgeR_sample_pair {
close $ofh;
## Run R-script
my $cmd = "R --vanilla -q < $Rscript_name";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
eval {
......@@ -538,7 +538,7 @@ sub run_DESeq2_sample_pair {
close $ofh;
## Run R-script
my $cmd = "R --vanilla -q < $Rscript_name";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
&process_cmd($cmd);
return;
......@@ -614,7 +614,7 @@ sub run_limma_voom_sample_pair {
close $ofh;
## Run R-script
my $cmd = "R --vanilla -q < $Rscript_name";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
eval {
......@@ -713,7 +713,7 @@ sub run_ROTS_sample_pair {
close $ofh;
## Run R-script
my $cmd = "R --vanilla -q < $Rscript_name";
my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";
eval {
......
......@@ -117,7 +117,10 @@ main: {
print $ofh "\n\n# GO-Seq protocol: build pwf based on ALL DE features\n";
print $ofh "missing_gene_lengths = sample_set_gene_ids[! sample_set_gene_ids %in% rownames(gene_lengths)]\n";
print $ofh "if (length(missing_gene_lengths) > 0) {\n";
print $ofh " stop(\"Error, missing gene lengths for features: \", paste(missing_gene_lengths, collapse=', '))\n";
print $ofh "}\n";
print $ofh "sample_set_gene_lengths = gene_lengths[sample_set_gene_ids,]\n";
print $ofh "GO_info_listed = GO_info_listed[ names(GO_info_listed) %in% sample_set_gene_ids ]\n";
......
......@@ -118,7 +118,7 @@ sub run_TMM {
close $ofh;
&process_cmd("R --vanilla -q < $tmm_norm_script");
&process_cmd("R --no-save --no-restore --no-site-file --no-init-file -q < $tmm_norm_script");
return("$counts_matrix_file.TMM_info.txt");
......
......@@ -38,7 +38,7 @@ main: {
print $ofh "dev.off();\n";
close $ofh;
system("R --vanilla -q < __tmp_tiers_boxplot.R");
system("R --no-save --no-restore --no-site-file --no-init-file -q < __tmp_tiers_boxplot.R");
exit($?);
}
......
......@@ -43,26 +43,38 @@ def main():
"GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"),
epilog="", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True, help="Path to the SuperTranscripts fasta file.")
parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True,
help="Path to the SuperTranscripts fasta file.")
parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True, help="Path to the SuperTranscript gtf file.")
parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True,
help="Path to the SuperTranscript gtf file.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2, help="Pair of paired ends read files.")
group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2,
help="Pair of paired ends read files.")
group.add_argument('-s', '--single', dest="single_reads", type=str, help="Single reads file.")
group.add_argument('-s', '--single', dest="single_reads", type=str,
help="Single reads file.")
parser.add_argument('-o', '--output', dest="out_path", type=str, required=True, help="Path to the folder where to generate the output.")
group.add_argument("-S", "--samples_file", dest="samples_file", type=str,
help="Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))")
parser.add_argument('-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int, help="Size of the reads (used for STAR --sjdbOverhang). default=150")
parser.add_argument('-o', '--output', dest="out_path", type=str, required=True,
help="Path to the folder where to generate the output.")
parser.add_argument('-t', '--threads', dest="nthreads", type=str, default="4", help="Number of threads to use for tools that are multithreaded.")
parser.add_argument('-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int,
help="Size of the reads (used for STAR --sjdbOverhang). default=150")
parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000", help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")
parser.add_argument('-t', '--threads', dest="nthreads", type=str, default="4",
help="Number of threads to use for tools that are multithreaded.")
parser.add_argument('-m', '--maxram', dest="maxram", type=str, default="50000000000",
help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).")
parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function")
parser.add_argument("--STAR_genomeGenerate_opts", type=str, default="",
help="options to pass through to STAR's genomeGenerate function")
args = parser.parse_args()
......@@ -76,13 +88,31 @@ def main():
if not GATK_HOME:
exit("Error, missing path to GATK in $GATK.")
# get real paths before changing working directory in case they are relative paths
if args.paired_reads:
reads_paths = [os.path.realpath(f) for f in args.paired_reads]
else:
elif args.single_reads:
reads_paths = [os.path.realpath(args.single_reads)]
elif args.samples_file:
left_fq_list = list()
right_fq_list = list()
with open(args.samples_file) as fh:
for line in fh:
line = line.rstrip()
if not re.match("\w", line):
continue
fields = line.split("\t")
left_fq = fields[2]
left_fq_list.append(os.path.realpath(left_fq))
if len(fields) > 3:
right_fq = fields[3]
right_fq_list.append(os.path.realpath(right_fq))
reads_paths = [",".join(left_fq_list)]
if right_fq_list:
reads_paths.append(",".join(right_fq_list))
else:
raise RuntimeError("no reads specified") # should never get here
st_fa_path = os.path.realpath(args.st_fa)
......@@ -160,6 +190,10 @@ def main():
pipeliner.run()
##
## GATK settings based on best practices:
## https://software.broadinstitute.org/gatk/documentation/article.php?id=3891
##
# clean and convert sam file with Picard-Tools
logger.info("Cleaning and Converting sam file with Picard-Tools.")
......
......@@ -224,7 +224,7 @@ main: {
}
$cmd = "R --vanilla -q < $dexseq_rscript";
$cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $dexseq_rscript";
$pipeliner->add_commands(new Command($cmd, "$analysis_token.dexseq.$top_genes_plot.ok"));
$pipeliner->run();
......
......@@ -44,6 +44,9 @@ def main():
parser.add_argument("--no_squeeze", required=False, action="store_true",
default=False, help="don't merge unbranched stretches of node identifiers")
parser.add_argument("--no_refinement", required=False, action="store_true",
default=False, help="don't refine splice graph by further collapsing allelic variants")
parser.add_argument("--incl_cdna", required=False, action="store_true",
default=False, help="rewrite Trinity fasta file using simplified graph structure")
......@@ -110,6 +113,10 @@ def main():
node_path_obj_list.append(n_path)
#print(str(n_path))
# adjust for unique prefix yet shared suffix of FST nodes
node_path_obj_list = Node_path.Node_path.adjust_for_fst_nodes(tgraph, node_path_obj_list)
# generate multiple alignment
start_time = time.time()
......@@ -125,18 +132,34 @@ def main():
if args.verbose:
logger.info("Final splice_model_alignment for Gene {} :\n{}\n".format(gene_name, str(splice_model_alignment)))
################################
## Splice graph refinement stage
squeezed_splice_model = splice_model_alignment
if args.no_squeeze:
logger.info("--no_squeeze set, so not squeezing structure")
#squeezed_splice_model = Splice_model_refiner.refine_alignment(squeezed_splice_model, reset_node_ids=True) # True important here... can have repeat nodes
else:
# SQUEEZE
squeezed_splice_model = splice_model_alignment.squeeze()
logger.debug("Squeezed splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
if not args.no_refinement:
squeezed_splice_model = Splice_model_refiner.refine_alignment(squeezed_splice_model, reset_node_ids=True) # True important here... can have repeat nodes
logger.debug("After seq-refinement of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
squeezed_splice_model = Splice_model_refiner.remove_redundant_paths(squeezed_splice_model)
logger.debug("After removing redundant paths of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
if not args.no_squeeze:
# re-squeeze
squeezed_splice_model = squeezed_splice_model.squeeze()
logger.debug("Post-refinement and final squeeze of splice model for Gene {}:\n{}\n".format(gene_name, str(squeezed_splice_model)))
# done with splice graph refinement.
squeezed_splice_model.reassign_node_loc_ids_by_align_order()
......
......@@ -29,7 +29,9 @@ class Compact_graph_whole:
def compact_unbranched(self, tgraph):
for node in tgraph.get_all_nodes():
all_nodes = list(tgraph.get_all_nodes())
for node in all_nodes:
if node.is_dead():
continue
......
......@@ -426,7 +426,7 @@ class Gene_splice_modeler:
writes the multiply aligned isoform sequences to an output filehandle
"""
transcript_names = malign_dict.keys()
transcript_names = list(malign_dict.keys())
alignment_length = len(malign_dict[ transcript_names[ 0 ] ])
......
......@@ -76,3 +76,98 @@ class Node_path:
return path_str
@staticmethod
def adjust_for_fst_nodes(tgraph, node_path_list):
"""
fst nodes will have an extra 5' sequence as compared to the corresponding non-fst nodes.
If both the fst and non-fst version of the node exist, must modify the fst nodes so that
they are separated from their 5' extension, and the core of the node (suffix) is shared.
input: TGraph obj, list of node_path objects.
The node_path objects are modified in-place as needed.
A fst-node will be truncated to the unique prefix and the non-fst node will be integrated into the path.
returns the node_path_list with any required adjustments
"""
# get list of fst nodes requiring adjustment
fst_nodes_require_adj = list()
nodes = tgraph.get_all_nodes()
for node in nodes:
node_id = node.get_loc_id()
if re.search("fst", node_id):
core_node_id = re.sub("fst", "", node_id)
core_node = tgraph.retrieve_node(core_node_id)
if core_node is not None:
fst_nodes_require_adj.append( (node, core_node) )
if not fst_nodes_require_adj:
# nothing to do
logger.debug("no FST nodes to adjust")
return node_path_list
logger.debug("Adjusting FST nodes: {}".format(fst_nodes_require_adj))
old_fst_node_to_new_fst_nodes = dict()
fst_nodes_to_delete = list()
# perform node modifications:
for (fst_node, core_node) in fst_nodes_require_adj:
fst_node_seq = fst_node.get_seq()
core_node_seq = core_node.get_seq()
# reverse, index, then revcomp the index value to get the actual position.
fst_node_seq_rev = fst_node_seq[::-1]
core_node_seq_rev = fst_node_seq[::-1]
if not re.match(core_node_seq_rev, fst_node_seq_rev):
raise RuntimeError("Error, core_node_seq:\n{}\nis not a suffix of fst seq:\n{}\n".format(core_node_seq, fst_node_seq))
prefix_endpt = len(fst_node_seq) - len(core_node_seq)
if prefix_endpt == 0:
assert fst_node_seq == core_node_seq, "Error, prefix starts at first position but sequences are not equivalent"
core_node.add_transcripts(fst_node.get_transcripts())
old_fst_node_to_new_fst_nodes[fst_node] = [core_node]
fst_nodes_to_delete.append(fst_node)
else:
prefix_string = fst_node_seq[0:prefix_endpt]
logger.debug("FST-SEQ-EXTRACTION\n\nFSTseq:\n{}\n\nCOREseq:\n{}\n\nPREFIXseq:\n{}\n\n".format(fst_node_seq, core_node_seq, prefix_string))
core_node.add_transcripts(fst_node.get_transcripts())
old_fst_node_to_new_fst_nodes[fst_node] = [fst_node, core_node]
fst_node.set_seq(prefix_string)
# now perform node path updates
for node_path in node_path_list:
nodes = node_path.get_path()
first_node = nodes[0]
if first_node in old_fst_node_to_new_fst_nodes:
# must replace
replacement_node_list = old_fst_node_to_new_fst_nodes[first_node]
if len(replacement_node_list) == 1:
# swap it out
nodes[0] = replacement_node_list[0]
elif len(replacement_node_list) == 2:
nodes[0] = replacement_node_list[1]
nodes.insert(0, replacement_node_list[0])
else:
raise RuntimeError("shouldn't get here")
# purge the nodes targeted for deletion
for node in fst_nodes_to_delete:
tgraph.prune_node(node)
return node_path_list
......@@ -66,9 +66,6 @@ def refine_alignment(node_alignment_obj, reset_node_ids=False):
splice_graph_node_alignment = splice_graph_to_node_alignment(refined_tgraph)
splice_graph_node_alignment = remove_redundant_paths(splice_graph_node_alignment)
return(splice_graph_node_alignment)
......
......@@ -104,6 +104,18 @@ class TGraph:
self.node_cache.pop(node.get_loc_id())
def retrieve_node(self, node_id):
"""
does not instantiate, only retrieves.
If loc_node_id is not in the graph, returns None
"""
if node_id in self.node_cache:
return self.node_cache[node_id]
else:
return None
def get_gene_id(self):
return self.gene_id
......