Skip to content
Commits on Source (5)
[submodule "trinityrnaseq.wiki"]
path = trinityrnaseq.wiki
url = https://github.com/trinityrnaseq/trinityrnaseq.wiki.git
[submodule "trinity_ext_sample_data"]
path = trinity_ext_sample_data
url = https://github.com/trinityrnaseq/trinity_ext_sample_data.git
#!/usr/bin/env perl
use strict;
use warnings;
use Carp;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
my $usage = <<__EOUSAGE__;
#####################################################
#
# --DE_results <string> DE_results file
#
# --gene_trans_map <string> gene-trans-map file
#
#####################################################
__EOUSAGE__
;
my $help_flag;
my $DE_results_file;
my $gene_trans_map_file;
&GetOptions ( 'h' => \$help_flag,
'DE_results=s' => \$DE_results_file,
'gene_trans_map=s' => \$gene_trans_map_file,
);
if ($help_flag) {
die $usage;
}
unless ($DE_results_file && $gene_trans_map_file) {
die $usage;
}
main: {
my %trans_to_gene = &parse_gene_trans_map($gene_trans_map_file);
my %gene_to_up_down_trans;
open(my $fh, $DE_results_file) or die "Error, cannot open file $DE_results_file";
my $header = <$fh>;
chomp $header;
my @header_fields = split(/\t/, $header);
my %header_col_index;
while (<$fh>) {
chomp;
my @fields = split(/\t/);
unless (%header_col_index) {
if (scalar(@fields) == scalar(@header_fields) + 1) {
unshift(@header_fields, "");
}
}
my $trans_name = $fields[0];
my $gene_name = $trans_to_gene{$trans_name} or die "Error, cannot find gene_id corresponding to trans [$trans_name] ";
my $struct = { name => $trans_name };
for (my $i = 1; $i <= $#fields; $i++) {
my $col_header = $header_fields[$i];
my $val = $fields[$i];
$struct->{$col_header} = $val;
}
my $fdr = $struct->{FDR};
unless (defined $fdr) {
die "Error, no FDR set for @fields";
}
unless ($fdr <= 0.05) { next; }
my $logFC = $struct->{logFC};
unless (defined $logFC) {
die "Error, no logFC set for @fields";
}
unless (abs($logFC) >= 1) { # 2-fold min
next;
}
if ($logFC > 0) {
push (@{$gene_to_up_down_trans{$gene_name}->{UP}}, $struct);
}
else {
push(@{$gene_to_up_down_trans{$gene_name}->{DOWN}}, $struct);
}
}
close $fh;
foreach my $gene_name (keys %gene_to_up_down_trans) {
my @up;
if (exists $gene_to_up_down_trans{$gene_name}->{UP}) {
@up = @{$gene_to_up_down_trans{$gene_name}->{UP}};
}
my @down;
if (exists $gene_to_up_down_trans{$gene_name}->{DOWN}) {
@down = @{$gene_to_up_down_trans{$gene_name}->{DOWN}};
}
if (@up && @down) {
## got DTU candidate:
print "# $gene_name\n";
foreach my $up_entry (@up) {
print join("\t", "UP", &struct_to_string($up_entry)) . "\n";
}
foreach my $down_entry (@down) {
print join("\t", "DOWN", &struct_to_string($down_entry)) . "\n";
}
print "\n"; # spacer
}
}
exit(0);
}
####
sub parse_gene_trans_map {
my ($gene_trans_map_file) = @_;
my %trans_to_gene;
open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
while (<$fh>) {
chomp;
unless (/\w/) { next; }
if (/^\#/) { next; }
my ($gene, $trans) = split(/\t/);
$trans_to_gene{$trans} = $gene;
}
return(%trans_to_gene);
}
####
sub struct_to_string {
my ($struct) = @_;
my $ret_text = join("\t", $struct->{name}, $struct->{sampleA}, $struct->{sampleB},
$struct->{logFC}, $struct->{FDR});
return($ret_text);
}
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("argparse"))
parser = ArgumentParser(description="See GOplot documentation: https://wencke.github.io/")
parser$add_argument("--EC_david", help="EC.david input file", required=TRUE)
parser$add_argument("--EC_genelist", help="EC.genelist input file", required=TRUE)
parser$add_argument("--pdf_outfile", help="pdf output filename", required=TRUE)
args = parser$parse_args()
library(GOplot)
EC.david = read.table(args$EC_david, sep='\t', header=T)
EC.genelist = read.table(args$EC_genelist, sep='\t', header=T)
# plotting commands from: https://wencke.github.io/
circ <- circle_dat(EC.david, EC.genelist)
message("Generating plot pdf file: ", args$pdf_outfile)
pdf(file=args$pdf_outfile)
###############################
## The modified barplot (GOBar)
###############################
# Generate a simple barplot
GOBar(subset(circ, category == 'BP'))
# Facet the barplot according to the categories of the terms
GOBar(circ, display = 'multiple')
# Facet the barplot, add a title and change the colour scale for the z-score
GOBar(circ, display = 'multiple', title = 'Z-score coloured barplot', zsc.col = c('yellow', 'black', 'cyan'))
#############################
## The bubble plot (GOBubble)
#############################
# Generate the bubble plot with a label threshold of 3
GOBubble(circ, labels = 3)
# Add a title, change the colour of the circles, facet the plot according to the categories and change the label threshold
GOBubble(circ, title = 'Bubble plot', col = c('orange', 'darkred', 'gold'), display = 'multiple', labels = 3)
# Colour the background according to the category
GOBubble(circ, title = 'Bubble plot with background colour', display = 'multiple', labels = 3)
# Reduce redundant terms with a gene overlap >= 0.75...
reduced_circ <- reduce_overlap(circ, overlap = 0.75)
# ...and plot it
GOBubble(reduced_circ, labels = 2.8)
###########################################################################################
## Circular visualization of the results of gene- annotation enrichment analysis (GOCircle)
###########################################################################################
#Generate a circular visualization of the results of gene- annotation enrichment analysis
GOCircle(circ)
# Generate a circular visualization of selected terms
IDs <- c('GO:0007507', 'GO:0001568', 'GO:0001944', 'GO:0048729', 'GO:0048514', 'GO:0005886', 'GO:0008092', 'GO:0008047')
GOCircle(circ, nsub = IDs)
# Generate a circular visualization for 10 terms
GOCircle(circ, nsub = 10)
##### Now, the rest is up to you. ;-)
quit(save = "no", status = 0, runLast = FALSE)
#!/usr/bin/env Rscript
### Glimma plots for results from
# $TRINITY_HOME/Analysis/DifferentialExpression/run_DE_analysis.pl
# written by Ken Field, Bucknell University
# based on Glimma vignette
# https://bioconductor.org/packages/release/bioc/html/Glimma.html
# by Shian Su, Matthew E. Ritchie
# further modified by bhaas for incorporation into Trinity package.
## ---------------------------------------------
## in case you need to install Glimma:
# source("https://bioconductor.org/biocLite.R")
# biocLite("Glimma")
## ---------------------------------------------
suppressPackageStartupMessages(library("argparse"))
parser = ArgumentParser()
parser$add_argument("--samples_file", help="samples file", required=TRUE)
parser$add_argument("--DE_results", help="DE_results file", required=TRUE)
parser$add_argument("--counts_matrix", help="matrix of raw counts", required=TRUE)
args = parser$parse_args()
DE_results_file = args$DE_results
counts_matrix_file = args$counts_matrix
samples_file = args$samples_file
source("https://bioconductor.org/biocLite.R")
biocLite("Glimma")
biocLite("argparse")
biocLite("edgeR")
library(edgeR)
library(limma)
library(Glimma)
### Load DE_results file and sort by transcript names
DE_results <- read.delim(DE_results_file, row.names=1, stringsAsFactors=FALSE)
DE_results <- DE_results[order(row.names(DE_results)),]
### Load cpm table and refactor
### Make sure you load the correct genes or transcripts version
rawcounts <- read.table(counts_matrix_file, header=T, row.names=1, com='')
rawcounts <- rawcounts[order(row.names(rawcounts)),]
samples <- read.delim(samples_file, header=FALSE, stringsAsFactors=FALSE, com='#')
colnames(samples) <- c("Group","Sample")
rawcounts <- rawcounts[,samples$Sample]
rnaseqMatrix = round(rawcounts)
rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,]
conditions = factor(samples$Group)
exp_study = DGEList(counts=rnaseqMatrix, group=conditions)
exp_study = calcNormFactors(exp_study)
exp_study = estimateCommonDisp(exp_study)
exp_study = estimateTagwiseDisp(exp_study)
cpm_table <- cpm(exp_study)
cpm_table = cpm_table[rownames(DE_results),]
### Optional, Merge the DE_results and cpm_table
cpm.DE <- cbind(cpm_table, DE_results)
write.csv(cpm.DE, file="cpm.DE_results.csv")
### Set significance level for different colors on plot
DE_results$GeneID <- rownames(DE_results)
DE_results$logCPM <- DE_results$logCPM
DE_results$Sig <- as.numeric(DE_results$FDR < 0.001)
### Make the Glimma MA-plot
glMDPlot(DE_results, counts=cpm_table, samples=samples$Sample,
anno=cpm.DE, groups=samples$Group,
xval="logCPM", yval="logFC",
display.columns=colnames(cpm.DE),
folder="glimma_MA", html="MA-plot",
search.by="GeneID", status=DE_results$Sig, cols=c("red","gray","blue") )
### Make the Glimma Volcano Plot
DE_results$logFDR <- -log(DE_results$FDR+1e-100)
glMDPlot(DE_results, xval="logFC", yval="logFDR", counts=cpm_table,
anno=cpm.DE, groups=samples$Group, samples = samples$Sample,
status=DE_results$Sig, display.columns=colnames(cpm.DE),
folder="glimma_volcano", html="volcano", launch=TRUE)
This diff is collapsed.
......@@ -39,9 +39,9 @@ matrix_to_color_assignments = function(matrix_m, col=NULL, by=c("matrix", "row",
if (by == "matrix") {
min_val = min(matrix_m)
min_val = min(matrix_m, na.rm=T)
matrix_m = matrix_m - min_val
max_val = max(matrix_m)
max_val = max(matrix_m, na.rm=T)
matrix_m = matrix_m / max_val * num_colors
#print(matrix_m)
matrix_m = apply(matrix_m, 1:2, function(x) ifelse (x<1, as.character(col[1]), as.character(col[x])));
......@@ -51,8 +51,8 @@ matrix_to_color_assignments = function(matrix_m, col=NULL, by=c("matrix", "row",
else {
row_or_col_only_color_selector_func = function(x) {
a = min(x);
b = max(x);
a = min(x, na.rm=T);
b = max(x, na.rm=T);
c = (x-a)/(b-a) * num_colors;
c = round(c);
c = ifelse (c<1, 1, c);
......@@ -62,7 +62,11 @@ matrix_to_color_assignments = function(matrix_m, col=NULL, by=c("matrix", "row",
}
if (by == "row") {
matrix_m = t(apply(matrix_m, 1, row_or_col_only_color_selector_func));
matrix_m = apply(matrix_m, 1, row_or_col_only_color_selector_func);
print(matrix_m)
print("dim matrix_m after apply"); print(dim(matrix_m))
matrix_m = t(matrix_m);
print("dim matrix_m after transpose: "); print(dim(matrix_m))
}
else {
# by column
......
plot_MA = function(logCounts, logFoldChange, FDR, xlab="logCounts", ylab="logFC", title="MA plot", pch=20) {
plot_MA = function(featureNames, logCounts, logFoldChange, FDR, xlab="logCounts", ylab="logFC", title="MA plot", pch=20, top_gene_labels_show=20) {
plot(logCounts, logFoldChange, col=ifelse(FDR<=0.05, "red", "black"), xlab=xlab, ylab=ylab, main=title, pch=pch);;
text(logCounts[1:top_gene_labels_show], logFoldChange[1:top_gene_labels_show], labels=featureNames[1:top_gene_labels_show], cex= 0.7, pos=3)
}
plot_Volcano = function(logFoldChange, FDR, xlab="logFC", ylab="-1*log10(FDR)", title="Volcano plot", pch=20) {
plot_Volcano = function(featureNames, logFoldChange, FDR, xlab="logFC", ylab="-1*log10(FDR)", title="Volcano plot", pch=20, top_gene_labels_show=20) {
plot(logFoldChange, -1*log10(FDR), col=ifelse(FDR<=0.05, "red", "black"), xlab=xlab, ylab=ylab, main=title, pch=pch);
text(logFoldChange[1:top_gene_labels_show], (-1*log10(FDR))[1:top_gene_labels_show], labels=featureNames[1:top_gene_labels_show], cex= 0.7, pos=3)
}
plot_MA_and_Volcano = function(logCounts, logFoldChange, FDR, xlab="logCounts", ylab="logFC", title="MA plot") {
plot_MA_and_Volcano = function(featureNames, logCounts, logFoldChange, FDR, xlab="logCounts", ylab="logFC", title="MA plot") {
def.par = par(no.readonly = TRUE) # save default, for resetting...
gridlayout = matrix(c(1:4),nrow=2,ncol=2, byrow=TRUE);
layout(gridlayout, widths=c(1,1,1,1), heights=c(1,1,1,1))
#gridlayout = matrix(c(1:4),nrow=2,ncol=2, byrow=TRUE);
#layout(gridlayout, widths=c(1,1,1,1), heights=c(1,1,1,1))
plot_MA(logCounts, logFoldChange, FDR);
plot_Volcano(logFoldChange, FDR);
plot_MA(featureNames, logCounts, logFoldChange, FDR);
plot_Volcano(featureNames, logFoldChange, FDR);
# draw again, but use a smaller dot for data points
plot_MA(logCounts, logFoldChange, FDR, pch='.');
plot_Volcano(logFoldChange, FDR, pch='.');
#plot_MA(logCounts, logFoldChange, FDR, pch='.');
#plot_Volcano(logFoldChange, FDR, pch='.');
par(def.par)
......
##################################################################################
## From: https://stackoverflow.com/questions/22410606/violin-plot-with-list-input
## and slightly modified to my liking here.
vioplot2<-function (x, ..., range = 1.5, h = NULL, ylim = NULL, names = NULL,
horizontal = FALSE, col = c("magenta"), border = "black", lty = 1,
lwd = 1, rectCol = "black", colMed = "white", pchMed = 19,
at, add = FALSE, wex = 1, drawRect = TRUE)
{
library(sm)
if(!is.list(x)){
datas <- list(x, ...)
} else{
datas<-x
}
n <- length(datas)
if (missing(at))
at <- 1:n
upper <- vector(mode = "numeric", length = n)
lower <- vector(mode = "numeric", length = n)
q1 <- vector(mode = "numeric", length = n)
q3 <- vector(mode = "numeric", length = n)
med <- vector(mode = "numeric", length = n)
base <- vector(mode = "list", length = n)
height <- vector(mode = "list", length = n)
baserange <- c(Inf, -Inf)
args <- list(display = "none")
if (!(is.null(h)))
args <- c(args, h = h)
for (i in 1:n) {
data <- datas[[i]]
data.min <- min(data)
data.max <- max(data)
q1[i] <- quantile(data, 0.25)
q3[i] <- quantile(data, 0.75)
med[i] <- median(data)
iqd <- q3[i] - q1[i]
upper[i] <- min(q3[i] + range * iqd, data.max)
lower[i] <- max(q1[i] - range * iqd, data.min)
est.xlim <- c(min(lower[i], data.min), max(upper[i],
data.max))
smout <- do.call("sm.density", c(list(data, xlim = est.xlim),
args))
hscale <- 0.4/max(smout$estimate) * wex
base[[i]] <- smout$eval.points
height[[i]] <- smout$estimate * hscale
t <- range(base[[i]])
baserange[1] <- min(baserange[1], t[1])
baserange[2] <- max(baserange[2], t[2])
}
if (!add) {
xlim <- if (n == 1)
at + c(-0.5, 0.5)
else range(at) + min(diff(at))/2 * c(-1, 1)
if (is.null(ylim)) {
ylim <- baserange
}
}
if (is.null(names)) {
label <- 1:n
}
else {
label <- names
}
boxwidth <- 0.05 * wex
if (!add)
plot.new()
if (!horizontal) {
if (!add) {
plot.window(xlim = xlim, ylim = ylim)
axis(2)
axis(1, at = at, label = label)
}
box()
for (i in 1:n) {
polygon(c(at[i] - height[[i]], rev(at[i] + height[[i]])),
c(base[[i]], rev(base[[i]])), col = col[i], border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(at[c(i, i)], c(lower[i], upper[i]), lwd = lwd,
lty = lty)
rect(at[i] - boxwidth/2, q1[i], at[i] + boxwidth/2,
q3[i], col = rectCol)
points(at[i], med[i], pch = pchMed, col = colMed)
}
}
}
else {
if (!add) {
plot.window(xlim = ylim, ylim = xlim)
axis(1)
axis(2, at = at, label = label)
}
box()
for (i in 1:n) {
polygon(c(base[[i]], rev(base[[i]])), c(at[i] - height[[i]],
rev(at[i] + height[[i]])), col = col[i], border = border,
lty = lty, lwd = lwd)
if (drawRect) {
lines(c(lower[i], upper[i]), at[c(i, i)], lwd = lwd,
lty = lty)
rect(q1[i], at[i] - boxwidth/2, q3[i], at[i] +
boxwidth/2, col = rectCol)
points(med[i], at[i], pch = pchMed, col = colMed)
}
}
}
invisible(list(upper = upper, lower = lower, median = med,
q1 = q1, q3 = q3))
}
......@@ -13,7 +13,7 @@ my $usage = <<__EOUSAGE__;
#
# Required:
#
# --matrix <string> TMM.EXPR.matrix
# --matrix|m <string> TMM.EXPR.matrix
#
# Optional:
#
......@@ -28,7 +28,7 @@ my $usage = <<__EOUSAGE__;
#
# Misc:
#
# --samples <string> sample-to-replicate mappings (provided to run_DE_analysis.pl)
# --samples|s <string> sample-to-replicate mappings (provided to run_DE_analysis.pl)
#
# --max_DE_genes_per_comparison <int> extract only up to the top number of DE features within each pairwise comparison.
# This is useful when you have massive numbers of DE features but still want to make
......@@ -43,6 +43,7 @@ my $usage = <<__EOUSAGE__;
# --GO_annots <string> GO annotations file
# --gene_lengths <string> lengths of genes file
#
# --include_GOplot optional: will generate inputs to GOplot and attempt to make a preliminary pdf plot/report for it.
#
##############################################################
......@@ -74,12 +75,12 @@ my $order_columns_by_samples_file = 0;
my $examine_GO_enrichment_flag;
my $GO_annots_file;
my $gene_lengths_file;
my $RUN_GOPLOT = 0;
&GetOptions ( 'h' => \$help_flag,
'matrix=s' => \$matrix_file,
'matrix|m=s' => \$matrix_file,
'P=f' => \$p_value,
'C=f' => \$log2_fold_change,
'output=s' => \$output_prefix,
......@@ -93,11 +94,15 @@ my $gene_lengths_file;
'examine_GO_enrichment' => \$examine_GO_enrichment_flag,
'GO_annots=s' => \$GO_annots_file,
'gene_lengths=s' => \$gene_lengths_file,
'include_GOplot' => \$RUN_GOPLOT,
'samples=s' => \$samples_file,
'samples|s=s' => \$samples_file,
"order_columns_by_samples_file" => \$order_columns_by_samples_file,
);
......@@ -205,7 +210,7 @@ main: {
{
open (my $ofh, ">$diff_expr_matrix") or die "Error, cannot write to file $diff_expr_matrix";
print $ofh "\t$matrix_header\n";
print $ofh "$matrix_header\n";
foreach my $acc (keys %diffExpr) {
my $counts_row = $read_count_rows{$acc} or die "Error, no read counts row for $acc";
print $ofh join("\t", $acc, $counts_row) . "\n";
......@@ -284,8 +289,7 @@ sub parse_result_files_find_diffExp {
foreach my $result_file (@$result_files_aref) {
$result_file =~ /matrix\.(\S+)_vs_(\S+)\.[^\.]+\.DE_results$/ or die "Error, cannot extract condition names from $result_file";
my ($condA, $condB) = ($1, $2);
my ($condA, $condB) = &get_sample_pairs_from_DE_results_file($result_file);
my $pairwise_samples_file = "$result_file.samples";
{
......@@ -308,26 +312,28 @@ sub parse_result_files_find_diffExp {
my $condA_up_subset_file = "$result_file_out_prefix.$condA-UP.subset";
my $condB_up_subset_file = "$result_file_out_prefix.$condB-UP.subset";
my $either_subset_file = "$result_file_out_prefix.DE.subset";
open (my $condA_up_ofh, ">$condA_up_subset_file") or die "Error, cannot write to $condA_up_subset_file";
open (my $condB_up_ofh, ">$result_file_out_prefix.$condB-UP.subset") or die "Error, cannot write to $condB_up_subset_file";
open (my $condB_up_ofh, ">$condB_up_subset_file") or die "Error, cannot write to $condB_up_subset_file";
open (my $cond_either_up_ofh, ">$either_subset_file") or die "Error, cannot write to $either_subset_file";
my $count = 0;
my $header = <$fh>;
chomp $header;
unless ($header =~ /^id\t/) {
print $condA_up_ofh "id\t";
print $condB_up_ofh "id\t";
}
print $condA_up_ofh "$header\t$fpkm_matrix_header\n";
print $condB_up_ofh "$header\t$fpkm_matrix_header\n";
print $cond_either_up_ofh "$header\t$fpkm_matrix_header\n";
my $countA = 0;
my $countB = 0;
my %condA_up_genes;
my %condB_up_genes;
my %either_up_genes;
my $rank = 0;
while (<$fh>) {
......@@ -355,7 +361,13 @@ sub parse_result_files_find_diffExp {
my $matrix_counts = $read_fpkm_rows_href->{$id} || die "Error, no counts from matrix for $id";
if ($log_fold_change > 0) {
######################################
# log fold changes should be log2(A/B)
$either_up_genes{$id} = $rank;
print $cond_either_up_ofh "$line\t$matrix_counts\n";
if ($log_fold_change < 0) {
print $condB_up_ofh "$line\t$matrix_counts\n";
$countB++;
......@@ -382,17 +394,57 @@ sub parse_result_files_find_diffExp {
# conditionB enriched heatmaps
&write_matrix_generate_heatmap("$condB_up_subset_file.matrix", \%condB_up_genes, $read_fpkm_rows_href, $fpkm_matrix_header, $pairwise_samples_file);
# either enriched heatmaps
&write_matrix_generate_heatmap("$either_subset_file.matrix", \%either_up_genes, $read_fpkm_rows_href, $fpkm_matrix_header, $pairwise_samples_file);
}
## do GO enrichment analysis
if ($examine_GO_enrichment_flag) {
my $cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condA_up_subset_file";
my $background_file = $result_file;
$background_file =~ s/\.DE_results$/\.count_matrix/ or die "Error, cannot modify $result_file to count_matrix name";
my $cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file "
. " --lengths $gene_lengths_file --genes_single_factor $condA_up_subset_file"
. " --background $background_file ";
&process_cmd($cmd) if $countA;
$cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file --lengths $gene_lengths_file --genes_single_factor $condB_up_subset_file";
$cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file "
. " --lengths $gene_lengths_file --genes_single_factor $condB_up_subset_file"
. " --background $background_file ";
&process_cmd($cmd) if $countB;
if ($countA + $countB) {
$cmd = "$FindBin::RealBin/run_GOseq.pl --GO_assignments $GO_annots_file "
. " --lengths $gene_lengths_file --genes_single_factor $either_subset_file"
. " --background $background_file ";
&process_cmd($cmd);
if ($RUN_GOPLOT) {
$cmd = "$FindBin::RealBin/prep_n_run_GOplot.pl --GO_annots $GO_annots_file "
. " --DE_subset $either_subset_file "
. " --DE_GO_enriched $either_subset_file.GOseq.enriched "
. " --tmpdir $either_subset_file.GOseq.enriched.GOplot_dat"
. " --pdf_filename $either_subset_file.GOseq.enriched.GOplot_dat.pdf";
eval {
&process_cmd($cmd);
};
if ($@) {
# can't afford for this to be fatal, so just reporting a failure message for this part.
print STDERR "WARNING: GOplot failed to run successfully on $either_subset_file.GOseq.enriched\n";
}
}
}
}
......@@ -402,7 +454,7 @@ sub parse_result_files_find_diffExp {
{
# write DE count matrix:
my $num_DE_counts_outfile = "numDE_feature_counts.P${max_p_value}_C${min_abs_log2_fold_change}.matrix";
my $num_DE_counts_outfile = "DE_feature_counts.P${max_p_value}_C${min_abs_log2_fold_change}.matrix";
open (my $ofh, ">$num_DE_counts_outfile") or die "Error, cannot write to file $num_DE_counts_outfile";
my @conditions = sort keys %DE_pair_counts;
print $ofh "\t" . join("\t", @conditions) . "\n";
......@@ -480,3 +532,26 @@ sub write_matrix_generate_heatmap {
return;
}
####
sub get_sample_pairs_from_DE_results_file {
my ($result_file) = @_;
my ($header, $line) = split(/\n/, `head -n2 $result_file`);
unless ($header =~ /^sampleA\tsampleB/) {
die "Error, header [$header] of $result_file lacks expected formatting starting with sampleA\tsampleB";
}
my @header_comps = split(/\t/, $header);
my @x = split(/\t/, $line);
unless (scalar(@x) == scalar(@header_comps) + 1) {
die "Error, top lines of result file: $result_file do not meet expectations for the R data.frame ";
}
my $sampleA = $x[1];
my $sampleB = $x[2];
return($sampleA, $sampleB);
}
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("argparse"))
initial.options <- commandArgs(trailingOnly = FALSE)
file.arg.name <- "--file="
script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
script.dirname <- dirname(script.name)
source(paste0(script.dirname,"/R/rnaseq_plot_funcs.R"))
parser = ArgumentParser(description="generate MA plots in DE directory")
parser$add_argument("--top_genes_label", help="number of top DE genes to label in the plot", type="integer", default=20)
args = parser$parse_args()
DE_result_files_vec = dir(pattern='DE_results$')
pdf("DE_results.MA_plots.pdf")
for (DE_file in DE_result_files_vec) {
message("plotting MA plot for: ", DE_file)
data = read.table(DE_file, header=T, row.names=1)
plot_MA(rownames(data), data$logCPM, data$logFC, data$FDR, title=DE_file, top_gene_labels_show=args$top_genes_label)
}
dev.off()
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("argparse"))
initial.options <- commandArgs(trailingOnly = FALSE)
file.arg.name <- "--file="
script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
script.dirname <- dirname(script.name)
source(paste0(script.dirname,"/R/rnaseq_plot_funcs.R"))
parser = ArgumentParser(description="generate volcano plots in DE directory")
parser$add_argument("--top_genes_label", help="number of top DE genes to label in the plot", type="integer", default=20)
args = parser$parse_args()
DE_result_files_vec = dir(pattern='DE_results$')
pdf("DE_results.volcano_plots.pdf")
for (DE_file in DE_result_files_vec) {
message("plotting volcano for: ", DE_file)
data = read.table(DE_file, header=T, row.names=1)
plot_Volcano(rownames(data), data$logFC, data$FDR, title=DE_file, top_gene_labels_show=args$top_genes_label)
}
dev.off()
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use lib ("$FindBin::Bin/../../PerlLib");
use DelimParser;
use File::Basename;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
my $help_flag;
my $usage = <<__EOUSAGE__;
###################################################################
#
# --GO_annots <string> Trinotate_report.xls.gene_ontology (genes or transcripts - be consistent w/ DE analysis being done here)
#
# --DE_subset <string> The 'DE_analysis.DE_subset' file.
#
# --DE_GO_enriched <string> The 'DE.subset.GOseq.enriched' file.
#
# --tmpdir <string> Path to put prep files for GOplot
#
# --pdf_filename <string> filename for pdf plot output file
#
###################################################################
__EOUSAGE__
;
my $gene_ontology_assignments_file;
my $enriched_file;
my $DE_results_file;
my $tmpdir;
my $pdf_filename;
&GetOptions ( 'h' => \$help_flag,
'GO_annots=s' => \$gene_ontology_assignments_file,
'DE_subset=s' => \$DE_results_file,
'DE_GO_enriched=s' => \$enriched_file,
'pdf_filename=s' => \$pdf_filename,
'tmpdir=s' => \$tmpdir,
);
if ($help_flag) {
die $usage;
}
unless ($gene_ontology_assignments_file && $enriched_file && $DE_results_file && $tmpdir && $pdf_filename) {
die $usage;
}
main: {
if (! -d $tmpdir) {
&process_cmd("mkdir -p $tmpdir");
}
my %enriched_GO = &parse_GO_enriched($enriched_file);
my %DE_genes = &parse_DE_genes($DE_results_file);
my %genes_with_enriched_GO = &parse_GO_assignments($gene_ontology_assignments_file, \%enriched_GO, \%DE_genes);
## Write the EC.david file:
my $EC_david_file = "$tmpdir/EC.david";
{
open (my $ofh, ">$EC_david_file") or die "Error, cannot write to $EC_david_file";
my $tab_writer = new DelimParser::Writer($ofh, "\t", ['Category', 'ID', 'Term', 'Genes', 'adj_pval']);
foreach my $row (values %enriched_GO) {
# format wanted:
# Category ID Term
#1 BP GO:0007507 heart development
# Genes
# 1 DLC1, NRP2, NRP1, EDN1, PDLIM3, GJA1, TTN, GJA5, ZIC3, TGFB2, CERKL, GATA6, COL4A3BP, GAB1, SEMA3C, MKL2, SLC22A5, MB, PTPRJ, RXRA, VANGL2, MYH6, TNNT2, HHEX, MURC, MIB1, FOXC2, FOXC1, ADAM19, MYL2, TCAP, EGLN1, SOX9, ITGB1, CHD7, HEXIM1, PKD2, NFATC4, PCSK5, ACTC1, TGFBR2, NF1, HSPG2, SMAD3, TBX1, TNNI3, CSRP3, FOXP1, KCNJ8, PLN, TSC2, ATP6V0A1, TGFBR3, HDAC9
# adj_pval
# 1 2.17e-06
my $go_term_info = $row->{go_term};
my ($go_type, $go_descr) = split(/\s+/, $go_term_info, 2);
my $go_id = $row->{'category'};
my $genes = $genes_with_enriched_GO{$go_id};
unless ($genes) {
die "Error, no genes extracted for GO category: $go_id $go_type $go_descr ";
}
$tab_writer->write_row( { 'Category' => $go_type,
'ID' => $go_id,
'Term' => $go_descr,
'Genes' => $genes,
'adj_pval' => $row->{over_represented_FDR},
} );
}
close $ofh;
}
## write the EC.genelist file:
my $EC_genelist_file = "$tmpdir/EC.genelist";
{
open (my $ofh, ">$EC_genelist_file") or die $!;
my $tab_writer = new DelimParser::Writer($ofh, "\t", ['ID', 'logFC', 'adj.P.Val']);
foreach my $DE_info_href (sort {$a->{'adj.P.Val'}<=>$b->{'adj.P.Val'}} values %DE_genes) {
$tab_writer->write_row( { 'ID' => $DE_info_href->{ID},
'logFC' => $DE_info_href->{logFC},
'adj.P.Val' => $DE_info_href->{'adj.P.Val'},
}
);
}
close $ofh;
}
my $cmd = "$FindBin::RealBin/GOplot.Rscript --EC_david $EC_david_file --EC_genelist $EC_genelist_file --pdf_outfile $pdf_filename";
&process_cmd($cmd);
exit(0);
}
####
sub parse_DE_genes {
my ($file) = @_;
my %genes;
open (my $fh, $file) or die "Error, cannot open file $file";
my $header = <$fh>;
unless ($header =~ /^sample/) {
die "Error, file: $file has unexpected format... no 'sample' starting header.";
}
while (<$fh>) {
chomp;
my @x = split(/\t/);
my $gene_id = $x[0];
my $logFC = $x[3];
my $fdr = $x[6];
$genes{$gene_id} = { ID => $gene_id,
logFC => $logFC,
'adj.P.Val' => $fdr,
};
}
return(%genes);
}
####
sub parse_GO_enriched {
my ($enriched_file) = @_;
open (my $fh, $enriched_file) or die "Error, cannot open file $enriched_file";
my $tab_reader = new DelimParser::Reader($fh, "\t");
my %enriched_GO;
while (my $row = $tab_reader->get_row()) {
my $category = $row->{category} or die "Error, cannot identify 'category' value";
$enriched_GO{$category} = $row;
}
return(%enriched_GO);
}
####
sub parse_GO_assignments {
my ($gene_ontology_assignment_file, $enriched_GO_href, $DE_genes_href) = @_;
my %GO_to_gene_ids;
open (my $fh, $gene_ontology_assignment_file) or die "Error, cannot open file $gene_ontology_assignment_file";
while (<$fh>) {
chomp;
my ($gene_id, $GO_assignments) = split(/\t/);
unless (exists $DE_genes_href->{$gene_id}) { next; }
my @GO = split(/,/, $GO_assignments);
foreach my $GO_id (@GO) {
if (exists $enriched_GO_href->{$GO_id}) {
$GO_to_gene_ids{$GO_id}->{$gene_id} = 1;
}
}
}
close $fh;
my %genes_enriched_GO;
foreach my $GO_id (keys %GO_to_gene_ids) {
my $gene_ids_href = $GO_to_gene_ids{$GO_id};
my @gene_list = sort keys %$gene_ids_href;
$genes_enriched_GO{$GO_id} = join(", ", @gene_list);
}
return(%genes_enriched_GO);
}
####
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
my $ret = system($cmd);
if ($ret) {
die "Error, CMD: $cmd died with ret $ret";
}
return;
}
#!/usr/bin/env perl
use strict;
use warnings;
use Carp;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use FindBin;
use lib ("$FindBin::Bin/../../PerlLib");
use Fasta_reader;
use Data::Dumper;
my $help_flag;
my $usage = <<__EOUSAGE__;
####################################################################################
#
# required:
#
# --transcripts_fasta <string> fasta file containing transcript sequences.
#
# --gene_trans_map <string> gene-trans mapping file (format: gene_id(tab)transcript_id)
#
# --expr_matrix <string> expression matrix (fpkm or tpm, ideally TMM-normalized)
#
# --samples_file <string> samples file
#
# --min_pct_iso <float> minimum percent of total isoform expression
#
# optional:
#
# --out_prefix <string> output filename prefix ("pruned.\${min_pct_iso}"
#
####################################################################################
__EOUSAGE__
;
my $gene_trans_map_file;
my $expr_matrix;
my $samples_file;
my $min_pct_iso;
my $out_prefix;
my $transcripts_fasta_file;
&GetOptions ( 'h' => \$help_flag,
'gene_trans_map=s' => \$gene_trans_map_file,
'expr_matrix=s' => \$expr_matrix,
'transcripts_fasta=s' => \$transcripts_fasta_file,
'samples_file=s' => \$samples_file,
'min_pct_iso=s' => \$min_pct_iso,
'out_prefix=s' => \$out_prefix,
);
unless ($gene_trans_map_file && $expr_matrix && $samples_file && $min_pct_iso && $transcripts_fasta_file) {
die $usage;
}
unless ($out_prefix) {
$out_prefix = "pruned.$min_pct_iso";
}
main: {
print STDERR "-parsing samples file: $samples_file\n";
my %sample_to_replicates = &parse_samples_file($samples_file);
print STDERR "-parsing gene-trans mapping file: $gene_trans_map_file\n";
my %gene_to_trans = &parse_gene_trans_relationships($gene_trans_map_file);
print STDERR "-parsing expr matrix: $expr_matrix\n";
my %trans_to_avg_expr = &parse_expr_matrix($expr_matrix, \%sample_to_replicates);
my $out_log_filename = "$out_prefix.log";
my $out_fasta_filename = "$out_prefix.fasta";
open(my $out_log_fh, ">$out_log_filename") or die "Error, cannot write to $out_log_filename";
open(my $out_fasta_fh, ">$out_fasta_filename") or die "Error, cannot write to $out_fasta_filename";
print STDERR "-pruning isoforms\n";
## decide on which transcripts to keep
my %trans_retain;
foreach my $gene (keys %gene_to_trans) {
my @trans_ids = keys %{$gene_to_trans{$gene}};
if (scalar(@trans_ids) == 1) {
# no alt splicing, just keep it.
$trans_retain{ $trans_ids[0] } = 1;
next;
}
my %trans_to_sum_expr_all_conditions;
my @trans_to_sample_expr;
my %sample_to_sum_expr;
foreach my $sample_name (keys %sample_to_replicates) {
my $sum_expr = 0;
my @trans_to_expr;
foreach my $trans_id (@trans_ids) {
my $expr = $trans_to_avg_expr{$trans_id}->{$sample_name};
unless (defined $expr) {
die "Error, no expression set for trans_id: [$trans_id] and sample type: [$sample_name] ";
}
$trans_to_sum_expr_all_conditions{$trans_id} += $expr;
push (@trans_to_sample_expr, [$trans_id, $sample_name, $expr]);
$sample_to_sum_expr{$sample_name} += $expr;
}
}
# prioritize transcript by order across experiments
@trans_to_sample_expr = reverse sort {$trans_to_sum_expr_all_conditions{$a->[0]} <=> $trans_to_sum_expr_all_conditions{$b->[0]}} @trans_to_sample_expr;
my %seen_sample;
foreach my $trans_info_aref (@trans_to_sample_expr) {
# always retain top isoform in any sample type.
my ($trans_id, $sample_name, $expr) = @$trans_info_aref;
$expr = sprintf("%.3f", $expr);
my $retain_flag = 0;
my $selected_top_entry = 0;
if (! $seen_sample{$sample_name}) {
# top expr entry for that sample
$seen_sample{$sample_name} = 1;
$selected_top_entry = 1;
$retain_flag = 1;
}
my $sum_sample_expr = $sample_to_sum_expr{$sample_name};
my $pct_iso = 0;
if ($sum_sample_expr > 0) {
$pct_iso = sprintf("%.2f", $expr / $sum_sample_expr * 100);
if ($pct_iso >= $min_pct_iso) {
$retain_flag = 1;
}
}
if ($retain_flag) {
$trans_retain{$trans_id} = 1;
}
## output log entry
my $out_text = join("\t", $gene, $trans_id, $sample_name, $expr, $pct_iso);
if ($retain_flag) {
$out_text .= "\t[RETAINED]";
}
if ($selected_top_entry) {
$out_text .= "\t[TOP_EXPR]";
}
print $out_log_fh "$out_text\n";
}
print $out_log_fh "\n"; # spacer between genes.
}
close $out_log_fh;
my $fasta_reader = new Fasta_reader($transcripts_fasta_file) or die $!;
my %to_retrieve = %trans_retain;
print STDERR "-outputting filtered transcripts as: $out_fasta_filename\n";
while (my $seq_obj = $fasta_reader->next()) {
my $acc = $seq_obj->get_accession();
if ($trans_retain{$acc}) {
my $fasta_entry = $seq_obj->get_FASTA_format();
chomp $fasta_entry;
print $out_fasta_fh "$fasta_entry\n";
delete($to_retrieve{$acc});
}
}
if (%to_retrieve) {
die "Failed to retrieve fasta records for transcripts: " . Dumper(\%to_retrieve);
}
close $out_fasta_fh;
print STDERR "-done. See outputs: $out_prefix.\*\n\n";
exit(0);
}
####
sub parse_gene_trans_relationships {
my ($gene_trans_map_file) = @_;
my %gene_to_trans;
open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
while (<$fh>) {
unless (/\w/) { next; }
if (/^\#/) { next; }
chomp;
my ($gene_id, $trans_id) = split(/\t/);
$gene_to_trans{$gene_id}->{$trans_id} = 1;
}
close $fh;
return(%gene_to_trans);
}
####
sub parse_samples_file {
my ($filename) = @_;
my %sample_to_reps;
open (my $fh, $filename) or die "Error, cannot open file $filename";
while (<$fh>) {
chomp;
my ($condition, $replicate_name, @rest) = split(/\t/);
$sample_to_reps{$condition}->{$replicate_name} = 1;
}
return(%sample_to_reps);
}
####
sub parse_expr_matrix {
my ($expr_matrix, $sample_to_replicates_href) = @_;
open(my $fh, $expr_matrix) or die "Error, cannot open file $expr_matrix";
my $header = <$fh>;
chomp $header;
my @header_fields = split(/\t/, $header);
my %replicate_to_col_number;
my %trans_expr;
my $first_data_row = 1;
while(<$fh>) {
chomp;
my @x = split(/\t/);
# process column info as needed.
if ($first_data_row) {
if (scalar(@header_fields) == scalar(@x) - 1) {
# adjust header
unshift(@header_fields, '');
}
# assign header column numbers
for (my $i = 1; $i <= $#header_fields; $i++) {
my $rep_name = $header_fields[$i];
$replicate_to_col_number{$rep_name} = $i;
}
$first_data_row = 0;
}
my $trans_id = $x[0];
## process data, compute averages
foreach my $sample_name (keys %$sample_to_replicates_href) {
my @expr_vals;
my @rep_names = keys %{$sample_to_replicates_href->{$sample_name}};
foreach my $rep_name (@rep_names) {
my $col_id = $replicate_to_col_number{$rep_name} || die "Error, no column identified for replicate name: [$rep_name] ";
my $expr_val = $x[$col_id];
push (@expr_vals, $expr_val);
}
my $avg_expr = &sum(@expr_vals) / scalar(@expr_vals);
$trans_expr{$trans_id}->{$sample_name} = $avg_expr;
}
}
return(%trans_expr);
}
####
sub sum {
my @vals = @_;
my $sum_val = 0;
foreach my $v (@vals) {
$sum_val += $v;
}
return($sum_val);
}
#!/usr/bin/env perl
use strict;
use warnings;
use Carp;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use FindBin;
use Data::Dumper;
my $help_flag;
my $usage = <<__EOUSAGE__;
####################################################################################
#
# required:
#
# --gtf <string> gtf transcript structure file
#
# --pruned_transcripts_fasta <string> fasta file containing transcript sequences.
#
####################################################################################
__EOUSAGE__
;
my $gtf_file;
my $transcripts_fasta_file;
&GetOptions ( 'h' => \$help_flag,
'gtf=s' => \$gtf_file,
'pruned_transcripts_fasta=s' => \$transcripts_fasta_file,
);
unless ($gtf_file && $transcripts_fasta_file) {
die $usage;
}
main: {
my %trans_ids_want;
{
open (my $fh, $transcripts_fasta_file) or die "Error, cannot open file $transcripts_fasta_file";
while (<$fh>) {
if (/^>(\S+)/) {
my $trans_id = $1;
$trans_ids_want{$trans_id} = 1;
}
}
close $fh;
}
my %remain_to_find_in_gtf = %trans_ids_want;
open(my $fh, $gtf_file) or die $!;
while (<$fh>) {
unless (/\w/) { next; }
if (/^\#/) { next; }
my $line = $_;
if (/transcript_id \"([^\"]+)\"/) {
my $trans_id = $1;
if ($trans_ids_want{$trans_id}) {
print $line;
delete $remain_to_find_in_gtf{$trans_id};
}
}
}
close $fh;
if (%remain_to_find_in_gtf) {
die "Error, didn't encounter entries for the following transcripts in the gtf file: " . Dumper(\%remain_to_find_in_gtf);
}
exit(0);
}
......@@ -15,6 +15,8 @@ use Data::Dumper;
my $ROTS_B = 500;
my $ROTS_K = 5000;
my $MIN_REPS_MIN_CPM = "2,1";
my $usage = <<__EOUSAGE__;
......@@ -42,7 +44,10 @@ my $usage = <<__EOUSAGE__;
#
# General options:
#
# --min_rowSum_counts <int> default: 2 (only those rows of matrix meeting requirement will be tested)
# --min_reps_min_cpm <string> default: $MIN_REPS_MIN_CPM (format: 'min_reps,min_cpm')
# At least min count of replicates must have cpm values > min cpm value.
# (ie. filtMatrix = matrix[rowSums(cpm(matrix)> min_cpm) >= min_reps, ] adapted from edgeR manual)
# Note, ** if no --samples_file, default for min_reps is set = 1 **
#
# --output|o name of directory to place outputs (default: \$method.\$pid.dir)
#
......@@ -90,7 +95,7 @@ __EOUSAGE__
my $matrix_file;
my $method;
my $samples_file;
my $min_rowSum_counts = 2;
my $help_flag;
my $output_dir;
my $dispersion; # I've used 0.1 myself - but read the manual to guide your choice.
......@@ -107,7 +112,7 @@ my $make_tar_gz_file = 0;
'method=s' => \$method,
'samples_file|s=s' => \$samples_file,
'output|o=s' => \$output_dir,
'min_rowSum_counts=i' => \$min_rowSum_counts,
'min_reps_min_cpm=s' => \$MIN_REPS_MIN_CPM,
'dispersion=f' => \$dispersion,
'reference_sample=s' => \$reference_sample,
......@@ -149,6 +154,17 @@ unless ($method =~ /^(edgeR|DESeq2|voom|ROTS|GLM)$/) {
die "Error, do not recognize --method [$method]";
}
my ($MIN_REPS, $MIN_CPM) = split(/,/, $MIN_REPS_MIN_CPM);
if ($samples_file) {
unless ($MIN_REPS > 0 && $MIN_CPM > 0) {
die "Error, --min_reps_min_cpm $MIN_REPS_MIN_CPM must include values > 0 in comma-delimited format. ex. '2,1' ";
}
}
else {
print STDERR "-note, no biological replicates identified, so setting min reps = $MIN_REPS.\n";
$MIN_REPS = 1;
}
main: {
......@@ -388,7 +404,7 @@ sub run_edgeR_sample_pair {
print $ofh "col_ordering = c(" . join(",", @rep_column_indices) . ")\n";
print $ofh "rnaseqMatrix = data[,col_ordering]\n";
print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=$min_rowSum_counts,]\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > $MIN_CPM) >= $MIN_REPS,]\n";
print $ofh "conditions = factor(c(rep(\"$sample_A\", $num_rep_A), rep(\"$sample_B\", $num_rep_B)))\n";
print $ofh "\n";
print $ofh "exp_study = DGEList(counts=rnaseqMatrix, group=conditions)\n";
......@@ -410,13 +426,20 @@ sub run_edgeR_sample_pair {
print $ofh "et = exactTest(exp_study, pair=c(\"$sample_A\", \"$sample_B\"), dispersion=$dispersion)\n";
}
print $ofh "tTags = topTags(et,n=NULL)\n";
print $ofh "write.table(tTags, file=\'$output_prefix.edgeR.DE_results\', sep='\t', quote=F, row.names=T)\n";
print $ofh "result_table = tTags\$table\n";
print $ofh "result_table = data.frame(sampleA=\"$sample_A\", sampleB=\"$sample_B\", result_table)\n";
## reset logfc so it's A/B instead of B/A to be consistent with DESeq2
print $ofh "result_table\$logFC = -1 * result_table\$logFC\n";
print $ofh "write.table(result_table, file=\'$output_prefix.edgeR.DE_results\', sep='\t', quote=F, row.names=T)\n";
print $ofh "write.table(rnaseqMatrix, file=\'$output_prefix.edgeR.count_matrix\', sep='\t', quote=F, row.names=T)\n";
## generate MA and Volcano plots
print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.edgeR.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "result_table = tTags\$table\n";
print $ofh "plot_MA_and_Volcano(result_table\$logCPM, result_table\$logFC, result_table\$FDR)\n";
print $ofh "plot_MA_and_Volcano(rownames(result_table), result_table\$logCPM, result_table\$logFC, result_table\$FDR)\n";
print $ofh "dev.off()\n";
close $ofh;
......@@ -466,7 +489,7 @@ sub run_DESeq2_sample_pair {
## write R-script to run DESeq
open (my $ofh, ">$Rscript_name") or die "Error, cannot write to $Rscript_name";
print $ofh "library(edgeR)\n";
print $ofh "library(DESeq2)\n";
print $ofh "\n";
......@@ -474,7 +497,7 @@ sub run_DESeq2_sample_pair {
print $ofh "col_ordering = c(" . join(",", @rep_column_indices) . ")\n";
print $ofh "rnaseqMatrix = data[,col_ordering]\n";
print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=$min_rowSum_counts,]\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > $MIN_CPM) >= $MIN_REPS,]\n";
print $ofh "conditions = data.frame(conditions=factor(c(rep(\"$sample_A\", $num_rep_A), rep(\"$sample_B\", $num_rep_B))))\n";
print $ofh "rownames(conditions) = colnames(rnaseqMatrix)\n";
print $ofh "ddsFullCountTable <- DESeqDataSetFromMatrix(\n"
......@@ -482,29 +505,31 @@ sub run_DESeq2_sample_pair {
. " colData = conditions,\n"
. " design = ~ conditions)\n";
print $ofh "dds = DESeq(ddsFullCountTable)\n";
print $ofh "res = results(dds)\n";
print $ofh "contrast=c(\"conditions\",\"$sample_A\",\"$sample_B\")\n";
print $ofh "res = results(dds, contrast)\n";
# adj from: Carsten Kuenne, thx!
##recreates baseMeanA and baseMeanB columns that are not created by default DESeq2 anymore
print $ofh "baseMeanA <- rowMeans(counts(dds, normalized=TRUE)[,colData(dds)\$condition == \"$sample_A\"])\n";
print $ofh "baseMeanB <- rowMeans(counts(dds, normalized=TRUE)[,colData(dds)\$condition == \"$sample_B\"])\n";
print $ofh "baseMeanA <- rowMeans(counts(dds, normalized=TRUE)[,colData(dds)\$conditions == \"$sample_A\"])\n";
print $ofh "baseMeanB <- rowMeans(counts(dds, normalized=TRUE)[,colData(dds)\$conditions == \"$sample_B\"])\n";
print $ofh "res = cbind(baseMeanA, baseMeanB, as.data.frame(res))\n";
##adds an “id” column headline for column 0
print $ofh "res = cbind(id=rownames(res), as.data.frame(res))\n";
print $ofh "res = cbind(sampleA=\"$sample_A\", sampleB=\"$sample_B\", as.data.frame(res))\n";
print $ofh "res\$padj[is.na(res\$padj)] <- 1\n"; # Carsten Kuenne
##set row.names to false to accomodate change above
## output results
print $ofh "write.table(as.data.frame(res[order(res\$pvalue),]), file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE, row.names=F)\n";
print $ofh "write.table(as.data.frame(res[order(res\$pvalue),]), file=\'$output_prefix.DESeq2.DE_results\', sep='\t', quote=FALSE)\n";
print $ofh "write.table(rnaseqMatrix, file=\'$output_prefix.DESeq2.count_matrix\', sep='\t', quote=FALSE)\n";
## generate MA and Volcano plots
print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.DESeq2.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(log2(res\$baseMean+1), res\$log2FoldChange, res\$padj)\n";
print $ofh "plot_MA_and_Volcano(rownames(res), log2(res\$baseMean+1), res\$log2FoldChange, res\$padj)\n";
print $ofh "dev.off()\n";
......@@ -555,7 +580,7 @@ sub run_limma_voom_sample_pair {
print $ofh "col_ordering = c(" . join(",", @rep_column_indices) . ")\n";
print $ofh "rnaseqMatrix = data[,col_ordering]\n";
print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=$min_rowSum_counts,]\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > $MIN_CPM) >= $MIN_REPS,]\n";
print $ofh "conditions = factor(c(rep(\"$sample_A\", $num_rep_A), rep(\"$sample_B\", $num_rep_B)))\n";
print $ofh "\n";
print $ofh "design = model.matrix(~conditions)\n";
......@@ -570,16 +595,18 @@ sub run_limma_voom_sample_pair {
print $ofh "# output results, including average expression val for each feature\n";
print $ofh "c = cpm(x)\n";
print $ofh "m = apply(c, 1, mean)\n";
print $ofh "tTags\$logFC = -1 * tTags\$logFC # make A/B instead of B/A\n";
print $ofh "tTags2 = cbind(tTags, logCPM=log2(m[rownames(tTags)]))\n";
print $ofh "DE_matrix = data.frame(logFC=tTags\$logFC, logCPM=tTags2\$logCPM, PValue=tTags\$'P.Value', FDR=tTags\$'adj.P.Val')\n";
print $ofh "DE_matrix = data.frame(sampleA=\"$sample_A\", sampleB=\"$sample_B\", logFC=tTags\$logFC, logCPM=tTags2\$logCPM, PValue=tTags\$'P.Value', FDR=tTags\$'adj.P.Val')\n";
print $ofh "rownames(DE_matrix) = rownames(tTags)\n";
print $ofh "write.table(DE_matrix, file=\'$output_prefix.voom.DE_results\', sep='\t', quote=F, row.names=T)\n";
print $ofh "write.table(rnaseqMatrix, file=\'$output_prefix.voom.count_matrix\', sep='\t', quote=F, row.names=T)\n";
## generate MA and Volcano plots
print $ofh "# MA and volcano plots\n";
print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.voom.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(tTags2\$logCPM, tTags\$logFC, tTags\$'adj.P.Val')\n";
print $ofh "plot_MA_and_Volcano(rownames(tTags2), tTags2\$logCPM, tTags\$logFC, tTags\$'adj.P.Val')\n";
print $ofh "dev.off()\n";
close $ofh;
......@@ -640,7 +667,7 @@ sub run_ROTS_sample_pair {
print $ofh "col_ordering = c(" . join(",", @rep_column_indices) . ")\n";
print $ofh "rnaseqMatrix = data[,col_ordering]\n";
print $ofh "rnaseqMatrix = round(rnaseqMatrix)\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=$min_rowSum_counts,]\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > $MIN_CPM) >= $MIN_REPS,]\n";
print $ofh "conditions = factor(c(rep(\"$sample_A\", $num_rep_A), rep(\"$sample_B\", $num_rep_B)))\n";
print $ofh "\n";
print $ofh "design = model.matrix(~conditions)\n";
......@@ -667,15 +694,17 @@ sub run_ROTS_sample_pair {
print $ofh "logFC = log2(FC)\n";
print $ofh "results = summary(res_voom, fdr=0.1)\n";
print $ofh "feature_order = rownames(results)\n";
print $ofh "final_table = data.frame(logCPM=log2(m+1)[feature_order], CPM_A=mean_sampleA_cpm[feature_order], CPM_B=mean_sampleB_cpm[feature_order], logFC=logFC[feature_order], results)\n";
print $ofh "final_table = data.frame(sampleA=\"$sample_A\", sampleB=\"$sample_B\", logCPM=log2(m+1)[feature_order], CPM_A=mean_sampleA_cpm[feature_order], CPM_B=mean_sampleB_cpm[feature_order], logFC=logFC[feature_order], results)\n";
print $ofh "write.table(final_table, file=\"$output_prefix.ROTS.DE_results\", quote=F, sep='\t')\n";
print $ofh "write.table(rnaseqMatrix, file=\"$output_prefix.ROTS.count_matrix\", quote=F, sep='\t')\n";
## generate MA and Volcano plots
print $ofh "# MA and volcano plots\n";
print $ofh "source(\"$FindBin::RealBin/R/rnaseq_plot_funcs.R\")\n";
print $ofh "pdf(\"$output_prefix.voom.DE_results.MA_n_Volcano.pdf\")\n";
print $ofh "plot_MA_and_Volcano(final_table\$logCPM, final_table\$logFC, final_table\$FDR)\n";
print $ofh "plot_MA_and_Volcano(rownames(final_table), final_table\$logCPM, final_table\$logFC, final_table\$FDR)\n";
print $ofh "dev.off()\n";
......@@ -737,7 +766,8 @@ sub run_GLM {
print $ofh "data = read.table(\"$matrix_file\", header=T, row.names=1, com='')\n";
print $ofh "rnaseqMatrix = round(data)\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=$min_rowSum_counts,]\n";
print $ofh "rnaseqMatrix = rnaseqMatrix[rowSums(cpm(rnaseqMatrix) > $MIN_CPM) >= $MIN_REPS,]\n";
print $ofh "design = model.matrix(~0+groups)\n";
print $ofh "colnames(design) = levels(groups)\n";
......
......@@ -2,6 +2,7 @@
use strict;
use warnings;
use FindBin;
use Getopt::Long qw(:config no_ignore_case bundling pass_through);
......@@ -15,24 +16,30 @@ my $usage = <<__EOUSAGE__;
#
# --GO_assignments <string> extracted GO assignments with format: feature_id <tab> GO:000001,GO:00002,...
#
# --lengths <string> feature lengths with format: feature_id <tab> length
# --lengths <string> feature lengths file with format: feature_id <tab> length
#
# --background <string> gene ids file that defines the full population of genes to consider in testing.
# Ideally, these represent the genes that are expressed and relevant to this test
# as opposed to using all genes in the genome.
#
###############################################################################################
__EOUSAGE__
;
my ($factor_labeling, $GO_file, $help_flag, $lengths_file, $genes_single_factor_file);
my $background_file;
&GetOptions("factor_labeling=s" => \$factor_labeling,
"GO_assignments=s" => \$GO_file,
"lengths=s" => \$lengths_file,
"genes_single_factor=s" => \$genes_single_factor_file,
"background=s" => \$background_file,
"help|h" => \$help_flag,
);
......@@ -43,7 +50,7 @@ if ($help_flag) {
}
unless (($factor_labeling || $genes_single_factor_file) && $GO_file && $lengths_file) {
unless (($factor_labeling || $genes_single_factor_file) && $GO_file && $lengths_file && $background_file) {
die $usage;
}
......@@ -53,6 +60,12 @@ main: {
my $Rscript = "__runGOseq.R";
open (my $ofh, ">$Rscript") or die $!;
print $ofh "library(goseq)\n";
print $ofh "library(GO.db)\n";
print $ofh "library(qvalue)\n";
print $ofh "# capture list of genes for functional enrichment testing\n";
if ($genes_single_factor_file) {
print $ofh "factor_labeling = read.table(\"$genes_single_factor_file\", row.names=1)\n";
print $ofh "factor_labeling[,1] = rep('custom_list', dim(factor_labeling)[1])\n";
......@@ -64,18 +77,25 @@ main: {
print $ofh "colnames(factor_labeling) = c('type')\n";
print $ofh "factor_list = unique(factor_labeling[,1])\n";
print $ofh "DE_genes = rownames(factor_labeling)\n";
print $ofh "gene_lengths = read.table(\"$lengths_file\", header=T, row.names=1)\n";
print $ofh "\n\n# get gene lengths\n";
print $ofh "gene_lengths = read.table(\"$lengths_file\", header=T, row.names=1, com='')\n";
print $ofh "gene_lengths = as.matrix(gene_lengths[,1,drop=F])\n";
print $ofh "\n\n# get background gene list\n";
print $ofh "background = read.table(\"$background_file\", header=T, row.names=1)\n";
print $ofh "background.gene_ids = rownames(background)\n";
print $ofh "background.gene_ids = unique(c(background.gene_ids, DE_genes))\n";
print $ofh "sample_set_gene_ids = background.gene_ids\n";
print $ofh "\n\n# parse GO assignments\n";
print $ofh "GO_info = read.table(\"$GO_file\", header=F, row.names=1,stringsAsFactors=F)\n";
print $ofh "GO_info_listed = apply(GO_info, 1, function(x) unlist(strsplit(x,',')))\n";
print $ofh "names(GO_info_listed) = rownames(GO_info)\n";
print $ofh "features_with_GO = rownames(GO_info)\n";
print $ofh "lengths_features_with_GO = gene_lengths[features_with_GO,]\n";
print $ofh "get_GO_term_descr = function(x) {\n";
print $ofh " d = 'none';\n"
. " go_info = GOTERM[[x]];\n"
......@@ -84,28 +104,39 @@ main: {
. "}\n";
print $ofh "# build pwf based on ALL DE features\n";
print $ofh "cat_genes_vec = as.integer(features_with_GO %in% rownames(factor_labeling))\n";
print $ofh "library(goseq)\n";
print $ofh "library(GO.db)\n";
print $ofh "library(qvalue)\n";
print $ofh "\n\n#organize go_id -> list of genes\n";
print $ofh "GO_to_gene_list = list()\n";
print $ofh "for (gene_id in intersect(names(GO_info_listed), sample_set_gene_ids)) {\n";
print $ofh " go_list = GO_info_listed[[gene_id]]\n";
print $ofh " for (go_id in go_list) {\n";
print $ofh " GO_to_gene_list[[go_id]] = c(GO_to_gene_list[[go_id]], gene_id)\n";
print $ofh " }\n";
print $ofh "}\n";
print $ofh "\n\n# GO-Seq protocol: build pwf based on ALL DE features\n";
print $ofh "pwf=nullp(cat_genes_vec,bias.data=lengths_features_with_GO)\n";
print $ofh "rownames(pwf) = names(GO_info_listed)\n";
print $ofh "sample_set_gene_lengths = gene_lengths[sample_set_gene_ids,]\n";
print $ofh "GO_info_listed = GO_info_listed[ names(GO_info_listed) %in% sample_set_gene_ids ]\n";
print $ofh "cat_genes_vec = as.integer(sample_set_gene_ids %in% rownames(factor_labeling))\n";
print $ofh "pwf=nullp(cat_genes_vec, bias.data=sample_set_gene_lengths)\n";
print $ofh "rownames(pwf) = sample_set_gene_ids\n";
print $ofh "\n\n# perform functional enrichment testing for each category.\n";
print $ofh "for (feature_cat in factor_list) {\n";
print $ofh " message('Processing category: ', feature_cat)\n";
print $ofh " cat_genes_vec = as.integer(features_with_GO %in% rownames(factor_labeling)[factor_labeling\$type == feature_cat])\n";
#print $ofh " names(cat_genes_vec) = features_with_GO\n";
print $ofh " gene_ids_in_feature_cat = rownames(factor_labeling)[factor_labeling\$type == feature_cat]\n";
print $ofh " cat_genes_vec = as.integer(sample_set_gene_ids %in% gene_ids_in_feature_cat)\n";
print $ofh " pwf\$DEgenes = cat_genes_vec\n";
print $ofh " res = goseq(pwf,gene2cat=GO_info_listed)\n";
print $ofh " res = goseq(pwf,gene2cat=GO_info_listed, use_genes_without_cat=TRUE)\n";
## Process the over-represented
print $ofh " ## over-represented categories:\n";
#print $ofh " res\$over_represented_FDR = p.adjust(res\$over_represented_pvalue, method='BH')\n";
print $ofh " pvals = res\$over_represented_pvalue\n";
print $ofh " pvals[pvals > 1 - 1e-10] = 1 - 1e-10\n";
print $ofh " q = qvalue(pvals)\n";
......@@ -121,6 +152,13 @@ main: {
print $ofh " descr = unlist(lapply(result_table\$category, get_GO_term_descr))\n";
print $ofh " result_table\$go_term = descr;\n";
print $ofh " result_table\$gene_ids = do.call(rbind, lapply(result_table\$category, function(x) { \n" .
" gene_list = GO_to_gene_list[[x]]\n" .
" gene_list = gene_list[gene_list %in% gene_ids_in_feature_cat]\n" .
" paste(gene_list, collapse=', ');\n" .
" }) )\n";
print $ofh " write.table(result_table[order(result_table\$over_represented_pvalue),], file=go_enrich_filename, sep='\t', quote=F, row.names=F)\n";
......@@ -146,11 +184,6 @@ main: {
print $ofh " write.table(result_table[order(result_table\$under_represented_pvalue),], file=go_depleted_filename, sep='\t', quote=F, row.names=F)\n";
print $ofh "}\n";
close $ofh;
......
#!/usr/bin/env Rscript
args<-commandArgs(TRUE)
if (length(args) == 0) {
stop("usage: validate_UP_subset.Rscript samples.txt");
}
samples_txt_file <- args[1]
validate_UP_subset <- function(filename, samples_list) {
cat("validating: ", filename, "\n")
data <- read.table(filename, header=T, stringsAsFactors=F)
if (nrow(data) == 0) {
message("no DE entries reported here.")
return(0)
}
sampleA <- data[['sampleA']][1]
sampleB <- data[['sampleB']][1]
sampleA_rep_names <- samples_list[[sampleA]][['replicate']]
sampleB_rep_names <- samples_list[[sampleB]][['replicate']]
sampleA_idx <- which(colnames(data) %in% sampleA_rep_names)
sampleB_idx <- which(colnames(data) %in% sampleB_rep_names)
sampleA_mean <- apply(data, 1, function(x) mean(as.numeric(x[sampleA_idx])))
sampleB_mean <- apply(data, 1, function(x) mean(as.numeric(x[sampleB_idx])))
cat("sampleA_mean: ", sampleA_mean, "\n")
cat("sampleB_mean: ", sampleB_mean, "\n")
num_discordant <- sum(sampleA_mean/sampleB_mean < 0)
cat("num discordant: ", num_discordant, "\n")
return(num_discordant)
}
samples_info <- read.table(samples_txt_file, stringsAsFactors=F)
colnames(samples_info) <- c('sample', 'replicate')
samples_list <- split(samples_info, samples_info$sample)
files <- dir(pattern="*-UP.subset$")
errflag <- 0
for (file in files) {
num_discordant <- validate_UP_subset(file, samples_list)
if (num_discordant > 0) {
errflag <- 1
warning("File: (", file, ") has ", num_discordant, " discordant features")
}
else {
message("File: (", file, ") passes test.")
}
}
quit("no", errflag)
......@@ -14,7 +14,7 @@ my $help_flag;
my $min_per_id = 99;
my $max_per_gap = 1;
my $min_per_length = 100;
my $min_per_length = 99;
my $usage = <<__EOUSAGE__;
......
......@@ -7,8 +7,11 @@ use lib ("$FindBin::RealBin/../../../PerlLib");
use SingleLinkageClusterer;
use PSL_parser;
use Process_cmd;
use Getopt::Long qw(:config no_ignore_case bundling);
use DelimParser;
use File::Basename;
......@@ -245,8 +248,7 @@ sub resolve_overlaps {
my $key = join("$;", $gene_id, $trans_id);
my $struct = $mapping_info{$key} or die "Error, cannot find alignment for $key";
print $ofh $struct->{psl}->get_line();
print $ofh $struct->{line} . "\n";
}
......@@ -300,46 +302,40 @@ sub examine_blat_mappings {
my %mappings;
my $psl_parser = new PSL_parser($blat_output);
while (my $psl_entry = $psl_parser->get_next()) {
#print $psl_entry->toString();
my $cmd = "$FindBin::Bin/blat_psl_to_align_summary_stats.pl $blat_output > $blat_output.stats";
&process_cmd($cmd);
my $trans_assembly = $psl_entry->get_Q_name();
my $gene_id = $psl_entry->get_T_name();
my $per_id = $psl_entry->get_per_id();
open(my $fh, "$blat_output.stats") or die "Error, cannot open file $blat_output.stats";
my $tab_reader = new DelimParser::Reader($fh, "\t");
while (my $row = $tab_reader->get_row()) {
my $num_gap_opens = $psl_entry->get_T_gap_count() + $psl_entry->get_Q_gap_count();
my $num_gap_bases = $psl_entry->get_T_gap_bases() + $psl_entry->get_Q_gap_bases();
my $num_matches = $psl_entry->get_match_count();
my $num_mismatches = $psl_entry->get_mismatch_count();
#print $psl_entry->toString();
my $line = $tab_reader->reconstruct_line_from_row($row);
my ($trans_end5, $trans_end3) = $psl_entry->get_Q_span();
my ($gene_end5, $gene_end3) = $psl_entry->get_T_span();
my $trans_assembly = $row->{query};
my $gene_id = $row->{target};
my $per_id = $row->{per_id};
my $orient = $psl_entry->get_strand();
my $orient = $row->{strand};
if ($forward_orient && $orient eq '-') { next; }
if ($per_id < $min_per_id) { next; }
my $gene_seq_len = $psl_entry->get_T_size();
my $percent_gapped = $row->{per_gap};
my $percent_gapped = ($num_gap_bases) / ($num_matches + $num_mismatches) * 100;
if ($percent_gapped > $max_per_gap ) {
#print STDERR "\%gapped = $percent_gapped, so skipping.\n";
next; # too gappy
}
my ($gene_lend, $gene_rend) = sort {$a<=>$b} ($gene_end5, $gene_end3);
my ($gene_lend, $gene_rend) = ($row->{target_lend}, $row->{target_rend});
my ($trans_lend, $trans_rend) = sort {$a<=>$b} ($trans_end5, $trans_end3);
my ($trans_lend, $trans_rend) = ($row->{query_lend}, $row->{query_rend});
my $delta = ($gene_lend - 1) + ($gene_seq_len - $gene_rend);
my $pct_len = 100 - ($delta/$gene_seq_len * 100);
my $pct_len = $row->{per_len};
#print STDERR "PERCENT_LEN: $pct_len (vs. $min_per_length)\n";
......@@ -351,7 +347,7 @@ sub examine_blat_mappings {
## score the alignment:
my $score = (5 * $num_matches) - (4 * $num_mismatches) - (20 * $num_gap_opens) - log($num_gap_bases + 1);
my $score = $row->{score};
my $struct = $mapping_info{$pair};
......@@ -378,16 +374,14 @@ sub examine_blat_mappings {
strand => $orient,
psl => $psl_entry,
line => $line,
};
}
}
}
return(%mappings);
}
......
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use lib ("$FindBin::RealBin/../../../PerlLib");
use PSL_parser;
use DelimParser;
my $usage = "\n\n\tusage: $0 blat_output.pslx\n\n";
my $blat_output = $ARGV[0] or die $usage;
main: {
my @column_fields = ("target", "query", "per_id", "per_gap", "score",
"target_lend", "target_rend", "query_lend", "query_rend",
"strand", "per_len");
my $tab_writer = new DelimParser::Writer(*STDOUT, "\t", \@column_fields);
my $psl_parser = new PSL_parser($blat_output);
while (my $psl_entry = $psl_parser->get_next()) {
#print $psl_entry->toString();
my $trans_assembly = $psl_entry->get_Q_name();
my $gene_id = $psl_entry->get_T_name();
my $per_id = $psl_entry->get_per_id();
my $num_gap_opens = $psl_entry->get_T_gap_count() + $psl_entry->get_Q_gap_count();
my $num_gap_bases = $psl_entry->get_T_gap_bases() + $psl_entry->get_Q_gap_bases();
my $num_matches = $psl_entry->get_match_count();
my $num_mismatches = $psl_entry->get_mismatch_count();
my ($trans_end5, $trans_end3) = $psl_entry->get_Q_span();
my ($gene_end5, $gene_end3) = $psl_entry->get_T_span();
my $orient = $psl_entry->get_strand();
my $gene_seq_len = $psl_entry->get_T_size();
my $percent_gapped = ($num_gap_bases) / ($num_matches + $num_mismatches) * 100;
my ($gene_lend, $gene_rend) = sort {$a<=>$b} ($gene_end5, $gene_end3);
my ($trans_lend, $trans_rend) = sort {$a<=>$b} ($trans_end5, $trans_end3);
my $delta = ($gene_lend - 1) + ($gene_seq_len - $gene_rend);
my $pct_len = 100 - ($delta/$gene_seq_len * 100);
$pct_len = sprintf("%.2f", $pct_len);
## score the alignment:
my $score = (5 * $num_matches) - (4 * $num_mismatches) - (20 * $num_gap_opens) - log($num_gap_bases + 1);
my %row = (
target => $gene_id,
query => $trans_assembly,
per_id => $per_id,
score => sprintf("%.2f", $score),
target_lend => $gene_lend,
target_rend => $gene_rend,
query_lend => $trans_lend,
query_rend => $trans_rend,
strand => $orient,
per_gap => sprintf("%.2f", $percent_gapped),
per_len => $pct_len,
#psl => $psl_entry,
);
$tab_writer->write_row(\%row);
}
}