Skip to content
Commits on Source (11)
[submodule "TransDecoder.lrgTests"]
path = TransDecoder.lrgTests
url = https://github.com/TransDecoder/TransDecoder.lrgTests.git
[submodule "TransDecoder.wiki"]
path = TransDecoder.wiki
url = https://github.com/TransDecoder/TransDecoder.wiki.git
## v5.0.0 August 26, 2017
-algorithm updates: frame[0] score > 0 and max for first 3 reading frames (instead of all 6), and orf with highest frame[0] score is chosen allowing for minimal overlap among selected predictions.
-option --single_best_only provides the single longest of the selected orfs per contig.
-long orfs unlikely to appear in random sequence are automatically selected as candidates with this minimal long orf length set dynamically according to GC content.
-orf score and blast or pfam info is propagated to gff3 output
## v4.1.0
-single best orf now selected by default. If more than the single best orf is wanted, use the --all_good_orfs parameter.
-start codon refinement is now done by default. To turn it off and get the original behavior of extending to the longest orf position, use parameter: --no_refine_starts
-cdhit has been removed and replaced by our own fast method for removing redundancies.
-selection of coding regions is strictly governed by Markov-based likelihood scores across reading frames. No auto-retention of long orfs by default, but can be activated by parameter: --retain_long_orfs_length
** all v4 releases pre-v4.1 were fairly quickly retracted due to bugs and insufficient benchmarking **
## v3.0.2 release Oct 31, 2016
minor bugfix release - when checking for required utilities to be installed, doesn't require ^/ in path
## v3.0.0 release April 26, 2016
TransDecoder v3.0.0 includes the following changes:
TransDecoder.LongOrfs now includes parameter '--gene_trans_map ' as a way to retain the gene identifier information. In the case of Cufflinks and Trinity, the gene identifiers will automatically be recognized and retained. For PASA and other inputs, it is necessary to provide the gene-to-transcript identifier mappings in order to generate isoform-clustered output files (gff3).
TransDecoder.Predict now includes flag ' --single_best_orf ' to retain only the single 'best' ORF per transcript. ORFs are prioritized according to homology information (if given the blast and pfam results) and by sequence length, with longer ORFs preferred.
Codon phase information is now included in the GFF3 output files.
The .mRNA files that were generated by default for genome-free TransDecoder runs are now deprecated, but of course the .cds and .pep files are provided.
The sample data sets include examples for running TransDecoder in a few different contexts, including starting from Trinity, PASA, or Cufflinks data.
More useful logging information is provided to it's clearer as to how many orfs are being retained and which are being eliminated along the way.
## 2016-03-11 v2.1 release
-added cpu parameter to TransDecoder.predict
-retaining gene identifier information from cufflinks output
......
SHELL := /bin/bash
all:
cd ./transdecoder_plugins/ && $(MAKE) all
@echo Nothing to build. Run \'make test\' to run example executions on each of the sample data sets provided.
clean:
cd ./transdecoder_plugins/ && $(MAKE) clean
cd ./sample_data && ./cleanme.pl
cd ./sample_data && make clean
test:
cd ./sample_data/ && ./runMe.sh
cd ./sample_data/ && make test
#!/usr/bin/env perl
# classes for DelimParser::Reader and DelimParser::Writer
package DelimParser;
use strict;
use warnings;
use Carp;
####
sub new {
my ($packagename, $fh, $delimiter) = @_;
unless ($fh && $delimiter) {
confess "Error, need filehandle and delimiter params";
}
my $self = { _delim => $delimiter,
_fh => $fh,
# set below in _init()
_column_headers => [],
};
bless ($self, $packagename);
return($self);
}
####
sub get_fh {
my $self = shift;
return($self->{_fh});
}
####
sub get_delim {
my $self = shift;
return($self->{_delim});
}
####
sub get_column_headers {
my $self = shift;
return(@{$self->{_column_headers}});
}
####
sub set_column_headers {
my $self = shift;
my (@columns) = @_;
$self->{_column_headers} = \@columns;
return;
}
####
sub get_num_columns {
my $self = shift;
return(length($self->get_column_headers()));
}
###
sub reconstruct_header_line {
my $self = shift;
my @column_headers = $self->get_column_headers();
my $header_line = join("\t", @column_headers);
return($header_line);
}
###
sub reconstruct_line_from_row {
my $self = shift;
my $row_href = shift;
unless ($row_href && ref $row_href) {
confess "Error, must set row_href as param";
}
my @column_headers = $self->get_column_headers();
my @vals;
foreach my $col_header (@column_headers) {
my $val = $row_href->{$col_header};
push (@vals, $val);
}
my $row_text = join("\t", @vals);
return($row_text);
}
##################################################
package DelimParser::Reader;
use strict;
use warnings;
use Carp;
use Data::Dumper;
our @ISA;
push (@ISA, 'DelimParser');
sub new {
my ($packagename) = shift;
my $self = $packagename->DelimParser::new(@_);
$self->_init();
return($self);
}
####
sub _init {
my $self = shift;
my $fh = $self->get_fh();
my $delim = $self->get_delim();
my $header_row = <$fh>;
chomp $header_row;
unless ($header_row) {
confess "Error, no header row read.";
}
my @fields = split(/$delim/, $header_row);
$self->set_column_headers(@fields);
return;
}
####
sub get_row {
my $self = shift;
my $fh = $self->get_fh();
my $line = <$fh>;
unless ($line) {
return(undef); # eof
}
my $delim = $self->get_delim();
my @fields = split(/$delim/, $line);
chomp $fields[$#fields]; ## it's important that this is done after the delimiter splitting in case the last field is actually empty.
my @column_headers = $self->get_column_headers();
my $num_col = scalar (@column_headers);
my $num_fields = scalar(@fields);
if ($num_col != $num_fields) {
confess "Error, line: [$line] " . Dumper(\@fields) . " is lacking $num_col fields: " . Dumper(\@column_headers);
}
my %dict;
foreach my $colname (@column_headers) {
my $field = shift @fields;
$dict{$colname} = $field;
}
return(\%dict);
}
##################################################
package DelimParser::Writer;
use strict;
use warnings;
use Carp;
our @ISA;
push (@ISA, 'DelimParser');
sub new {
my ($packagename) = shift;
my ($ofh, $delim, $column_fields_aref, $FLAGS) = @_;
## FLAGS can be:
# NO_WRITE_HEADER|...
unless (ref $column_fields_aref eq 'ARRAY') {
confess "Error, need constructor params: ofh, delim, column_fields_aref";
}
my $self = $packagename->DelimParser::new($ofh, $delim);
$self->_initialize($column_fields_aref, $FLAGS);
return($self);
}
####
sub _initialize {
my $self = shift;
my $column_fields_aref = shift;
my $FLAGS = shift;
unless (ref $column_fields_aref eq 'ARRAY') {
confess "Error, require column_fields_aref as param";
}
my $ofh = $self->get_fh();
my $delim = $self->get_delim();
$self->set_column_headers(@$column_fields_aref);
unless (defined($FLAGS) && $FLAGS =~ /NO_WRITE_HEADER/) {
my $output_line = join($delim, @$column_fields_aref);
print $ofh "$output_line\n";
}
return;
}
####
sub write_row {
my $self = shift;
my $dict_href = shift;
unless (ref $dict_href eq "HASH") {
confess "Error, need dict_href as param";
}
my $num_dict_fields = scalar(keys %$dict_href);
my @column_headers = $self->get_column_headers();
my $delim = $self->get_delim();
my @out_fields;
for my $column_header (@column_headers) {
my $field = $dict_href->{$column_header};
unless (defined $field) {
confess "Error, missing value for required column field: $column_header";
}
if ($field =~ /$delim/) {
# don't allow any delimiters to contaminate the field value, otherwise it'll introduce offsets.
$field =~ s/$delim/ /g;
}
# also avoid newlines, which will also break the output formatting.
if ($field =~ /\n/) {
$field =~ s/\n/ /g;
}
push (@out_fields, $field);
}
my $outline = join("\t", @out_fields);
my $ofh = $self->get_fh();
print $ofh "$outline\n";
return;
}
1; #EOM
package PWM;
use strict;
use warnings;
use Carp;
my $PSEUDOCOUNT = 0.1;
srand(1); # reproducibility
###
sub new {
my $packagename = shift;
my $self = { pos_freqs => [],
pos_probs => [],
_pwm_built_flag => 0,
};
bless($self, $packagename);
return($self);
}
sub is_pwm_built {
my ($self) = @_;
return($self->{_pwm_built_flag});
}
sub has_pos_freqs {
my ($self) = @_;
if (@{$self->{pos_freqs}}) {
return(1);
}
else {
return(0);
}
}
sub add_feature_seq_to_pwm {
my ($self, $feature_seq) = @_;
$feature_seq = uc $feature_seq;
my $pwm_len = $self->get_pwm_length();
if ($pwm_len && length($feature_seq) != $pwm_len) {
confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
}
if ($feature_seq =~ /[^GATC]/) {
print STDERR "Error, feature_seq: $feature_seq contains non-GATC chars... skipping\n";
return;
}
my @chars = split(//, $feature_seq);
for (my $i = 0; $i <= $#chars; $i++) {
my $char = $chars[$i];
$self->{pos_freqs}->[$i]->{$char}++;
}
return;
}
sub remove_feature_seq_from_pwm {
my ($self, $feature_seq) = @_;
if (! $self->has_pos_freqs()) {
confess "Error, pwm obj doesn't have position frequencies set";
}
$feature_seq = uc $feature_seq;
my $pwm_len = $self->get_pwm_length();
if ($pwm_len && length($feature_seq) != $pwm_len) {
confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
}
if ($feature_seq =~ /[^GATC]/) {
print STDERR "Warning, feature_seq: $feature_seq contains non-GATC chars... skipping.\n";
return;
}
my @chars = split(//, $feature_seq);
for (my $i = 0; $i <= $#chars; $i++) {
my $char = $chars[$i];
$self->{pos_freqs}->[$i]->{$char}--;
}
return;
}
sub get_pwm_length {
my ($self) = @_;
my $pwm_length;
if ($self->is_pwm_built()) {
$pwm_length = $#{$self->{pos_probs}} + 1;
}
else {
$pwm_length = $#{$self->{pos_freqs}} + 1;
}
return($pwm_length);
}
sub simulate_feature {
my ($self) = @_;
if (! $self->is_pwm_built()) {
confess "Error, pwm not built yet";
}
my $probs_aref = $self->{pos_probs};
my @chars = qw(G A T C);
my $feature_seq = "";
my $pwm_length = $self->get_pwm_length();
for (my $i = 0; $i < $pwm_length; $i++) {
my $sum_prob = 0;
my $selected_flag = 0;
#print "rand val: $rand_val\n";
my $tot_prob = 0;
for (my $j = 0; $j <= $#chars; $j++) {
my $char = $chars[$j];
my $p = $probs_aref->[$i]->{$char};
$tot_prob += $p;
}
my $rand_val = rand($tot_prob); # tot_prob should be 1 or very close...
for (my $j = 0; $j <= $#chars; $j++) {
my $char = $chars[$j];
my $p = $probs_aref->[$i]->{$char};
#print "char: $char, p: $p\n";
$sum_prob += $p;
if ($j == $#chars) {
$sum_prob = 1;
}
if ($rand_val <= $sum_prob) {
# choose char
$feature_seq .= $char;
$selected_flag = 1;
last;
}
}
if (! $selected_flag) {
croak "Error, didn't select a random char for feature seq";
}
}
return($feature_seq);
}
####
sub write_pwm_file {
my ($self, $filename) = @_;
unless ($self->is_pwm_built()) {
croak "Error, pwm needs to be built first before writing to file";
}
open (my $ofh, ">$filename") or croak "Error, cannot write to file: $filename";
my $pos_probs_aref = $self->{pos_probs};
my $pwm_length = $self->get_pwm_length();
my @chars = qw(G A T C);
print $ofh join("\t", "pos", @chars) . "\n";
for (my $i = 0; $i < $pwm_length; $i++) {
my @vals = ($i);
foreach my $char (@chars) {
my $prob = $pos_probs_aref->[$i]->{$char} || 0;
push (@vals, $prob);
}
print $ofh join("\t", @vals) . "\n";
}
close $ofh;
return;
}
####
sub build_pwm {
my ($self) = @_;
my $pos_freqs_aref = $self->{pos_freqs};
for (my $i = 0; $i <= $#{$pos_freqs_aref}; $i++) {
my @chars = keys %{$pos_freqs_aref->[$i]};
my $sum = 0;
foreach my $char (@chars) {
my $val = $pos_freqs_aref->[$i]->{$char};
$sum += $val;
}
# now convert to relative freqs
@chars = qw(G A T C); # the complete set of chars we care about.
foreach my $char (@chars) {
my $val = $pos_freqs_aref->[$i]->{$char} || 0;
my $prob = sprintf("%.6f", ($val + $PSEUDOCOUNT) / ($sum + 4 * $PSEUDOCOUNT) );
$self->{pos_probs}->[$i]->{$char} = $prob;
}
}
$self->{_pwm_built_flag} = 1;
return;
}
sub score_pwm_using_base_freqs {
my ($self, $target_sequence, $base_freqs_href, %options ) = @_;
### Options can include:
###
### mask => [ coordA, coordB, coordC ], # ignored pwm positions in scoring
### pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
###
for my $key (keys %options) {
if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
confess "Error, option $key is not recognized";
}
}
#print STDERR "target_seq: [$target_sequence]\n";
$target_sequence = uc $target_sequence;
unless ($target_sequence =~ /^[GATC]+$/) {
# can only score GATC-containing sequences.
return("NA");
}
if (! $self->is_pwm_built()) {
croak("pwm not built yet!");
}
my $pwm_length = $self->get_pwm_length();
if (length($target_sequence) != $pwm_length) {
croak "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
}
my %mask;
if (my $mask_positions_aref = $options{'mask'}) {
%mask = map { + $_ => 1 } @$mask_positions_aref;
}
my $motif_score = 0;
my @seqarray = split(//, $target_sequence);
my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
if (my $pwm_range_aref = $options{'pwm_range'}) {
($pwm_start, $pwm_end) = @$pwm_range_aref;
}
for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
if ($mask{$i}) {
next;
}
my $char = $seqarray[$i];
my $prob = $self->{pos_probs}->[$i]->{$char};
unless ($prob) {
return("NA");
}
my $prob_rand = $base_freqs_href->{$char};
unless ($prob_rand) {
die "Error, no non-zero probability value specified for char [$char] ";
}
my $loglikelihood = log($prob/$prob_rand);
$motif_score += $loglikelihood;
}
return($motif_score);
}
sub score_plus_minus_pwm {
my ($self, $target_sequence, $pwm_minus, %options) = @_;
### Options can include:
###
### mask => [ coordA, coordB, coordC ], # ignored pwm positions in scoring
### pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
###
for my $key (keys %options) {
if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
confess "Error, option $key is not recognized";
}
}
#print STDERR "target_seq: [$target_sequence]\n";
$target_sequence = uc $target_sequence;
unless ($target_sequence =~ /^[GATC]+$/) {
# can only score GATC-containing sequences.
return("NA");
}
if (! $self->is_pwm_built()) {
croak("pwm not built yet!");
}
my $pwm_length = $self->get_pwm_length();
if (length($target_sequence) != $pwm_length) {
croak "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
}
my %mask;
if (my $mask_positions_aref = $options{'mask'}) {
%mask = map { + $_ => 1 } @$mask_positions_aref;
}
my $motif_score = 0;
my @seqarray = split(//, $target_sequence);
my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
if (my $pwm_range_aref = $options{'pwm_range'}) {
($pwm_start, $pwm_end) = @$pwm_range_aref;
}
for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
if ($mask{$i}) {
#print STDERR "masking $i\n";
next;
}
my $char = $seqarray[$i];
my $prob = $self->{pos_probs}->[$i]->{$char};
unless ($prob) {
return("NA");
}
my $prob_rand = $pwm_minus->{pos_probs}->[$i]->{$char};
unless ($prob_rand) {
die "Error, no non-zero probability value specified for char [$char] ";
}
my $loglikelihood = log($prob/$prob_rand);
$motif_score += $loglikelihood;
}
return($motif_score);
}
####
sub load_pwm_from_file {
my ($self, $pwm_file) = @_;
open(my $fh, $pwm_file) or confess "Error, cannot open file: $pwm_file";
my $header = <$fh>;
chomp $header;
my @chars = split(/\t/, $header);
shift @chars;
while (<$fh>) {
chomp;
my @x = split(/\t/);
my $idx_pos = shift @x;
for (my $i = 0; $i <= $#chars; $i++) {
my $char = $chars[$i];
my $prob = $x[$i];
$self->{pos_probs}->[$idx_pos]->{$char} = $prob;
}
}
close $fh;
$self->{_pwm_built_flag} = 1;
return;
}
1; #EOM
package Pipeliner;
use strict;
use warnings;
use Carp;
use Cwd;
################################
## Verbose levels:
## 1: see CMD string
## 2: see stderr during process
my $VERBOSE = 0;
################################
####################
## Static methods:
####################
####
sub ensure_full_path {
my ($path, $ADD_GZ_FIFO_FLAG) = @_;
unless ($path =~ m|^/|) {
$path = cwd() . "/$path";
}
if ($ADD_GZ_FIFO_FLAG && $path =~ /\.gz$/) {
$path = "<(zcat $path)";
}
return($path);
}
####
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
my $ret = system($cmd);
if ($ret) {
die "Error, CMD: $cmd died with ret $ret";
}
return;
}
################
## Obj methods:
################
####
sub new {
my $packagename = shift;
my %params = @_;
if ($params{-verbose}) {
$VERBOSE = $params{-verbose};
}
my $cmds_log = $params{-cmds_log};
unless ($cmds_log) {
$cmds_log = "pipeliner.$$.cmds";
}
open (my $ofh, ">$cmds_log") or confess "Error, cannot write to $cmds_log";
my $self = {
cmd_objs => [],
checkpoint_dir => undef,
cmds_log_ofh => $ofh,
};
bless ($self, $packagename);
return($self);
}
sub add_commands {
my $self = shift;
my @cmds = @_;
foreach my $cmd (@cmds) {
unless (ref($cmd) =~ /Command/) {
confess "Error, need Command object as param";
}
my $checkpoint_file = $cmd->get_checkpoint_file();
if ($checkpoint_file !~ m|^/|) {
if (my $checkpoint_dir = $self->get_checkpoint_dir()) {
$checkpoint_file = "$checkpoint_dir/$checkpoint_file";
$cmd->reset_checkpoint_file($checkpoint_file);
}
}
push (@{$self->{cmd_objs}}, $cmd);
}
return $self;
}
sub set_checkpoint_dir {
my $self = shift;
my ($checkpoint_dir) = @_;
if (! -d $checkpoint_dir) {
confess "Error, cannot locate checkpointdir: $checkpoint_dir";
}
$self->{checkpoint_dir} = $checkpoint_dir;
}
sub get_checkpoint_dir {
my $self = shift;
return($self->{checkpoint_dir});
}
sub has_commands {
my $self = shift;
if ($self->_get_commands()) {
return(1);
}
else {
return(0);
}
}
sub run {
my $self = shift;
my $cmds_log_ofh = $self->{cmds_log_ofh};
foreach my $cmd_obj ($self->_get_commands()) {
my $cmdstr = $cmd_obj->get_cmdstr();
print $cmds_log_ofh "$cmdstr\n";
my $checkpoint_file = $cmd_obj->get_checkpoint_file();
if (-e $checkpoint_file) {
print STDERR "-- Skipping CMD: $cmdstr, checkpoint exists.\n" if $VERBOSE;
}
else {
print STDERR "* Running CMD: $cmdstr\n" if $VERBOSE;
my $tmp_stderr = "tmp.$$.stderr";
if (-e $tmp_stderr) {
unlink($tmp_stderr);
}
unless ($VERBOSE == 2) {
$cmdstr .= " 2>$tmp_stderr";
}
my $ret = system($cmdstr);
if ($ret) {
if (-e $tmp_stderr) {
system("cat $tmp_stderr");
unlink($tmp_stderr);
}
confess "Error, cmd: $cmdstr died with ret $ret";
}
else {
`touch $checkpoint_file`;
if ($?) {
confess "Error creating checkpoint file: $checkpoint_file";
}
}
if (-e $tmp_stderr) {
unlink($tmp_stderr);
}
}
}
# reset in case reusing the pipeline obj
$self->{cmd_objs} = []; # reinit
return;
}
sub _get_commands {
my $self = shift;
return(@{$self->{cmd_objs}});
}
package Command;
use strict;
use warnings;
use Carp;
sub new {
my $packagename = shift;
my ($cmdstr, $checkpoint_file) = @_;
unless ($cmdstr && $checkpoint_file) {
confess "Error, need cmdstr and checkpoint filename as params";
}
my $self = { cmdstr => $cmdstr,
checkpoint_file => $checkpoint_file,
};
bless ($self, $packagename);
return($self);
}
####
sub get_cmdstr {
my $self = shift;
return($self->{cmdstr});
}
####
sub get_checkpoint_file {
my $self = shift;
return($self->{checkpoint_file});
}
####
sub reset_checkpoint_file {
my $self = shift;
my $checkpoint_file = shift;
$self->{checkpoint_file} = $checkpoint_file;
}
1; #EOM
package Process_cmd;
use strict;
use warnings;
use Carp;
use Cwd;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(process_cmd ensure_full_path);
sub process_cmd {
my ($cmd) = @_;
print STDERR "CMD: $cmd\n";
my $ret = system($cmd);
if ($ret) {
confess "Error, cmd:\n$cmd\n died with ret ($ret)";
}
return;
}
sub ensure_full_path {
my ($path) = @_;
unless ($path =~ m|^/|) {
$path = cwd() . "/$path";
}
return($path);
}
1; #EOM
#!/usr/local/bin/perl
####
sub nucs_in_common {
my ($e5, $e3, $g5, $g3) = @_;
($e5, $e3) = sort {$a<=>$b} ($e5, $e3);
($g5, $g3) = sort {$a<=>$b} ($g5, $g3);
unless (&coordsets_overlap([$e5,$e3], [$g5, $g3])) {
return(0);
}
my $length = abs ($e3 - $e5) + 1;
my $diff1 = ($e3 - $g3);
$diff1 = ($diff1 > 0) ? $diff1 : 0;
my $diff2 = ($g5 - $e5);
$diff2 = ($diff2 > 0) ? $diff2 : 0;
my $overlap_length = $length - $diff1 - $diff2;
return ($overlap_length);
}
####
sub coordsets_overlap {
my ($coordset_A_aref, $coordset_B_aref) = @_;
my ($lend_A, $rend_A) = sort {$a<=>$b} @$coordset_A_aref;
my ($lend_B, $rend_B) = sort {$a<=>$b} @$coordset_B_aref;
if ($lend_A <= $rend_B && $rend_A >= $lend_B) {
## yes, overlap
return (1);
}
else {
return (0);
}
}
####
sub coordset_A_encapsulates_B {
my ($coordset_A_aref, $coordset_B_aref) = @_;
my ($lend_A, $rend_A) = sort {$a<=>$b} @$coordset_A_aref;
my ($lend_B, $rend_B) = sort {$a<=>$b} @$coordset_B_aref;
if ($lend_A <= $lend_B && $rend_B <= $rend_A) {
return(1); # true
}
else {
return(0); # false;
}
}
1;
......@@ -9,19 +9,11 @@ It uses the following criteria:
* the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.
* if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).
* optional the putative peptide has a match to a Pfam domain above the noise cutoff score.
* start codon positions are refined according to a start-codon position weight matrix based feature scoring.
The software is primarily maintained by Brian Haas at the Broad Institute and Alexie Papanicolaou at the Commonwealth Scientific and Industrial Research Organisation (CSIRO). It is integrated into other related software such as Trinity, PASA, EVidenceModeler, and Trinotate.
Full documentation is provided at: http://transdecoder.github.io
===========================
To build just TransDecoder:
Please make sure you read http://transdecoder.github.io before proceeding.
% make
See sample_data/ directory for example usage.
......@@ -47,7 +47,6 @@ See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently suppo
Mitochondrial-Euascomycetes
Mitochondrial-Protozoans
=cut
......@@ -67,6 +66,7 @@ use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;
use Pipeliner;
my $UTIL_DIR = "$FindBin::RealBin/util";
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
......@@ -120,6 +120,18 @@ if ($genetic_code ne 'universal') {
main: {
my $workdir = basename($transcripts_file) . ".transdecoder_dir";
unless (-d $workdir) {
mkdir($workdir) or die "Error, cannot mkdir $workdir.";
}
my $longorf_checkpoint = "$workdir/__longorfs.ok";
if (-e $longorf_checkpoint) {
print STDERR "-longorfs already completed, checkpoint identified at: $longorf_checkpoint\n";
exit(0);
}
my %gene_trans_map;
if ($gene_trans_map_file) {
open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
......@@ -132,13 +144,6 @@ main: {
}
my $workdir = basename($transcripts_file) . ".transdecoder_dir";
unless (-d $workdir) {
mkdir($workdir) or die "Error, cannot mkdir $workdir.";
}
my $base_freqs_file = "$workdir/base_freqs.dat";
my $base_freqs_checkpoint = "$base_freqs_file.ok";
if (! -e $base_freqs_checkpoint) {
......@@ -317,6 +322,8 @@ main: {
close GFF;
&process_cmd("touch $longorf_checkpoint");
print STDERR "\n\n#################################\n"
. "### Done preparing long ORFs. ###\n"
. "##################################\n\n";
......
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
use Pod::Usage;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use Data::Dumper;
use List::Util qw (min max);
use File::Basename;
use Carp;
use lib ("$FindBin::RealBin/PerlLib");
use POSIX qw(ceil);
use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;
use Pipeliner;
use DelimParser;
my $RETAIN_LONG_ORFS_MIN_LENGTH = 1000000; # so essentially, off by default
=pod
=head1 NAME
......@@ -14,8 +38,13 @@ Required:
Common options:
--retain_long_orfs <int> retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence
marks it as coding (default: 900 bp => 300aa)
--retain_long_orfs_mode <string> 'dynamic' or 'strict' (default: dynamic)
In dynamic mode, sets range according to 1%FDR in random sequence of same GC content.
--retain_long_orfs_length <int> under 'strict' mode, retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence
marks it as coding (default: 1000000) so essentially turned off by default.)
--retain_pfam_hits <string> domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)
Any ORF with a pfam domain hit will be retained in the final output.
......@@ -23,21 +52,16 @@ Common options:
--retain_blastp_hits <string> blastp output in '-outfmt 6' format.
Any ORF with a blast match will be retained in the final output.
--single_best_orf Retain only the single best ORF per transcript.
(Best is defined as having (optionally pfam and/or blast support) and longest orf)
--cpu <int> Use multiple cores for cd-hit-est. (default=1)
--single_best_only Retain only the single best orf per transcript (prioritized by homology then orf length)
-G <string> genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia, ...)
--no_refine_starts start refinement identifies potential start codons for 5' partial ORFs using a PWM, process on by default.
Advanced options
--train <string> FASTA file with ORFs to train Markov Mod for protein identification; otherwise
longest non-redundant ORFs used
-T <int> If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
Note, 10x this value are first selected for use with cd-hit to remove redundancies,
Note, 10x this value are first selected for removing redundancies,
and then this -T value of longest ORFs are selected from the non-redundant set.
=cut
......@@ -69,33 +93,16 @@ See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently suppo
use strict;
use warnings;
use FindBin;
use Pod::Usage;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use Data::Dumper;
use List::Util qw (min max);
use File::Basename;
use lib ("$FindBin::RealBin/PerlLib");
use POSIX qw(ceil);
use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;
my $UTIL_DIR = "$FindBin::RealBin/util";
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
my ($cd_hit_est_exec) = &check_program('cd-hit-est');
my ($transcripts_file,$train_file);
my $top_ORFs_train = 500;
my $max_prot_length_for_training = 5000;
my $help;
......@@ -103,18 +110,55 @@ my $verbose;
my $search_pfam = "";
my ($reuse,$pfam_out);
my $RETAIN_LONG_ORFS = 900;
my $RETAIN_LONG_ORFS_MODE = 'dynamic';
#################################
# quantiles based on random orfs
#
# Quant 0.95 0.99 0.999 0.9999
# 80 1230.0 1728.0 2422.656 3240.7914
# 75 927.0 1257.0 1743.0 2278.6704
# 70 750.0 1002.0 1358.1 1681.77
# 65 645.0 829.59 1085.718 1383.5718
# 60 570.0 722.31 927.0 1158.231
# 55 513.0 633.0 796.983 950.8983
# 50 486.0 582.0 748.77 887.877
# 45 456.0 546.0 644.655 694.4931
# 40 431.7 501.0 589.794 612.2352
# 35 426.0 488.76 554.88 565.7904
# 30 406.5 469.5 510.9 518.19
# 25 404.4 448.56 465.39 467.739
# setting it to 0.999% mark:
my @GC_TO_MIN_LONG_ORF_LENGTH = ( [25, 465],
[30, 510],
[35, 555],
[40, 590],
[45, 645],
[50, 749],
[55, 797],
[60, 927],
[65, 1086],
[70, 1358],
[75, 1743],
[80, 2422] );
my $retain_pfam_hits_file;
my $retain_blastp_hits_file;
my $cpu = 1;
my $MPI_DEBUG = 1;
my $single_best_orf_flag = 0;
my $single_best_flag = 0;
my $genetic_code = "";
my $NO_REFINE_START_CODONS_FLAG = 0;
&GetOptions( 't=s' => \$transcripts_file,
'train:s' => \$train_file,
'h' => \$help,
'v' => \$verbose,
......@@ -124,7 +168,8 @@ my $genetic_code = "";
'search_pfam=s' => \$search_pfam,
'reuse' => \$reuse,
'retain_long_orfs=i' => \$RETAIN_LONG_ORFS,
'retain_long_orfs_mode=s' => \$RETAIN_LONG_ORFS_MODE,
'retain_long_orfs_length=i' => \$RETAIN_LONG_ORFS_MIN_LENGTH,
'debug' => \$MPI_DEBUG,
......@@ -132,10 +177,11 @@ my $genetic_code = "";
'retain_blastp_hits=s' => \$retain_blastp_hits_file,
'cpu=i' => \$cpu,
'single_best_orf' => \$single_best_orf_flag,
'single_best_only' => \$single_best_flag,
'G=s' => \$genetic_code,
'no_refine_starts' => \$NO_REFINE_START_CODONS_FLAG,
);
......@@ -157,6 +203,8 @@ if ($genetic_code) {
$genetic_code = " --genetic_code $genetic_code";
}
main: {
my $workdir = basename($transcripts_file) . ".transdecoder_dir";
......@@ -164,6 +212,14 @@ main: {
die "Error, cannot find directory: $workdir, be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
}
my $checkpoints_dir = "$workdir.__checkpoints";
if (! -d $checkpoints_dir) {
mkdir($checkpoints_dir);
}
my $pipeliner = new Pipeliner('-verbose' => 2);
$pipeliner->set_checkpoint_dir($checkpoints_dir);
my $prefix = "$workdir/longest_orfs";
my $cds_file = "$prefix.cds";
my $gff3_file = "$prefix.gff3";
......@@ -171,164 +227,89 @@ main: {
## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.
my $top_cds_file;
if ($train_file) {
if (! -s $train_file) {
die "Error, cannot locate train file: $train_file";
}
$top_cds_file = $train_file;
}
else {
$top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
my $checkpoint = "$top_cds_file.ok";
if (! -e $checkpoint) {
my $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
{
# to speed things up only check for redundancy up to x the number of entries we want
my $red_num = $top_ORFs_train * 10;
my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
&process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
&process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -T $cpu -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
&process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
&process_cmd("touch $checkpoint");
$pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num $max_prot_length_for_training > $red_num_cds_longest_file",
"get_longest_orfs.ok"));
$pipeliner->add_commands(new Command("$UTIL_DIR/exclude_similar_proteins.pl $red_num_cds_longest_file > $red_num_cds_longest_file.nr", "nr.ok"));
$pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr $top_ORFs_train > $top_cds_file",
"top_train_select.ok"));
$pipeliner->run();
}
}
# get hexamer scores
my $hexamer_scores_file = "$workdir/hexamer.scores";
my $hexamer_checkpoint = "$hexamer_scores_file.ok";
if (! -e $hexamer_checkpoint) {
my $base_freqs_file = "$workdir/base_freqs.dat";
my $cmd = "$UTIL_DIR/seq_n_baseprobs_to_logliklihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file";
&process_cmd($cmd);
&process_cmd("touch $hexamer_checkpoint");
if ($RETAIN_LONG_ORFS_MODE eq 'dynamic') {
$RETAIN_LONG_ORFS_MIN_LENGTH = &get_dynamic_retain_long_orf_length($base_freqs_file, \@GC_TO_MIN_LONG_ORF_LENGTH);
}
# score kmers for Markov model
$pipeliner->add_commands(new Command("$UTIL_DIR/seq_n_baseprobs_to_loglikelihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file",
"hexamer_scores.ok") );
# score all cds entries
my $cds_scores_file = "$cds_file.scores";
my $cds_scores_checkpoint = "$cds_scores_file.ok";
if (! -e $cds_scores_checkpoint) {
my $cmd = "$UTIL_DIR/score_CDS_liklihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file";
&process_cmd($cmd);
$pipeliner->add_commands(new Command("$UTIL_DIR/score_CDS_likelihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file",
"score_cdss.ok"));
&process_cmd("touch $cds_scores_checkpoint");
}
## Retain those that have pfam matches
my %has_pfam_hit;
## select orfs
my $select_cmd = "$UTIL_DIR/select_best_ORFs_per_transcript.pl --gff3_file $gff3_file --cds_scores $cds_scores_file "
. " --min_length_auto_accept $RETAIN_LONG_ORFS_MIN_LENGTH ";
if ($retain_pfam_hits_file) {
%has_pfam_hit = &parse_pfam_hits($retain_pfam_hits_file);
$select_cmd .= " --pfam_hits $retain_pfam_hits_file ";
}
my %has_blastp_hit;
if ($retain_blastp_hits_file) {
%has_blastp_hit = &parse_blastp_hits_file($retain_blastp_hits_file);
$select_cmd .= " --blast_hits $retain_blastp_hits_file ";
}
# get accs for best entries
my $acc_file = "$cds_file.scores.selected";
{
my %att_counter;
open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
while (<$ifh>) {
chomp;
my ($acc, $orf_length, @scores) = split(/\t/);
my @ATTS;
my $score_1 = shift @scores;
my $max_score_other_frame = max(@scores);
my $keep_flag = 0;
if ($has_pfam_hit{$acc}) {
$keep_flag = 1;
push (@ATTS, "PFAM");
print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
}
if ($has_blastp_hit{$acc}) {
$keep_flag = 1;
push (@ATTS, "BLASTP");
print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
}
if ($orf_length >= $RETAIN_LONG_ORFS) {
$keep_flag = 1;
push (@ATTS, "LONGORF");
}
if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
$keep_flag = 1;
push (@ATTS, "FRAMESCORE");
if ($single_best_flag) {
$select_cmd .= " --single_best_orf ";
}
$select_cmd .= " > $cds_file.best_candidates.gff3 ";
if ($keep_flag) {
print $ofh "$acc\n";
my $att_string = join("|", sort @ATTS);
$att_counter{$att_string}++;
}
}
close $ifh;
close $ofh;
$pipeliner->add_commands(new Command($select_cmd, "orf_select.ok"));
$pipeliner->run();
# report on the categories of the selected ORFs
print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
my $count = $att_counter{$att};
print STDERR join("\t", $att, $count) . "\n";
}
print STDERR "########################\n\n\n";
}
# index the current gff file:
my $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
&process_cmd($cmd);
# retrieve the best entries:
$cmd = "$UTIL_DIR/gene_list_to_gff.pl $acc_file $gff3_file.inx > $cds_file.best_candidates.gff3";
&process_cmd($cmd);
##############################
## Generate the final outputs.
##############################
my $final_output_prefix = basename($transcripts_file) . ".transdecoder";
{
# exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
$cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";
my $target_final_file = "$cds_file.best_candidates.gff3";
my $target_final_file = "$cds_file.eclipsed_removed.gff3";
## start codon refinement
unless ($NO_REFINE_START_CODONS_FLAG) {
my $cmd = "$UTIL_DIR/train_start_PWM.pl --transcripts $transcripts_file --selected_orfs $top_cds_file --out_prefix $workdir/start_refinement";
$pipeliner->add_commands(new Command($cmd, "train_start_pwm.ok"));
&process_cmd($cmd);
my $revised_orf_gff3_file = "$target_final_file.revised_starts.gff3";
$cmd = "$UTIL_DIR/start_codon_refinement.pl --transcripts $transcripts_file --gff3_file $target_final_file > $revised_orf_gff3_file";
$pipeliner->add_commands(new Command($cmd, "revise_starts.ok"));
if ($single_best_orf_flag) {
$cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
if ($retain_blastp_hits_file) {
$cmd .= " --blast_hits $retain_blastp_hits_file ";
}
if ($retain_pfam_hits_file) {
$cmd .= " --pfam_hits $retain_pfam_hits_file ";
}
$cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
$pipeliner->run();
&process_cmd($cmd);
$target_final_file = $revised_orf_gff3_file;
$target_final_file = "$cds_file.single_best_orf.gff3";
}
......@@ -336,11 +317,12 @@ main: {
## write final outputs:
{
## make a BED file for viewing in IGV
my $gff3_file = "$final_output_prefix.gff3";
my $bed_file = $gff3_file;
$bed_file =~ s/\.gff3$/\.bed/;
$cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
my $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
&process_cmd($cmd);
......@@ -357,9 +339,9 @@ main: {
$best_cds_file =~ s/\.pep$/\.cds/;
$cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file --seqType CDS $genetic_code > $best_cds_file";
&process_cmd($cmd);
}
print STDERR "transdecoder is finished. See output files $final_output_prefix.\*\n\n\n";
......@@ -450,3 +432,44 @@ sub check_program() {
}
return @paths;
}
####
sub get_dynamic_retain_long_orf_length {
my ($base_freqs_file, $GC_to_min_orf_length_aref) = @_;
my $pct_GC = -1;
open(my $fh, $base_freqs_file) or confess "Error, cannot open file: $base_freqs_file";
while (<$fh>) {
chomp;
my ($base, $count, $ratio) = split(/\t/);
if ($base eq 'C') {
$pct_GC = 2 * $ratio * 100;
last;
}
}
close $fh;
print STDERR "PCT_GC: $pct_GC\n";
unless ($pct_GC > 0) {
confess "Error, didn't parse percent C from file: $base_freqs_file";
}
foreach my $bin (@$GC_to_min_orf_length_aref) {
my ($gc, $min_len) = @$bin;
if ($pct_GC <= $gc) {
return($min_len);
}
}
# if we got here, then GC content must be crazy high!
# in this case, rely entirely on Markov model for coding determination.
return(1000000); #effectively infinity here.
}
transdecoder (5.0.1-1) unstable; urgency=medium
* New upstream version (no need to repackage since binary files are now
removed upstream)
* Upstream changelog: cdhit has been removed and replaced by our own fast
method for removing redundancies
* Standards-Version: 4.1.3
* debhelper 11
* Add missing Depends: r-base-core, python
* d/rules: do not parse d/changelog
* Fix permissions
-- Andreas Tille <tille@debian.org> Mon, 19 Feb 2018 15:11:06 +0100
transdecoder (3.0.1+dfsg-1) unstable; urgency=medium
* Add Multi-Arch: foreign hint to transdecoder-doc
......
......@@ -4,10 +4,9 @@ Uploaders: Michael R. Crusoe <michael.crusoe@gmail.com>,
Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9),
cd-hit,
Build-Depends: debhelper (>= 11~),
liburi-perl
Standards-Version: 3.9.8
Standards-Version: 4.1.3
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/transdecoder.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/transdecoder.git
Homepage: https://transdecoder.github.io/
......@@ -16,8 +15,9 @@ Package: transdecoder
Architecture: all
Depends: ${misc:Depends},
${perl:Depends},
cd-hit,
liburi-perl
liburi-perl,
r-base-core,
python
Description: find coding regions within RNA transcript sequences
TransDecoder identifies candidate coding regions within transcript sequences,
such as those generated by de novo RNA-Seq transcript assembly using Trinity,
......
Author: Michael R. Crusoe <mcrusoe@msu.edu>
Last-Update: Fri, 13 Feb 2015 06:26:24 -0500
Description: Fix path to Debian location of cd-hit-est
--- transdecoder.orig/TransDecoder.Predict
+++ transdecoder/TransDecoder.Predict
@@ -90,7 +90,7 @@
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
-my ($cd_hit_est_exec) = &check_program('cd-hit-est');
+my ($cd_hit_est_exec) = &check_program('/usr/lib/cd-hit/cd-hit-est');
my ($transcripts_file,$train_file);
......@@ -4,7 +4,7 @@ Description: Fix whatis entry of manpage
--- a/TransDecoder.LongOrfs
+++ b/TransDecoder.LongOrfs
@@ -6,7 +6,7 @@
@@ -6,7 +6,7 @@ my $MIN_PROT_LENGTH = 100;
=head1 NAME
......@@ -15,7 +15,7 @@ Description: Fix whatis entry of manpage
--- a/TransDecoder.Predict
+++ b/TransDecoder.Predict
@@ -4,7 +4,7 @@
@@ -28,7 +28,7 @@ my $RETAIN_LONG_ORFS_MIN_LENGTH = 100000
=head1 NAME
......
fix-test
cd-hit-est-rename
fix-whatis
reproducible-sample_data.patch
#!/usr/bin/make -f
DH_VERBOSE := 1
DEBVERS := $(shell dpkg-parsechangelog --show-field=Version)
include /usr/share/dpkg/default.mk
%:
dh $@ --parallel
dh $@
override_dh_auto_clean:
cd ./sample_data && for script in ./*/cleanme.pl; do $$($$script); done
......@@ -23,16 +24,17 @@ override_dh_install:
override_dh_installman:
pod2man --section=1 --center="Transcriptome Protein Prediction" \
--release="$(DEBVERS)" TransDecoder.LongOrfs > \
--release="$(DEB_VERSION_UPSTREAM)" TransDecoder.LongOrfs > \
debian/TransDecoder.LongOrfs.1
pod2man --section=1 --center="Transcriptome Protein Prediction" \
--release="$(DEBVERS)" TransDecoder.Predict > \
--release="$(DEB_VERSION_UPSTREAM)" TransDecoder.Predict > \
debian/TransDecoder.Predict.1
dh_installman
override_dh_fixperms:
dh_fixperms
chmod -x debian/transdecoder/usr/lib/transdecoder/PerlLib/Fasta_retriever.pm
find debian -name Process_cmd.pm -exec chmod -x \{\} \;
override_dh_clean:
# generated man pages
......
version=3
version=4
opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
https://github.com/TransDecoder/Transdecoder/releases .*/archive/v?(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
DIRS=cufflinks_example pasa_example simple_transcriptome_target
DIRS=cufflinks_example pasa_example simple_transcriptome_target stringtie_example
test:
@for i in $(DIRS); do \
......