Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "TransDecoder.lrgTests"]
 	path = TransDecoder.lrgTests
 	url = https://github.com/TransDecoder/TransDecoder.lrgTests.git
+[submodule "TransDecoder.wiki"]
+	path = TransDecoder.wiki
+	url = https://github.com/TransDecoder/TransDecoder.wiki.git
--- a/Changelog.txt
+++ b/Changelog.txt
+## v5.0.0 August 26, 2017
+
+-algorithm updates: frame[0] score > 0 and max for first 3 reading frames (instead of all 6), and orf with highest frame[0] score is chosen allowing for minimal overlap among selected predictions.
+-option --single_best_only provides the single longest of the selected orfs per contig.
+-long orfs unlikely to appear in random sequence are automatically selected as candidates with this minimal long orf length set dynamically according to GC content.
+-orf score and blast or pfam info is propagated to gff3 output
+
+
+
+## v4.1.0
+
+-single best orf now selected by default.  If more than the single best orf is wanted, use the  --all_good_orfs  parameter.
+-start codon refinement is now done by default. To turn it off and get the original behavior of extending to the longest orf position, use parameter:  --no_refine_starts
+-cdhit has been removed and replaced by our own fast method for removing redundancies.
+-selection of coding regions is strictly governed by Markov-based likelihood scores across reading frames. No auto-retention of long orfs by default, but can be activated by parameter:   --retain_long_orfs_length
+
+
+** all v4 releases pre-v4.1 were fairly quickly retracted due to bugs and insufficient benchmarking **
+
+
+## v3.0.2 release Oct 31, 2016
+
+minor bugfix release - when checking for required utilities to be installed, doesn't require ^/ in path
+
+
+
+## v3.0.0 release April 26, 2016
+
+TransDecoder v3.0.0 includes the following changes:
+
+TransDecoder.LongOrfs now includes parameter '--gene_trans_map ' as a way to retain the gene identifier information. In the case of Cufflinks and Trinity, the gene identifiers will automatically be recognized and retained. For PASA and other inputs, it is necessary to provide the gene-to-transcript identifier mappings in order to generate isoform-clustered output files (gff3).
+
+TransDecoder.Predict now includes flag ' --single_best_orf ' to retain only the single 'best' ORF per transcript. ORFs are prioritized according to homology information (if given the blast and pfam results) and by sequence length, with longer ORFs preferred.
+
+Codon phase information is now included in the GFF3 output files.
+
+The .mRNA files that were generated by default for genome-free TransDecoder runs are now deprecated, but of course the .cds and .pep files are provided.
+
+The sample data sets include examples for running TransDecoder in a few different contexts, including starting from Trinity, PASA, or Cufflinks data.
+
+More useful logging information is provided to it's clearer as to how many orfs are being retained and which are being eliminated along the way.
+
+
+
 ## 2016-03-11 v2.1 release
 -added cpu parameter to TransDecoder.predict
 -retaining gene identifier information from cufflinks output

--- a/Makefile
+++ b/Makefile
 SHELL := /bin/bash

 all:
-	cd ./transdecoder_plugins/ && $(MAKE) all
+	@echo Nothing to build.  Run \'make test\' to run example executions on each of the sample data sets provided.

 clean:
-	cd ./transdecoder_plugins/ && $(MAKE) clean
-	cd ./sample_data && ./cleanme.pl
+	cd ./sample_data && make clean

 test:
-	cd ./sample_data/ && ./runMe.sh
+	cd ./sample_data/ && make test


--- a/PerlLib/DelimParser.pm
+++ b/PerlLib/DelimParser.pm
+#!/usr/bin/env perl
+
+# classes for DelimParser::Reader and DelimParser::Writer
+
+package DelimParser;
+use strict;
+use warnings;
+use Carp;
+
+####
+sub new {
+    my ($packagename, $fh, $delimiter) = @_;
+    
+    unless ($fh && $delimiter) {
+        confess "Error, need filehandle and delimiter params";
+    }
+    
+
+    my $self = {  _delim => $delimiter,
+                  _fh => $fh,
+
+                  # set below in _init()
+                  _column_headers => [],
+    };
+
+    
+    bless ($self, $packagename);
+        
+    return($self);
+}
+
+
+####
+sub get_fh {
+    my $self = shift;
+    return($self->{_fh});
+}
+
+####
+sub get_delim {
+    my $self = shift;
+    return($self->{_delim});
+}
+
+####
+sub get_column_headers {
+    my $self = shift;
+    return(@{$self->{_column_headers}});
+}
+
+####
+sub set_column_headers {
+    my $self = shift;
+    my (@columns) = @_;
+
+    $self->{_column_headers} = \@columns;
+    
+    return;
+}
+
+####
+sub get_num_columns {
+    my $self = shift;
+    return(length($self->get_column_headers()));
+}
+
+
+###
+sub reconstruct_header_line {
+    my $self = shift;
+    my @column_headers = $self->get_column_headers();
+
+    my $header_line = join("\t", @column_headers);
+    return($header_line);
+}
+
+###
+sub reconstruct_line_from_row {
+    my $self = shift;
+    my $row_href = shift;
+    unless ($row_href && ref $row_href) {
+        confess "Error, must set row_href as param";
+    }
+
+    my @column_headers = $self->get_column_headers();
+
+    my @vals;
+    foreach my $col_header (@column_headers) {
+        my $val = $row_href->{$col_header};
+        push (@vals, $val);
+    }
+
+    my $row_text = join("\t", @vals);
+
+    return($row_text);
+        
+}
+
+
+##################################################
+package DelimParser::Reader;
+use strict;
+use warnings;
+use Carp;
+use Data::Dumper;
+
+our @ISA;
+push (@ISA, 'DelimParser');
+
+sub new {
+    my ($packagename) = shift;
+    my $self = $packagename->DelimParser::new(@_);
+    
+    $self->_init();
+    
+    return($self);
+}
+
+
+####
+sub _init {
+    my $self = shift;
+    
+    my $fh = $self->get_fh();
+    my $delim = $self->get_delim();
+
+    my $header_row = <$fh>;
+    chomp $header_row;
+
+    unless ($header_row) {
+        confess "Error, no header row read.";
+    }
+    
+    my @fields = split(/$delim/, $header_row);
+        
+    $self->set_column_headers(@fields);
+    
+    
+    return;
+}
+
+####
+sub get_row {
+    my $self = shift;
+    
+    my $fh = $self->get_fh();
+    my $line = <$fh>;
+    unless ($line) {
+        return(undef); # eof
+    }
+    
+    my $delim = $self->get_delim();
+    my @fields = split(/$delim/, $line);
+    chomp $fields[$#fields]; ## it's important that this is done after the delimiter splitting in case the last field is actually empty.
+    
+    
+    my @column_headers = $self->get_column_headers();
+
+    my $num_col = scalar (@column_headers);
+    my $num_fields = scalar(@fields);
+
+    if ($num_col != $num_fields) {
+        confess "Error, line: [$line] " . Dumper(\@fields) . " is lacking $num_col fields: " . Dumper(\@column_headers);
+    }
+    
+    my %dict;
+    foreach my $colname (@column_headers) {
+        my $field = shift @fields;
+        $dict{$colname} = $field;
+    }
+    
+    return(\%dict);
+}
+
+
+##################################################
+
+package DelimParser::Writer;
+use strict;
+use warnings;
+use Carp;
+
+our @ISA;
+push (@ISA, 'DelimParser');
+
+sub new {
+    my ($packagename) = shift;
+    my ($ofh, $delim, $column_fields_aref, $FLAGS) = @_;
+        
+    ## FLAGS can be:
+    #  NO_WRITE_HEADER|...
+
+    unless (ref $column_fields_aref eq 'ARRAY') {
+        confess "Error, need constructor params: ofh, delim, column_fields_aref";
+    }
+    
+    my $self = $packagename->DelimParser::new($ofh, $delim);
+ 
+    $self->_initialize($column_fields_aref, $FLAGS);
+    
+    return($self);
+}
+
+
+####
+sub _initialize {
+    my $self = shift;
+    my $column_fields_aref = shift;
+    my $FLAGS = shift;
+    
+    unless (ref $column_fields_aref eq 'ARRAY') {
+        confess "Error, require column_fields_aref as param";
+    }
+    
+        
+    my $ofh = $self->get_fh();
+    my $delim = $self->get_delim();
+
+    
+    
+    $self->set_column_headers(@$column_fields_aref);
+    
+    unless (defined($FLAGS) && $FLAGS =~ /NO_WRITE_HEADER/) {
+        my $output_line = join($delim, @$column_fields_aref);
+        print $ofh "$output_line\n";
+    }
+    
+    
+    return;
+}
+
+
+####
+sub write_row {
+    my $self = shift;
+    my $dict_href = shift;
+
+    unless (ref $dict_href eq "HASH") {
+        confess "Error, need dict_href as param";
+    }
+
+    my $num_dict_fields = scalar(keys %$dict_href);
+    
+    my @column_headers = $self->get_column_headers();
+        
+    
+    my $delim = $self->get_delim();
+
+    my @out_fields;
+    for my $column_header (@column_headers) {
+        my $field = $dict_href->{$column_header};
+        unless (defined $field) {
+            confess "Error, missing value for required column field: $column_header";
+        }
+        if ($field =~ /$delim/) {
+            # don't allow any delimiters to contaminate the field value, otherwise it'll introduce offsets.
+            $field =~ s/$delim/ /g;
+        }
+        # also avoid newlines, which will also break the output formatting.
+        if ($field =~ /\n/) {
+            $field =~ s/\n/ /g;
+        }
+        
+        push (@out_fields, $field);
+    }
+
+    my $outline = join("\t", @out_fields);
+
+    my $ofh = $self->get_fh();
+
+    print $ofh "$outline\n";
+    
+    return;
+}
+
+
+1; #EOM
+
--- a/PerlLib/PWM.pm
+++ b/PerlLib/PWM.pm
+package PWM;
+
+use strict;
+use warnings;
+use Carp;
+
+my $PSEUDOCOUNT = 0.1;
+
+srand(1); # reproducibility
+
+###
+sub new {
+    my $packagename = shift;
+    
+    my $self = { pos_freqs => [],
+                 pos_probs => [],
+                 _pwm_built_flag => 0,
+    };
+
+    bless($self, $packagename);
+
+    return($self);
+}
+
+
+sub is_pwm_built {
+    my ($self) = @_;
+    
+    return($self->{_pwm_built_flag});
+
+}
+
+sub has_pos_freqs {
+    my ($self) = @_;
+
+    if (@{$self->{pos_freqs}}) {
+        return(1);
+    }
+    else {
+        return(0);
+    }
+}
+
+
+
+sub add_feature_seq_to_pwm {
+    my ($self, $feature_seq) = @_;
+
+    $feature_seq = uc $feature_seq;
+    
+    my $pwm_len = $self->get_pwm_length();
+    if ($pwm_len && length($feature_seq) != $pwm_len) {
+        confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
+    }
+    if ($feature_seq =~ /[^GATC]/) {
+        print STDERR "Error, feature_seq: $feature_seq contains non-GATC chars... skipping\n";
+        return;
+    }
+    
+    my @chars = split(//, $feature_seq);
+    for (my $i = 0; $i <= $#chars; $i++) {
+        my $char = $chars[$i];
+
+        $self->{pos_freqs}->[$i]->{$char}++;
+    }
+
+    return;
+}
+
+
+sub remove_feature_seq_from_pwm {
+    my ($self, $feature_seq) = @_;
+    
+    if (! $self->has_pos_freqs()) {
+        confess "Error, pwm obj doesn't have position frequencies set";
+    }
+    
+    $feature_seq = uc $feature_seq;
+    
+    my $pwm_len = $self->get_pwm_length();
+    if ($pwm_len && length($feature_seq) != $pwm_len) {
+        confess "Error, pwm_len: $pwm_len and feature_seq len: " . length($feature_seq) . " are unmatched.";
+    }
+    if ($feature_seq =~ /[^GATC]/) {
+        print STDERR "Warning, feature_seq: $feature_seq contains non-GATC chars... skipping.\n";
+        return;
+    }
+    
+    my @chars = split(//, $feature_seq);
+    for (my $i = 0; $i <= $#chars; $i++) {
+        my $char = $chars[$i];
+        $self->{pos_freqs}->[$i]->{$char}--;
+    }
+    
+    return;
+}
+
+
+    
+sub get_pwm_length {
+    my ($self) = @_;
+
+    my $pwm_length;
+    
+    if ($self->is_pwm_built()) {
+        $pwm_length = $#{$self->{pos_probs}} + 1;
+    }
+    else {
+        $pwm_length = $#{$self->{pos_freqs}} + 1;
+    }
+    
+    return($pwm_length);
+}
+
+
+sub simulate_feature {
+    my ($self) = @_;
+
+    if (! $self->is_pwm_built()) {
+        confess "Error, pwm not built yet";
+    }
+
+    my $probs_aref = $self->{pos_probs};
+
+    my @chars = qw(G A T C);
+    my $feature_seq = "";
+
+    my $pwm_length = $self->get_pwm_length();
+    for (my $i = 0; $i < $pwm_length; $i++) {
+        my $sum_prob = 0;
+        my $selected_flag = 0;
+        #print "rand val: $rand_val\n";
+        my $tot_prob = 0;
+        for (my $j = 0; $j <= $#chars; $j++) {
+            my $char = $chars[$j];
+            my $p = $probs_aref->[$i]->{$char};
+            $tot_prob += $p;
+        }
+        my $rand_val = rand($tot_prob); # tot_prob should be 1 or very close...
+        
+        
+        for (my $j = 0; $j <= $#chars; $j++) {
+            my $char = $chars[$j];
+            my $p = $probs_aref->[$i]->{$char};
+            #print "char: $char, p: $p\n";
+            $sum_prob += $p;
+            if ($j == $#chars) {
+                $sum_prob = 1;
+            }
+            
+            if ($rand_val <= $sum_prob) {
+                # choose char
+                $feature_seq .= $char;
+                $selected_flag = 1;
+                last;
+            }
+        }
+        if (! $selected_flag) {
+            croak "Error, didn't select a random char for feature seq";
+        }
+    }
+
+    return($feature_seq);
+
+}
+
+
+
+####
+sub write_pwm_file {
+    my ($self, $filename) = @_;
+
+    unless ($self->is_pwm_built()) {
+        croak "Error, pwm needs to be built first before writing to file";
+    }
+    
+    open (my $ofh, ">$filename") or croak "Error, cannot write to file: $filename";
+
+    my $pos_probs_aref = $self->{pos_probs};
+    my $pwm_length = $self->get_pwm_length();
+    
+    my @chars = qw(G A T C);
+    print $ofh join("\t", "pos", @chars) . "\n";
+    for (my $i = 0; $i < $pwm_length; $i++) {
+        
+        my @vals = ($i);
+        foreach my $char (@chars) {
+            my $prob = $pos_probs_aref->[$i]->{$char} || 0;
+            push (@vals, $prob);
+        }
+                
+        print $ofh join("\t", @vals) . "\n";
+    }
+    
+    close $ofh;
+    
+    return;
+}
+
+
+
+####
+sub build_pwm {
+    my ($self) = @_;
+
+    my $pos_freqs_aref = $self->{pos_freqs};
+
+    for (my $i = 0; $i <= $#{$pos_freqs_aref}; $i++) {
+        my @chars = keys %{$pos_freqs_aref->[$i]};
+
+        my $sum = 0;
+        foreach my $char (@chars) {
+            my $val = $pos_freqs_aref->[$i]->{$char};
+            $sum += $val;
+        }
+        
+        # now convert to relative freqs
+        @chars = qw(G A T C); # the complete set of chars we care about.
+        foreach my $char (@chars) {
+            my $val = $pos_freqs_aref->[$i]->{$char} || 0;
+            my $prob = sprintf("%.6f", ($val + $PSEUDOCOUNT) / ($sum + 4 * $PSEUDOCOUNT) );
+            $self->{pos_probs}->[$i]->{$char} = $prob;
+        }
+        
+    }
+
+    $self->{_pwm_built_flag} = 1;
+
+    return;
+}
+
+
+sub score_pwm_using_base_freqs {
+    my ($self, $target_sequence, $base_freqs_href, %options ) = @_;
+
+    ###  Options can include:
+    ###
+    ###   mask => [ coordA, coordB, coordC ],  # ignored pwm positions in scoring
+    ###   pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
+    ###
+    
+    for my $key (keys %options) {
+        if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
+            confess "Error, option $key is not recognized";
+        }
+    }
+        
+    #print STDERR "target_seq: [$target_sequence]\n";
+    
+    $target_sequence = uc $target_sequence;
+    unless ($target_sequence =~ /^[GATC]+$/) {
+        # can only score GATC-containing sequences.
+        return("NA");
+    }
+    
+    if (! $self->is_pwm_built()) {
+        croak("pwm not built yet!");
+    }
+    
+    my $pwm_length = $self->get_pwm_length();
+
+    if (length($target_sequence) != $pwm_length) {
+        croak "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
+    }
+
+    my %mask;
+    if (my $mask_positions_aref = $options{'mask'}) {
+        %mask = map { + $_ => 1 } @$mask_positions_aref;
+    }
+    
+    my $motif_score = 0;
+
+    my @seqarray = split(//, $target_sequence);
+
+    my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
+    if (my $pwm_range_aref = $options{'pwm_range'}) {
+        ($pwm_start, $pwm_end) = @$pwm_range_aref;
+    }
+        
+    for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
+        
+        if ($mask{$i}) {
+            next;
+        }
+        
+        my $char = $seqarray[$i];
+        my $prob = $self->{pos_probs}->[$i]->{$char};
+
+        unless ($prob) {
+            return("NA");
+        }
+        
+        my $prob_rand = $base_freqs_href->{$char};
+        unless ($prob_rand) {
+            die "Error, no non-zero probability value specified for char [$char] ";
+        }
+        
+        my $loglikelihood = log($prob/$prob_rand);
+        $motif_score += $loglikelihood;
+
+    }
+    
+    return($motif_score);
+
+}
+
+
+sub score_plus_minus_pwm {
+    my ($self, $target_sequence, $pwm_minus, %options) = @_;
+    
+    ###  Options can include:
+    ###
+    ###   mask => [ coordA, coordB, coordC ],  # ignored pwm positions in scoring
+    ###   pwm_range => [pwm_idx_start, pwm_idx_end], # start and end are inclusive
+    ###
+    
+    for my $key (keys %options) {
+        if (! grep {$_ eq $key} ('mask', 'pwm_range')) {
+            confess "Error, option $key is not recognized";
+        }
+    }
+        
+    #print STDERR "target_seq: [$target_sequence]\n";
+    
+    $target_sequence = uc $target_sequence;
+    unless ($target_sequence =~ /^[GATC]+$/) {
+        # can only score GATC-containing sequences.
+        return("NA");
+    }
+    
+    if (! $self->is_pwm_built()) {
+        croak("pwm not built yet!");
+    }
+    
+    my $pwm_length = $self->get_pwm_length();
+
+    if (length($target_sequence) != $pwm_length) {
+        croak "Error, len(target_sequence)=" . length($target_sequence) . " and pwm length = $pwm_length";
+    }
+
+    my %mask;
+    if (my $mask_positions_aref = $options{'mask'}) {
+        %mask = map { + $_ => 1 } @$mask_positions_aref;
+    }
+    
+    my $motif_score = 0;
+
+    my @seqarray = split(//, $target_sequence);
+
+    my ($pwm_start, $pwm_end) = (0, $pwm_length-1);
+    if (my $pwm_range_aref = $options{'pwm_range'}) {
+        ($pwm_start, $pwm_end) = @$pwm_range_aref;
+    }
+        
+    for (my $i = $pwm_start; $i <= $pwm_end; $i++) {
+        
+        if ($mask{$i}) {
+            #print STDERR "masking $i\n";
+            next;
+        }
+        
+        my $char = $seqarray[$i];
+        my $prob = $self->{pos_probs}->[$i]->{$char};
+
+        unless ($prob) {
+            return("NA");
+        }
+        
+        my $prob_rand = $pwm_minus->{pos_probs}->[$i]->{$char};
+        unless ($prob_rand) {
+            die "Error, no non-zero probability value specified for char [$char] ";
+        }
+        
+        my $loglikelihood = log($prob/$prob_rand);
+        $motif_score += $loglikelihood;
+        
+    }
+    
+    return($motif_score);
+
+}
+
+####
+sub load_pwm_from_file {
+    my ($self, $pwm_file) = @_;
+
+    open(my $fh, $pwm_file) or confess "Error, cannot open file: $pwm_file";
+
+    my $header = <$fh>;
+    chomp $header;
+    my @chars = split(/\t/, $header);
+    shift @chars;
+    
+    while (<$fh>) {
+        chomp;
+        my @x = split(/\t/);
+        my $idx_pos = shift @x;
+        for (my $i = 0; $i <= $#chars; $i++) {
+            my $char = $chars[$i];
+            my $prob = $x[$i];
+
+            $self->{pos_probs}->[$idx_pos]->{$char} = $prob;
+        }
+    }
+    close $fh;
+
+    $self->{_pwm_built_flag} = 1;
+
+    return;
+}
+
+
+1; #EOM
--- a/PerlLib/Pipeliner.pm
+++ b/PerlLib/Pipeliner.pm
+package Pipeliner;
+
+use strict;
+use warnings;
+use Carp;
+use Cwd;
+
+################################
+## Verbose levels:
+## 1: see CMD string
+## 2: see stderr during process
+my $VERBOSE = 0;
+################################
+
+
+####################
+## Static methods:
+####################
+
+####
+sub ensure_full_path {
+    my ($path, $ADD_GZ_FIFO_FLAG) = @_;
+
+    unless ($path =~ m|^/|) {
+        $path = cwd() . "/$path";
+    }
+
+    if ($ADD_GZ_FIFO_FLAG && $path =~ /\.gz$/) {
+        $path = "<(zcat $path)";
+    }
+    
+    return($path);
+}
+
+####
+sub process_cmd {
+    my ($cmd) = @_;
+
+    print STDERR "CMD: $cmd\n";
+    my $ret = system($cmd);
+    if ($ret) {
+        die "Error, CMD: $cmd died with ret $ret";
+    }
+    return;
+}
+
+
+################
+## Obj methods:
+################
+
+####
+sub new {
+    my $packagename = shift;
+    my %params = @_;
+    
+    if ($params{-verbose}) {
+        $VERBOSE = $params{-verbose};
+    }
+    my $cmds_log = $params{-cmds_log};
+    unless ($cmds_log) {
+        $cmds_log = "pipeliner.$$.cmds";
+    }
+    
+    open (my $ofh, ">$cmds_log") or confess "Error, cannot write to $cmds_log";
+    
+    
+    my $self = { 
+        cmd_objs => [],
+        checkpoint_dir => undef,
+
+        cmds_log_ofh => $ofh,
+        
+    };
+    
+    bless ($self, $packagename);
+
+    return($self);
+}
+
+
+sub add_commands {
+    my $self = shift;
+    my @cmds = @_;
+    
+    foreach my $cmd (@cmds) {
+        unless (ref($cmd) =~ /Command/) {
+            confess "Error, need Command object as param";
+        }
+
+        my $checkpoint_file = $cmd->get_checkpoint_file();
+        if ($checkpoint_file !~ m|^/|) {
+            if (my $checkpoint_dir = $self->get_checkpoint_dir()) {
+                $checkpoint_file = "$checkpoint_dir/$checkpoint_file";
+                $cmd->reset_checkpoint_file($checkpoint_file);
+            }
+        }
+        
+
+        push (@{$self->{cmd_objs}}, $cmd);
+    }
+    
+    return $self;
+
+}
+
+sub set_checkpoint_dir {
+    my $self = shift;
+    my ($checkpoint_dir) = @_;
+    if (! -d $checkpoint_dir) {
+        confess "Error, cannot locate checkpointdir: $checkpoint_dir";
+    }
+    $self->{checkpoint_dir} = $checkpoint_dir;
+}
+
+sub get_checkpoint_dir {
+    my $self = shift;
+    return($self->{checkpoint_dir});
+}
+
+sub has_commands {
+    my $self = shift;
+    if ($self->_get_commands()) {
+        return(1);
+    }
+    else {
+        return(0);
+    }
+}
+
+sub run {
+    my $self = shift;
+
+    my $cmds_log_ofh = $self->{cmds_log_ofh};
+
+    foreach my $cmd_obj ($self->_get_commands()) {
+        
+        my $cmdstr = $cmd_obj->get_cmdstr();
+        print $cmds_log_ofh "$cmdstr\n";
+        
+
+        my $checkpoint_file = $cmd_obj->get_checkpoint_file();
+        
+        if (-e $checkpoint_file) {
+            print STDERR "-- Skipping CMD: $cmdstr, checkpoint exists.\n" if $VERBOSE;
+        }
+        else {
+            print STDERR "* Running CMD: $cmdstr\n" if $VERBOSE;
+            
+            my $tmp_stderr = "tmp.$$.stderr";
+            if (-e $tmp_stderr) {
+                unlink($tmp_stderr);
+            }
+            unless ($VERBOSE == 2) {
+                $cmdstr .= " 2>$tmp_stderr";
+            }
+            
+            my $ret = system($cmdstr);
+            if ($ret) {
+                
+                if (-e $tmp_stderr) {
+                    system("cat $tmp_stderr");
+                    unlink($tmp_stderr);
+                }
+                                
+                confess "Error, cmd: $cmdstr died with ret $ret";
+            }
+            else {
+                `touch $checkpoint_file`;
+                if ($?) {
+                    
+                    confess "Error creating checkpoint file: $checkpoint_file";
+                }
+            }
+
+            if (-e $tmp_stderr) {
+                unlink($tmp_stderr);
+            }
+        }
+    }
+
+    
+    # reset in case reusing the pipeline obj
+    $self->{cmd_objs} = []; # reinit
+    
+
+    return;
+}
+
+sub _get_commands {
+    my $self = shift;
+
+    return(@{$self->{cmd_objs}});
+}
+
+
+
+
+
+package Command;
+use strict;
+use warnings;
+use Carp;
+
+sub new {
+    my $packagename = shift;
+    
+    my ($cmdstr, $checkpoint_file) = @_;
+
+    unless ($cmdstr && $checkpoint_file) {
+        confess "Error, need cmdstr and checkpoint filename as params";
+    }
+
+    my $self = { cmdstr => $cmdstr,
+                 checkpoint_file => $checkpoint_file,
+    };
+
+    bless ($self, $packagename);
+
+    return($self);
+}
+    
+####
+sub get_cmdstr {
+    my $self = shift;
+    return($self->{cmdstr});
+}
+
+####
+sub get_checkpoint_file {
+    my $self = shift;
+    return($self->{checkpoint_file});
+}
+
+####
+sub reset_checkpoint_file {
+    my $self = shift;
+    my $checkpoint_file = shift;
+
+    $self->{checkpoint_file} = $checkpoint_file;
+}
+
+1; #EOM
--- a/PerlLib/Process_cmd.pm
+++ b/PerlLib/Process_cmd.pm
+package Process_cmd;
+
+use strict;
+use warnings;
+use Carp;
+use Cwd;
+
+require Exporter;
+our @ISA = qw(Exporter);
+our @EXPORT = qw(process_cmd ensure_full_path);
+
+
+sub process_cmd {
+	my ($cmd) = @_;
+
+	print STDERR "CMD: $cmd\n";
+
+	my $ret = system($cmd);
+	if ($ret) {
+		confess "Error, cmd:\n$cmd\n died with ret ($ret)";
+	}
+
+	return;
+}
+
+
+sub ensure_full_path {
+    my ($path) = @_;
+
+    unless ($path =~ m|^/|) {
+        $path = cwd() . "/$path";
+    }
+
+    return($path);
+}
+
+
+
+1; #EOM
+
--- a/PerlLib/overlapping_nucs.ph
+++ b/PerlLib/overlapping_nucs.ph
+#!/usr/local/bin/perl
+
+####
+sub nucs_in_common {
+    my ($e5, $e3, $g5, $g3) = @_;
+	
+
+    ($e5, $e3) = sort {$a<=>$b} ($e5, $e3);
+    ($g5, $g3) = sort {$a<=>$b} ($g5, $g3);
+
+	unless (&coordsets_overlap([$e5,$e3], [$g5, $g3])) {
+		return(0);
+	}
+	
+    my $length = abs ($e3 - $e5) + 1;
+    my $diff1 = ($e3 - $g3);
+    $diff1 = ($diff1 > 0) ? $diff1 : 0;
+    my $diff2 = ($g5 - $e5);
+    $diff2 = ($diff2 > 0) ? $diff2 : 0;
+    my $overlap_length = $length - $diff1 - $diff2;
+    return ($overlap_length);
+}
+
+####
+sub coordsets_overlap {
+	my ($coordset_A_aref, $coordset_B_aref) = @_;
+
+	my ($lend_A, $rend_A) = sort {$a<=>$b} @$coordset_A_aref;
+
+	my ($lend_B, $rend_B) = sort {$a<=>$b} @$coordset_B_aref;
+
+	if ($lend_A <= $rend_B && $rend_A >= $lend_B) {
+		## yes, overlap
+		return (1);
+	}
+	else {
+		return (0);
+	}
+}
+
+
+####
+sub coordset_A_encapsulates_B {
+	my ($coordset_A_aref, $coordset_B_aref) = @_;
+
+	my ($lend_A, $rend_A) = sort {$a<=>$b} @$coordset_A_aref;
+	
+	my ($lend_B, $rend_B) = sort {$a<=>$b} @$coordset_B_aref;
+	
+	if ($lend_A <= $lend_B && $rend_B <= $rend_A) {
+		return(1); # true
+	}
+	else {
+		return(0); # false;
+	}
+}
+
+
+
+1;
+
--- a/README
+++ b/README
@@ -9,19 +9,11 @@ It uses the following criteria:
 *    the above coding score is greatest when the ORF is scored in the 1st reading frame as compared to scores in the other 5 reading frames.
 *    if a candidate ORF is found fully encapsulated by the coordinates of another candidate ORF, the longer one is reported. However, a single transcript can report multiple ORFs (allowing for operons, chimeras, etc).
 *    optional the putative peptide has a match to a Pfam domain above the noise cutoff score.
+*    start codon positions are refined according to a start-codon position weight matrix based feature scoring.

-The software is primarily maintained by Brian Haas at the Broad Institute and Alexie Papanicolaou at the Commonwealth Scientific and Industrial Research Organisation (CSIRO). It is integrated into other related software such as Trinity, PASA, EVidenceModeler, and Trinotate.

 Full documentation is provided at: http://transdecoder.github.io

-===========================
-To build just TransDecoder:

-Please make sure you read http://transdecoder.github.io before proceeding.
-
-   %  make
-
-
-See sample_data/ directory for example usage.


--- a/TransDecoder.LongOrfs
+++ b/TransDecoder.LongOrfs
@@ -47,7 +47,6 @@ See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently suppo
 Mitochondrial-Euascomycetes
 Mitochondrial-Protozoans

-
 =cut


@@ -67,6 +66,7 @@ use Gene_obj;
 use Nuc_translator;
 use Fasta_reader;
 use Longest_orf;
+use Pipeliner;

 my $UTIL_DIR = "$FindBin::RealBin/util";
 $ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
@@ -120,6 +120,18 @@ if ($genetic_code ne 'universal') {

 main: {

+    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
+    unless (-d $workdir) {
+        mkdir($workdir) or die "Error, cannot mkdir $workdir.";
+    }
+
+    my $longorf_checkpoint = "$workdir/__longorfs.ok";
+    if (-e $longorf_checkpoint) {
+        print STDERR "-longorfs already completed, checkpoint identified at: $longorf_checkpoint\n";
+        exit(0);
+    }
+    
+    
    my %gene_trans_map;
    if ($gene_trans_map_file) {
        open(my $fh, $gene_trans_map_file) or die "Error, cannot open file $gene_trans_map_file";
@@ -132,13 +144,6 @@ main: {
    }
    
    
-
-    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
-    unless (-d $workdir) {
-        mkdir($workdir) or die "Error, cannot mkdir $workdir.";
-    }
-    
-    
    my $base_freqs_file = "$workdir/base_freqs.dat";
    my $base_freqs_checkpoint = "$base_freqs_file.ok";
    if (! -e $base_freqs_checkpoint) {
@@ -317,6 +322,8 @@ main: {
    close GFF;

    
+    &process_cmd("touch $longorf_checkpoint");
+        
    print STDERR "\n\n#################################\n"
                  . "### Done preparing long ORFs.  ###\n"
                  . "##################################\n\n";

--- a/TransDecoder.Predict
+++ b/TransDecoder.Predict
 #!/usr/bin/env perl

+use strict;
+use warnings;
+use FindBin;
+use Pod::Usage;
+use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
+use Data::Dumper;
+use List::Util qw (min max);
+use File::Basename;
+use Carp;
+
+use lib ("$FindBin::RealBin/PerlLib");
+
+use POSIX qw(ceil);
+use Gene_obj;
+use Nuc_translator;
+use Fasta_reader;
+use Longest_orf;
+use Pipeliner;
+use DelimParser;
+
+
+my $RETAIN_LONG_ORFS_MIN_LENGTH = 1000000; # so essentially, off by default
+
+
 =pod

 =head1 NAME
@@ -14,8 +38,13 @@ Required:

 Common options:

- --retain_long_orfs <int>               retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
-                                         marks it as coding (default: 900 bp => 300aa)
+
+ --retain_long_orfs_mode <string>        'dynamic' or 'strict' (default: dynamic)
+                                        In dynamic mode, sets range according to 1%FDR in random sequence of same GC content.
+
+ 
+ --retain_long_orfs_length <int>         under 'strict' mode, retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
+                                         marks it as coding (default: 1000000) so essentially turned off by default.)

 --retain_pfam_hits <string>            domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)     
                                        Any ORF with a pfam domain hit will be retained in the final output.
@@ -23,21 +52,16 @@ Common options:
 --retain_blastp_hits <string>          blastp output in '-outfmt 6' format.
                                        Any ORF with a blast match will be retained in the final output.

- --single_best_orf                      Retain only the single best ORF per transcript.
-                                        (Best is defined as having (optionally pfam and/or blast support) and longest orf)
-
- --cpu <int>                            Use multiple cores for cd-hit-est. (default=1)
+ --single_best_only                     Retain only the single best orf per transcript (prioritized by homology then orf length)

 -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia, ...)

+ --no_refine_starts                     start refinement identifies potential start codons for 5' partial ORFs using a PWM, process on by default.

 Advanced options

- --train <string>                       FASTA file with ORFs to train Markov Mod for protein identification; otherwise 
-                                        longest non-redundant ORFs used
-
 -T <int>                               If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
-                                        Note, 10x this value are first selected for use with cd-hit to remove redundancies,
+                                        Note, 10x this value are first selected for removing redundancies,
                                        and then this -T value of longest ORFs are selected from the non-redundant set.

 =cut
@@ -69,33 +93,16 @@ See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently suppo



-use strict;
-use warnings;
-use FindBin;
-use Pod::Usage;
-use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
-use Data::Dumper;
-use List::Util qw (min max);
-use File::Basename;
-
-use lib ("$FindBin::RealBin/PerlLib");

-use POSIX qw(ceil);
-use Gene_obj;
-use Nuc_translator;
-use Fasta_reader;
-use Longest_orf;

 my $UTIL_DIR = "$FindBin::RealBin/util";
 $ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";


-my ($cd_hit_est_exec) = &check_program('cd-hit-est');
-
 my ($transcripts_file,$train_file);

 my $top_ORFs_train = 500;
-
+my $max_prot_length_for_training = 5000;

 my $help;

@@ -103,18 +110,55 @@ my $verbose;
 my $search_pfam = "";
 my ($reuse,$pfam_out);

-my $RETAIN_LONG_ORFS = 900;
+
+my $RETAIN_LONG_ORFS_MODE = 'dynamic';
+
+#################################
+# quantiles based on random orfs
+#
+# Quant  0.95    0.99    0.999     0.9999
+# 80     1230.0  1728.0  2422.656  3240.7914
+# 75     927.0   1257.0  1743.0    2278.6704
+# 70     750.0   1002.0  1358.1    1681.77
+# 65     645.0   829.59  1085.718  1383.5718
+# 60     570.0   722.31  927.0     1158.231
+# 55     513.0   633.0   796.983   950.8983
+# 50     486.0   582.0   748.77    887.877
+# 45     456.0   546.0   644.655   694.4931
+# 40     431.7   501.0   589.794   612.2352
+# 35     426.0   488.76  554.88    565.7904
+# 30     406.5   469.5   510.9     518.19
+# 25     404.4   448.56  465.39    467.739
+
+
+# setting it to 0.999% mark:
+my @GC_TO_MIN_LONG_ORF_LENGTH = ( [25, 465],
+                                  [30, 510],
+                                  [35, 555],
+                                  [40, 590],
+                                  [45, 645],
+                                  [50, 749],
+                                  [55, 797],
+                                  [60, 927],
+                                  [65, 1086],
+                                  [70, 1358],
+                                  [75, 1743],
+                                  [80, 2422] );
+


 my $retain_pfam_hits_file;
 my $retain_blastp_hits_file;
 my $cpu = 1;
 my $MPI_DEBUG = 1;
-my $single_best_orf_flag = 0;
+my $single_best_flag = 0;
+
 my $genetic_code = "";

+my $NO_REFINE_START_CODONS_FLAG = 0;
+
+
 &GetOptions( 't=s' => \$transcripts_file,
-             'train:s' => \$train_file,

             'h' => \$help,
             'v' => \$verbose,
@@ -124,7 +168,8 @@ my $genetic_code = "";
             'search_pfam=s' => \$search_pfam,
             'reuse' => \$reuse,

-             'retain_long_orfs=i' => \$RETAIN_LONG_ORFS,
+             'retain_long_orfs_mode=s' => \$RETAIN_LONG_ORFS_MODE,
+             'retain_long_orfs_length=i' => \$RETAIN_LONG_ORFS_MIN_LENGTH,

             'debug' => \$MPI_DEBUG,
             
@@ -132,10 +177,11 @@ my $genetic_code = "";
             'retain_blastp_hits=s' => \$retain_blastp_hits_file,
             'cpu=i' => \$cpu,

-             'single_best_orf' => \$single_best_orf_flag,
+             'single_best_only' => \$single_best_flag,
             
             'G=s' => \$genetic_code,

+             'no_refine_starts' => \$NO_REFINE_START_CODONS_FLAG,
                          
             );

@@ -157,6 +203,8 @@ if ($genetic_code) {
    $genetic_code = " --genetic_code $genetic_code";
 }

+
+
 main: {
    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
    
@@ -164,6 +212,14 @@ main: {
        die "Error, cannot find directory: $workdir,  be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
    }

+    my $checkpoints_dir = "$workdir.__checkpoints";
+    if (! -d $checkpoints_dir) {
+        mkdir($checkpoints_dir);
+    }
+    
+    my $pipeliner = new Pipeliner('-verbose' => 2);
+    $pipeliner->set_checkpoint_dir($checkpoints_dir);
+    
    my $prefix = "$workdir/longest_orfs";
    my $cds_file = "$prefix.cds";
    my $gff3_file = "$prefix.gff3";
@@ -171,164 +227,89 @@ main: {

    ## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.
    
-    my $top_cds_file;
-    if ($train_file) {
-
-        if (! -s $train_file) {
-            die "Error, cannot locate train file: $train_file";
-        }
-        $top_cds_file = $train_file;
-    }
-    else {
-        $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
-        my $checkpoint = "$top_cds_file.ok";
-        if (! -e $checkpoint) {
+    my $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
+    {
        
        # to speed things up only check for redundancy up to x the number of entries we want
        my $red_num = $top_ORFs_train * 10;
        my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
-            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
-            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -T $cpu -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
-            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
        
-            &process_cmd("touch $checkpoint");
+        $pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num $max_prot_length_for_training > $red_num_cds_longest_file",
+                                             "get_longest_orfs.ok"));
+        
+        $pipeliner->add_commands(new Command("$UTIL_DIR/exclude_similar_proteins.pl $red_num_cds_longest_file > $red_num_cds_longest_file.nr", "nr.ok"));
+        
+        $pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr $top_ORFs_train > $top_cds_file",
+                                             "top_train_select.ok"));
+        
+    
+        $pipeliner->run();
    
-        }
    }
    
        
    # get hexamer scores
    my $hexamer_scores_file = "$workdir/hexamer.scores";
    my $hexamer_checkpoint = "$hexamer_scores_file.ok";
-    if (! -e $hexamer_checkpoint) {
-        
    my $base_freqs_file = "$workdir/base_freqs.dat";

-        my $cmd = "$UTIL_DIR/seq_n_baseprobs_to_logliklihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file";
-        &process_cmd($cmd);
-
-        &process_cmd("touch $hexamer_checkpoint");
+    if ($RETAIN_LONG_ORFS_MODE eq 'dynamic') {
+        $RETAIN_LONG_ORFS_MIN_LENGTH = &get_dynamic_retain_long_orf_length($base_freqs_file, \@GC_TO_MIN_LONG_ORF_LENGTH);
    }
    
+    # score kmers for Markov model
+    $pipeliner->add_commands(new Command("$UTIL_DIR/seq_n_baseprobs_to_loglikelihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file",
+                                         "hexamer_scores.ok") );
+    
    # score all cds entries
    my $cds_scores_file = "$cds_file.scores";
    my $cds_scores_checkpoint = "$cds_scores_file.ok";
-    if (! -e $cds_scores_checkpoint) {
-        my $cmd = "$UTIL_DIR/score_CDS_liklihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file";
-        &process_cmd($cmd);
+    $pipeliner->add_commands(new Command("$UTIL_DIR/score_CDS_likelihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file",
+                                         "score_cdss.ok"));

-        &process_cmd("touch $cds_scores_checkpoint");
-    }

-    ## Retain those that have pfam matches

-    my %has_pfam_hit;
+    ## select orfs
+
+    my $select_cmd = "$UTIL_DIR/select_best_ORFs_per_transcript.pl --gff3_file $gff3_file --cds_scores $cds_scores_file "
+        . " --min_length_auto_accept $RETAIN_LONG_ORFS_MIN_LENGTH ";

    if ($retain_pfam_hits_file) {
-        %has_pfam_hit = &parse_pfam_hits($retain_pfam_hits_file);
+        $select_cmd .= " --pfam_hits $retain_pfam_hits_file ";
    }
-
-    my %has_blastp_hit;
    if ($retain_blastp_hits_file) {
-        %has_blastp_hit = &parse_blastp_hits_file($retain_blastp_hits_file);
+        $select_cmd .= " --blast_hits $retain_blastp_hits_file ";
    }

-    # get accs for best entries
-    my $acc_file = "$cds_file.scores.selected";
-    {
-        
-        my %att_counter;
-        
-        open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
-        open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
-        while (<$ifh>) {
-            chomp;
-            my ($acc, $orf_length, @scores) = split(/\t/);
-
-            my @ATTS;
-            
-            my $score_1 = shift @scores;
-            my $max_score_other_frame = max(@scores);
-
-            my $keep_flag = 0;
-
-            if ($has_pfam_hit{$acc}) {
-                $keep_flag = 1;
-                push (@ATTS, "PFAM");
-                print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
-            }
-            if ($has_blastp_hit{$acc}) {
-                $keep_flag = 1;
-                push (@ATTS, "BLASTP");
-                print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
-            }
-            if ($orf_length >= $RETAIN_LONG_ORFS) {
-                $keep_flag = 1;
-                push (@ATTS, "LONGORF");
-            }
-            if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
-                $keep_flag = 1;
-                push (@ATTS, "FRAMESCORE");
+    if ($single_best_flag) {
+        $select_cmd .= " --single_best_orf ";
    }
+    $select_cmd .= " > $cds_file.best_candidates.gff3 ";
    
-            if ($keep_flag) {
-                print $ofh "$acc\n";
-
-                my $att_string = join("|", sort @ATTS);
-                $att_counter{$att_string}++;
-            }
-        }
-        close $ifh;
-        close $ofh;
+    $pipeliner->add_commands(new Command($select_cmd, "orf_select.ok"));
    
+    $pipeliner->run();

-        # report on the categories of the selected ORFs
-        print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
-        foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
-            my $count = $att_counter{$att};
-            print STDERR join("\t", $att, $count) . "\n";
-        }
-        print STDERR "########################\n\n\n";
-    }
        
-    # index the current gff file:
-    my $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
-    &process_cmd($cmd);
-    
-    # retrieve the best entries:
-    $cmd = "$UTIL_DIR/gene_list_to_gff.pl $acc_file $gff3_file.inx > $cds_file.best_candidates.gff3";
-    &process_cmd($cmd);
-    
-    
-    ##############################
-    ## Generate the final outputs.
-    ##############################
    
    my $final_output_prefix = basename($transcripts_file) . ".transdecoder";

-    {
-                        
-        # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
-        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";
+    my $target_final_file = "$cds_file.best_candidates.gff3";
    
-        my $target_final_file = "$cds_file.eclipsed_removed.gff3";
+    ## start codon refinement
+    unless ($NO_REFINE_START_CODONS_FLAG) {
+        my $cmd = "$UTIL_DIR/train_start_PWM.pl --transcripts $transcripts_file --selected_orfs $top_cds_file --out_prefix $workdir/start_refinement";
+        $pipeliner->add_commands(new Command($cmd, "train_start_pwm.ok"));
                
-        &process_cmd($cmd);
+        my $revised_orf_gff3_file = "$target_final_file.revised_starts.gff3";
+        $cmd = "$UTIL_DIR/start_codon_refinement.pl --transcripts $transcripts_file --gff3_file $target_final_file > $revised_orf_gff3_file";
+        $pipeliner->add_commands(new Command($cmd, "revise_starts.ok"));
        
            
-        if ($single_best_orf_flag) {
-            $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
-            if ($retain_blastp_hits_file) {
-                $cmd .= " --blast_hits $retain_blastp_hits_file ";
-            }
-            if ($retain_pfam_hits_file) {
-                $cmd .= " --pfam_hits $retain_pfam_hits_file ";
-            }
-            $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
+        $pipeliner->run();
        
-            &process_cmd($cmd);
+        $target_final_file = $revised_orf_gff3_file;
        
-            $target_final_file = "$cds_file.single_best_orf.gff3";
    }
    
    
@@ -336,11 +317,12 @@ main: {
                
    ## write final outputs:

+    {
        ## make a BED file for viewing in IGV
        my $gff3_file = "$final_output_prefix.gff3";
        my $bed_file = $gff3_file;
        $bed_file =~ s/\.gff3$/\.bed/;
-        $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
+        my $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
        &process_cmd($cmd);
        
        
@@ -357,9 +339,9 @@ main: {
        $best_cds_file =~ s/\.pep$/\.cds/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file --seqType CDS $genetic_code > $best_cds_file";
        &process_cmd($cmd);
-        
    }
    
+    
    print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
    
    
@@ -450,3 +432,44 @@ sub check_program() {
    }
    return @paths;
 }
+
+
+
+####
+sub get_dynamic_retain_long_orf_length {
+    my ($base_freqs_file, $GC_to_min_orf_length_aref) = @_;
+
+    my $pct_GC = -1;
+
+    open(my $fh, $base_freqs_file) or confess "Error, cannot open file: $base_freqs_file";
+    while (<$fh>) {
+        chomp;
+        my ($base, $count, $ratio) = split(/\t/);
+        if ($base eq 'C') {
+            $pct_GC = 2 * $ratio * 100;
+            last;
+        }
+    }
+    close $fh;
+
+    print STDERR "PCT_GC: $pct_GC\n";
+    
+    unless ($pct_GC > 0) {
+        confess "Error, didn't parse percent C from file: $base_freqs_file";
+    }
+
+
+    foreach my $bin (@$GC_to_min_orf_length_aref) {
+        my ($gc, $min_len) = @$bin;
+        
+        if ($pct_GC <= $gc) {
+            return($min_len);
+        }
+    }
+    
+    # if we got here, then GC content must be crazy high!
+    # in this case, rely entirely on Markov model for coding determination.
+    return(1000000); #effectively infinity here.
+
+}
+
--- a/debian/changelog
+++ b/debian/changelog
+transdecoder (5.0.1-1) unstable; urgency=medium
+
+  * New upstream version (no need to repackage since binary files are now
+    removed upstream)
+  * Upstream changelog: cdhit has been removed and replaced by our own fast
+    method for removing redundancies
+  * Standards-Version: 4.1.3
+  * debhelper 11
+  * Add missing Depends: r-base-core, python
+  * d/rules: do not parse d/changelog
+  * Fix permissions
+
+ -- Andreas Tille <tille@debian.org>  Mon, 19 Feb 2018 15:11:06 +0100
+
 transdecoder (3.0.1+dfsg-1) unstable; urgency=medium

  * Add Multi-Arch: foreign hint to transdecoder-doc

--- a/debian/compat
+++ b/debian/compat
-9
+11
--- a/debian/control
+++ b/debian/control
@@ -4,10 +4,9 @@ Uploaders: Michael R. Crusoe <michael.crusoe@gmail.com>,
           Andreas Tille <tille@debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 9),
-               cd-hit,
+Build-Depends: debhelper (>= 11~),
               liburi-perl
-Standards-Version: 3.9.8
+Standards-Version: 4.1.3
 Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/transdecoder.git
 Vcs-Git: https://anonscm.debian.org/git/debian-med/transdecoder.git
 Homepage: https://transdecoder.github.io/
@@ -16,8 +15,9 @@ Package: transdecoder
 Architecture: all
 Depends: ${misc:Depends},
         ${perl:Depends},
-         cd-hit,
-         liburi-perl
+         liburi-perl,
+         r-base-core,
+         python
 Description: find coding regions within RNA transcript sequences
 TransDecoder identifies candidate coding regions within transcript sequences,
 such as those generated by de novo RNA-Seq transcript assembly using Trinity,

--- a/debian/patches/cd-hit-est-rename
+++ b/debian/patches/cd-hit-est-rename
-Author: Michael R. Crusoe <mcrusoe@msu.edu>
-Last-Update: Fri, 13 Feb 2015 06:26:24 -0500
-Description: Fix path to Debian location of cd-hit-est
-
--- transdecoder.orig/TransDecoder.Predict
-+++ transdecoder/TransDecoder.Predict
-@@ -90,7 +90,7 @@
- $ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";
- 
- 
-my ($cd_hit_est_exec) = &check_program('cd-hit-est');
-+my ($cd_hit_est_exec) = &check_program('/usr/lib/cd-hit/cd-hit-est');
- 
- my ($transcripts_file,$train_file);
- 
--- a/debian/patches/fix-whatis
+++ b/debian/patches/fix-whatis
@@ -4,7 +4,7 @@ Description: Fix whatis entry of manpage

 --- a/TransDecoder.LongOrfs
 +++ b/TransDecoder.LongOrfs
-@@ -6,7 +6,7 @@
+@@ -6,7 +6,7 @@ my $MIN_PROT_LENGTH = 100;
 
 =head1 NAME
 
@@ -15,7 +15,7 @@ Description: Fix whatis entry of manpage
 
 --- a/TransDecoder.Predict
 +++ b/TransDecoder.Predict
-@@ -4,7 +4,7 @@
+@@ -28,7 +28,7 @@ my $RETAIN_LONG_ORFS_MIN_LENGTH = 100000
 
 =head1 NAME
 

--- a/debian/patches/series
+++ b/debian/patches/series
-fix-test
-cd-hit-est-rename
 fix-whatis
 reproducible-sample_data.patch
--- a/debian/rules
+++ b/debian/rules
 #!/usr/bin/make -f

 DH_VERBOSE := 1
-DEBVERS        := $(shell dpkg-parsechangelog --show-field=Version)
+
+include /usr/share/dpkg/default.mk

 %:
-	dh $@ --parallel
+	dh $@

 override_dh_auto_clean:
 	cd ./sample_data && for script in ./*/cleanme.pl; do $$($$script); done
@@ -23,16 +24,17 @@ override_dh_install:

 override_dh_installman:
 	pod2man --section=1 --center="Transcriptome Protein Prediction" \
-		--release="$(DEBVERS)" TransDecoder.LongOrfs > \
+		--release="$(DEB_VERSION_UPSTREAM)" TransDecoder.LongOrfs > \
 		debian/TransDecoder.LongOrfs.1
 	pod2man --section=1 --center="Transcriptome Protein Prediction" \
-		--release="$(DEBVERS)" TransDecoder.Predict > \
+		--release="$(DEB_VERSION_UPSTREAM)" TransDecoder.Predict > \
 		debian/TransDecoder.Predict.1
 	dh_installman

 override_dh_fixperms:
 	dh_fixperms
 	chmod -x debian/transdecoder/usr/lib/transdecoder/PerlLib/Fasta_retriever.pm
+	find debian -name Process_cmd.pm -exec chmod -x \{\} \;

 override_dh_clean:
 	# generated man pages

--- a/debian/watch
+++ b/debian/watch
-version=3
+version=4

-opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
 https://github.com/TransDecoder/Transdecoder/releases .*/archive/v?(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)

--- a/sample_data/Makefile
+++ b/sample_data/Makefile

-DIRS=cufflinks_example pasa_example simple_transcriptome_target
+DIRS=cufflinks_example pasa_example simple_transcriptome_target stringtie_example

 test:
 	@for i in $(DIRS); do \