Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,6 @@ META.yml
 MYMETA.yml
 nytprof.out
 pm_to_blib
-doc/prokka-*
 bug/
 db/cm/*.i1*
 db/kingdom/*/sprot.p*

--- a/.travis.yml
+++ b/.travis.yml
+language: perl
+
+sudo: false
+
+perl:
+    - "5.26"
+    
+install:
+    - "cpanm --quiet --notest Time::Piece XML::Simple Digest::MD5 Bio::Perl"
+    - "export PATH=$PWD/bin:$PATH"
+
+script:
+    - "prokka --version"
+    - "prokka --help"
+    - "! prokka --doesnotexist"
+    - "prokka --depends"
+    - "prokka --setupdb"
+    - "prokka --listdb"
+    - "prokka --cpus 2 --outdir asm --prefix asm test/plasmid.fna"
+    - "grep '>' asm/asm.fna"
+    - "prokka --cleandb"
--- a/README.md
+++ b/README.md
--- a/bin/prokka
+++ b/bin/prokka
--- a/bin/prokka-abricate_to_fasta_db
+++ b/bin/prokka-abricate_to_fasta_db
+#!/usr/bin/env perl
+
+# IN
+#>ncbi~~~A7J11_00131~~~A7J11_00131 bifunctional aminoglycoside N-acetyltransferase AAC(3)-Ib/aminoglycoside N-acetyltransferase AAC(6')-Ib''
+#ATGAGCATCATTGCAACCGTCAAGATCGGCCCTGACGAAATTTCAGCCATGAGGGCTGTG
+#CTCGATCTCTTCGGCAAAGAGTTTGAGGACATTCCAACCTACTCTGATCGCCAGCCGACC
+#AATGAGTATCTTGCCAATCTTCTGCACAGCGAGACGTTCATCGCGCTCGCTGCTTTTGAC
+
+# OUT
+#>Q92AT0 2.4.1.333~~~~~~1,2-beta-oligoglucan phosphorylase~~~COG3459
+#MTMLKEIKKADLSAAFYPSGELAWLKLKDIMLNQVIQNPLENRLSQIYVRAHVGDKIEIYPLLSRDAEVGFNENGVEYRGVVGPFRYSVQMHFHTRGWFYDVTVDGD
+
+use Bio::SeqIO;
+use Data::Dumper;
+
+@ARGV or die "USAGE: $0".
+  ' $(dirname $(which abricate))/../db/ncbi/sequences'.
+  ' > $(dirname $(which prokka))/../db/kingdom/Bacteria/AMR';
+
+my $in  = Bio::SeqIO->new(-fh=>\*ARGV,   -format=>'fasta');
+my $out = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>'fasta');
+
+my %seen;
+
+while (my $seq = $in->next_seq) {
+  my(undef,$gene,$locustag) = split m"~~~", $seq->id;
+  $gene = '' if $gene eq $locustag;
+  my $prot = $seq->translate;
+  die Dumper($prot) if $prot->seq =~ m/\*./; # check for stop codon in middle
+  die Dumper($prot) if $seen{$prot->seq}++;  # check for dupes
+  $prot->id($locustag);
+  $prot->desc( join('~~~', '', $gene, $prot->desc, '') );
+  $out->write_seq($prot);
+}
+
--- a/bin/prokka-biocyc_to_fasta_db
+++ b/bin/prokka-biocyc_to_fasta_db
-#!/usr/bin/perl
+#!/usr/bin/env perl
 use strict;
 use warnings;
 use Bio::SeqIO;

--- a/bin/prokka-build_kingdom_dbs
+++ b/bin/prokka-build_kingdom_dbs
 #!/bin/bash

+# this now redirects to EBI UK
 URL="ftp://ftp.ebi.edu.au/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/"

 # where to put formatted database: /path/to/prokka/db/
@@ -30,6 +31,9 @@ for K in Viruses Archaea Bacteria ; do
    "$ROOTDIR/prokka-uniprot_to_fasta_db" | \
    makeblastdb -in - -dbtype prot -title "Prokka $K" -hash_index -out "$DIR/sprot"

+  # If you want a copy of the sprot file too (we do for git distribution)
+  blastdbcmd -db "$DIR/sprot" -entry all > "$DIR/sprot"
+    
  # If you want to use old BLAST:
  # formatdb -i stdin -p T -V T -t "Prokka $K" -n "$DIR/sprot"


--- a/bin/prokka-cdd_to_hmm
+++ b/bin/prokka-cdd_to_hmm
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 use strict;
+use warnings;
 use Data::Dumper;
 use Bio::SeqIO;
 use Bio::AlignIO;
@@ -63,8 +64,19 @@ for my $cid (sort keys %desc) {
  my $faln = "$tempdir/$cid.FASTA";
 #  system("esl-reformat stockholm $tempdir/$cid.FASTA | grep -v lcl.consensus > $faln");
  print STDERR "$cid | $faln >> $lib.aln\n";
+  if (not -r $faln) {
+    print STDERR "WARNING! skipping missing file: $faln\n";
+    next;
+  }
  my $in = Bio::AlignIO->new(-file=>$faln, -format=>'fasta');
  while (my $aln = $in->next_aln) {
+#    $aln->set_displayname_flat(); # remove bioperl suffix: "/begin-end"
+#    my %seen = ('lcl|consensus'=>1);
+#    for my $seq ($aln->each_seq) {
+#      if ($seen{ $seq->id }++) {  # remove duplicates (why do they exist!?)
+#        $aln->remove_seq($seq);
+#      }
+#    }
    my $desc = join '~~~', ('', $desc{$cid}{gene}, $desc{$cid}{prod});
    print STDERR "$cid: $desc\n";
    $aln->id($cid);
@@ -73,6 +85,8 @@ for my $cid (sort keys %desc) {
  #    $aln->dblink("CDD:$cid");
    $out->write_aln($aln);
  }
+#  system("cat $faln $lib.aln");
+#  exit(-1);
 }

 system("hmmbuild $lib.hmm.ascii $lib.aln");

--- a/bin/prokka-clusters_to_hmm
+++ b/bin/prokka-clusters_to_hmm
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 use strict;
+use warnings;
 use Data::Dumper;
 use File::Temp qw(tempdir);


--- a/bin/prokka-genbank_to_fasta_db
+++ b/bin/prokka-genbank_to_fasta_db
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 use strict;
+use warnings;
 use Bio::SeqIO;
 use Bio::Tools::CodonTable;
 use Data::Dumper;

--- a/bin/prokka-genpept_to_fasta_db
+++ b/bin/prokka-genpept_to_fasta_db
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 use strict;
+use warnings;
 use Bio::SeqIO;

 my(@Options, $verbose, $format, $hypo, $idtag, $sep, $blank, $pseudo, $minlen);

--- a/bin/prokka-hamap_to_hmm
+++ b/bin/prokka-hamap_to_hmm
-#!/usr/bin/perl
+#!/usr/bin/env perl
 use strict;
 use warnings;
 use Bio::SeqIO;
@@ -21,7 +21,7 @@ while (<$ali_fh>) {
  my @x = split m/\t/;
  next if $x[5] =~ m/UPF\d+|homolog|[@<\[]/;
  $x[5] =~ s/^probable\s+//gi;
-  $DESC{ $x[1] } = "$sep$x[4]$sep$x[5]";
+  $DESC{ $x[0] } = "$sep$x[4]$sep$x[5]";
 }
 printf STDERR "Accepted descriptions for %d families\n", scalar keys %DESC;

@@ -43,7 +43,7 @@ while (<$fam_fh>) {
  elsif (m/^AC\s+(\w+)/) {
    $AC = $1;
  }
-  elsif (m/EC=([\d.]+);/) {
+  elsif (m/EC=([\d.-]+);/) {
    $EC = $1;
  }
  elsif (m/^ Bacteria/) {

--- a/bin/prokka-uniprot_to_fasta_db
+++ b/bin/prokka-uniprot_to_fasta_db
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 use strict;
+use warnings;
 use SWISS::Entry;
 use SWISS::KW;
 #use SWISS::OC;
@@ -54,8 +55,19 @@ while (<ARGV>)
  $gene = '' if $gene =~ m/\d{2}/ or $gene =~ m/\./;

  my $ec = ''; 
-#  my $prod = 'hypothetical protein'; 
  my $prod = ''; 
+  my $cog = '';
+
+  if (1) {
+    # [ 'eggNOG', 'COG4799', 'LUCA' ]
+    for my $dr ( @{ $entry->DRs->list } ) {
+#      print Dumper($dr);
+      if ($dr->[1] =~ m/^(COG\d+)$/) {
+        $cog = $1;
+        last;
+      }   
+    }
+  }

  if (1) {  
    for my $de ($entry->DEs->elements) {
@@ -67,6 +79,7 @@ while (<ARGV>)
      elsif ($de->type eq 'Full' and $de->category eq 'RecName') {
 	$prod = $de->text;
 	if ($prod =~ m/^UPF\d|^Uncharacterized protein|^ORF|^Protein /) {
+          next if ! $hypo;
 	  $prod = $HYPO;
 	}
      }
@@ -83,8 +96,8 @@ while (<ARGV>)
  $gene ||= $blank;
  $prod ||= $blank;
  
-  print STDERR join("\t", $entry->AC, $ec, $gene, $prod), "\n" if $verbose;
-  print ">", $entry->AC, " $ec$sep$gene$sep$prod\n", $entry->SQs->seq, "\n";
+  print STDERR join("\t", $entry->AC, $ec, $gene, $prod, $cog), "\n" if $verbose;
+  print ">", $entry->AC, " $ec$sep$gene$sep$prod$sep$cog\n", $entry->SQs->seq, "\n";
  
  $out++;

@@ -106,7 +119,7 @@ sub setOptions {
    {OPT=>"evidence=i",   VAR=>\$evlev, DEFAULT=>2, DESC=>"1=prot 2=mrna 3=homol 4=pred 5=unsure"},
    {OPT=>"fragments!",   VAR=>\$frag, DEFAULT=>0, DESC=>"Include 'DE Flags: Fragment;' entries"},
    {OPT=>"minlen=i",   VAR=>\$minlen, DEFAULT=>20, DESC=>"Minimum peptide length"},
-    {OPT=>"maxlen=i",   VAR=>\$maxlen, DEFAULT=>1E5, DESC=>"Minimum peptide length"},
+    {OPT=>"maxlen=i",   VAR=>\$maxlen, DEFAULT=>1E5, DESC=>"Maximum peptide length"},
    {OPT=>"term=s",   VAR=>\$term, DEFAULT=>'', DESC=>"Lineage must contain this term eg. 'Bacteria'"},
    {OPT=>"hypo!",   VAR=>\$hypo, DEFAULT=>0, DESC=>"Don't filter out hypothetical proteins"},
  );

--- a/db/cm/Bacteria
+++ b/db/cm/Bacteria
--- a/db/genus/Escherichia
+++ b/db/genus/Escherichia
--- a/db/hmm/HAMAP.hmm
+++ b/db/hmm/HAMAP.hmm
--- a/db/kingdom/Archaea/sprot
+++ b/db/kingdom/Archaea/sprot
--- a/db/kingdom/Bacteria/AMR
+++ b/db/kingdom/Bacteria/AMR
--- a/db/kingdom/Bacteria/IS
+++ b/db/kingdom/Bacteria/IS
--- a/db/kingdom/Bacteria/sprot
+++ b/db/kingdom/Bacteria/sprot