Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/Makefile
+++ b/Makefile
-
 CC = g++ -Wall -ggdb
 CC = g++ -pg
 CC = g++

-# without OpenMP
-
+# default with OpenMP
 # with OpenMP
 # in command line: 
 # make openmp=yes
@@ -14,6 +12,21 @@ else
  CCFLAGS = -fopenmp
 endif

+#LDFLAGS = -static -lz -o
+#LDFLAGS = /usr/lib/x86_64-linux-gnu/libz.a -o
+
+# default with zlib
+# without zlib
+# in command line:
+# make zlib=no
+ifeq ($(zlib),no)
+  CCFLAGS += 
+  LDFLAGS += -o
+else
+  CCFLAGS += -DWITH_ZLIB
+  LDFLAGS += -lz -o
+endif
+
 # support debugging
 # in command line:
 # make debug=yes
@@ -28,9 +41,6 @@ ifdef MAX_SEQ
 CCFLAGS += -DMAX_SEQ=$(MAX_SEQ)
 endif

-#LDFLAGS = -static -o
-LDFLAGS += -o
-
 PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454

 # Propagate hardening flags

--- a/README
+++ b/README
 For cd-hit

-How to compile?
+Requirements
+Since 4.8.1, cd-hit supports .gz format input file. This requires zlib library. zlib should
+be install in most Linux systems, so cd-hit should be compiled without issue. If your system
+don't have zlib, please install it first. 
+    * On Ubuntu, to install zlib:
+        sudo apt install zlib1g-dev
+    * On CentOS, to install zlib:
+        sudo yum install zlib-devel 
+
+
+How to compile
  1. Compile with multi-threading support (default): make
  2. Compile without multi-threading support (if you are on very old systems): make openmp=no
+  3. Compile without zlib (if you can not install zlib): make zlib=no
+
+Having problems to compile
+Please contact the author


 For cd-hit-auxtools
@@ -10,9 +24,16 @@ For cd-hit-auxtools
  make


-For psi-cd-hit
-  please download legacy BLAST (not BLAST+) and install the executables in your $PATH
+Compile cd-hit on MacOS
+To install CD-HIT on MacOS, first install gcc on your system.
+To use Homebrew (https://brew.sh/), see https://brewformulas.org/gcc. 
+Then locate the path to your g++ executable, (e.g. /usr/local/Cellar/gcc/6.3.0_1/bin/g++-6, 
+note: yours g++ path is likely to be different), then use command like this:
+  make CC=/usr/local/Cellar/gcc/6.3.0_1/bin/g++-6
+

+For psi-cd-hit
+  please download BLAST+ (not legacy BLAST) and install the executables in your $PATH

 For more information, please visit http://cd-hit.org


--- a/cd-hit-clstr_2_blm8.pl
+++ b/cd-hit-clstr_2_blm8.pl
+#!/usr/bin/perl
+#
+
+my $rep;
+my @non_reps = ();
+my @aln_info = ();
+while($ll=<>){
+  if ($ll =~ /^>/ ) {
+    if (@non_reps) {
+      for ($i=0; $i<@non_reps; $i++) {
+        print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+      }
+    }
+    $rep = "";
+    @non_reps = ();
+    @aln_info = ();
+  }
+  else {
+    chop($ll);
+    if ($ll =~ />(.+)\.\.\./ ) {
+      my $id = $1;
+      if ($ll =~ /\*$/) { $rep = $id}
+      else              { 
+        push(@non_reps, $id);
+        my @lls = split(/\s+/, $ll);
+        my ($a, $iden) = split(/\//, $lls[-1]);
+        chop($iden); ### removing % sign
+        my ($qb, $qe, $sb, $se) = split(/:/, $a);
+        my $alnln = $qe-$qb+1;
+        my $mis = int($alnln * (100-$iden) / 100);
+        my $gap = 0;
+        my $exp = 0;
+        my $bit = $alnln*3 - $mis*6; #### very rough
+        push(@aln_info, "$iden\t$alnln\t$mis\t$gap\t$qb\t$qe\t$sb\t$se\t$exp\t$bit");
+      }
+    }
+#>Cluster 582
+#0       6671aa, >gi|302514050|uid|51... *
+#1       339aa, >SRS584717|scaffold|... at 2:332:4020:4356/89.32%
+#2       182aa, >SRS584717|scaffold|... at 1:182:6490:6671/100.00%
+#3       367aa, >SRS584717|scaffold|... at 1:332:4543:4874/90.66%
+#4       109aa, >SRS584717|scaffold|... at 1:108:5782:5889/97.22%
+
+  }
+}
+    if (@non_reps) {
+      for ($i=0; $i<@non_reps; $i++) {
+        print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
+      }
+    }
+
+#query                          subject         %       alnln   mis     gap     q_b     q_e     s_b     s_e     expect  bits
+##0                              1               2       3       4       5       6       7       8       9       10      11
+#mHE-SRS012902|scaffold|86.16    gnl|CDD|226997  47.62   42      17      2       164     201     210     250     5e-04   37.6
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|225183  47.46   236     122     1       1       236     475     708     1e-92    284
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|224055  44.35   239     130     2       1       239     332     567     2e-84    259
+#mHE-SRS012902|scaffold|109.23   gnl|CDD|227321  39.50   238     140     3       1       238     324     557     9e-69    218
+
--- a/cdhit-common.c++
+++ b/cdhit-common.c++
--- a/cdhit-common.h
+++ b/cdhit-common.h
@@ -35,11 +35,15 @@
 #include<stdint.h>
 #include<time.h>

+#ifdef WITH_ZLIB
+#include<zlib.h>
+#endif
+
 #include<valarray>
 #include<vector>
 #include<map>

-#define CDHIT_VERSION  "4.7"
+#define CDHIT_VERSION  "4.8.1"

 #ifndef MAX_SEQ
 #define MAX_SEQ 655360
@@ -559,9 +563,17 @@ class SequenceDB
 		~SequenceDB(){ Clear(); }

 		void Read( const char *file, const Options & options );
+		void Readgz( const char *file, const Options & options );
+
 		void Read( const char *file, const char *file2, const Options & options );
+		void Readgz( const char *file, const char *file2, const Options & options );
+
 		void WriteClusters( const char *db, const char *newdb, const Options & options );
+		void WriteClustersgz( const char *db, const char *newdb, const Options & options );
+
 		void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+		void WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
+
 		void WriteExtra1D( const Options & options );
 		void WriteExtra2D( SequenceDB & other, const Options & options );
 		void DivideSave( const char *db, const char *newdb, int n, const Options & options );

--- a/cdhit-utility.c++
+++ b/cdhit-utility.c++
@@ -13,18 +13,19 @@ char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artif
 //

 char contacts[] =
-  "   Questions, bugs, contact Limin Fu at l2fu@ucsd.edu, or Weizhong Li at liwz@sdsc.edu\n"
-  "   For updated versions and information, please visit: http://cd-hit.org\n\n"
+  "   Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n"
+  "   For updated versions and information, please visit: http://cd-hit.org\n"
+  "                                                    or https://github.com/weizhongli/cdhit\n\n"
  "   cd-hit web server is also available from http://cd-hit.org\n\n"
  "   If you find cd-hit useful, please kindly cite:\n\n";

-char txt_option_i[] = "\tinput filename in fasta format, required\n";
+char txt_option_i[] = "\tinput filename in fasta format, required, can be in .gz format\n";
 char txt_option_j[] = 
 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
 \t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
 \t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
-char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
-char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
+char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required, can be in .gz format\n";
+char txt_option_i2[] = "\tinput filename for db2 in fasta format, required, can be in .gz format\n";
 char txt_option_j2[] = 
 "\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
 \t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
@@ -34,12 +35,12 @@ char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (
 char txt_option_c[] = 
 "\tsequence identity threshold, default 0.9\n \
 \tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the full length of the shorter sequence\n";
 char txt_option_G[] = 
 "\tuse global sequence identity, default 1\n \
 \tif set to 0, then use local sequence identity, calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the length of the alignment\n \
 \tNOTE!!! don't use -G 0 unless you use alignment coverage controls\n \
 \tsee options -aL, -AL, -aS, -AS\n";
@@ -135,7 +136,8 @@ char txt_option_sc[] =
 \tif set to 1, output clusters by decreasing size\n";
 char txt_option_sf[] =
 "\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
-\tif set to 1, output sequences by decreasing cluster size\n";
+\tif set to 1, output sequences by decreasing cluster size\n \
+\tthis can be very slow if the input is in .gz format\n";

 char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
 char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
@@ -337,7 +339,7 @@ int print_usage_div (char *arg) {
 char mytxt_option_c[] =
 "\tsequence identity threshold, default 0.98\n \
 \tthis is a \"global sequence identity\" calculated as :\n \
-\tnumber of identical amino acids in alignment\n \
+\tnumber of identical amino acids or bases in alignment\n \
 \tdivided by the full length of the shorter sequence + gaps\n";
 char mytxt_option_b[] = "\tband_width of alignment, default 10\n";
 char mytxt_option_n_est[] = "\tword_length, default 10, see user's guide for choosing it\n";

--- a/debian/changelog
+++ b/debian/changelog
-cd-hit (4.6.8-3) UNRELEASED; urgency=medium
+cd-hit (4.8.1-1) unstable; urgency=medium

+  [ Jelmer Vernooĳ ]
  * Use secure copyright file specification URI.
  * Trim trailing whitespace.

- -- Jelmer Vernooĳ <jelmer@debian.org>  Sat, 20 Oct 2018 13:28:38 +0000
+  [ Andreas Tille ]
+  * New upstream version
+  * debhelper 12
+  * Standards-Version: 4.4.0
+  * Remove trailing whitespace in debian/rules
+  * Do not parse d/changelog
+  * Build-Depends: zlib1g-dev
+
+ -- Andreas Tille <tille@debian.org>  Fri, 19 Jul 2019 19:34:25 +0200

 cd-hit (4.6.8-2) unstable; urgency=medium


--- a/debian/compat
+++ b/debian/compat
-11
+12
--- a/debian/control
+++ b/debian/control
@@ -4,9 +4,10 @@ Uploaders: Tim Booth <tbooth@ceh.ac.uk>,
           Andreas Tille <tille@debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 11~),
-               help2man
-Standards-Version: 4.2.1
+Build-Depends: debhelper (>= 12~),
+               help2man,
+               zlib1g-dev
+Standards-Version: 4.4.0
 Vcs-Browser: https://salsa.debian.org/med-team/cd-hit
 Vcs-Git: https://salsa.debian.org/med-team/cd-hit.git
 Homepage: http://weizhong-lab.ucsd.edu/cd-hit/

--- a/debian/rules
+++ b/debian/rules
@@ -9,10 +9,9 @@

 export DEB_BUILD_MAINT_OPTIONS = hardening=+all

-pkg := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
-ver := $(shell dpkg-parsechangelog | sed -ne 's/^Version: \(\([0-9]\+\):\)\?\(.*\)-.*/\3/p')
+include /usr/share/dpkg/default.mk

-mandir=$(CURDIR)/debian/$(pkg)/usr/share/man/man1/
+mandir=$(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/

 %:
 	dh $@
@@ -23,54 +22,54 @@ override_dh_auto_build:
 override_dh_installman:
 	mkdir -p $(mandir)

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences' \
 	   $(CURDIR)/cd-hit > $(mandir)/cd-hit.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences, optimised for 454 data' \
 	   $(CURDIR)/cd-hit-454 > $(mandir)/cd-hit-454.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='quickly group sequences in db1 or db2 format' \
 	   $(CURDIR)/cd-hit-2d > $(mandir)/cd-hit-2d.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='run CD-HIT algorithm on RNA/DNA sequences' \
 	   $(CURDIR)/cd-hit-est > $(mandir)/cd-hit-est.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='run CD-HIT algorithm on RNA/DNA sequences in db1 or db2 format' \
 	   $(CURDIR)/cd-hit-est-2d > $(mandir)/cd-hit-est-2d.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='divide a big clustering job into pieces to run cd-hit-2d or cd-hit-est-2d jobs' \
 	   $(CURDIR)/cd-hit-2d-para.pl	> $(mandir)/cd-hit-2d-para.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	   --name='divide a big clustering job into pieces to run cd-hit or cd-hit-est jobs' \
 	    $(CURDIR)/cd-hit-para.pl	> $(mandir)/cd-hit-para.1

-	help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
           --name='divide a FASTA file into pieces' \
            $(CURDIR)/cd-hit-div.pl    > $(mandir)/cd-hit-div.1

 	# psi-cd-hit.pl is throwing some errors which are fixed using sed
 	# ... at least in versions <= 4.6.4 - now help2man does not work at all
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities' \
 	#   $(CURDIR)/psi-cd-hit.pl	| \
 	#   sed -e '/^Name "main::.*" used only once:/d' \
 	#	> $(mandir)/psi-cd-hit.1

 	# help2man does not work at all since version 4.6.4
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
 	#      $(CURDIR)/psi-cd-hit-2d.pl	> $(mandir)/psi-cd-hit-2d.1

 	# FIXME: what is the difference between psi-cd-hit-2d.pl and psi-cd-hit-2d-g1.pl ?
 	# help2man does not work at all since version 4.6.4
-	#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
+	#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
 	#   --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
 	#      $(CURDIR)/psi-cd-hit-2d-g1.pl	> $(mandir)/psi-cd-hit-2d-g1.1

@@ -87,7 +86,7 @@ override_dh_installman:
 	#   psi-cd-hit-local.pl    -> even throws several "used only once: possible typo" errors

 override_dh_auto_install:
-	dh_auto_install -- PREFIX=debian/$(pkg)/usr/lib/cd-hit
+	dh_auto_install -- PREFIX=debian/$(DEB_SOURCE)/usr/lib/cd-hit

 override_dh_compress:
 	dh_compress --exclude=.pdf
--- a/psi-cd-hit/psi-cd-hit-local.pl
+++ b/psi-cd-hit/psi-cd-hit-local.pl
@@ -13,21 +13,22 @@ our $opt_aS    = 0.0;  #
 our $opt_aL    = 0.0;  #
 our $circle    = 0;    #
 our $opt_g     = 1;    ####################
-our $blast_exe = "blastall -p blastp -m 8";               #########################
-our $prof_exe  = "blastpgp -m 8";                         #
-our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500";      #
-our $prof_db   = "";                                      #
-our $bl_para   = "-F T -e 0.000001 -b 100000 -v 100000";  #  program
+
+our $blast_exe = "blastp";                                             #########################
+our $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000";  #  program
+our $bl_para_u = "";                                                   #
 our $bl_STDIN  = 1;                                                    #
 our $keep_bl   = 0;                                                    #
 our $blast_prog= "blastp";                                             #
-our $formatdb  = "formatdb";                              #########################
+our $formatdb  = "makeblastdb";                                        #########################
+
 our $exec_mode = "local";      #######################
 our $num_qsub  = 1;            #
 our $para_no   = 1;            # compute
 our $sh_file   = "";           #
 our $num_multi_seq = 50;       #
 our $batch_no_per_node = 100;  #######################
+
 our $reformat_seg = 50000;
 our $restart_seg  = 20000;
 our $job          = "";
@@ -73,9 +74,7 @@ sub parse_para_etc {
    elsif ($arg eq "-sl")         { $skip_long = shift; }
    ## program
    elsif ($arg eq "-prog")       { $blast_prog= shift; }
-    elsif ($arg eq "-p")          { $prof_para = shift; }
-    elsif ($arg eq "-dprof")      { $prof_db   = shift; die "option -dprof no longer supported!";}
-    elsif ($arg eq "-s")          { $bl_para   = shift; }
+    elsif ($arg eq "-s")          { $bl_para_u = shift; }
    elsif ($arg eq "-k")          { $keep_bl   = shift; }
    elsif ($arg eq "-bs")         { $bl_STDIN  = shift; }
    ## compute
@@ -103,38 +102,35 @@ sub parse_para_etc {
    print_usage(); exit();
  }

-  if ($blast_prog eq "blastn") {
-    $formatdb  = "formatdb -p F";
-    $blast_exe    = "blastall -p blastn -m 8";
-  }
-  elsif ($blast_prog eq "megablast") {
-    $blast_prog = "blastn"; #### back to blastn for blast parser type
-    $formatdb  = "formatdb -p F";
-    $blast_exe    = "megablast -H 100 -D 2 -m 8";
-  }
-  elsif ($blast_prog eq "blastpgp") {
-    $blast_exe  = "blastpgp -m 8 -j 3";
-  }
-
  #### for blast+
  if ($bl_plus) {
-    $formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
+    if ($blast_prog eq "blastp") {
+      $formatdb  = "makeblastdb -dbtype prot -max_file_sz 2GB";
      $blast_exe = "blastp -outfmt 6";
-    $bl_para   = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
-
-    if ($blast_prog eq "blastn") {
-      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+      $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+    }
+    elsif ($blast_prog eq "psiblast") {
+      $formatdb  = "makeblastdb -dbtype prot -max_file_sz 2GB";
+      $blast_exe = "psiblast -outfmt 6 -num_iterations 3";
+      $bl_para   = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+    }
+    elsif ($blast_prog eq "blastn") {
+      $formatdb  = "makeblastdb -dbtype nucl -max_file_sz 2GB";
      $blast_exe = "blastn -task blastn -outfmt 6";
-      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
+      $bl_para   = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
    }
    elsif ($blast_prog eq "megablast") {
-      $blast_prog = "blastn"; #### back to blastn for blast parser type
-      $formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
+      $formatdb  = "makeblastdb -dbtype nucl -max_file_sz 2GB";
      $blast_exe = "blastn -task megablast -outfmt 6";
-      $bl_para   = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads";  #  program
+      $bl_para   = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads";  #  program
+      $blast_prog= "blastn"; #### back to blastn for blast parser type
+    }
+    else {
+      print "Unknown blast program: $blast_prog\n";
+      print_usage(); exit();
    }
-    elsif ($blast_prog eq "blastpgp") {
-      $blast_exe  = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
+    if ($bl_para_u) {
+      $bl_para = "$bl_para_u -num_threads $bl_threads";
    }
  }

@@ -142,6 +138,13 @@ sub parse_para_etc {
    $blast_exe = "$bl_path/$blast_exe";
    $formatdb  = "$bl_path/$formatdb";
  }
+  my $bl_ver = `$blast_exe -version`;
+  if ($bl_ver =~ /blast/) {
+    print "BLAST version:\n$bl_ver\n\n";
+  }
+  else {
+    die "BLAST program not found: $blast_exe\n";
+  }

  (-e $db_in) || die "No input";
  ($db_out)   || die "No output";
@@ -576,7 +579,7 @@ sub keep_strand_with_top_hsp {
 }
 ########## END keep_strand_with_top_hsp

-########## for blastpgp -j no (no>1)
+########## for psiblast -j no (no>1)
 ########## keep hits from the last round
 sub keep_hsp_of_last_round {
  my $self = shift;
@@ -691,7 +694,7 @@ sub process_blout_blastp_blastn {
  my $len_rep = 0;
  my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
  if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
-  if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
+  if ($blast_prog eq "psiblast" ) {keep_hsp_of_last_round($bl); }

  if ($g_iden == 0 ) { #### Local identity
    keep_top_hsp($bl); #### local alignment, only the top HSP
@@ -1084,11 +1087,7 @@ Options
             long sequences will not be clustered anyway.
        -sl 0 means no skipping
   program:
-     -prog (blastp, blastn, megablast, blastpgp), default $blast_prog
-     -p  profile search para, default 
-         "$prof_para"
-     -dprof database for building PSSM, default using input
-         you can also use another database that is more comprehensive like NR80
+     -prog (blastp, blastn, megablast, psiblast), default $blast_prog
     -s  blast search para, default 
         "$bl_para"
     -bs (1/0) default $bl_STDIN

--- a/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
+++ b/usecases/Miseq-16S/NG-Omics-Miseq-16S.pl
@@ -105,7 +105,7 @@ $CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5
 $CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
 $CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
 $CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
-$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+$CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
 rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log

 EOD

--- a/usecases/Miseq-16S/NG-Omics-Miseq-16S.py
+++ b/usecases/Miseq-16S/NG-Omics-Miseq-16S.py
+#!/usr/bin/python
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+queue_system = 'SGE'
+
+########## local variables etc. Please edit
+ENV={
+  'CD_HIT_dir'           : '/home/oasis/data/NGS-ann-project/apps/cdhit.git',
+  'NGS_prog_trimmomatic' : '/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar',
+}
+
+########## computation resources for execution of jobs
+NGS_executions = {}
+NGS_executions['qsub_1'] = {
+  'type'                : 'qsub-pe',
+  'cores_per_node'      : 8,
+  'number_nodes'        : 64,
+  'user'                : 'weizhong', #### I will use command such as qstat -u weizhong to query submitted jobs
+  'command'             : 'qsub',
+  'command_name_opt'    : '-N',
+  'command_err_opt'     : '-e',
+  'command_out_opt'     : '-o',
+  'template'            : '''#!/bin/bash
+##$ -q RNA.q
+#$ -v PATH
+#$ -V
+
+'''
+}
+
+NGS_executions['sh_1'] = {
+  'type'                : 'sh',
+  'cores_per_node'      : 8,
+  'number_nodes'        : 1,
+  'template'            : '''#!/bin/bash
+
+'''
+}
+
+
+NGS_batch_jobs = {}
+NGS_batch_jobs['qc'] = {
+  'CMD_opts'         : ['100'],
+  'execution'        : 'qsub_1',               # where to execute
+  'cores_per_cmd'    : 4,                    # number of threads used by command below
+  'no_parallel'      : 1,                    # number of total jobs to run using command below
+  'command'          : '''
+java -jar $ENV.NGS_prog_trimmomatic PE -threads 4 -phred33 $DATA.0 $DATA.1 $SELF/R1.fq $SELF/R1-s.fq $SELF/R2.fq $SELF/R2-s.fq \\
+    SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$CMDOPTS.0 MAXINFO:80:0.5 1>$SELF/qc.stdout 2>$SELF/qc.stderr
+
+perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa &
+perl -e '$i=0; while(<>){ if (/^@/) {$i++;  print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa &
+
+wait
+rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
+'''
+}
+
+
+NGS_batch_jobs['otu'] = {
+  'injobs'            : ['qc'],
+  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+  'execution'         : 'qsub_1',               # where to execute
+  'cores_per_cmd'     : 2,                    # number of threads used by command below
+  'no_parallel'       : 1,                    # number of total jobs to run using command below
+  'command'           : '''
+#### cluster at 100% PE
+$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0  -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.nr.log
+#### cluster at 99% PE and SE for R1,R2 
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr   -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.99.log
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\
+    -i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\
+    -a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr       $SELF/seq.99f.clstr     > $SELF/seq.99f-all.clstr
+$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > $SELF/seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr
+
+rm -f $SELF/seq.chimeric-clstr.R1    $SELF/seq.chimeric-clstr.R1.log    $SELF/seq.chimeric-clstr.R2    $SELF/seq.chimeric-clstr.R2.log 
+rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log
+mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt
+
+'''
+}
+
+
+NGS_batch_jobs['otu-pooled'] = {
+  'CMD_opts'          : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
+  'execution'         : 'qsub_1',               # where to execute
+  'cores_per_cmd'     : 2,                    # number of threads used by command below
+  'no_parallel'       : 1,                    # number of total jobs to run using command below
+  'command'           : '''
+#### before running
+#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
+$ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.log
+$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
+    -cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10  -T 1 -M 8000  -d 0 -p 1 > seq.97.ref.log
+$ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
+$ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
+$ENV.CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
+rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
+
+'''
+}
+
--- a/usecases/Miseq-16S/NG-Omics-WF.py
+++ b/usecases/Miseq-16S/NG-Omics-WF.py
--- a/usecases/Miseq-16S/README
+++ b/usecases/Miseq-16S/README
@@ -51,10 +51,17 @@ http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/t
 We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.


-3. Modify NG-Omics-Miseq-16S.pl
-Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
-    $CD_HIT_dir   = "PATH_to_cd-hit";
-    $NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar";  #### where you have installed Trimmomatic
+3. Modify NG-Omics-Miseq-16S.py
+Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.py, in the top few lines:
+
+from
+  'CD_HIT_dir'           : '/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621',
+  'NGS_prog_trimmomatic' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar',
+
+to
+  'CD_HIT_dir'           : 'PATH_to_cd-hit_directory',
+  'NGS_prog_trimmomatic' : 'PATH_to_Trimmomatic/trimmomatic-0.32.jar',   #### where you have installed Trimmomatic 
+  

 4. Download reference dataset
 Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
@@ -124,7 +131,7 @@ This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE
 4. Run sequence QC and OTU clustering for each sample
 In the working directory, run

-    PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh

    where: 150 and 100 are the effective length, 
           see next section for suggestions in choose effective clustering read length
@@ -138,13 +145,13 @@ This command will generate shell scripts for QC and for OTU for each sample.
 The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all 
 these jobs finished you then run all otu.sample_name.sh

-NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline 
+NG-Omics-WF.py https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline 
 tool developed in our group. It is not fully released yet, since we need more time to document 
-this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples. 
-First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the 
+this tool. However, you can try to use NG-Omics-WF.py to automatically run all your samples. 
+First edit NG-Omics-Miseq-16S.py and modify cores_per_node around line #36 to match the 
 number of CPU cores of your computer, then run

-    nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
+    nohup PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &

 After the job finished, the OTU results will be in sample_name/otu folder, important files include
    OTU.clstr: file lists all clusters and sequences
@@ -156,14 +163,14 @@ If you have multiple samples, you don't just want to stop here. It is important
 to pool all sample together and re-run OTU clustering so that all samples can be
 compared, run

-    PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled 
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/pool_samples.pl -s SAMPLE_file -o pooled 

 This will pool sequences from all samples. We can handle hundred and more sample without problem. 


 6. Cluster pooled samples, run
 
-    PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
+    PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh

 This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
 run this sh script. When it is finished, OTUs will be in the pooled directory:

--- a/usecases/Miseq-16S/pool_samples.pl
+++ b/usecases/Miseq-16S/pool_samples.pl
@@ -9,10 +9,13 @@ my $output            = $opts{o};
   $output            = "pooled" unless ($output);
 my $sample_in         = $opts{s};
 my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq   or sample1;sample2;sample3
+my $file_list         = $opts{f};
+my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
+   @file_list = split(/,/, $file_list) if ($file_list); 
+
 my $job               = $opts{j};
   $job = "otu" unless ($job);

-my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;

 my ($i, $j, $k, $cmd);
 $cmd = `mkdir $output` unless (-e $output);
@@ -62,7 +65,7 @@ foreach $i (@file_list) {

 sub usage {
 <<EOD;
-    $0 -s sample_file -o output_dir
+    $0 -s sample_file -o output_dir -j job -f list_files
    -s sample data file, required unless -S is present
       File format example
 #Sample data file example, TAB or space delimited for following lines
@@ -73,6 +76,8 @@ Sample_ID3 sample_data_0 sample_data_1
    -S sample data from command line, required unless -s is present
           format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1

+    -j job name
+    -f list of files (delimited by ,) to pool (cat) 
 EOD
 }

--- a/usecases/Miseq-16S/silva-ana1.pl
+++ b/usecases/Miseq-16S/silva-ana1.pl
+#!/usr/bin/perl
+## =========================== NGS tools ==========================================
+## NGS tools for metagenomic sequence analysis
+## May also be used for other type NGS data analysis
+##
+##                                      Weizhong Li, UCSD
+##                                      liwz@sdsc.edu
+## http://weizhongli-lab.org/
+## ================================================================================
+
+use Getopt::Std;
+getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
+die usage() unless ($opts{j} and $opts{o});
+my ($i, $j, $k, $cmd);
+my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
+my ($len, $lena, $lenb);
+
+my $fasta = $opts{j};
+my $output  = $opts{o};
+
+my %id_2_seq = ();
+my $id = "";
+my $ann;
+open(TMP, $fasta) || die "can not open $fasta";
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    chop($ll);
+    ($id, $ann) = split(/\s+/, substr($ll,1), 2);
+    $ann =~ s/\s/_/g;
+    $id = "$id|$ann";
+  }
+  else {
+    $ll =~ s/U/T/g;
+    $ll =~ s/u/T/g;
+    $id_2_seq{$id} .= $ll;
+  }
+}
+
+close(TMP);
+
+my @ids = keys %id_2_seq;
+   @ids = sort {length($b) <=> length($a) } @ids;
+
+open(OUT, "> $output") || die "can not write to $output";
+foreach $id (@ids) {
+  print OUT ">$id\n$id_2_seq{$id}";
+}
+close(OUT);
+
+
+
+sub usage {
+<<EOD;
+This script formats Silva FASTA file for CD-HIT-OTU-MiSeq. You should download Silva sequences
+from https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva.fasta.gz
+or similar file, gunzip the file
+
+Run this script as $0 -j SILVA_128_SSURef_Nr99_tax_silva.fasta -o silva_128_SSURef_processed.fasta
+
+Options:
+======================
+        -j path for SILVA_128_SSURef_Nr99_tax_silva.fasta
+        -o output FASTA file of formatted Silva reference DB
+EOD
+}
--- a/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
+++ b/usecases/miRNA-seq/NG-Omics-miRNA-seq.pl
+#!/usr/bin/perl
+################################################################################
+# NGS workflow by Weizhong Li, http://weizhongli-lab.org
+################################################################################
+
+########## local variables etc. Please edit
+$CD_HIT_dir           = "/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621";
+$NGS_prog_trimmomatic = "/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar";
+
+########## computation resources for execution of jobs
+%NGS_executions = ();
+$NGS_executions{"qsub_1"} = {
+  "type"                => "qsub-pe",
+  "cores_per_node"      => 8,
+  "number_nodes"        => 64,
+  "user"                => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
+  "command"             => "qsub",
+  "command_name_opt"    => "-N",
+  "command_err_opt"     => "-e",
+  "command_out_opt"     => "-o",
+  "template"            => <<EOD,
+#!/bin/sh
+#PBS -v PATH
+#PBS -V
+
+#\$ -q RNA.q
+#\$ -v PATH
+#\$ -V
+
+EOD
+};
+
+
+$NGS_executions{"sh_1"} = {
+  "type"                => "sh",
+  "cores_per_node"      => 8,
+  "number_nodes"        => 1,
+};
+
+
+# $NGS_batch_jobs{"qc-pe"} = {
+#   "CMD_opts"          => ["20"],
+#   "execution"         => "qsub_1",               # where to execute
+#   "cores_per_cmd"     => 4,                     # number of threads used by command below
+#   "no_parallel"       => 1,                     # number of total jobs to run using command below
+#   "command"           => <<EOD,
+# java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
+#     SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+# 
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
+# 
+# wait
+# rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
+# EOD
+# };
+
+ 
+$NGS_batch_jobs{"qc"} = {
+  "CMD_opts"          => ["20"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 4,                     # number of threads used by command below
+  "no_parallel"       => 1,                     # number of total jobs to run using command below
+  "command"           => <<EOD,
+java -jar $NGS_prog_trimmomatic SE -threads 4 -phred33 \\DATA.0          \\SELF/R1.fq                                            \\
+    SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
+perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++;  print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
+rm -f \\SELF/R1.fq
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr"} = {
+  "injobs"            => ["qc"],
+  "CMD_opts"          => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 2,                    # number of threads used by command below
+  "no_parallel"       => 1,                    # number of total jobs to run using command below
+  "command"           => <<EOD,
+#### cluster at 100%
+$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -o \\SELF/seq.nr -sf 1 -sc 1 -r 0 -c 1.00        -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.nr.log
+$CD_HIT_dir/usecases/miRNA-seq/filter-small-cluster.pl -i \\SELF/seq.nr.clstr -s \\SELF/seq.nr -o \\SELF/seq.nr-filtered.clstr -f \\SELF/seq.nr-filtered -c 1
+
+$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr-filtered  -o \\SELF/seq.95  -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.95.log
+$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr-filtered.clstr \\SELF/seq.95.clstr     > \\SELF/seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95  -i2 \\CMDOPTS.1 -o \\SELF/seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95  -i2 \\CMDOPTS.2 -o \\SELF/seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.ref.clstr > \\SELF/seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.spk.clstr > \\SELF/seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/seq.95-full.clstr \\SELF/seq.95.reftop.clstr > \\SELF/tmp.clstr
+$CD_HIT_dir/clstr_merge.pl \\SELF/tmp.clstr         \\SELF/seq.95.spktop.clstr > \\SELF/miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < \\SELF/miRNA.clstr > \\SELF/miRNA.clstr.s
+mv \\SELF/miRNA.clstr.s \\SELF/miRNA.clstr
+rm -f \\SELF/tmp.clstr
+EOD
+};
+
+
+$NGS_batch_jobs{"clstr-pooled"} = {
+  "CMD_opts"          => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
+  "execution"         => "qsub_1",               # where to execute
+  "cores_per_cmd"     => 2,                    # number of threads used by command below
+  "no_parallel"       => 1,                    # number of total jobs to run using command below
+  "command"           => <<EOD,
+#### concat seq.nr-filtered seq.nr-filtered.clstr 
+
+$CD_HIT_dir/cd-hit-est -i seq.nr-filtered    -o seq.95             -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > seq.95.log
+$CD_HIT_dir/clstr_rev.pl seq.nr-filtered.clstr       seq.95.clstr     > seq.95-full.clstr
+
+$CD_HIT_dir/cd-hit-est-2d -i seq.95  -i2 \\CMDOPTS.1 -o seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.ref.log
+$CD_HIT_dir/cd-hit-est-2d -i seq.95  -i2 \\CMDOPTS.2 -o seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.spk.log
+
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.ref.clstr > seq.95.reftop.clstr
+$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.spk.clstr > seq.95.spktop.clstr
+$CD_HIT_dir/clstr_merge.pl seq.95-full.clstr seq.95.reftop.clstr > tmp.clstr
+$CD_HIT_dir/clstr_merge.pl tmp.clstr         seq.95.spktop.clstr > miRNA.clstr
+
+$CD_HIT_dir/clstr_sort_by.pl < miRNA.clstr > miRNA.clstr.s
+mv miRNA.clstr.s miRNA.clstr
+
+$CD_HIT_dir/usecases/miRNA-seq/clstr_2_miRNA-table.pl -i miRNA.clstr -o miRNA.txt
+
+EOD
+};
+
+##############################################################################################
+########## END
+1;
+
--- a/usecases/miRNA-seq/clstr_2_miRNA-table.pl
+++ b/usecases/miRNA-seq/clstr_2_miRNA-table.pl
+#!/usr/bin/perl
+#
+use Getopt::Std;
+getopts("i:s:S:o:f:j:",\%opts);
+
+my $input             = $opts{i}; $input   = "Cluster.clstr" unless $input;
+my $output            = $opts{o}; $output  = "Cluster.txt" unless ($output);
+my ($i, $j, $k, $str, $cmd, $ll);
+
+my %count = ();
+my %count_t = ();
+my %count_s = ();
+my $Cluster_2_ann = ();
+# >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
+open(TMP, $input) || die "can not open $input";
+my $Cluster=0;
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    $Cluster++;
+  }
+  else {
+    chop($ll);
+    if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+      my $id = $2;
+      if ($id =~ /^Sample\|([^\|]+)\|/) {
+        $sample_id = $1;
+        $sample_id{$sample_id}=1;
+        $count{$Cluster}{$sample_id}++;
+        $count_t{$Cluster}++;
+        $count_s{$sample_id}++;
+      }
+      else {
+        $Cluster_2_ann{$Cluster} .= "$id\t";
+      }
+    }
+    else {
+      die "format error $ll";
+    }
+  }
+}
+close(TMP);
+
+my @sample_ids = sort keys %sample_id;
+
+open(OUT1, "> $output") || die "can not write $output";
+print OUT1 "Cluster";
+foreach $sample_id (@sample_ids){
+  print OUT1 "\t$sample_id";
+}
+#print OUT1 "\tTotal\n";
+print OUT1 "\tAnnotation\tSpike\n";
+
+for ($i=1; $i<=$Cluster; $i++){
+  $ann = "None";
+  if ($Cluster_2_ann{$i}) { $ann = $Cluster_2_ann{$i}; }
+  print OUT1 "Cluster$i";
+  foreach $sample_id (@sample_ids){
+    $k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
+    print OUT1 "\t$k";
+  }
+  #print OUT1 "\t$count_t{$i}";
+  print OUT1 "\t$ann\n";
+}
+close(OUT1);
+
+
--- a/usecases/miRNA-seq/filter-small-cluster.pl
+++ b/usecases/miRNA-seq/filter-small-cluster.pl
+#!/usr/bin/perl
+
+use Getopt::Std;
+my $script_name = $0;
+my $script_dir = $0;
+   $script_dir =~ s/[^\/]+$//;
+   chop($script_dir);
+   $script_dir = "./" unless ($script_dir);
+
+getopts("i:s:o:f:c:",\%opts);
+die usage() unless ($opts{i} and $opts{s} and $opts{o} and $opts{f});
+
+my $input      = $opts{i}; ## nr.clstr
+my $fa         = $opts{s}; ## R1.fa
+my $output     = $opts{o}; ## nr-filtered.clstr
+my $output_fa  = $opts{f}; ## nr-filtered
+my $cutoff     = $opts{c}; $cutoff = 1  unless ($cutoff);
+
+my ($i, $j, $k, $str, $cmd, $ll);
+
+open(TMP, $input) || die "can not open $input";
+open(OUT, "> $output") || die "can not write to $output";
+$no = 0;
+$clstr = "";
+$rep = "";
+my %good_ids = ();
+
+while($ll=<TMP>){
+  if ($ll =~ /^>/) {
+    if ($no > $cutoff) {
+      print OUT $clstr;
+      $good_ids{$rep}=1;
+    }
+    $clstr = $ll;
+    $no = 0;
+  }
+  else {
+    $clstr .= $ll;
+    chop($ll);
+    if ($ll =~ /\*$/) {
+      $rep = "";
+      if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
+        $rep = $2;
+      }
+      else {
+        die "format error $ll";
+      }
+    }
+    $no++;
+  }
+}
+    if ($no > $cutoff) {
+      print OUT $clstr;
+      $good_ids{$rep}=1;
+    }
+close(OUT);
+close(TMP);
+
+open(TMP, $fa) || die "can not open $fa";
+open(OUT, ">$output_fa") || die "can not write to $output_fa";
+
+my $flag = 0;
+while($ll = <TMP>) {
+  if ($ll =~ /^>/) {
+    $gi = substr($ll,1);
+    chop($gi);
+    $gi =~ s/\s.+$//;
+    $flag = ( $good_ids{$gi} ) ? 1 : 0;
+  }
+  print OUT $ll  if ($flag);
+}
+
+close(TMP);
+close(OUT);
+
+
+
+sub usage {
+<<EOF
+Usage:
+$script_name -i seq.nr.clstr -s R1.fa -o output.clstr -f output.fa -c 1 
+
+Options:
+    -i input seq.nr.clstr
+    -s R1.fa
+    -o output.clstr
+    -f output.fa
+    -c abundance cutoff, default $cutoff
+       small clusters <= this size will be considiered as noise and will be removed
+
+EOF
+}
+###### END usage
+