Skip to content
Commits on Source (9)
CC = g++ -Wall -ggdb
CC = g++ -pg
CC = g++
# without OpenMP
# default with OpenMP
# with OpenMP
# in command line:
# make openmp=yes
......@@ -14,6 +12,21 @@ else
CCFLAGS = -fopenmp
endif
#LDFLAGS = -static -lz -o
#LDFLAGS = /usr/lib/x86_64-linux-gnu/libz.a -o
# default with zlib
# without zlib
# in command line:
# make zlib=no
ifeq ($(zlib),no)
CCFLAGS +=
LDFLAGS += -o
else
CCFLAGS += -DWITH_ZLIB
LDFLAGS += -lz -o
endif
# support debugging
# in command line:
# make debug=yes
......@@ -28,9 +41,6 @@ ifdef MAX_SEQ
CCFLAGS += -DMAX_SEQ=$(MAX_SEQ)
endif
#LDFLAGS = -static -o
LDFLAGS += -o
PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454
# Propagate hardening flags
......
For cd-hit
How to compile?
Requirements
Since 4.8.1, cd-hit supports .gz format input file. This requires zlib library. zlib should
be install in most Linux systems, so cd-hit should be compiled without issue. If your system
don't have zlib, please install it first.
* On Ubuntu, to install zlib:
sudo apt install zlib1g-dev
* On CentOS, to install zlib:
sudo yum install zlib-devel
How to compile
1. Compile with multi-threading support (default): make
2. Compile without multi-threading support (if you are on very old systems): make openmp=no
3. Compile without zlib (if you can not install zlib): make zlib=no
Having problems to compile
Please contact the author
For cd-hit-auxtools
......@@ -10,9 +24,16 @@ For cd-hit-auxtools
make
For psi-cd-hit
please download legacy BLAST (not BLAST+) and install the executables in your $PATH
Compile cd-hit on MacOS
To install CD-HIT on MacOS, first install gcc on your system.
To use Homebrew (https://brew.sh/), see https://brewformulas.org/gcc.
Then locate the path to your g++ executable, (e.g. /usr/local/Cellar/gcc/6.3.0_1/bin/g++-6,
note: yours g++ path is likely to be different), then use command like this:
make CC=/usr/local/Cellar/gcc/6.3.0_1/bin/g++-6
For psi-cd-hit
please download BLAST+ (not legacy BLAST) and install the executables in your $PATH
For more information, please visit http://cd-hit.org
......
#!/usr/bin/perl
#
my $rep;
my @non_reps = ();
my @aln_info = ();
while($ll=<>){
if ($ll =~ /^>/ ) {
if (@non_reps) {
for ($i=0; $i<@non_reps; $i++) {
print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
}
}
$rep = "";
@non_reps = ();
@aln_info = ();
}
else {
chop($ll);
if ($ll =~ />(.+)\.\.\./ ) {
my $id = $1;
if ($ll =~ /\*$/) { $rep = $id}
else {
push(@non_reps, $id);
my @lls = split(/\s+/, $ll);
my ($a, $iden) = split(/\//, $lls[-1]);
chop($iden); ### removing % sign
my ($qb, $qe, $sb, $se) = split(/:/, $a);
my $alnln = $qe-$qb+1;
my $mis = int($alnln * (100-$iden) / 100);
my $gap = 0;
my $exp = 0;
my $bit = $alnln*3 - $mis*6; #### very rough
push(@aln_info, "$iden\t$alnln\t$mis\t$gap\t$qb\t$qe\t$sb\t$se\t$exp\t$bit");
}
}
#>Cluster 582
#0 6671aa, >gi|302514050|uid|51... *
#1 339aa, >SRS584717|scaffold|... at 2:332:4020:4356/89.32%
#2 182aa, >SRS584717|scaffold|... at 1:182:6490:6671/100.00%
#3 367aa, >SRS584717|scaffold|... at 1:332:4543:4874/90.66%
#4 109aa, >SRS584717|scaffold|... at 1:108:5782:5889/97.22%
}
}
if (@non_reps) {
for ($i=0; $i<@non_reps; $i++) {
print "$non_reps[$i]\t$rep\t$aln_info[$i]\n";
}
}
#query subject % alnln mis gap q_b q_e s_b s_e expect bits
##0 1 2 3 4 5 6 7 8 9 10 11
#mHE-SRS012902|scaffold|86.16 gnl|CDD|226997 47.62 42 17 2 164 201 210 250 5e-04 37.6
#mHE-SRS012902|scaffold|109.23 gnl|CDD|225183 47.46 236 122 1 1 236 475 708 1e-92 284
#mHE-SRS012902|scaffold|109.23 gnl|CDD|224055 44.35 239 130 2 1 239 332 567 2e-84 259
#mHE-SRS012902|scaffold|109.23 gnl|CDD|227321 39.50 238 140 3 1 238 324 557 9e-69 218
This diff is collapsed.
......@@ -35,11 +35,15 @@
#include<stdint.h>
#include<time.h>
#ifdef WITH_ZLIB
#include<zlib.h>
#endif
#include<valarray>
#include<vector>
#include<map>
#define CDHIT_VERSION "4.7"
#define CDHIT_VERSION "4.8.1"
#ifndef MAX_SEQ
#define MAX_SEQ 655360
......@@ -559,9 +563,17 @@ class SequenceDB
~SequenceDB(){ Clear(); }
void Read( const char *file, const Options & options );
void Readgz( const char *file, const Options & options );
void Read( const char *file, const char *file2, const Options & options );
void Readgz( const char *file, const char *file2, const Options & options );
void WriteClusters( const char *db, const char *newdb, const Options & options );
void WriteClustersgz( const char *db, const char *newdb, const Options & options );
void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
void WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
void WriteExtra1D( const Options & options );
void WriteExtra2D( SequenceDB & other, const Options & options );
void DivideSave( const char *db, const char *newdb, int n, const Options & options );
......
......@@ -13,18 +13,19 @@ char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artif
//
char contacts[] =
" Questions, bugs, contact Limin Fu at l2fu@ucsd.edu, or Weizhong Li at liwz@sdsc.edu\n"
" For updated versions and information, please visit: http://cd-hit.org\n\n"
" Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n"
" For updated versions and information, please visit: http://cd-hit.org\n"
" or https://github.com/weizhongli/cdhit\n\n"
" cd-hit web server is also available from http://cd-hit.org\n\n"
" If you find cd-hit useful, please kindly cite:\n\n";
char txt_option_i[] = "\tinput filename in fasta format, required\n";
char txt_option_i[] = "\tinput filename in fasta format, required, can be in .gz format\n";
char txt_option_j[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
\t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required, can be in .gz format\n";
char txt_option_i2[] = "\tinput filename for db2 in fasta format, required, can be in .gz format\n";
char txt_option_j2[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
......@@ -34,12 +35,12 @@ char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (
char txt_option_c[] =
"\tsequence identity threshold, default 0.9\n \
\tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
\tnumber of identical amino acids in alignment\n \
\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the full length of the shorter sequence\n";
char txt_option_G[] =
"\tuse global sequence identity, default 1\n \
\tif set to 0, then use local sequence identity, calculated as :\n \
\tnumber of identical amino acids in alignment\n \
\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the length of the alignment\n \
\tNOTE!!! don't use -G 0 unless you use alignment coverage controls\n \
\tsee options -aL, -AL, -aS, -AS\n";
......@@ -135,7 +136,8 @@ char txt_option_sc[] =
\tif set to 1, output clusters by decreasing size\n";
char txt_option_sf[] =
"\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
\tif set to 1, output sequences by decreasing cluster size\n";
\tif set to 1, output sequences by decreasing cluster size\n \
\tthis can be very slow if the input is in .gz format\n";
char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
......@@ -337,7 +339,7 @@ int print_usage_div (char *arg) {
char mytxt_option_c[] =
"\tsequence identity threshold, default 0.98\n \
\tthis is a \"global sequence identity\" calculated as :\n \
\tnumber of identical amino acids in alignment\n \
\tnumber of identical amino acids or bases in alignment\n \
\tdivided by the full length of the shorter sequence + gaps\n";
char mytxt_option_b[] = "\tband_width of alignment, default 10\n";
char mytxt_option_n_est[] = "\tword_length, default 10, see user's guide for choosing it\n";
......
cd-hit (4.6.8-3) UNRELEASED; urgency=medium
cd-hit (4.8.1-1) unstable; urgency=medium
[ Jelmer Vernooij ]
* Use secure copyright file specification URI.
* Trim trailing whitespace.
-- Jelmer Vernooij <jelmer@debian.org> Sat, 20 Oct 2018 13:28:38 +0000
[ Andreas Tille ]
* New upstream version
* debhelper 12
* Standards-Version: 4.4.0
* Remove trailing whitespace in debian/rules
* Do not parse d/changelog
* Build-Depends: zlib1g-dev
-- Andreas Tille <tille@debian.org> Fri, 19 Jul 2019 19:34:25 +0200
cd-hit (4.6.8-2) unstable; urgency=medium
......
......@@ -4,9 +4,10 @@ Uploaders: Tim Booth <tbooth@ceh.ac.uk>,
Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
help2man
Standards-Version: 4.2.1
Build-Depends: debhelper (>= 12~),
help2man,
zlib1g-dev
Standards-Version: 4.4.0
Vcs-Browser: https://salsa.debian.org/med-team/cd-hit
Vcs-Git: https://salsa.debian.org/med-team/cd-hit.git
Homepage: http://weizhong-lab.ucsd.edu/cd-hit/
......
......@@ -9,10 +9,9 @@
export DEB_BUILD_MAINT_OPTIONS = hardening=+all
pkg := $(shell dpkg-parsechangelog | sed -n 's/^Source: //p')
ver := $(shell dpkg-parsechangelog | sed -ne 's/^Version: \(\([0-9]\+\):\)\?\(.*\)-.*/\3/p')
include /usr/share/dpkg/default.mk
mandir=$(CURDIR)/debian/$(pkg)/usr/share/man/man1/
mandir=$(CURDIR)/debian/$(DEB_SOURCE)/usr/share/man/man1/
%:
dh $@
......@@ -23,54 +22,54 @@ override_dh_auto_build:
override_dh_installman:
mkdir -p $(mandir)
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences' \
$(CURDIR)/cd-hit > $(mandir)/cd-hit.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences, optimised for 454 data' \
$(CURDIR)/cd-hit-454 > $(mandir)/cd-hit-454.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='quickly group sequences in db1 or db2 format' \
$(CURDIR)/cd-hit-2d > $(mandir)/cd-hit-2d.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='run CD-HIT algorithm on RNA/DNA sequences' \
$(CURDIR)/cd-hit-est > $(mandir)/cd-hit-est.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='run CD-HIT algorithm on RNA/DNA sequences in db1 or db2 format' \
$(CURDIR)/cd-hit-est-2d > $(mandir)/cd-hit-est-2d.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a big clustering job into pieces to run cd-hit-2d or cd-hit-est-2d jobs' \
$(CURDIR)/cd-hit-2d-para.pl > $(mandir)/cd-hit-2d-para.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a big clustering job into pieces to run cd-hit or cd-hit-est jobs' \
$(CURDIR)/cd-hit-para.pl > $(mandir)/cd-hit-para.1
help2man --no-info --no-discard-stderr --version-string='$(ver)' \
help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
--name='divide a FASTA file into pieces' \
$(CURDIR)/cd-hit-div.pl > $(mandir)/cd-hit-div.1
# psi-cd-hit.pl is throwing some errors which are fixed using sed
# ... at least in versions <= 4.6.4 - now help2man does not work at all
#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities' \
# $(CURDIR)/psi-cd-hit.pl | \
# sed -e '/^Name "main::.*" used only once:/d' \
# > $(mandir)/psi-cd-hit.1
# help2man does not work at all since version 4.6.4
#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
# $(CURDIR)/psi-cd-hit-2d.pl > $(mandir)/psi-cd-hit-2d.1
# FIXME: what is the difference between psi-cd-hit-2d.pl and psi-cd-hit-2d-g1.pl ?
# help2man does not work at all since version 4.6.4
#help2man --no-info --no-discard-stderr --version-string='$(ver)' \
#help2man --no-info --no-discard-stderr --version-string='$(DEB_VERSION_UPSTREAM)' \
# --name='runs similar algorithm like CD-HIT but using BLAST to calculate similarities in db1 or db2 format' \
# $(CURDIR)/psi-cd-hit-2d-g1.pl > $(mandir)/psi-cd-hit-2d-g1.1
......@@ -87,7 +86,7 @@ override_dh_installman:
# psi-cd-hit-local.pl -> even throws several "used only once: possible typo" errors
override_dh_auto_install:
dh_auto_install -- PREFIX=debian/$(pkg)/usr/lib/cd-hit
dh_auto_install -- PREFIX=debian/$(DEB_SOURCE)/usr/lib/cd-hit
override_dh_compress:
dh_compress --exclude=.pdf
......@@ -13,21 +13,22 @@ our $opt_aS = 0.0; #
our $opt_aL = 0.0; #
our $circle = 0; #
our $opt_g = 1; ####################
our $blast_exe = "blastall -p blastp -m 8"; #########################
our $prof_exe = "blastpgp -m 8"; #
our $prof_para = "-j 3 -F T -e 0.001 -b 500 -v 500"; #
our $prof_db = ""; #
our $bl_para = "-F T -e 0.000001 -b 100000 -v 100000"; # program
our $blast_exe = "blastp"; #########################
our $bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000"; # program
our $bl_para_u = ""; #
our $bl_STDIN = 1; #
our $keep_bl = 0; #
our $blast_prog= "blastp"; #
our $formatdb = "formatdb"; #########################
our $formatdb = "makeblastdb"; #########################
our $exec_mode = "local"; #######################
our $num_qsub = 1; #
our $para_no = 1; # compute
our $sh_file = ""; #
our $num_multi_seq = 50; #
our $batch_no_per_node = 100; #######################
our $reformat_seg = 50000;
our $restart_seg = 20000;
our $job = "";
......@@ -73,9 +74,7 @@ sub parse_para_etc {
elsif ($arg eq "-sl") { $skip_long = shift; }
## program
elsif ($arg eq "-prog") { $blast_prog= shift; }
elsif ($arg eq "-p") { $prof_para = shift; }
elsif ($arg eq "-dprof") { $prof_db = shift; die "option -dprof no longer supported!";}
elsif ($arg eq "-s") { $bl_para = shift; }
elsif ($arg eq "-s") { $bl_para_u = shift; }
elsif ($arg eq "-k") { $keep_bl = shift; }
elsif ($arg eq "-bs") { $bl_STDIN = shift; }
## compute
......@@ -103,38 +102,35 @@ sub parse_para_etc {
print_usage(); exit();
}
if ($blast_prog eq "blastn") {
$formatdb = "formatdb -p F";
$blast_exe = "blastall -p blastn -m 8";
}
elsif ($blast_prog eq "megablast") {
$blast_prog = "blastn"; #### back to blastn for blast parser type
$formatdb = "formatdb -p F";
$blast_exe = "megablast -H 100 -D 2 -m 8";
}
elsif ($blast_prog eq "blastpgp") {
$blast_exe = "blastpgp -m 8 -j 3";
}
#### for blast+
if ($bl_plus) {
$formatdb = "makeblastdb -dbtype prot -max_file_sz 8GB";
if ($blast_prog eq "blastp") {
$formatdb = "makeblastdb -dbtype prot -max_file_sz 2GB";
$blast_exe = "blastp -outfmt 6";
$bl_para = "-seg yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
if ($blast_prog eq "blastn") {
$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
$bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
}
elsif ($blast_prog eq "psiblast") {
$formatdb = "makeblastdb -dbtype prot -max_file_sz 2GB";
$blast_exe = "psiblast -outfmt 6 -num_iterations 3";
$bl_para = "-seg yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
}
elsif ($blast_prog eq "blastn") {
$formatdb = "makeblastdb -dbtype nucl -max_file_sz 2GB";
$blast_exe = "blastn -task blastn -outfmt 6";
$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
$bl_para = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
}
elsif ($blast_prog eq "megablast") {
$blast_prog = "blastn"; #### back to blastn for blast parser type
$formatdb = "makeblastdb -dbtype nucl -max_file_sz 8GB";
$formatdb = "makeblastdb -dbtype nucl -max_file_sz 2GB";
$blast_exe = "blastn -task megablast -outfmt 6";
$bl_para = "-dust yes -evalue 0.000001 -num_alignments 100000 -num_threads $bl_threads"; # program
$bl_para = "-dust yes -evalue 0.000001 -max_target_seqs 100000 -num_threads $bl_threads"; # program
$blast_prog= "blastn"; #### back to blastn for blast parser type
}
else {
print "Unknown blast program: $blast_prog\n";
print_usage(); exit();
}
elsif ($blast_prog eq "blastpgp") {
$blast_exe = "psiblast -outfmt 6 -num_iterations 3 -num_threads $bl_threads";
if ($bl_para_u) {
$bl_para = "$bl_para_u -num_threads $bl_threads";
}
}
......@@ -142,6 +138,13 @@ sub parse_para_etc {
$blast_exe = "$bl_path/$blast_exe";
$formatdb = "$bl_path/$formatdb";
}
my $bl_ver = `$blast_exe -version`;
if ($bl_ver =~ /blast/) {
print "BLAST version:\n$bl_ver\n\n";
}
else {
die "BLAST program not found: $blast_exe\n";
}
(-e $db_in) || die "No input";
($db_out) || die "No output";
......@@ -576,7 +579,7 @@ sub keep_strand_with_top_hsp {
}
########## END keep_strand_with_top_hsp
########## for blastpgp -j no (no>1)
########## for psiblast -j no (no>1)
########## keep hits from the last round
sub keep_hsp_of_last_round {
my $self = shift;
......@@ -691,7 +694,7 @@ sub process_blout_blastp_blastn {
my $len_rep = 0;
my $bl = defined($blout) ? readblast_m8("", $blout) : readblast_m8_buffer();
if ($blast_prog eq "blastn") { keep_strand_with_top_hsp($bl); }
if (($blast_prog eq "blastpgp") and (not $prof_db)) {keep_hsp_of_last_round($bl); }
if ($blast_prog eq "psiblast" ) {keep_hsp_of_last_round($bl); }
if ($g_iden == 0 ) { #### Local identity
keep_top_hsp($bl); #### local alignment, only the top HSP
......@@ -1084,11 +1087,7 @@ Options
long sequences will not be clustered anyway.
-sl 0 means no skipping
program:
-prog (blastp, blastn, megablast, blastpgp), default $blast_prog
-p profile search para, default
"$prof_para"
-dprof database for building PSSM, default using input
you can also use another database that is more comprehensive like NR80
-prog (blastp, blastn, megablast, psiblast), default $blast_prog
-s blast search para, default
"$bl_para"
-bs (1/0) default $bl_STDIN
......
......@@ -105,7 +105,7 @@ $CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 \\CMDOPTS.4 -j2 \\CMDOPTS.5
$CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
$CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
$CD_HIT_dir/usecases/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
$CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
EOD
......
#!/usr/bin/python
################################################################################
# NGS workflow by Weizhong Li, http://weizhongli-lab.org
################################################################################
queue_system = 'SGE'
########## local variables etc. Please edit
ENV={
'CD_HIT_dir' : '/home/oasis/data/NGS-ann-project/apps/cdhit.git',
'NGS_prog_trimmomatic' : '/home/oasis/data/NGS-ann-project/apps/Trimmomatic/trimmomatic-0.32.jar',
}
########## computation resources for execution of jobs
NGS_executions = {}
NGS_executions['qsub_1'] = {
'type' : 'qsub-pe',
'cores_per_node' : 8,
'number_nodes' : 64,
'user' : 'weizhong', #### I will use command such as qstat -u weizhong to query submitted jobs
'command' : 'qsub',
'command_name_opt' : '-N',
'command_err_opt' : '-e',
'command_out_opt' : '-o',
'template' : '''#!/bin/bash
##$ -q RNA.q
#$ -v PATH
#$ -V
'''
}
NGS_executions['sh_1'] = {
'type' : 'sh',
'cores_per_node' : 8,
'number_nodes' : 1,
'template' : '''#!/bin/bash
'''
}
NGS_batch_jobs = {}
NGS_batch_jobs['qc'] = {
'CMD_opts' : ['100'],
'execution' : 'qsub_1', # where to execute
'cores_per_cmd' : 4, # number of threads used by command below
'no_parallel' : 1, # number of total jobs to run using command below
'command' : '''
java -jar $ENV.NGS_prog_trimmomatic PE -threads 4 -phred33 $DATA.0 $DATA.1 $SELF/R1.fq $SELF/R1-s.fq $SELF/R2.fq $SELF/R2-s.fq \\
SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:$CMDOPTS.0 MAXINFO:80:0.5 1>$SELF/qc.stdout 2>$SELF/qc.stderr
perl -e '$i=0; while(<>){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R1.fq > $SELF/R1.fa &
perl -e '$i=0; while(<>){ if (/^@/) {$i++; print ">Sample|$SAMPLE|$i ", substr($_,1); $a=<>; print $a; $a=<>; $a=<>;}}' < $SELF/R2.fq > $SELF/R2.fa &
wait
rm -f $SELF/R1.fq $SELF/R2.fq $SELF/R1-s.fq $SELF/R2-s.fq
'''
}
NGS_batch_jobs['otu'] = {
'injobs' : ['qc'],
'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
'execution' : 'qsub_1', # where to execute
'cores_per_cmd' : 2, # number of threads used by command below
'no_parallel' : 1, # number of total jobs to run using command below
'command' : '''
#### cluster at 100% PE
$ENV.CD_HIT_dir/cd-hit-est -i $INJOBS.0/R1.fa -j $INJOBS.0/R2.fa -o $SELF/seq.nr -op $SELF/seq.nr.2 -sf 1 -sc 1 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 1.0 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.nr.log
#### cluster at 99% PE and SE for R1,R2
$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -o $SELF/seq.chimeric-clstr.R1 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R1.log
$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr.2 -o $SELF/seq.chimeric-clstr.R2 -r 0 -cx $CMDOPTS.6 -c 0.99 -n 10 -G 0 -b 1 -A 50 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.chimeric-clstr.R2.log
$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.nr -j $SELF/seq.nr.2 -o $SELF/seq.99 -op $SELF/seq.99.2 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.99 -n 10 -G 1 -b 1 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.99.log
$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-chimeric-and-small.pl -c $CMDOPTS.3 -k $SELF/seq.nr.clstr \\
-i $SELF/seq.chimeric-clstr.R1.clstr -j $SELF/seq.chimeric-clstr.R2.clstr \\
-a $SELF/seq.99.clstr -f $SELF/seq.99 -g $SELF/seq.99.2 -o $SELF/seq.99f
$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.nr.clstr $SELF/seq.99f.clstr > $SELF/seq.99f-all.clstr
$ENV.CD_HIT_dir/cd-hit-est -i $SELF/seq.99f -j $SELF/seq.99f.2 -o $SELF/seq.97 -op $SELF/seq.97.2 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.log
$ENV.CD_HIT_dir/cd-hit-est-2d -i $SELF/seq.97 -j $SELF/seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o $SELF/seq.97.ref -op $SELF/seq.97.ref.2 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > $SELF/seq.97.ref.log
$ENV.CD_HIT_dir/clstr_rev.pl $SELF/seq.99f-all.clstr $SELF/seq.97.clstr > $SELF/seq.97-all.clstr
$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < $SELF/seq.97.ref.clstr > $SELF/seq.97.reftop.clstr
$ENV.CD_HIT_dir/clstr_merge.pl $SELF/seq.97-all.clstr $SELF/seq.97.reftop.clstr > $SELF/OTU.clstr
rm -f $SELF/seq.chimeric-clstr.R1 $SELF/seq.chimeric-clstr.R1.log $SELF/seq.chimeric-clstr.R2 $SELF/seq.chimeric-clstr.R2.log
rm -f $SELF/seq.97.ref $SELF/seq.97.ref.2 $SELF/seq.97.ref.log
mv $SELF/seq.99f.log $SELF/chimeric-small-clusters-list.txt
'''
}
NGS_batch_jobs['otu-pooled'] = {
'CMD_opts' : ['150', '100', '0.97', '0.0001', 'path_to_spliced_ref_db-R1', 'path_to_spliced_ref_db-R1', '75'],
'execution' : 'qsub_1', # where to execute
'cores_per_cmd' : 2, # number of threads used by command below
'no_parallel' : 1, # number of total jobs to run using command below
'command' : '''
#### before running
#### concat seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt
$ENV.CD_HIT_dir/cd-hit-est -i seq.99f -j seq.99f.2 -o seq.97 -op seq.97.2 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.log
$ENV.CD_HIT_dir/cd-hit-est-2d -i seq.97 -j seq.97.2 -i2 $CMDOPTS.4 -j2 $CMDOPTS.5 -o seq.97.ref -op seq.97.ref.2 -P 1 -r 0 \\
-cx $CMDOPTS.0 -cy $CMDOPTS.1 -c 0.97 -n 10 -G 1 -b 10 -T 1 -M 8000 -d 0 -p 1 > seq.97.ref.log
$ENV.CD_HIT_dir/clstr_rev.pl seq.99f-all.clstr seq.97.clstr > seq.97-all.clstr
$ENV.CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.97.ref.clstr > seq.97.reftop.clstr
$ENV.CD_HIT_dir/clstr_merge.pl seq.97-all.clstr seq.97.reftop.clstr > OTU.clstr
$ENV.CD_HIT_dir/usecases/Miseq-16S/clstr_2_OTU_table.pl -i OTU.clstr -o OTU.txt
rm -f seq.97.ref seq.97.ref.2 seq.97.ref.log
'''
}
This diff is collapsed.
......@@ -51,10 +51,17 @@ http://www.usadellab.org/cms/?page=trimmomatic or https://github.com/timflutre/t
We also have a copy at http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
3. Modify NG-Omics-Miseq-16S.pl
Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.pl, in the top few lines:
$CD_HIT_dir = "PATH_to_cd-hit";
$NGS_prog_trimmomatic = "PATH_to_trimmomatic/trimmomatic-0.32.jar"; #### where you have installed Trimmomatic
3. Modify NG-Omics-Miseq-16S.py
Please edit usecases/Miseq-16S/NG-Omics-Miseq-16S.py, in the top few lines:
from
'CD_HIT_dir' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621',
'NGS_prog_trimmomatic' : '/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar',
to
'CD_HIT_dir' : 'PATH_to_cd-hit_directory',
'NGS_prog_trimmomatic' : 'PATH_to_Trimmomatic/trimmomatic-0.32.jar', #### where you have installed Trimmomatic
4. Download reference dataset
Reference database can be downloaded from http://weizhongli-lab.org/download-data/cd-hit-otu-miseq/.
......@@ -124,7 +131,7 @@ This program will output spliced PE files gg_13_5-PE99.150-100-R1 and gg_13_5-PE
4. Run sequence QC and OTU clustering for each sample
In the working directory, run
PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
where: 150 and 100 are the effective length,
see next section for suggestions in choose effective clustering read length
......@@ -138,13 +145,13 @@ This command will generate shell scripts for QC and for OTU for each sample.
The scripts will be in WF-sh folder. You can first run all the qc.sample_name.sh and after all
these jobs finished you then run all otu.sample_name.sh
NG-Omics-WF.pl https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
NG-Omics-WF.py https://github.com/weizhongli/ngomicswf is a very powerful workflow and pipeline
tool developed in our group. It is not fully released yet, since we need more time to document
this tool. However, you can try to use NG-Omics-WF.pl to automatically run all your samples.
First edit NG-Omics-Miseq-16S.pl and modify cores_per_node around line #36 to match the
this tool. However, you can try to use NG-Omics-WF.py to automatically run all your samples.
First edit NG-Omics-Miseq-16S.py and modify cores_per_node around line #36 to match the
number of CPU cores of your computer, then run
nohup PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
nohup PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -s SAMPLE_file -j otu -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 &
After the job finished, the OTU results will be in sample_name/otu folder, important files include
OTU.clstr: file lists all clusters and sequences
......@@ -156,14 +163,14 @@ If you have multiple samples, you don't just want to stop here. It is important
to pool all sample together and re-run OTU clustering so that all samples can be
compared, run
PATH_to_cd-hit-dir/usecases/pool_samples.pl -s SAMPLE_file -o pooled
PATH_to_cd-hit-dir/usecases/Miseq-16S/pool_samples.pl -s SAMPLE_file -o pooled
This will pool sequences from all samples. We can handle hundred and more sample without problem.
6. Cluster pooled samples, run
PATH_to_cd-hit-dir/usecases/NG-Omics-WF.pl -i PATH_to_cd-hit-dir/usecases/NG-Omics-Miseq-16S.pl -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-WF.py -i PATH_to_cd-hit-dir/usecases/Miseq-16S/NG-Omics-Miseq-16S.py -S pooled -j otu-pooled -T otu:150:100:0.97:0.0001:PATH_to-gg_13_5-PE99.150-100-R1:PATH_to-gg_13_5-PE99.150-100-R2:75 -J write-sh
This command will generate a script WF-sh/otu-pooled.pooled.sh, you can
run this sh script. When it is finished, OTUs will be in the pooled directory:
......
......@@ -9,10 +9,13 @@ my $output = $opts{o};
$output = "pooled" unless ($output);
my $sample_in = $opts{s};
my $sample_command_in = $opts{S}; #### ',' delimited samples, ':' delimited entries, e.g. sample1:R1.fq:R2.fq;sample2:R1.fq:R2.fq or sample1;sample2;sample3
my $file_list = $opts{f};
my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
@file_list = split(/,/, $file_list) if ($file_list);
my $job = $opts{j};
$job = "otu" unless ($job);
my @file_list = qw/seq.99f seq.99f.2 seq.99f-all.clstr chimeric-small-clusters-list.txt/;
my ($i, $j, $k, $cmd);
$cmd = `mkdir $output` unless (-e $output);
......@@ -62,7 +65,7 @@ foreach $i (@file_list) {
sub usage {
<<EOD;
$0 -s sample_file -o output_dir
$0 -s sample_file -o output_dir -j job -f list_files
-s sample data file, required unless -S is present
File format example
#Sample data file example, TAB or space delimited for following lines
......@@ -73,6 +76,8 @@ Sample_ID3 sample_data_0 sample_data_1
-S sample data from command line, required unless -s is present
format: Sample_ID1:sample_data_0:sample_data_0:sample_data_1,Sample_ID2:sample_data_0:sample_data_1
-j job name
-f list of files (delimited by ,) to pool (cat)
EOD
}
#!/usr/bin/perl
## =========================== NGS tools ==========================================
## NGS tools for metagenomic sequence analysis
## May also be used for other type NGS data analysis
##
## Weizhong Li, UCSD
## liwz@sdsc.edu
## http://weizhongli-lab.org/
## ================================================================================
use Getopt::Std;
getopts("i:j:o:r:e:p:q:c:d:N:t:u:d:M:T:S:",\%opts);
die usage() unless ($opts{j} and $opts{o});
my ($i, $j, $k, $cmd);
my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
my ($len, $lena, $lenb);
my $fasta = $opts{j};
my $output = $opts{o};
my %id_2_seq = ();
my $id = "";
my $ann;
open(TMP, $fasta) || die "can not open $fasta";
while($ll=<TMP>){
if ($ll =~ /^>/) {
chop($ll);
($id, $ann) = split(/\s+/, substr($ll,1), 2);
$ann =~ s/\s/_/g;
$id = "$id|$ann";
}
else {
$ll =~ s/U/T/g;
$ll =~ s/u/T/g;
$id_2_seq{$id} .= $ll;
}
}
close(TMP);
my @ids = keys %id_2_seq;
@ids = sort {length($b) <=> length($a) } @ids;
open(OUT, "> $output") || die "can not write to $output";
foreach $id (@ids) {
print OUT ">$id\n$id_2_seq{$id}";
}
close(OUT);
sub usage {
<<EOD;
This script formats Silva FASTA file for CD-HIT-OTU-MiSeq. You should download Silva sequences
from https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva.fasta.gz
or similar file, gunzip the file
Run this script as $0 -j SILVA_128_SSURef_Nr99_tax_silva.fasta -o silva_128_SSURef_processed.fasta
Options:
======================
-j path for SILVA_128_SSURef_Nr99_tax_silva.fasta
-o output FASTA file of formatted Silva reference DB
EOD
}
#!/usr/bin/perl
################################################################################
# NGS workflow by Weizhong Li, http://weizhongli-lab.org
################################################################################
########## local variables etc. Please edit
$CD_HIT_dir = "/home/oasis/gordon-data/NGS-ann-project-new/apps/cd-hit-v4.6.8-2017-0621";
$NGS_prog_trimmomatic = "/home/oasis/gordon-data/NGS-ann-project-new/apps/Trimmomatic/trimmomatic-0.32.jar";
########## computation resources for execution of jobs
%NGS_executions = ();
$NGS_executions{"qsub_1"} = {
"type" => "qsub-pe",
"cores_per_node" => 8,
"number_nodes" => 64,
"user" => "weizhong", #### I will use command such as qstat -u weizhong to query submitted jobs
"command" => "qsub",
"command_name_opt" => "-N",
"command_err_opt" => "-e",
"command_out_opt" => "-o",
"template" => <<EOD,
#!/bin/sh
#PBS -v PATH
#PBS -V
#\$ -q RNA.q
#\$ -v PATH
#\$ -V
EOD
};
$NGS_executions{"sh_1"} = {
"type" => "sh",
"cores_per_node" => 8,
"number_nodes" => 1,
};
# $NGS_batch_jobs{"qc-pe"} = {
# "CMD_opts" => ["20"],
# "execution" => "qsub_1", # where to execute
# "cores_per_cmd" => 4, # number of threads used by command below
# "no_parallel" => 1, # number of total jobs to run using command below
# "command" => <<EOD,
# java -jar $NGS_prog_trimmomatic PE -threads 4 -phred33 \\DATA.0 \\DATA.1 \\SELF/R1.fq \\SELF/R1-s.fq \\SELF/R2.fq \\SELF/R2-s.fq \\
# SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
#
# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
# perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R2.fq > \\SELF/R2.fa &
#
# wait
# rm -f \\SELF/R1.fq \\SELF/R2.fq \\SELF/R1-s.fq \\SELF/R2-s.fq
# EOD
# };
$NGS_batch_jobs{"qc"} = {
"CMD_opts" => ["20"],
"execution" => "qsub_1", # where to execute
"cores_per_cmd" => 4, # number of threads used by command below
"no_parallel" => 1, # number of total jobs to run using command below
"command" => <<EOD,
java -jar $NGS_prog_trimmomatic SE -threads 4 -phred33 \\DATA.0 \\SELF/R1.fq \\
SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:\\CMDOPTS.0 MAXINFO:80:0.5 1>\\SELF/qc.stdout 2>\\SELF/qc.stderr
perl -e '\$i=0; while(<>){ if (/^\@/) {\$i++; print ">Sample|\\SAMPLE|\$i ", substr(\$_,1); \$a=<>; print \$a; \$a=<>; \$a=<>;}}' < \\SELF/R1.fq > \\SELF/R1.fa &
rm -f \\SELF/R1.fq
EOD
};
$NGS_batch_jobs{"clstr"} = {
"injobs" => ["qc"],
"CMD_opts" => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
"execution" => "qsub_1", # where to execute
"cores_per_cmd" => 2, # number of threads used by command below
"no_parallel" => 1, # number of total jobs to run using command below
"command" => <<EOD,
#### cluster at 100%
$CD_HIT_dir/cd-hit-est -i \\INJOBS.0/R1.fa -o \\SELF/seq.nr -sf 1 -sc 1 -r 0 -c 1.00 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.nr.log
$CD_HIT_dir/usecases/miRNA-seq/filter-small-cluster.pl -i \\SELF/seq.nr.clstr -s \\SELF/seq.nr -o \\SELF/seq.nr-filtered.clstr -f \\SELF/seq.nr-filtered -c 1
$CD_HIT_dir/cd-hit-est -i \\SELF/seq.nr-filtered -o \\SELF/seq.95 -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > \\SELF/seq.95.log
$CD_HIT_dir/clstr_rev.pl \\SELF/seq.nr-filtered.clstr \\SELF/seq.95.clstr > \\SELF/seq.95-full.clstr
$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95 -i2 \\CMDOPTS.1 -o \\SELF/seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.ref.log
$CD_HIT_dir/cd-hit-est-2d -i \\SELF/seq.95 -i2 \\CMDOPTS.2 -o \\SELF/seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > \\SELF/seq.95.spk.log
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.ref.clstr > \\SELF/seq.95.reftop.clstr
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < \\SELF/seq.95.spk.clstr > \\SELF/seq.95.spktop.clstr
$CD_HIT_dir/clstr_merge.pl \\SELF/seq.95-full.clstr \\SELF/seq.95.reftop.clstr > \\SELF/tmp.clstr
$CD_HIT_dir/clstr_merge.pl \\SELF/tmp.clstr \\SELF/seq.95.spktop.clstr > \\SELF/miRNA.clstr
$CD_HIT_dir/clstr_sort_by.pl < \\SELF/miRNA.clstr > \\SELF/miRNA.clstr.s
mv \\SELF/miRNA.clstr.s \\SELF/miRNA.clstr
rm -f \\SELF/tmp.clstr
EOD
};
$NGS_batch_jobs{"clstr-pooled"} = {
"CMD_opts" => ["0.95", "path_to_miRbase", "path_to_spike-in_db"],
"execution" => "qsub_1", # where to execute
"cores_per_cmd" => 2, # number of threads used by command below
"no_parallel" => 1, # number of total jobs to run using command below
"command" => <<EOD,
#### concat seq.nr-filtered seq.nr-filtered.clstr
$CD_HIT_dir/cd-hit-est -i seq.nr-filtered -o seq.95 -r 0 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 1 -b 1 -T 4 -M 8000 > seq.95.log
$CD_HIT_dir/clstr_rev.pl seq.nr-filtered.clstr seq.95.clstr > seq.95-full.clstr
$CD_HIT_dir/cd-hit-est-2d -i seq.95 -i2 \\CMDOPTS.1 -o seq.95.ref -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.ref.log
$CD_HIT_dir/cd-hit-est-2d -i seq.95 -i2 \\CMDOPTS.2 -o seq.95.spk -r 1 -c \\CMDOPTS.0 -n 10 -p 1 -d 0 -G 0 -A 20 -b 1 -T 4 -M 8000 > seq.95.spk.log
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.ref.clstr > seq.95.reftop.clstr
$CD_HIT_dir/usecases/Miseq-16S/filter-nontop-ref.pl < seq.95.spk.clstr > seq.95.spktop.clstr
$CD_HIT_dir/clstr_merge.pl seq.95-full.clstr seq.95.reftop.clstr > tmp.clstr
$CD_HIT_dir/clstr_merge.pl tmp.clstr seq.95.spktop.clstr > miRNA.clstr
$CD_HIT_dir/clstr_sort_by.pl < miRNA.clstr > miRNA.clstr.s
mv miRNA.clstr.s miRNA.clstr
$CD_HIT_dir/usecases/miRNA-seq/clstr_2_miRNA-table.pl -i miRNA.clstr -o miRNA.txt
EOD
};
##############################################################################################
########## END
1;
#!/usr/bin/perl
#
use Getopt::Std;
getopts("i:s:S:o:f:j:",\%opts);
my $input = $opts{i}; $input = "Cluster.clstr" unless $input;
my $output = $opts{o}; $output = "Cluster.txt" unless ($output);
my ($i, $j, $k, $str, $cmd, $ll);
my %count = ();
my %count_t = ();
my %count_s = ();
my $Cluster_2_ann = ();
# >4360486|k__Bacteria;.p__Firmicutes;.c__Clostridia;.o__Clostridiales;.f__Lachnospiraceae;.g__Roseburia;.s__faecis
open(TMP, $input) || die "can not open $input";
my $Cluster=0;
while($ll=<TMP>){
if ($ll =~ /^>/) {
$Cluster++;
}
else {
chop($ll);
if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
my $id = $2;
if ($id =~ /^Sample\|([^\|]+)\|/) {
$sample_id = $1;
$sample_id{$sample_id}=1;
$count{$Cluster}{$sample_id}++;
$count_t{$Cluster}++;
$count_s{$sample_id}++;
}
else {
$Cluster_2_ann{$Cluster} .= "$id\t";
}
}
else {
die "format error $ll";
}
}
}
close(TMP);
my @sample_ids = sort keys %sample_id;
open(OUT1, "> $output") || die "can not write $output";
print OUT1 "Cluster";
foreach $sample_id (@sample_ids){
print OUT1 "\t$sample_id";
}
#print OUT1 "\tTotal\n";
print OUT1 "\tAnnotation\tSpike\n";
for ($i=1; $i<=$Cluster; $i++){
$ann = "None";
if ($Cluster_2_ann{$i}) { $ann = $Cluster_2_ann{$i}; }
print OUT1 "Cluster$i";
foreach $sample_id (@sample_ids){
$k = $count{$i}{$sample_id}? $count{$i}{$sample_id} : 0;
print OUT1 "\t$k";
}
#print OUT1 "\t$count_t{$i}";
print OUT1 "\t$ann\n";
}
close(OUT1);
#!/usr/bin/perl
use Getopt::Std;
my $script_name = $0;
my $script_dir = $0;
$script_dir =~ s/[^\/]+$//;
chop($script_dir);
$script_dir = "./" unless ($script_dir);
getopts("i:s:o:f:c:",\%opts);
die usage() unless ($opts{i} and $opts{s} and $opts{o} and $opts{f});
my $input = $opts{i}; ## nr.clstr
my $fa = $opts{s}; ## R1.fa
my $output = $opts{o}; ## nr-filtered.clstr
my $output_fa = $opts{f}; ## nr-filtered
my $cutoff = $opts{c}; $cutoff = 1 unless ($cutoff);
my ($i, $j, $k, $str, $cmd, $ll);
open(TMP, $input) || die "can not open $input";
open(OUT, "> $output") || die "can not write to $output";
$no = 0;
$clstr = "";
$rep = "";
my %good_ids = ();
while($ll=<TMP>){
if ($ll =~ /^>/) {
if ($no > $cutoff) {
print OUT $clstr;
$good_ids{$rep}=1;
}
$clstr = $ll;
$no = 0;
}
else {
$clstr .= $ll;
chop($ll);
if ($ll =~ /\*$/) {
$rep = "";
if ($ll =~ /\d+(aa|nt), >(.+)\.\.\./) {
$rep = $2;
}
else {
die "format error $ll";
}
}
$no++;
}
}
if ($no > $cutoff) {
print OUT $clstr;
$good_ids{$rep}=1;
}
close(OUT);
close(TMP);
open(TMP, $fa) || die "can not open $fa";
open(OUT, ">$output_fa") || die "can not write to $output_fa";
my $flag = 0;
while($ll = <TMP>) {
if ($ll =~ /^>/) {
$gi = substr($ll,1);
chop($gi);
$gi =~ s/\s.+$//;
$flag = ( $good_ids{$gi} ) ? 1 : 0;
}
print OUT $ll if ($flag);
}
close(TMP);
close(OUT);
sub usage {
<<EOF
Usage:
$script_name -i seq.nr.clstr -s R1.fa -o output.clstr -f output.fa -c 1
Options:
-i input seq.nr.clstr
-s R1.fa
-o output.clstr
-f output.fa
-c abundance cutoff, default $cutoff
small clusters <= this size will be considiered as noise and will be removed
EOF
}
###### END usage