Commit 08d8190e authored by Sascha Steinbiss's avatar Sascha Steinbiss

New upstream version 4.6.8

parent 8a6f5379
#!/usr/bin/perl
use Storable;
use strict;
use Text::NSP::Measures::2D::Fisher::right;
my $clstr_file = shift;
my $anno_file = shift;
my $store_file = shift;
my @cls_list = ();
my @fun_list = ();
my $cur_cls = "";
my %cls2rep = ();
my @cur_anno = ();
open(TMP, $clstr_file) || die;
while(my $ll = <TMP>) { # read .clstr files
if ($ll =~ /^>/) { # the begin of a cluster
$cur_cls = $ll;
$cur_cls =~ s/^>(.*?)\s$/$1/;
# print "$cur_cls|\n";
}
else{
chop($ll);
if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
my @tmp = split(/\|\|/,$4);
if ($#tmp == 0){
@cur_anno = ();
}
else{
@cur_anno = split(/,/, pop(@tmp));
}
# print $cur_cls.$cur_anno[0]."|\n";
push(@cls_list, $cur_cls);
push(@fun_list, [@cur_anno]);
if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\.(.*)\*$/){
# print "$4\n";
$cls2rep{$cur_cls} = $4;
# print "$cur_cls\t$4\n";
}
}
}
}
#print join("\n", @cls_list[0..10]);
@cls_list = map {$cls2rep{$_}} @cls_list;
#print join("\n", @cls_list[0..10]);
#print "\n";
#foreach my $i (0..10){
# print join("\t",@{$fun_list[$i]});
# print "\n";
#}
#print join("\n", @fun_list[0..10]);
#exit(1);
my %cls_size = ();
my %cls_anno = ();
my %anno_size = ();
my $M = $#fun_list+1;
#print $#fun_list."\t".$M."\n";
#print $#cls_list."\t".$M."\n";
foreach my $i (0..$#fun_list){
$cls_size{$cls_list[$i]}++;
if ($#{$fun_list[$i]} >= 0) { # have annotation
foreach my $anno (@{$fun_list[$i]}){
# print "$i\t$cls_list[$i]\t$anno\n";
$anno_size{$anno}++;
$cls_anno{$cls_list[$i]}{$anno}++;
}
}
}
#while (my ($a,$b) = each %anno_size){
# print "$a\t$b\n";
#}
#print "COG0171\t".$anno_
my %resu = ();
while(my ($cls, $cls_s) = each %cls_size){
my @tmp = ();
# $resu{$cls} = [];
while (my ($anno,$anno_s) = each %{$cls_anno{$cls}}){
# print "$cls\t$cls_s\t$anno\t$anno_s\t$anno_size{$anno}";
# print "\n";
my $pvalue = calculateStatistic(n11=>$anno_s, n1p=>$cls_s, np1=>$anno_size{$anno}, npp=>$M);
# anno_term, anno_size, clsper, anno_total, backper, enrichment, pvalue
push @tmp, [$anno, $anno_s, $anno_s/$cls_s, $anno_size{$anno}, $anno_size{$anno}/$M, $anno_s*$M/($cls_s*$anno_size{$anno}), $pvalue];
# push $resu{$cls}, [sort{$a[0] <=> $b[0]} @tmp];
}
@tmp = sort {$$a[6] <=> $$b[6]} @tmp;
$resu{$cls} = [@tmp];
}
store \%resu, $store_file;
open(TMP, "> $anno_file") || die;
print TMP "ClsName\tClsSize\tAnno_term\tAnno_size\tClsPer\tAnno_total\tSeq_total\tBackPer\tEnrichment\tPvalue\n";
while(my ($cls, $info) = each %resu){
foreach my $a (@{$info}){ #[$pvalue, $enrichment, $anno_s, $anno]
print TMP join("\t",($cls, $cls_size{$cls}, $a->[0], $a->[1], $a->[2], $a->[3],
$M, $a->[4], $a->[5], $a->[6]))."\n";
# print "$cls\t".join("\t",@{$a})."\n";
}
# print "$cls\t$#{$info}\n";
}
close(TMP)
......@@ -14,9 +14,8 @@ For psi-cd-hit
please download legacy BLAST (not BLAST+) and install the executables in your $PATH
For more information, please visit http://cd-hit.org or please read the cdhit-users-guide.pdf.
Most up-to-date documents are available at http://weizhongli-lab.org/cd-hit/wiki/doku.php?id=cd-hit_user_guide.
For more information, please visit http://cd-hit.org
cd-hit was originally hosted at Google Code, some of the old releases are still available from https://code.google.com/p/cdhit/.
Most up-to-date documents are available at https://github.com/weizhongli/cdhit/wiki
cd-hit is also available as web server, visit http://cd-hit.org for web server address.
This diff is collapsed.
......@@ -39,7 +39,7 @@
#include<vector>
#include<map>
#define CDHIT_VERSION "4.6"
#define CDHIT_VERSION "4.7"
#ifndef MAX_SEQ
#define MAX_SEQ 655360
......@@ -280,6 +280,10 @@ struct Options
int frag_size;
int option_r;
int threads;
int PE_mode; // -P
int trim_len; // -cx
int trim_len_R2; // -cy
int align_pos; // -ap for alignment position
size_t max_entries;
size_t max_sequences;
......@@ -293,8 +297,14 @@ struct Options
bool backupFile;
string input;
string input_pe;
string input2;
string input2_pe;
string output;
string output_pe;
int sort_output; // -sc
int sort_outputf; // -sf
Options(){
backupFile = false;
......@@ -332,6 +342,12 @@ struct Options
frag_size = 0;
des_len = 20;
threads = 1;
PE_mode = 0;
trim_len = 0;
trim_len_R2 = 0;
align_pos = 0;
sort_output = 0;
sort_outputf = 0;
max_entries = 0;
max_sequences = 1<<20;
mem_limit = 100000000;
......@@ -358,6 +374,7 @@ struct Sequence
// length of the sequence:
int size;
int bufsize;
int size_R2; // size = size.R1 + size.R2 for back-to-back merged seq
//uint32_t stats;
......@@ -369,13 +386,9 @@ struct Sequence
int offset;
// stream offset of the description string in the database:
size_t des_begin;
// length of the description:
int des_length;
// length of the description in quality score part:
int des_length2;
// length of data in fasta file, including line wrapping:
int dat_length;
size_t des_begin, des_begin2;
// total record length
int tot_length, tot_length2;
char *identifier;
......@@ -389,6 +402,7 @@ struct Sequence
Sequence();
Sequence( const Sequence & other );
Sequence( const Sequence & other, const Sequence & other2, int mode );
~Sequence();
void Clear();
......@@ -403,6 +417,7 @@ struct Sequence
int Format();
void ConvertBases();
void trim(int trim_len);
void SwapIn();
void SwapOut();
......@@ -544,7 +559,9 @@ class SequenceDB
~SequenceDB(){ Clear(); }
void Read( const char *file, const Options & options );
void Read( const char *file, const char *file2, const Options & options );
void WriteClusters( const char *db, const char *newdb, const Options & options );
void WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options );
void WriteExtra1D( const Options & options );
void WriteExtra2D( SequenceDB & other, const Options & options );
void DivideSave( const char *db, const char *newdb, int n, const Options & options );
......@@ -590,6 +607,7 @@ int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix
int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
int band_left, int band_center, int band_right, WorkingBuffer & buffer);
void strrev(char *p);
int print_usage_2d (char *arg);
int print_usage_est (char *arg);
int print_usage_div (char *arg);
......@@ -606,3 +624,7 @@ void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutof
int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
float current_time();
//some functions from very old cd-hit
int quick_sort_idx(int *a, int *idx, int lo0, int hi0 );
int quick_sort_idxr(int *a, int *idx, int lo0, int hi0 );
......@@ -49,6 +49,10 @@ int main(int argc, char **argv)
string db_in;
string db_in2;
string db_out;
string db_in_pe;
string db_in2_pe;
string db_out_pe;
options.cluster_thd = 0.95;
options.NAA = 10;
......@@ -67,8 +71,12 @@ int main(int argc, char **argv)
options.Validate();
db_in = options.input;
db_in_pe = options.input_pe;
db_in2 = options.input2;
db_in2_pe = options.input2_pe;
db_out = options.output;
db_out_pe = options.output_pe;
InitNAA( MAX_UAA );
options.NAAN = NAAN_array[options.NAA];
......@@ -80,10 +88,12 @@ int main(int argc, char **argv)
make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
}
seq_db.Read( db_in.c_str(), options );
if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
else {seq_db.Read( db_in.c_str(), options );}
cout << "total seq in db1: " << seq_db.sequences.size() << endl;
seq_db2.Read( db_in2.c_str(), options );
if ( options.PE_mode ) { seq_db2.Read( db_in2.c_str(), db_in2_pe.c_str(), options );}
else { seq_db2.Read( db_in2.c_str(), options );}
cout << "total seq in db2: " << seq_db2.sequences.size() << endl;
seq_db.SortDivide( options );
......@@ -93,6 +103,9 @@ int main(int argc, char **argv)
cout << "writing non-redundant sequences from db2" << endl;
seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options );
if ( options.PE_mode ) { seq_db2.WriteClusters( db_in2.c_str(), db_in2_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
else { seq_db2.WriteClusters( db_in2.c_str(), db_out.c_str(), options ); }
seq_db2.WriteExtra2D( seq_db, options );
cout << "program completed !" << endl << endl;
end_time = current_time();
......
......@@ -43,6 +43,8 @@ int main(int argc, char **argv)
{
string db_in;
string db_out;
string db_in_pe;
string db_out_pe;
options.cluster_thd = 0.95;
options.NAA = 10;
......@@ -60,8 +62,10 @@ int main(int argc, char **argv)
if (options.SetOptions( argc, argv, false, true ) == 0) print_usage_est(argv[0]);
options.Validate();
db_in = options.input;
db_out = options.output;
db_in = options.input;
db_in_pe = options.input_pe;
db_out = options.output;
db_out_pe = options.output_pe;
InitNAA( MAX_UAA );
seq_db.NAAN = NAAN_array[options.NAA];
......@@ -71,13 +75,16 @@ int main(int argc, char **argv)
make_comp_short_word_index(options.NAA, NAAN_array, Comp_AAN_idx);
}
seq_db.Read( db_in.c_str(), options );
if ( options.PE_mode ) {seq_db.Read( db_in.c_str(), db_in_pe.c_str(), options );}
else {seq_db.Read( db_in.c_str(), options );}
cout << "total seq: " << seq_db.sequences.size() << endl;
seq_db.SortDivide( options );
seq_db.DoClustering( options );
printf( "writing new database\n" );
seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options );
if ( options.PE_mode ) { seq_db.WriteClusters( db_in.c_str(), db_in_pe.c_str(), db_out.c_str(), db_out_pe.c_str(), options ); }
else { seq_db.WriteClusters( db_in.c_str(), db_out.c_str(), options ); }
// write a backup clstr file in case next step crashes
seq_db.WriteExtra1D( options );
......
......@@ -7,10 +7,9 @@ using namespace std;
// information
char cd_hit_ver[] = "\t\t====== CD-HIT version " CDHIT_VERSION " (built on " __DATE__ ") ======";
char cd_hit_ref1[] = "\"Clustering of highly homologous sequences to reduce thesize of large protein database\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2001) 17:282-283";
char cd_hit_ref2[] = "\"Tolerating some redundancy significantly speeds up clustering of large protein databases\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik. Bioinformatics, (2002) 18:77-82";
char cd_hit_ref3[] = "\"Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
char cd_hit_ref4[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
char cd_hit_ref1[] = "\"CD-HIT: a fast program for clustering and comparing large sets of protein or nucleotide sequences\", Weizhong Li & Adam Godzik. Bioinformatics, (2006) 22:1658-1659";
char cd_hit_ref2[] = "\"CD-HIT: accelerated for clustering the next generation sequencing data\", Limin Fu, Beifang Niu, Zhengwei Zhu, Sitao Wu & Weizhong Li. Bioinformatics, (2012) 28:3150-3152";
char cd_hit_ref3[] = "\"Beifang Niu, Limin Fu, Shulei Sun and Weizhong Li. Artificial and natural duplicates in pyrosequencing reads of metagenomic data. BMC Bioinformatics (2010) 11:187";
//
char contacts[] =
......@@ -20,9 +19,18 @@ char contacts[] =
" If you find cd-hit useful, please kindly cite:\n\n";
char txt_option_i[] = "\tinput filename in fasta format, required\n";
char txt_option_j[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i R1.fq -j R2.fq -o output_R1 -op output_R2 or\n \
\t -i R1.fa -j R2.fa -o output_R1 -op output_R2 \n";
char txt_option_i_2d[] = "\tinput filename for db1 in fasta format, required\n";
char txt_option_i2[] = "\tinput filename for db2 in fasta format, required\n";
char txt_option_j2[] =
"\tinput filename in fasta/fastq format for R2 reads if input are paired end (PE) files\n \
\t -i db1-R1.fq -j db1-R2.fq -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 or\n \
\t -i db1-R1.fa -j db1-R2.fa -i2 db2-R1.fq -j2 db2-R2.fq -o output_R1 -op output_R2 \n";
char txt_option_o[] = "\toutput filename, required\n";
char txt_option_op[] = "\toutput filename for R2 reads if input are paired end (PE) files\n";
char txt_option_c[] =
"\tsequence identity threshold, default 0.9\n \
\tthis is the default cd-hit's \"global sequence identity\" calculated as:\n \
......@@ -88,7 +96,21 @@ char txt_option_A[] =
char txt_option_B[] =
"\t1 or 0, default 0, by default, sequences are stored in RAM\n \
\tif set to 1, sequence are stored on hard drive\n \
\tit is recommended to use -B 1 for huge databases\n";
\t!! No longer supported !!\n";
char txt_option_P[] =
"\tinput paired end (PE) reads, default 0, single file\n \
\tif set to 1, please use -i R1 -j R2 to input both PE files\n";
char txt_option_cx[] =
"\tlength to keep after trimming the tail of sequence, default 0, not trimming\n \
\tif set to 50, the program only uses the first 50 letters of input sequence\n";
char txt_option_cy[] =
"\tlength to keep after trimming the tail of R2 sequence, default 0, not trimming\n \
\tif set to 50, the program only uses the first 50 letters of input R2 sequence\n \
\te.g. -cx 100 -cy 80 for paired end reads\n";
char txt_option_ap[] =
"\talignment position constrains, default 0, no constrain\n \
\tif set to 1, the program will force sequences to align at beginings\n \
\twhen set to 1, the program only does +/+ alignment\n";
char txt_option_uL[] =
"\tmaximum unmatched percentage for the longer sequence, default 1.0\n \
\tif set to 0.1, the unmatched region (excluding leading and tailing gaps)\n \
......@@ -108,6 +130,12 @@ char txt_option_r[] =
\tif set to 0, only +/+ strand alignment\n";
char txt_option_bak[] =
"\twrite backup cluster file (1 or 0, default 0)\n";
char txt_option_sc[] =
"\tsort clusters by size (number of sequences), default 0, output clusters by decreasing length\n \
\tif set to 1, output clusters by decreasing size\n";
char txt_option_sf[] =
"\tsort fasta/fastq by cluster size (number of sequences), default 0, no sorting\n \
\tif set to 1, output sequences by decreasing cluster size\n";
char txt_option_mask[] = "\tmasking letters (e.g. -mask NX, to mask out both 'N' and 'X')\n";
char txt_option_match[] = "\tmatching score, default 2 (1 for T-U and N-N)\n";
......@@ -145,6 +173,8 @@ int print_usage (char *arg) {
cout << " -B" << txt_option_B;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
cout << " -sc"<< txt_option_sc;
cout << " -sf"<< txt_option_sf;
cout << " -bak" << txt_option_bak;
cout << " -h\tprint this help\n\n";
cout << contacts;
......@@ -190,7 +220,7 @@ int print_usage_2d (char *arg) {
cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
cout << " If you find cd-hit useful, please kindly cite:\n\n";
cout << " " << cd_hit_ref1 << "\n";
cout << " " << cd_hit_ref3 << "\n\n\n";
cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_2d
......@@ -199,7 +229,9 @@ int print_usage_est (char *arg) {
cout << cd_hit_ver << "\n\n" ;
cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
cout << " -i" << txt_option_i;
cout << " -j" << txt_option_j;
cout << " -o" << txt_option_o;
cout << " -op" << txt_option_op;
cout << " -c" << txt_option_c;
cout << " -G" << txt_option_G;
cout << " -b" << txt_option_b;
......@@ -219,6 +251,10 @@ int print_usage_est (char *arg) {
cout << " -uS" << txt_option_uS;
cout << " -U" << txt_option_U;
cout << " -B" << txt_option_B;
cout << " -P" << txt_option_P;
cout << " -cx"<< txt_option_cx;
cout << " -cy"<< txt_option_cy;
cout << " -ap"<< txt_option_ap;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
cout << " -r" << txt_option_r;
......@@ -228,10 +264,12 @@ int print_usage_est (char *arg) {
cout << " -gap" << txt_option_gap;
cout << " -gap-ext" << txt_option_gap_ext;
cout << " -bak" << txt_option_bak;
cout << " -sc"<< txt_option_sc;
cout << " -sf"<< txt_option_sf;
cout << " -h\tprint this help\n\n";
cout << contacts;
cout << " " << cd_hit_ref1 << "\n";
cout << " " << cd_hit_ref3 << "\n\n\n";
cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_est
......@@ -241,7 +279,9 @@ int print_usage_est_2d (char *arg) {
cout << "Usage: "<< arg << " [Options] \n\nOptions\n\n";
cout << " -i" << txt_option_i_2d;
cout << " -i2"<< txt_option_i2;
cout << " -j, -j2"<< txt_option_j2;
cout << " -o" << txt_option_o;
cout << " -op" << txt_option_op;
cout << " -c" << txt_option_c;
cout << " -G" << txt_option_G;
cout << " -b" << txt_option_b;
......@@ -263,6 +303,9 @@ int print_usage_est_2d (char *arg) {
cout << " -uS" << txt_option_uS;
cout << " -U" << txt_option_U;
cout << " -B" << txt_option_B;
cout << " -P" << txt_option_P;
cout << " -cx"<< txt_option_cx;
cout << " -cy"<< txt_option_cy;
cout << " -p" << txt_option_p;
cout << " -g" << txt_option_g;
cout << " -r" << txt_option_r;
......@@ -275,7 +318,7 @@ int print_usage_est_2d (char *arg) {
cout << " -h\tprint this help\n\n";
cout << contacts;
cout << " " << cd_hit_ref1 << "\n";
cout << " " << cd_hit_ref3 << "\n\n\n";
cout << " " << cd_hit_ref2 << "\n\n\n";
exit(1);
} // END print_usage_est_2d
......@@ -326,8 +369,8 @@ int print_usage_454 (char *arg)
cout << " Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
cout << " If you find cd-hit useful, please kindly cite:\n\n";
cout << " " << cd_hit_ref1 << "\n";
cout << " " << cd_hit_ref3 << "\n";
cout << " " << cd_hit_ref4 << "\n\n\n";
cout << " " << cd_hit_ref2 << "\n";
cout << " " << cd_hit_ref3 << "\n\n\n";
exit(1);
}
......
#!/usr/bin/perl
use Storable;
use strict;
#my $sort_by_what = shift;
# $sort_by_what = "no" unless $sort_by_what;
my $clstr_file = shift;
my $store_file = shift;
my %clstr = (); # an array of hashes for all the cluster
my $rep_len = 0;
my $rep_acc = "";
my @cur_sequences = (); # array of hashes for all sequences in a cluster
my $ll = "";
my @record = ();
open(TMP, $clstr_file) || die;
while($ll = <TMP>) { # read .clstr files
if ($ll =~ /^>/) { # the begin of a cluster
if (scalar(@cur_sequences)) { # not the first cluster, therefore collect the information of last clstr
#@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
@cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
@record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
$clstr{$rep_acc} = [@record];
}
@cur_sequences=();
}
else { # the sequence line
chop($ll);
if ($ll =~ /^(\d+)\s+(\d+)(aa|nt),\s+>(.+)\.\.\./) {
@record = ($4, $2, 0, [], "");
if ($ll =~ /\*$/) { # representative sequence or not
$rep_acc = $record[0];
$rep_len = $record[1];
$record[4] = "100%";
}
# elsif ($ll =~ / at (\d.+)$/ ) {
elsif ($ll =~ / at (.+\d.+)$/ ) {# because cd-hit-est have strand info
$record[4] = $1;
}
}
push(@cur_sequences, [@record]);
}
}
if (scalar(@cur_sequences)) {
#@cur_sequences = sort {$$b{"seq_len"} <=> $$a{"seq_len"}} @cur_sequences;
@cur_sequences = sort {$$b[1] <=> $$a[1]} @cur_sequences;
@record = ($rep_acc, $rep_len, 1, [@cur_sequences], "");
$clstr{$rep_acc} = [@record];
}
close(TMP);
if (-e $store_file){ # already have a cluster file
my %old_clstr = %{retrieve($store_file)};
foreach my $rep_acc (keys %clstr){
my $seqs = $clstr{$rep_acc}[3]; # $seqs a reference to the sequences;
my $tmp_size = scalar(@{$seqs}); # how many sequences in a top level cluster, each sequence should be a representative sequence for lower level cluster
#print "$rep_acc, $tmp_size\n";
my $i;
for $i (0..($tmp_size-1)){
my $seq = $$seqs[$i];
if ($old_clstr{$$seq[0]}){
$clstr{$rep_acc}[3][$i][3] = [@{$old_clstr{$$seq[0]}[3]}];
$clstr{$rep_acc}[3][$i][2] = 1;
}
}
}
}
store \%clstr, $store_file;
#~ my $size = scalar(keys %clstr);
#~ print "$size\n";
#~ my $acc = 'D8F4YGO02FSTQP|range|2:370|frame|2|len|123';
#~ my $temp = $clstr{$acc}[1];
#~ print "$temp\n";
#~ my $temp = scalar(@{$clstr{$acc}[3]});
#~ print "$temp\n";
#~ my $x;
#~ for $x (@{$clstr{$acc}[3]} ){
#~ my $tmp_1 = scalar(@{$x->[3]});
#~ print "$x->[2], $x->[4], $x->[0], $x->[1], $tmp_1\n";
#~ }
#!/usr/bin/perl
use Storable;
use strict;
my $input_file = shift;
my $output_file = shift;
my $sort_by_what = shift;
$sort_by_what = "no" unless $sort_by_what;
my @clstr = values %{retrieve($input_file)};
if ($sort_by_what eq "no") {
### Added by liwz sort by No. sequences instead of No. nodes
my %rep2size = ();
my $clstr_no = scalar(@clstr);
my ($i);
for ($i=0; $i<$clstr_no; $i++){
my $node_size = 0;
foreach my $seq1 (@{$clstr[$i][3]}) {
if ($$seq1[2]) { # can be futher expanded
foreach my $seq2(@{$$seq1[3]}) {
if ($$seq2[2]) { $node_size += scalar(@{$$seq2[3]}); }
else { $node_size++; }
}
}
else {
$node_size++;
}
}
$rep2size{ $clstr[$i][0] } = $node_size;
}
### END
#@clstr = sort {scalar(@{$b->[3]}) <=> scalar(@{$a->[3]})} @clstr;
@clstr = sort {$rep2size{$b->[0]} <=> $rep2size{$a->[0]}} @clstr;
}
elsif ($sort_by_what eq "len") {
@clstr = sort {$b->[1] <=> $a->[1]} @clstr;
}
elsif ($sort_by_what eq "des") {
@clstr = sort {$a->[0] cmp $b->[0]} @clstr;
}
store \@clstr, $output_file;
No preview for this file type
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/perl -w
################################################################################
######### PSI-cd-hit written by Weizhong Li at http://cd-hit.org
################################################################################
our $script_name = $0;
our $script_dir = $0;
$script_dir =~ s/[^\/]+$//;
$script_dir = "./" unless ($script_dir);
require "$script_dir/psi-cd-hit-local-old.pl";
parse_para_etc(@ARGV);
open_LOG();
our @seqs = ();
our @dess = ();
our @idens = ();
our @lens = ();
our @passeds = ();
our @NR_clstr_nos = ();
our @in_bg = ();
our @NR_idx = ();
our $NR_no = 0;
our $DB_no = 0;
our $DB_len = 0;
our $DB_len0 = 0;
our $DB_len_reduced = 0;
our $DB_len_reduced2 = 0; #### for write_restart etc purpose
our $opt_aL_upper_band = 0; #### sequences < this length will not be submitted unless reformatdb
our $opt_al_upper_bandi= 0;
our $opt_aL_lower_band = 0; #### sequences < this length don't need to be searched
my ($i, $j, $k, $i0, $j0, $k0, $ll);
read_db();
our $NR_passed = 0;
our $formatdb_no = $NR_no;;
@NR_idx = (0..($NR_no-1));
@NR_idx = sort { $lens[$b] <=> $lens[$a] } @NR_idx unless (-e $restart_in);
our $NR90_no = 0;
our @NR90_seq = ();
$i0 = 0;
if ( -e $restart_in) { $i0 = read_restart(); } ## restart after crash
elsif ($skip_long > 0) { #### skip very long seqs
for (; $i0<$NR_no; $i0++) {
$i = $NR_idx[$i0];
last if ($lens[$i] < $skip_long);
$NR_passed++;
$NR_clstr_nos[$i] = $NR90_no;
$idens[$i] = "*";
$passeds[$i] = 1;
$NR90_seq[$NR90_no] = [$i];
$NR90_no++;
$DB_len_reduced += $lens[$i];
}
}
#### set init opt_aL_uppper/lower_bands
if ( ($opt_aL > 0.3) ) {
die ("option -aL > 1.0") if ($opt_aL > 1.0);
####################
###################
##################
#################
################
############### <-upper band
############## <- seq below not submit, unless band change
#############
############
###########
########## <- lower band
######### <- seq below not in format db
########
#######
#####
####
###
##
#
my $total_jobs = $batch_no_per_node * $num_qsub * $para_no;
my $space = ($total_jobs > $restart_seg) ? $total_jobs : $restart_seg;
my $d1 = $i0+$space;
$d1 = ($NR_no-1) if ($d1 >= $NR_no-1);
$opt_aL_upper_band = $lens[ $NR_idx[$d1] ];
$opt_aL_lower_band = int($opt_aL_upper_band * $opt_aL);
$opt_aL_upper_bandi= $d1;
write_LOG("set opt_aL_band $opt_aL_upper_band($opt_aL_upper_bandi) $opt_aL_lower_band");
}
($DB_no, $DB_len) = blast_formatdb();
$DB_len0 = $DB_len;
$DB_len_reduced = 0;
$DB_len_reduced2 = 0;
for (; $i0<$NR_no; $i0++) {
$i = $NR_idx[$i0];
run_batch_blast3($i0) unless ($in_bg[$i] or (-e "$bl_dir/$i.out") or $passeds[$i]);
if ( not $passeds[$i] ) { # this is a new representative