Commit 9e6ec7b9 authored by Andreas Tille's avatar Andreas Tille

Imported Upstream version 4.6.5

parent 1e84e06d
CD-HIT ChangeLog
CD-HIT-V4.6.1 (2012-08-27):
Fix: a minor bug in handling masking letters;
Add: a few minor changes and fixings to conform to debian packaging rules.
CD-HIT-V4.6 (2012-04-25):
Add: better auto tuning (for -M 0) of word table size for both single and multiple threaded execution;
Fix: a minor bug for very long input sequences;
Add: compiling option to change MAX_SEQ.
CD-HIT-V4.5.8 (2012-03-24):
Add: handling of asterisk at the end of sequences;
Add: a few minor improvements in the parallelization;
Add: new options -mask and -bak.
CD-HIT-V4.5.7 (2011-12-16):
Add: minor improvements to the word table size handling;
Add: updated documentation file.
CD-HIT-V4.5.6 (2011-09-02):
Fix: -M for cd-hit-2d and cd-hit-est-2d;
Fix: handling of invalid input sequences.
CD-HIT-V4.5.5:
Fix: a minor bug in option handling for score setting (-match, -mismatch);
Fix: a minor bug in printing logs to stdout for cd-hit-2d and cd-hit-2d-est.
CD-HIT-V4.5.4:
Add: support for FASTQ file as input;
Add: improvement to sequence partition for parallelization;
Change: default identity threshold from 0.9 to 0.95 for EST.
CD-HIT-V4.5.3:
Add: cd-hit-454 program to the main package (cdhit-454.c++);
Add: options to change the scoring settings;
Add: options to control the length of unmatched region.
CD-HIT-V4.5.2:
MinorChange: default value of "-n" for DNA sequence from 8 to 10;
MinorFix: alignment locations and length;
CD-HIT-V4.5.1:
MinorChange: default value of "-r" from 0 to 1;
MinorFix: alignment length for "-G 0".
CD-HIT-V4.5:
Remove: multi-level counting;
Fix: support for "-F" option.
CD-HIT-V4.5-Beta3:
Change: default word table size for "-M 0";
Fix: global identity computation.
CD-HIT-V4.5-Beta2:
Fix: alignment position and identity.
CD-HIT-V4.5-Beta:
Change: local band alignment;
Change: filter threshold calculation;
Fix: best alignment band searching;
CD-HIT-V4.3:
Fix: a few bugs related to multi-level counting;
Change: implementation for -M option.
CD-HIT-V4.2.x:
Some bug fixings.
CD-HIT-V4.2:
Add: multi-level counting array to improve the speed.
CD-HIT-V4.1.1:
Change: improve estimating alignment band for sequences with low complexity.
CD-HIT-V4.1:
Fix: a bug in searching best alignment band;
Fix: a bug in handling 'N' for EST;
CD-HIT-V4.0:
New implementation with parallelization using OpenMP.
CC = g++ -Wall -ggdb
CC = g++ -pg
CC = g++
# without OpenMP
# with OpenMP
# in command line:
# make openmp=yes
ifeq ($(openmp),no)
CCFLAGS = -DNO_OPENMP
else
CCFLAGS = -fopenmp
endif
# support debugging
# in command line:
# make debug=yes
# make openmp=yes debug=yes
ifeq ($(debug),yes)
CCFLAGS += -ggdb
else
CCFLAGS += -O2
endif
ifdef MAX_SEQ
CCFLAGS += -DMAX_SEQ=$(MAX_SEQ)
endif
#LDFLAGS = -static -o
LDFLAGS += -o
PROGS = cd-hit cd-hit-est cd-hit-2d cd-hit-est-2d cd-hit-div cd-hit-454
# Propagate hardening flags
CCFLAGS := $(CPPFLAGS) $(CCFLAGS) $(CXXFLAGS)
.c++.o:
$(CC) $(CCFLAGS) -c $<
all: $(PROGS)
clean:
rm -f *.o $(PROGS)
# programs
cd-hit: cdhit-common.o cdhit-utility.o cdhit.o
$(CC) $(CCFLAGS) cdhit.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit
cd-hit-2d: cdhit-common.o cdhit-utility.o cdhit-2d.o
$(CC) $(CCFLAGS) cdhit-2d.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-2d
cd-hit-est: cdhit-common.o cdhit-utility.o cdhit-est.o
$(CC) $(CCFLAGS) cdhit-est.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-est
cd-hit-est-2d: cdhit-common.o cdhit-utility.o cdhit-est-2d.o
$(CC) $(CCFLAGS) cdhit-est-2d.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-est-2d
cd-hit-div: cdhit-common.o cdhit-utility.o cdhit-div.o
$(CC) $(CCFLAGS) cdhit-div.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-div
cd-hit-454: cdhit-common.o cdhit-utility.o cdhit-454.o
$(CC) $(CCFLAGS) cdhit-454.o cdhit-common.o cdhit-utility.o $(LDFLAGS) cd-hit-454
# objects
cdhit-common.o: cdhit-common.c++ cdhit-common.h
$(CC) $(CCFLAGS) cdhit-common.c++ -c
cdhit-utility.o: cdhit-utility.c++ cdhit-utility.h
$(CC) $(CCFLAGS) cdhit-utility.c++ -c
cdhit.o: cdhit.c++ cdhit-utility.h
$(CC) $(CCFLAGS) cdhit.c++ -c
cdhit-2d.o: cdhit-2d.c++ cdhit-utility.h
$(CC) $(CCFLAGS) cdhit-2d.c++ -c
cdhit-est.o: cdhit-est.c++ cdhit-utility.h
$(CC) $(CCFLAGS) cdhit-est.c++ -c
cdhit-est-2d.o: cdhit-est-2d.c++ cdhit-utility.h
$(CC) $(CCFLAGS) cdhit-est-2d.c++ -c
cdhit-div.o: cdhit-div.c++ cdhit-common.h
$(CC) $(CCFLAGS) cdhit-div.c++ -c
cdhit-454.o: cdhit-454.c++ cdhit-common.h
$(CC) $(CCFLAGS) cdhit-454.c++ -c
PREFIX ?= /usr/local/bin
install:
for prog in $(PROGS); do \
install -m 0755 $$prog $(PREFIX); \
done
install -m 0755 *.pl $(PREFIX);
For cd-hit
How to compile?
1. Compile with multi-threading support (default): make
2. Compile without multi-threading support (if you are on very old systems): make openmp=no
For cd-hit-auxtools
cd cd-hit-auxtools
make
For psi-cd-hit
please download legacy BLAST (not BLAST+) and install the executables in your $PATH
For more information, please visit http://cd-hit.org or please read the cdhit-users-guide.pdf.
Most up-to-date documents are available at http://weizhongli-lab.org/cd-hit/wiki/doku.php?id=cd-hit_user_guide.
cd-hit was originally hosted at Google Code, some of the old releases are still available from https://code.google.com/p/cdhit/.
cd-hit is also available as web server, visit http://cd-hit.org for web server address.
This diff is collapsed.
CC = g++
CFLAGS = -Wall -Wno-unused -I. -Imintlib
LFLAGS = -fPIC
UNAME = $(shell uname)
ifeq ($(UNAME), Linux)
CFLAGS += -DUNIX
endif
ifeq ($(UNAME), Darwin)
CFLAGS += -DUNIX -DMAC_OSX
endif
ifeq ($(debug),yes)
CFLAGS += -ggdb
else
CFLAGS += -O2
endif
#OBJECTS = mintlib/minString.o mintlib/minMap.o suffix_array.o
OBJECTS = mintlib/minString.o mintlib/minMap.o bioSequence.o
first: all
all: cd-hit-dup cd-hit-lap read-linker
.SUFFIXES: .c .obj .cpp .cc .cxx .C
.cxx.o:
$(CC) -c $(CFLAGS) -o $@ $<
cd-hit-dup: $(OBJECTS) cdhit-dup.o
$(CC) $(LFLAGS) $(OBJECTS) cdhit-dup.o -o cd-hit-dup
cd-hit-lap: $(OBJECTS) cdhit-lap.o
$(CC) $(LFLAGS) $(OBJECTS) cdhit-lap.o -o cd-hit-lap
read-linker: $(OBJECTS) read-linker.o
$(CC) $(LFLAGS) $(OBJECTS) read-linker.o -o read-linker
clean:
rm $(OBJECTS) cdhit-dup.o cdhit-lap.o read-linker.o
//=================================================================
// This file is a part of cdhit-dup.
// By Limin Fu (phoolimin@gmail.com, lmfu@ucsd.edu)
//=================================================================
#include <ctype.h>
#include "bioSequence.hxx"
using namespace Min;
using namespace Bio;
Min::String Bio::Sequence::GetDescription( int deslen )
{
String des = Description();
int i = 0;
if( des[i] == '>' || des[i] == '@' || des[i] == '+' ) i += 1;
if( des[i] == ' ' || des[i] == '\t' ) i += 1;
if( deslen == 0 || deslen > des.Size() ) deslen = des.Size();
while( i < deslen and ! isspace( des[i] ) ) i += 1;
des.Resize( i );
return des;
}
const char *dna_reverse_complimentary = "TBGDEFCHIJKLMNOPQRSAAVWXYZ";
void Bio::Sequence::ToReverseComplimentary()
{
int i, j;
int n = seq.Size();
int m = (n/2) + (n%2);
for(i=0, j=n-1; i<m; i++, j--){
char L = seq[i];
char R = seq[j];
seq[i] = dna_reverse_complimentary[ R - 'A' ];
seq[j] = dna_reverse_complimentary[ L - 'A' ];
}
if( qs.Size() != n ) return;
for(i=0, j=n-1; i<m; i++, j--){
char L = qs[i];
char R = qs[j];
qs[i] = R; qs[j] = L;
}
}
void Bio::Sequence::Print( FILE *fout, int width )
{
if( qs.Size() ){
assert( seq.Size() == qs.Size() );
if( des.Size() ){
fprintf( fout, "@%s\n", des.Data() );
}else{
fprintf( fout, "@%i\n", id );
}
fprintf( fout, "%s\n", seq.Data() );
if( des.Size() ){
fprintf( fout, "+%s\n", des.Data() );
}else{
fprintf( fout, "+%i\n", id );
}
fprintf( fout, "%s\n", qs.Data() );
}else{
if( des.Size() ){
fprintf( fout, ">%s\n", des.Data() );
}else{
fprintf( fout, ">%i\n", id );
}
fprintf( fout, "%s\n", seq.Data() );
}
}
#define BUFSIZE 1024
int Bio::SequenceCache::Update( int max, bool idonly )
{
char buf[ BUFSIZE+1 ];
char *res = NULL;
FILE *fin = inputFile;
String & ds = buffer.Description();
String & ss = buffer.SequenceData();
String & qs = buffer.QualityScore();
Clear();
if( fin == NULL ) return 0;
while( feof( fin ) == 0 ){
if( (res=fgets( buf, BUFSIZE, fin )) == NULL ) break;
if( getdes && (buf[0] == '>' || buf[0] == '@' || buf[0] == '+') ){
getdes = 0;
if( buf[0] == '+' ){
current = & qs;
}else{
if( ss.Size() ){
ss.ToUpper();
sequences.Append( new Sequence( buffer ) );
buffer.Reset();
if( sequences.Size() % 100000 == 0 )
printf( "Read %9i sequences ...\n", (int)sequences.Size() );
}
current = & ss;
ds = buf + 1;
while( ds[ds.Size()-1] != '\n' ){
if( (res=fgets( buf, BUFSIZE, fin )) == NULL ) break;
ds += buf;
}
ds.Chop();
if( idonly ){
int d = 0;
while( d < ds.Size() && isspace( ds[d] ) ==0 ) d += 1;
ds.Resize( d );
}
//printf( "%s\n", ds->Data() );
if( max > 0 && (int)sequences.Size() >= max ) return sequences.Size();
}
}else{
assert( current != NULL );
*current += buf;
while( current->Size() && isspace( (*current)[current->Size()-1] ) ) current->Chop();
getdes = current == & ss || (current == & qs && qs.Size() == ss.Size());
}
}
if( ss.Size() ){
sequences.Append( new Sequence( buffer ) );
buffer.Reset();
}
return sequences.Size();
}
bool Bio::SequenceList::ReadFastAQ( const Min::String & file, bool idonly, FILE *fout_log )
{
char buf[ BUFSIZE+1 ];
FILE *fin = fopen( file.Data(), "r" );
bool wasEmpty = sequences.Size() == 0;
char *res = NULL;
int max = 0, min = 0x7fffffff;
int getdes = 1;
Sequence one;
String & ds = one.Description();
String & ss = one.SequenceData();
String & qs = one.QualityScore();
String *data=NULL;
if( fin == NULL ) return 0;
while( feof( fin ) == 0 ){
if( (res=fgets( buf, BUFSIZE, fin )) == NULL ) break;
if( getdes && (buf[0] == '>' || buf[0] == '@' || buf[0] == '+') ){
getdes = 0;
if( buf[0] == '+' ){
data = & qs;
}else{
if( ss.Size() ){
ss.ToUpper();
if( ss.Size() > max ) max = ss.Size();
if( ss.Size() < min ) min = ss.Size();
sequences.Append( new Sequence( one ) );
one.Reset();
if( sequences.Size() % 100000 == 0 )
fprintf( fout_log, "Read %9i sequences ...\n", (int)sequences.Size() );
}
data = & ss;
ds = buf + 1;
while( ds[ds.Size()-1] != '\n' ){
if( (res=fgets( buf, BUFSIZE, fin )) == NULL ) break;
ds += buf;
}
ds.Chop();
if( idonly ){
int d = 0;
while( d < ds.Size() && isspace( ds[d] ) ==0 ) d += 1;
ds.Resize( d );
}
//printf( "%s\n", ds->Data() );
}
}else{
*data += buf;
while( data->Size() && isspace( (*data)[data->Size()-1] ) ) data->Chop();
getdes = data == & ss || (data == & qs && qs.Size() == ss.Size());
}
}
fclose( fin );
if( ss.Size() ){
if( ss.Size() > max ) max = ss.Size();
if( ss.Size() < min ) min = ss.Size();
sequences.Append( new Sequence( one ) );
}
if( wasEmpty ){
max_length = max;
min_length = min;
}else{
if( max > max_length ) max_length = max;
if( min < min_length ) min_length = min;
}
return true;
}
void Bio::SequenceList::SortByLength()
{
if( max_length == min_length ) return;
int i;
int N = sequences.Size();
int M = max_length - min_length + 1;
Min::Array<int> count( M, 0 ); // count for each size = max_len - i
Min::Array<int> accum( M, 0 ); // count for all size > max_len - i
Min::Array<int> offset( M, 0 ); // offset from accum[i] when filling sorting
Min::Array<Sequence*> sorting( N );
for (i=0; i<N; i++) count[ max_length - sequences[i]->Length() ] ++;
for (i=1; i<M; i++) accum[i] = accum[i-1] + count[i-1];
for (i=0; i<N; i++){
int len = max_length - sequences[i]->Length();
int id = accum[len] + offset[len];
//sequences[i].index = id;
sorting[id] = sequences[i];
offset[len] ++;
}
sequences.Swap( sorting );
}
int Bio::SequenceList::RemoveEmptySequences()
{
int i, M=0, N = sequences.Size();
for(i=0; i<N; i++){
if( sequences[i]->Length() == 0 ){
delete sequences[i];
continue;
}
sequences[M++] = sequences[i];
}
sequences.Resize( M );
}
void Bio::SequenceList::Shuffle()
{
int i, N = sequences.Size();
if( N <= 1 ) return;
for(i=N-1; i>=0; i--){
int rnd = (int)(i * (rand() / (1.0 + RAND_MAX)));
Sequence *seq = sequences[i];
sequences[i] = sequences[rnd];
sequences[rnd] = seq;
}
}
//=================================================================
// This file is a part of cdhit-dup.
// By Limin Fu (phoolimin@gmail.com, lmfu@ucsd.edu)
//=================================================================
#include <stdint.h>
#include <stdio.h>
#include "minArray.hxx"
#include "minString.hxx"
#define BEGIN_NS_BIO namespace Bio {
#define END_NS_BIO }
BEGIN_NS_BIO
class Sequence
{
uint32_t id;
Min::String des;
Min::String seq;
Min::String qs;
public:
Sequence(){ id = 0; }
#if 0
Sequence( const Sequence & other ){
id = other.id;
des = other.des;
seq = other.seq;
qs = other.qs;
}
#endif
void Copy( const Sequence & other ){
des = other.des;
seq = other.seq;
qs = other.qs;
}
int Length()const{ return seq.Size(); }
Min::String& Description(){ return des; }
Min::String& SequenceData(){ return seq; }
Min::String& QualityScore(){ return qs; }
Min::String GetDescription( int deslen = 0 );
void Reset(){ des.Reset(); seq.Reset(); qs.Reset(); }
void ToReverseComplimentary();
void Print( FILE *fout = stdout, int width = 0 );
};
class SequenceCache
{
Min::Array<Sequence*> sequences;
FILE *inputFile;
bool getdes;
Sequence buffer;
Min::String *current;
public:
SequenceCache( const Min::String & file ){
inputFile = fopen( file.Data(), "r" );
current = NULL;
getdes = true;
}
~SequenceCache(){
if( inputFile ) fclose( inputFile );
Clear();
}
void Clear(){
for(int i=0,n=sequences.Size(); i<n; i++) delete sequences[i];
sequences.Clear();
}
int Update( int max = 10000, bool idonly = false );
size_t Count()const{ return sequences.Size(); }
Sequence* operator[]( int i )const{ return sequences[i]; }
};
class SequenceList
{
Min::Array<Sequence*> sequences;
int max_length;
int min_length;
public:
SequenceList(){ max_length = min_length = 0; }
~SequenceList(){
Clear();
}
void Clear(){
for(int i=0,n=sequences.Size(); i<n; i++) delete sequences[i];
sequences.Clear();
}
bool ReadFastAQ( const Min::String & file, bool idonly=false, FILE *fout_log=stdout );
void SortByLength();
size_t Count()const{ return sequences.Size(); }
int MaxLength(){
int i, N = sequences.Size();
if( N == 0 ) return 0;
for(i=1, max_length = sequences[0]->Length(); i<N; i++){
if( sequences[i]->Length() > max_length ) max_length = sequences[i]->Length();
}
return max_length;
}
int MinLength(){
int i, N = sequences.Size();
if( N == 0 ) return 0;
for(i=1, min_length = sequences[0]->Length(); i<N; i++){
if( sequences[i]->Length() < min_length ) min_length = sequences[i]->Length();
}
return min_length;
}
int RemoveEmptySequences();
void Shuffle();
Sequence* operator[]( int i )const{ return sequences[i]; }
};
END_NS_BIO
#!/usr/bin/perl
my $script_name = $0;
my $script_dir = $0;
$script_dir =~ s/[^\/]+$//;
chop($script_dir);
$script_dir = "./" unless ($script_dir);
use Getopt::Std;
getopts("i:j:o:p:c:",\%opts);
die usage() unless ($opts{i} and $opts{j} and $opts{o} and $opts{p} and $opts{c} );
my ($i, $j, $k, $cmd);
my ($ll, $lla, $llb, $id, $ida, $idb, $seq, $seqa, $seqb, $qua, $quaa, $quab);
my ($len, $lena, $lenb);
my %FHZ=();
my $fastq1 = $opts{i};
my $fastq2 = $opts{j};
my $clstr_file = $opts{c};
my $outfile1 = $opts{o};
my $outfile2 = $opts{p};
my $format = input_test($fastq1); #fasta or fastq
my %rep_ids = ();
open(TMP, $clstr_file) || die "can not open $clstr_file";
while($ll = <TMP>){
if ($ll =~ /^>/) {
next;
}
else {
chop($ll);
if ($ll =~ /\*$/) {
if ($ll =~ /\s(\d+)(aa|nt), >(.+)\.\.\./) {
my $id = $3;
$rep_ids{$id} = 1;
}
}
}
}
close(TMP);
open(INPUTa, $fastq1) || die "can not open $fastq1"; open(OUTa, "> $outfile1") || die "can not write to $outfile1";
open(INPUTb, $fastq2) || die "can not open $fastq2"; open(OUTb, "> $outfile2") || die "can not write to $outfile2";
if ($format eq "fasta") {
my $flag;
while($lla = <INPUTa> and $llb=<INPUTb>) {
if ( ($lla =~ /^>/) and ($llb =~ /^>/) ) {
my $ida = substr($lla,1); chop($ida); $ida =~ s/\s.+$//;
my $idb = substr($llb,1); chop($idb); $idb =~ s/\s.+$//;
$flag = (($rep_ids{$ida}) or ($rep_ids{$idb})) ? 1 : 0;
}
if ($flag) {
print OUTa $lla;
print OUTb $llb;
}
}
}
else {
my $flag;
while($lla = <INPUTa> and $llb=<INPUTb>) {
if ( ($lla =~ /^@/) and ($llb =~ /^@/) )