Skip to content
Commits on Source (5)
......@@ -198,7 +198,83 @@ can be generated from a GI taxid dump:
### Custom database
TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
- `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).
- `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
- `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
- reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
#### More info on `--taxonomy-tree` and `--name-table`
The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump.
- Field terminator is `\t|\t`
- Row terminator is `\t|\n`
The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ..., no rank)
Further fields are ignored.
The `name-table` / names.dmp is the taxonomy names file:
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (scientific name, synonym, common name, ...)
`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
#### Example
*Conversion table `ex.conv`*:
Seq1 11
Seq2 12
Seq3 13
Seq4 11
*Taxonomy tree `ex.tree`*:
1 | 1 | root
10 | 1 | kingdom
11 | 10 | species
12 | 10 | species
13 | 1 | species
*Name table `ex.name`*:
1 | root | | scientific name |
10 | Bacteria | | scientific name |
11 | Bacterium A | | scientific name |
12 | Bacterium B | | scientific name |
12 | Some other species | | scientific name |
*Reference sequences `ex.fa`*:
>Seq1
AAAACGTACGA.....
>Seq2
AAAACGTACGA.....
>Seq3
AAAACGTACGA.....
>Seq4
AAAACGTACGA.....
To build the database, call
centrifuge-build --conversion-table ex.conv \
--taxonomy-tree ex.tree --name-table ex.name \
ex.fa ex
which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.
### Centrifuge classification output
......
......@@ -211,7 +211,84 @@ can be generated from a GI taxid dump:
### Custom database
TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
- `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).
- `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
- `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
- reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
#### More info on `--taxonomy-tree` and `--name-table`
The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump.
- Field terminator is `\t|\t`
- Row terminator is `\t|\n`
The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
fields:
tax_id -- node id in GenBank taxonomy database
parent tax_id -- parent node id in GenBank taxonomy database
rank -- rank of this node (superkingdom, kingdom, ..., no rank)
Further fields are ignored.
The `name-table` / names.dmp is the taxonomy names file:
tax_id -- the id of node associated with this name
name_txt -- name itself
unique name -- the unique variant of this name if name not unique
name class -- (scientific name, synonym, common name, ...)
`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
#### Example
*Conversion table `ex.conv`*:
Seq1 11
Seq2 12
Seq3 13
Seq4 11
*Taxonomy tree `ex.tree`*:
1 | 1 | root
10 | 1 | kingdom
11 | 10 | species
12 | 10 | species
13 | 1 | species
*Name table `ex.name`*:
1 | root | | scientific name |
10 | Bacteria | | scientific name |
11 | Bacterium A | | scientific name |
12 | Bacterium B | | scientific name |
12 | Some other species | | scientific name |
*Reference sequences `ex.fa`*:
>Seq1
AAAACGTACGA.....
>Seq2
AAAACGTACGA.....
>Seq3
AAAACGTACGA.....
>Seq4
AAAACGTACGA.....
To build the database, call
centrifuge-build --conversion-table ex.conv \
--taxonomy-tree ex.tree --name-table ex.name \
ex.fa ex
which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.
### Centrifuge classification output
......
......@@ -191,6 +191,7 @@ CENTRIFUGE_SCRIPT_LIST = centrifuge \
centrifuge-build \
centrifuge-inspect \
centrifuge-download \
centrifuge-kreport \
$(wildcard centrifuge-*.pl)
......
......@@ -2305,9 +2305,9 @@ void AlnSinkSam<index_t>::appendMate(
case READ_ID: appendReadID(o, rd.name); break;
case SEQ_ID: appendSeqID(o, rs, ebwt.tree()); break;
case SEQ: o.append((string(rd.patFw.toZBuf()) +
(rdo == NULL? "" : "N" + string(rdo->patFw.toZBuf()))).c_str()); break;
(rdo == NULL? "" : "_" + string(rdo->patFw.toZBuf()))).c_str()); break;
case QUAL: o.append((string(rd.qual.toZBuf()) +
(rdo == NULL? "" : "I" + string(rdo->qual.toZBuf()))).c_str()); break;
(rdo == NULL? "" : "_" + string(rdo->qual.toZBuf()))).c_str()); break;
case SEQ1: o.append(rd.patFw.toZBuf()); break;
case QUAL1: o.append(rd.qual.toZBuf()); break;
......
......@@ -116,6 +116,13 @@ my %read_compress = ();
my $cap_out = undef; # Filename for passthrough
my $no_unal = 0;
my $large_idx = 0;
# Variables handling the output format
my $outputFmtSam = 0 ;
my $tabFmtOptIdx = 0 ;
my $needReadSeq = 0 ;
my $removeSeqCols = 0 ;
# Remove whitespace
for my $i (0..$#bt2_args) {
$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
......@@ -179,6 +186,7 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
}
for my $rarg ("un-conc", "al-conc", "un", "al") {
if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
$needReadSeq = 1 ;
$bt2_args[$i] = undef;
if(scalar(@args) > 1 && $args[1] ne "") {
$read_fns{$rarg} = $args[1];
......@@ -193,7 +201,57 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
last;
}
}
if ($arg eq "--out-fmt" )
{
$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
$i++;
if ( $bt2_args[$i] eq "sam" )
{
$outputFmtSam = 1 ;
}
#$bt2_args[$i] = undef;
}
if ( $arg eq "--tab-fmt-cols" )
{
$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
$tabFmtOptIdx = $i + 1 ;
}
}
# Determine whether we need to add two extra columns for seq and qual to out-fmt
if ( $needReadSeq == 1 && ( $tabFmtOptIdx == 0 || $outputFmtSam == 1 ) )
{
my $i ;
my $needAdd = 1 ;
if ( $tabFmtOptIdx != 0 )
{
my @cols = split /,/, $bt2_args[ $tabFmtOptIdx ] ;
foreach my $f (@cols)
{
if ( $f eq "readSeq" )
{
$needAdd = 0 ;
last ;
}
}
}
else
{
push @bt2_args, "--tab-fmt-cols" ;
push @bt2_args, "readID,seqID,taxID,score,2ndBestScore,hitLength,queryLength,numMatches" ;
$tabFmtOptIdx = scalar( @bt2_args ) - 1 ;
}
if ( $needAdd )
{
$removeSeqCols = 1 ;
$bt2_args[ $tabFmtOptIdx ] .= ",readSeq,readQual" ;
}
}
# If the user asked us to redirect some reads to files, or to suppress
# unaligned reads, then we need to capture the output from Centrifuge and pass it
# through this wrapper.
......@@ -423,6 +481,17 @@ my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
# Possibly add read input on an anonymous pipe
$cmd = "$readpipe $cmd" if defined($readpipe);
# The function removes the two extra columns that we added to get the read seq and qual
sub RemoveSeqCols
{
my $line = $_[0] ;
my @cols = split /\t/, $line ;
pop @cols ;
pop @cols ;
my $tab = "\t" ;
return join( $tab, @cols ) ;
}
Info("$cmd\n");
my $ret;
if(defined($cap_out)) {
......@@ -485,30 +554,87 @@ if(defined($cap_out)) {
}
}
}
my $seqIndex = -1 ;
my $qualIndex = -1 ;
my $readIdIndex = -1 ;
if ( $outputFmtSam == 0 )
{
my $outputHeader = <BT> ;
my @cols = split /\t/, $outputHeader ;
for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
{
if ( $cols[$i] =~ /readSeq/ )
{
$seqIndex = $i ;
}
elsif ( $cols[$i] =~ /readQual/ )
{
$qualIndex = $i ;
}
elsif ( $cols[$i] =~ /readID/ )
{
$readIdIndex = $i ;
}
}
if ( $seqIndex == -1 && scalar( keys %read_fhs) == 0 )
{
Error( "Must use readSeq in --tabFmtCols in order to output unaligned reads." ) ;
}
$outputHeader = RemoveSeqCols( $outputHeader )."\n" if ( $removeSeqCols == 1 ) ;
print {$ofh} $outputHeader ;
}
else
{
$seqIndex = 9 ;
$qualIndex = 10 ;
$readIdIndex = 0 ;
}
while(<BT>) {
chomp;
my $filt = 0;
unless(substr($_, 0, 1) eq "@") {
# If we are supposed to output certain reads to files...
my $tab1_i = index($_, "\t") + 1;
my $tab2_i = index($_, "\t", $tab1_i);
my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
my $unal = ($fl & 4) != 0;
#my $tab1_i = index($_, "\t") + 1;
#my $tab2_i = index($_, "\t", $tab1_i);
#my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
my $unal = 0 ;
if ( /unclassified/ )
{
$unal = 1 ;
}
$filt = 1 if $no_unal && $unal;
if($passthru) {
if(scalar(keys %read_fhs) == 0) {
if(scalar(keys %read_fhs) == 0 || $seqIndex == -1 ) {
# Next line is read with some whitespace escaped
my $l = <BT>;
# my $l = <BT>;
} else {
my $mate1 = (($fl & 64) != 0);
my $mate2 = (($fl & 128) != 0);
my $unp = !$mate1 && !$mate2;
my $pair = !$unp;
my @cols = split /\t/ ;
my $isPaired = 0 ;
my $pair = 0 ;
if ( $cols[$seqIndex] =~ /_/ )
{
$pair = 1 ;
}
my $unp = !$pair ;
# Next line is read with some whitespace escaped
my $l = <BT>;
chomp($l);
$l =~ s/%(..)/chr(hex($1))/eg;
#my $l = <BT>;
#chomp($l);
#$l =~ s/%(..)/chr(hex($1))/eg;
if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) {
my $l ;
if ( $qualIndex != -1 )
{
$l = "@".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n+\n".$cols[$qualIndex]."\n" ;
}
else
{
$l = ">".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n" ;
}
if($unal) {
# Failed to align
print {$read_fhs{un}} $l if defined($read_fhs{un});
......@@ -517,21 +643,42 @@ if(defined($cap_out)) {
print {$read_fhs{al}} $l if defined($read_fhs{al});
}
}
my $warnedAboutLength = 0 ;
if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) {
my $conc = (($fl & 2) != 0);
if ($conc && $mate1) {
print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"});
} elsif($conc && $mate2) {
print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"});
} elsif(!$conc && $mate1) {
print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"});
} elsif(!$conc && $mate2) {
print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"});
my @seq = split /_/, $cols[$seqIndex] ;
my @qual = ( substr( $cols[$qualIndex], 0, length( $seq[0] ) ), substr( $cols[$qualIndex], length( $seq[0] ) + 1 ) ) ;
my $l1 ;
my $l2 ;
if ( $qualIndex != -1 )
{
$l1 = "@".$cols[ $readIdIndex ]."\n".$seq[0]."\n+\n".$qual[0]."\n" ;
}
else
{
$l1 = ">".$cols[ $readIdIndex ]."\n".$seq[0]."\n" ;
}
if ( $qualIndex != -1 )
{
$l2 = "@".$cols[ $readIdIndex ]."\n".$seq[1]."\n+\n".$qual[1]."\n" ;
}
else
{
$l2 = ">".$cols[ $readIdIndex ]."\n".$seq[1]."\n" ;
}
if ( !$unal) {
print {$read_fhs{"al-conc"}{1}} $l1 if defined($read_fhs{"al-conc"});
print {$read_fhs{"al-conc"}{2}} $l2 if defined($read_fhs{"al-conc"});
} else {
print {$read_fhs{"un-conc"}{1}} $l1 if defined($read_fhs{"un-conc"});
print {$read_fhs{"un-conc"}{2}} $l2 if defined($read_fhs{"un-conc"});
}
}
}
}
}
$_ = RemoveSeqCols( $_ ) if ( $removeSeqCols == 1 ) ;
print {$ofh} "$_\n" if !$filt;
}
for my $k (@fhs_to_close) { close($k); }
......
......@@ -11,10 +11,10 @@ if hash rsync 2>/dev/null; then
DL_MODE="rsync"
elif hash wget 2>/dev/null; then
DL_PROG="wget -N --reject=index.html -qO"
DL_MODE="ftp"
DL_MODE="https"
else
DL_PROG="curl -s -o"
DL_MODE="ftp"
DL_MODE="https"
fi
export DL_PROG DL_MODE
......@@ -154,7 +154,7 @@ COMMON OPTIONS
WHEN USING database refseq OR genbank:
-d <domain> What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
-a <assembly level> Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'.
-a <assembly level> Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
-c <refseq category> Only download genomes in the specified refseq category. Default: any.
-t <taxids> Only download the specified taxonomy IDs, comma separated. Default: any.
-r Download RNA sequences, too.
......@@ -262,8 +262,10 @@ SPECIES_TAXID_FIELD=7
VERSION_STATUS_FIELD=11
ASSEMBLY_LEVEL_FIELD=12
FTP_PATH_FIELD=20
FTP_PATH_FIELD2=21 ## Needed for wrongly formatted virus files - hopefully just a temporary fix
AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
AWK_QUERY="\$$VERSION_STATUS_FIELD==\"latest\""
[[ "$ASSEMBLY_LEVEL" != "Any" ]] && AWK_QUERY="$AWK_QUERY && \$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\""
[[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""
TAXID=${TAXID//,/|}
......@@ -320,11 +322,22 @@ for DOMAIN in $DOMAINS; do
N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
[[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
if [[ "$DOMAIN" == "viral" ]]; then
## Wrong columns in viral assembly summary files - the path is sometimes in field 20, sometimes 21
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
sed 's/^\(.*\)\t\(ftp:.*\)\t.*/\1\t\2/;s/^\(.*\)\t.*\t\(ftp:.*\)/\1\t\2/' | \
sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
else
echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
fi
echo >&2
if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
echo "Downloadinging rna sequence files" >&2
cut -f $TAXID_FIELD,$FTP_PATH_FIELD "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\
......
......@@ -9,12 +9,17 @@
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;
my ($centrifuge_index, $min_score, $min_length);
my $only_unique = 0;
my $show_zeros = 0;
my $is_cnts_table = 0;
my $PROG = "centrifuge-kreport";
my $CWD = dirname( abs_path( $0 ) ) ;
GetOptions(
"help" => \&display_help,
......@@ -71,9 +76,23 @@ if ($is_cnts_table) {
$seq_count += $count;
}
} else {
<>;
my $header = <>;
my @cols = split /\s+/, $header ;
my %headerMap ;
for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
{
$headerMap{ $cols[$i] } = $i ;
}
while (<>) {
my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
#my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
my @cols = split /\s+/ ;
my $seqID = $cols[ $headerMap{ "seqID" } ] ;
my $taxid = $cols[ $headerMap{ "taxID" } ] ;
my $score = $cols[ $headerMap{ "score" } ] ;
my $hitLength = $cols[ $headerMap{ "hitLength" } ] ;
my $queryLength = $cols[ $headerMap{ "queryLength" } ] ;
my $numMatches = $cols[ $headerMap{ "numMatches" } ] ;
next if $only_unique && $numMatches > 1;
next if defined $min_length && $hitLength < $min_length;
next if defined $min_score && $score < $min_score;
......@@ -150,7 +169,7 @@ sub dfs_summation {
sub load_taxonomy {
print STDERR "Loading names file ...\n";
open NAMES, "-|", "centrifuge-inspect --name-table $centrifuge_index"
open NAMES, "-|", "$CWD/centrifuge-inspect --name-table $centrifuge_index"
or die "$PROG: can't open names file: $!\n";
while (<NAMES>) {
chomp;
......@@ -162,7 +181,7 @@ sub load_taxonomy {
close NAMES;
print STDERR "Loading nodes file ...\n";
open NODES, "-|", "centrifuge-inspect --taxonomy-tree $centrifuge_index"
open NODES, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index"
or die "$PROG: can't open nodes file: $!\n";
while (<NODES>) {
chomp;
......
#!/usr/bin/env perl
use strict ;
use warnings ;
use File::Basename;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;
die "Usage: centrifuge-promote.pl centrifuge_index centrifuge_output level > output\n\n".
"Promote the taxonomy id to specified level in Centrifuge output.\n" if ( @ARGV == 0 ) ;
my $CWD = dirname( abs_path( $0 ) ) ;
# Go through the index to obtain the taxonomy tree
my %taxParent ;
my %taxIdToSeqId ;
my %taxLevel ;
my $centrifuge_index = $ARGV[0] ;
open FP1, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t\|\t/;
$taxParent{ $cols[0] } = $cols[1] ;
$taxLevel{ $cols[0] } = $cols[2] ;
}
close FP1 ;
open FP1, "-|", "$CWD/centrifuge-inspect --conversion-table $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t/ ;
$taxIdToSeqId{ $cols[1] } = $cols[0] ;
}
close FP1 ;
# Go through the output of centrifuge
my $level = $ARGV[2] ;
sub PromoteTaxId
{
my $tid = $_[0] ;
return 0 if ( $tid <= 0 || !defined( $taxLevel{ $tid } ) ) ;
if ( $taxLevel{ $tid } eq $level )
{
return $tid ;
}
else
{
return 0 if ( $tid <= 1 ) ;
return PromoteTaxId( $taxParent{ $tid } ) ;
}
}
sub OutputPromotedLines
{
my @lines = @{ $_[0] } ;
return if ( scalar( @lines ) <= 0 ) ;
my @newLines ;
my $i ;
my $numMatches = 0 ;
my %showedUpTaxId ;
my $tab = sprintf( "\t" ) ;
for ( $i = 0 ; $i < scalar( @lines ) ; ++$i )
{
my @cols = split /\t+/, $lines[ $i ] ;
my $newTid = PromoteTaxId( $cols[2] ) ;
if ( $newTid <= 1 )
{
$newTid = $cols[2] ;
}
my $newLevel = $cols[1] ;
$newLevel = $taxLevel{ $newTid } if ( $newTid >= 1 && defined $taxLevel{ $newTid } ) ;
next if ( defined $showedUpTaxId{ $newTid } ) ;
$showedUpTaxId{ $newTid } = 1 ;
++$numMatches ;
$cols[2] = $newTid ;
$cols[1] = $newLevel ;
push @newLines, join( $tab, @cols ) ;
}
for ( $i = 0 ; $i < scalar( @newLines ) ; ++$i )
{
my @cols = split /\t+/, $newLines[$i] ;
$cols[-1] = $numMatches ;
print join( $tab, @cols ), "\n" ;
}
}
open FP1, $ARGV[1] ;
my $header = <FP1> ;
my $prevReadId = "" ;
my @lines ;
print $header ;
while ( <FP1> )
{
chomp ;
my @cols = split /\t/ ;
if ( $cols[0] eq $prevReadId )
{
push @lines, $_ ;
}
else
{
$prevReadId = $cols[0] ;
OutputPromotedLines( \@lines ) ;
undef @lines ;
push @lines, $_ ;
}
}
OutputPromotedLines( \@lines ) ;
close FP1 ;
......@@ -501,7 +501,7 @@ static void resetOptions() {
col_name_map["CIGAR"] = PLACEHOLDER;
col_name_map["RNEXT"] = SEQ_ID;
col_name_map["PNEXT"] = PLACEHOLDER_ZERO;
col_name_map["TLEN"] = PLACEHOLDER_ZERO;
col_name_map["TLEN"] = QUERY_LENGTH ; //PLACEHOLDER_ZERO;
col_name_map["SEQ"] = SEQ;
col_name_map["QUAL"] = QUAL;
......
......@@ -418,8 +418,11 @@ public:
if(!_tree_traverse) {
if(_hitMap.size() > (size_t)rp.khits)
{
reportUnclassified( sink ) ;
return 0;
}
}
uint8_t rank = 0;
while(_hitMap.size() > (size_t)rp.khits) {
......@@ -511,7 +514,10 @@ public:
}
}
if(!only_host_taxIDs && _hitMap.size() > (size_t)rp.khits)
{
reportUnclassified( sink ) ;
return 0;
}
#if 0
// boost up the score if the assignment is unique
......@@ -528,7 +534,7 @@ public:
max_score += (rdlen > 15 ? (rdlen - 15) * (rdlen - 15) : 0);
}
bool reported = false ;
for(size_t gi = 0; gi < _hitMap.size(); gi++) {
assert_gt(_hitMap[gi].score, 0);
HitCount<index_t>& hitCount = _hitMap[gi];
......@@ -555,7 +561,12 @@ public:
hitCount.readPositions,
isFw);
sink.report(0, &rs);
reported = true ;
}
if ( reported == false )
reportUnclassified( sink ) ;
return 0;
}
......@@ -1024,6 +1035,17 @@ private:
return idx;
}
void reportUnclassified( AlnSinkWrap<index_t>& sink )
{
AlnRes rs ;
EList<pair<uint32_t,uint32_t> > dummy ;
dummy.push_back( make_pair( 0, 0 ) ) ;
rs.init( 0, 0, string( "unclassified" ), 0, 0, 0, dummy, true ) ;
sink.report( 0, &rs ) ;
}
// compare BWTHits by size, ascending, first, then by length, descending
// TODO: move this operator into BWTHits if that is the standard way we would like to sort
// TODO: this ordering does not necessarily give the best results
......
centrifuge (1.0.3-1) unstable; urgency=medium
* New upstream release
-- Andreas Tille <tille@debian.org> Fri, 02 Mar 2018 19:59:50 +0100
centrifuge (1.0.3~beta-2) unstable; urgency=medium
* Move hisat2 and jellyfish to Build-Depends to make sure package builds
......
......@@ -6,11 +6,9 @@ Subject: Fix build with rank
classifier.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/classifier.h b/classifier.h
index f61a7f0..13b110b 100644
--- a/classifier.h
+++ b/classifier.h
@@ -425,7 +425,7 @@ public:
@@ -428,7 +428,7 @@ public:
while(_hitMap.size() > (size_t)rp.khits) {
_hitTaxCount.clear();
for(size_t i = 0; i < _hitMap.size(); i++) {
......
......@@ -6,11 +6,9 @@ Subject: Fix make install DESTDIR
Makefile | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/Makefile b/Makefile
index 8d66b3c..1889b09 100644
--- a/Makefile
+++ b/Makefile
@@ -412,20 +412,20 @@ prefix=/usr/local
@@ -413,20 +413,20 @@ prefix=/usr/local
.PHONY: install
install: all
......
......@@ -13,7 +13,7 @@ Description: Propagate hardening options
#CFLAGS = -fdiagnostics-color=always
ifeq (1,$(USE_SRA))
@@ -254,7 +254,8 @@ DEFS=-fno-strict-aliasing \
@@ -255,7 +255,8 @@ DEFS=-fno-strict-aliasing \
$(CFLAGS) \
$(PREF_DEF) \
$(MM_DEF) \
......
This diff is collapsed.
......@@ -52,7 +52,7 @@
<table width="100%"><tr><td>last updated:</td> <td align="right">12/06/2016</td></tr>
<tr>
<td>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria (compressed)</i></a>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Archaea (compressed)</i></a>
</td>
<td align="right" style="font-size: x-small">
<b>4.4 GB</b>
......@@ -60,7 +60,7 @@
</tr>
<tr>
<td>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Viruses, Human (compressed)</i></a>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Archaea, Viruses, Human (compressed)</i></a>
</td>
<td align="right" style="font-size: x-small">
<b>5.4 GB</b>
......@@ -68,7 +68,7 @@
</tr>
<tr>
<td>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Viruses, Human </i></a>
<a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Aarchaea, Viruses, Human </i></a>
</td>
<td align="right" style="font-size: x-small">
<b>7.9 GB</b>
......
......@@ -11,6 +11,7 @@ KEEP_FILES?=0
get_ref_file_names = $(addprefix $(REFERENCE_SEQUENCES_DIR)/, $(addsuffix $(1), \
$(addprefix all-,$(COMPLETE_GENOMES)) \
$(addprefix all-,$(addsuffix -chromosome_level,$(CHROMOSOME_LEVEL_GENOMES))) \
$(addprefix all-,$(addsuffix -any_level,$(ANY_LEVEL_GENOMES))) \
$(addprefix mammalian-reference-,$(MAMMALIAN_TAXIDS)) \
$(addprefix all-compressed-,$(COMPLETE_GENOMES_COMPRESSED)) \
$(if $(INCLUDE_CONTAMINANTS),contaminants)))
......@@ -43,6 +44,10 @@ STANDARD TARGETS:
p+h+v As above, but with uncompressed bacterial genomes
p+v
v
Alternatively, a IDX_NAME and one or more genomes may be specified as
options to build a custom database.
......@@ -79,25 +84,37 @@ all:
IDX_NAME?=$(shell basename $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))))
INDICES=p+h+v p_compressed p_compressed+h+v refseq_microbial refseq_full nt
INDICES=p+h+v p+v v p p_compressed p_compressed+h+v refseq_microbial refseq_full nt
p+h+v: export COMPLETE_GENOMES:=archaea bacteria viral
p+h+v: export ANY_LEVEL_GENOMES:=viral
p+h+v: export COMPLETE_GENOMES:=archaea bacteria
p+h+v: export MAMMALIAN_TAXIDS:=9606
p+h+v: export INCLUDE_CONTAMINANTS:=1
p+h+v: export IDX_NAME:=p+h+v
p_compressed: export COMPLETE_GENOMES:=
p+v: export ANY_LEVEL_GENOMES:=viral
p+v: export COMPLETE_GENOMES:=archaea bacteria
p+v: export INCLUDE_CONTAMINANTS:=1
p+v: export IDX_NAME:=p+v
v: export ANY_LEVEL_GENOMES:=viral
v: export IDX_NAME:=v
p: export COMPLETE_GENOMES:=archaea bacteria
p: export IDX_NAME:=p
p_compressed: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
p_compressed: export IDX_NAME:=p_compressed
p_compressed+h+v: export COMPLETE_GENOMES:=viral
p_compressed+h+v: export ANY_LEVEL_GENOMES:=viral
p_compressed+h+v: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
p_compressed+h+v: export MAMMALIAN_TAXIDS:=9606
p_compressed+h+v: export INCLUDE_CONTAMINANTS:=1
p_compressed+h+v: export IDX_NAME:=p_compressed+h+v
refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa viral
refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa
refseq_microbial: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
refseq_microbial: export ANY_LEVEL_GENOMES:=viral
##refseq_microbial: export SMALL_GENOMES:=mitochondrion plasmid plastid # TODO
refseq_microbial: export MAMMALIAN_TAXIDS:=9606 10090
refseq_microbial: export INCLUDE_CONTAMINANTS:=1
......@@ -106,6 +123,7 @@ refseq_microbial: export CF_BUILD_OPTS+=--ftabchars 14
refseq_full: export COMPLETE_GENOMES:=archaea bacteria fungi invertebrate plant protozoa vertebrate_mammalian vertebrate_other viral
refseq_full: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
refseq_full: export ANY_LEVEL_GENOMES:=viral
refseq_full: export SMALL_GENOMES:=mitochondrion plasmid plastid
refseq_full: export MAMMALIAN_TAXIDS:=9606 10090
refseq_full: export INCLUDE_CONTAMINANTS:=1
......@@ -196,7 +214,7 @@ $(REFERENCE_SEQUENCES_DIR)/mammalian-reference-%.fna: | $(REFERENCE_SEQUENCES_DI
@[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
centrifuge-download -o $(TMP_DIR) -d "vertebrate_mammalian" -a "Chromosome" -t $* -c 'reference genome' -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX), $(notdir $@))
cat $(TMP_DIR)/vertebrate_mammalian/*.fna > $@.tmp && mv $@.tmp $@
find $(TMP_DIR)/vertebrate_mammalian -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/vertebrate_mammalian ]] || mkdir -p $(DL_DIR)/vertebrate_mammalian
......@@ -219,12 +237,12 @@ else
rm -rf $(TMP_DIR)
endif
$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
@echo Downloading and dust-masking $*
centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
......@@ -233,12 +251,12 @@ else
rm -rf $(TMP_DIR)
endif
$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
$(REFERENCE_SEQUENCES_DIR)/all-%-any_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
@echo Downloading and dust-masking $*
centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Any" -d "$*" -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
......@@ -247,12 +265,24 @@ else
rm -rf $(TMP_DIR)
endif
$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
@echo Downloading and dust-masking $*
centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
else
rm -rf $(TMP_DIR)
endif
$(REFERENCE_SEQUENCES_DIR)/contaminants.fna: | $(REFERENCE_SEQUENCES_DIR)
[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
centrifuge-download -o $(TMP_DIR) contaminants > $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
cat $(TMP_DIR)/contaminants/*.fna > $@.tmp && mv $@.tmp $@
find $(TMP_DIR)/contaminants -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
ifeq (1,$(KEEP_FILES))
[[ -d $(DL_DIR)/contaminants ]] || mkdir -p $(DL_DIR)/contaminants
......