Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · 6709a4b9
--- a/MANUAL
+++ b/MANUAL
@@ -198,7 +198,83 @@ can be generated from a GI taxid dump:

 ### Custom database

-TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
+
+  - `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).  
+  - `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
+  - `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
+  - reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
+
+When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
+
+#### More info on `--taxonomy-tree` and `--name-table`
+
+The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump. 
+
+- Field terminator is `\t|\t`
+- Row terminator is `\t|\n`
+
+The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+
+    tax_id                  -- node id in GenBank taxonomy database
+    parent tax_id           -- parent node id in GenBank taxonomy database
+    rank                    -- rank of this node (superkingdom, kingdom, ..., no rank)
+
+Further fields are ignored.
+
+The `name-table` / names.dmp is the taxonomy names file:
+
+    tax_id                  -- the id of node associated with this name
+    name_txt                -- name itself
+    unique name             -- the unique variant of this name if name not unique
+    name class              -- (scientific name, synonym, common name, ...)
+
+`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
+
+#### Example
+
+*Conversion table `ex.conv`*: 
+    
+    Seq1	11
+    Seq2	12
+    Seq3	13
+    Seq4	11
+
+*Taxonomy tree `ex.tree`*: 
+
+    1	|	1	|	root
+    10	|	1	|	kingdom
+    11	|	10	|	species
+    12	|	10	|	species
+    13	|	1	|	species
+
+*Name table `ex.name`*:
+
+    1	|	root	|		|	scientific name	|
+    10	|	Bacteria	|		|	scientific name	|
+    11	|	Bacterium A	|		|	scientific name	|
+    12	|	Bacterium B	|		|	scientific name	|
+    12	|	Some other species	|		|	scientific name	|
+
+*Reference sequences `ex.fa`*:
+
+    >Seq1
+    AAAACGTACGA.....
+    >Seq2
+    AAAACGTACGA.....
+    >Seq3
+    AAAACGTACGA.....
+    >Seq4
+    AAAACGTACGA.....
+
+To build the database, call
+
+    centrifuge-build --conversion-table ex.conv \
+                     --taxonomy-tree ex.tree --name-table ex.name \ 
+                     ex.fa ex
+
+which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.

 ### Centrifuge classification output


--- a/MANUAL.markdown
+++ b/MANUAL.markdown
@@ -211,7 +211,84 @@ can be generated from a GI taxid dump:

 ### Custom database

-TODO: Add toy example for nodes.dmp, names.dmp and seqid2taxid.map
+To build a custom database, you need the provide the follwing four files to `centrifuge-build`:
+
+  - `--conversion-table`: tab-separated file mapping sequence IDs to taxonomy IDs. Sequence IDs are the header up to the first space or second pipe (`|`).  
+  - `--taxonomy-tree`: `\t|\t`-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree. When using NCBI taxonomy IDs, this will be the `nodes.dmp` from `ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz`.
+  - `--name-table`: '\t|\t'-separated file mapping taxonomy IDs to a name. A further column (typically column 4) must specify `scientific name`. When using NCBI taxonomy IDs, `names.dmp` is the appropriate file.
+  - reference sequences: The ID of the sequences are the header up to the first space or second pipe (`|`)
+
+When using custom taxonomy IDs, use only positive integers greater-equal to `1` and use `1` for the root of the tree.
+
+#### More info on `--taxonomy-tree` and `--name-table`
+
+The format of these files are based on `nodes.dmp` and `names.dmp` from the NCBI taxonomy database dump. 
+
+- Field terminator is `\t|\t`
+- Row terminator is `\t|\n`
+
+The `taxonomy-tree` / nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+
+    tax_id                  -- node id in GenBank taxonomy database
+    parent tax_id           -- parent node id in GenBank taxonomy database
+    rank                    -- rank of this node (superkingdom, kingdom, ..., no rank)
+
+Further fields are ignored.
+
+The `name-table` / names.dmp is the taxonomy names file:
+
+    tax_id                  -- the id of node associated with this name
+    name_txt                -- name itself
+    unique name             -- the unique variant of this name if name not unique
+    name class              -- (scientific name, synonym, common name, ...)
+
+`name class` **has** to be `scientific name` to be included in the build. All other lines are ignored
+
+#### Example
+
+*Conversion table `ex.conv`*: 
+    
+    Seq1	11
+    Seq2	12
+    Seq3	13
+    Seq4	11
+
+
+*Taxonomy tree `ex.tree`*: 
+
+    1	|	1	|	root
+    10	|	1	|	kingdom
+    11	|	10	|	species
+    12	|	10	|	species
+    13	|	1	|	species
+
+*Name table `ex.name`*:
+
+    1	|	root	|		|	scientific name	|
+    10	|	Bacteria	|		|	scientific name	|
+    11	|	Bacterium A	|		|	scientific name	|
+    12	|	Bacterium B	|		|	scientific name	|
+    12	|	Some other species	|		|	scientific name	|
+
+*Reference sequences `ex.fa`*:
+
+    >Seq1
+    AAAACGTACGA.....
+    >Seq2
+    AAAACGTACGA.....
+    >Seq3
+    AAAACGTACGA.....
+    >Seq4
+    AAAACGTACGA.....
+
+To build the database, call
+
+    centrifuge-build --conversion-table ex.conv \
+                     --taxonomy-tree ex.tree --name-table ex.name \ 
+                     ex.fa ex
+
+which results in three index files named `ex.1.cf`, `ex.2.cf` and `ex.3.cf`.


 ### Centrifuge classification output

--- a/Makefile
+++ b/Makefile
@@ -191,6 +191,7 @@ CENTRIFUGE_SCRIPT_LIST = 	centrifuge \
 	centrifuge-build \
 	centrifuge-inspect \
 	centrifuge-download \
+	centrifuge-kreport \
 	$(wildcard centrifuge-*.pl)



--- a/aln_sink.h
+++ b/aln_sink.h
@@ -2305,9 +2305,9 @@ void AlnSinkSam<index_t>::appendMate(
 			case READ_ID:      appendReadID(o, rd.name); break;
 			case SEQ_ID:       appendSeqID(o, rs, ebwt.tree()); break;
 			case SEQ:          o.append((string(rd.patFw.toZBuf()) + 
-										(rdo == NULL? "" : "N" + string(rdo->patFw.toZBuf()))).c_str()); break;
+										(rdo == NULL? "" : "_" + string(rdo->patFw.toZBuf()))).c_str()); break;
 			case QUAL:         o.append((string(rd.qual.toZBuf()) + 
-										(rdo == NULL? "" : "I" + string(rdo->qual.toZBuf()))).c_str()); break;
+										(rdo == NULL? "" : "_" + string(rdo->qual.toZBuf()))).c_str()); break;

 			case SEQ1:         o.append(rd.patFw.toZBuf()); break;
 			case QUAL1:        o.append(rd.qual.toZBuf()); break;

--- a/centrifuge
+++ b/centrifuge
@@ -116,6 +116,13 @@ my %read_compress = ();
 my $cap_out = undef;       # Filename for passthrough
 my $no_unal = 0;
 my $large_idx = 0;
+
+# Variables handling the output format
+my $outputFmtSam = 0 ;
+my $tabFmtOptIdx = 0 ;
+my $needReadSeq = 0 ;
+my $removeSeqCols = 0 ;
+
 # Remove whitespace
 for my $i (0..$#bt2_args) {
 	$bt2_args[$i]=~ s/^\s+//; $bt2_args[$i] =~ s/\s+$//;
@@ -179,6 +186,7 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 	}
 	for my $rarg ("un-conc", "al-conc", "un", "al") {
 		if($arg =~ /^--${rarg}$/ || $arg =~ /^--${rarg}-gz$/ || $arg =~ /^--${rarg}-bz2$/) {
+			$needReadSeq = 1 ;
 			$bt2_args[$i] = undef;
 			if(scalar(@args) > 1 && $args[1] ne "") {
 				$read_fns{$rarg} = $args[1];
@@ -193,7 +201,57 @@ for(my $i = 0; $i < scalar(@bt2_args); $i++) {
 			last;
 		}
 	}
+	if ($arg eq "--out-fmt" )
+	{
+		$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+		$i++;
+		if ( $bt2_args[$i] eq "sam" ) 
+		{
+			$outputFmtSam = 1 ;
+		}
+		#$bt2_args[$i] = undef;
+
+	}
+
+	if ( $arg eq "--tab-fmt-cols" )
+	{
+		$i < scalar(@bt2_args)-1 || Fail("Argument expected in next token!\n");
+		$tabFmtOptIdx = $i + 1 ;
+	}
+}
+
+# Determine whether we need to add two extra columns for seq and qual to out-fmt
+if ( $needReadSeq == 1 && ( $tabFmtOptIdx == 0 || $outputFmtSam == 1 ) )
+{
+	my $i ;
+	my $needAdd = 1 ;
+	if ( $tabFmtOptIdx != 0 )
+	{
+		my @cols = split /,/, $bt2_args[ $tabFmtOptIdx ] ;
+		foreach my $f (@cols)
+		{
+			if ( $f eq "readSeq" )
+			{
+				$needAdd = 0 ;		
+				last ;
+			}
+		}
+
+	}
+	else
+	{
+		push @bt2_args, "--tab-fmt-cols" ;
+		push @bt2_args, "readID,seqID,taxID,score,2ndBestScore,hitLength,queryLength,numMatches" ;
+		$tabFmtOptIdx = scalar( @bt2_args ) - 1 ;
+	}
+	
+	if ( $needAdd )
+	{
+		$removeSeqCols = 1 ;
+		$bt2_args[ $tabFmtOptIdx ] .= ",readSeq,readQual" ; 
+	}
 }
+
 # If the user asked us to redirect some reads to files, or to suppress
 # unaligned reads, then we need to capture the output from Centrifuge and pass it
 # through this wrapper.
@@ -423,6 +481,17 @@ my $cmd = "$align_prog$debug_str --wrapper basic-0 ".join(" ", @bt2_args);
 # Possibly add read input on an anonymous pipe
 $cmd = "$readpipe $cmd" if defined($readpipe);

+# The function removes the two extra columns that we added to get the read seq and qual
+sub RemoveSeqCols
+{
+	my $line = $_[0] ;
+	my @cols = split /\t/, $line ;
+	pop @cols ;
+	pop @cols ;
+	my $tab = "\t" ;
+	return join( $tab, @cols ) ;
+}
+
 Info("$cmd\n");
 my $ret;
 if(defined($cap_out)) {
@@ -485,30 +554,87 @@ if(defined($cap_out)) {
 			}
 		}
 	}
+
+	my $seqIndex = -1 ;
+	my $qualIndex = -1 ;
+	my $readIdIndex = -1 ;
+	if ( $outputFmtSam == 0 )
+	{
+		my $outputHeader = <BT> ;
+		my @cols = split /\t/, $outputHeader ;
+		for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
+		{
+			if ( $cols[$i] =~ /readSeq/ )
+			{
+				$seqIndex = $i ;
+			}
+			elsif ( $cols[$i] =~ /readQual/ )
+			{
+				$qualIndex = $i ;
+			}
+			elsif ( $cols[$i] =~ /readID/ )
+			{
+				$readIdIndex = $i ;
+			}
+		}
+		if ( $seqIndex == -1 && scalar( keys %read_fhs) == 0 )
+		{
+			Error( "Must use readSeq in --tabFmtCols in order to output unaligned reads." )  ;
+		}
+		
+		$outputHeader = RemoveSeqCols( $outputHeader )."\n" if ( $removeSeqCols == 1 ) ;
+		print {$ofh} $outputHeader ;
+	}
+	else
+	{
+		$seqIndex = 9 ;
+		$qualIndex = 10 ;
+		$readIdIndex = 0 ;
+	}
+
 	while(<BT>) {
 		chomp;
 		my $filt = 0;
 		unless(substr($_, 0, 1) eq "@") {
 			# If we are supposed to output certain reads to files...
-			my $tab1_i = index($_, "\t") + 1;
-			my $tab2_i = index($_, "\t", $tab1_i);
-			my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
-			my $unal = ($fl & 4) != 0;
+			#my $tab1_i = index($_, "\t") + 1;
+			#my $tab2_i = index($_, "\t", $tab1_i);
+			#my $fl = substr($_, $tab1_i, $tab2_i - $tab1_i);
+			my $unal = 0 ;
+			if ( /unclassified/ )
+			{
+				$unal = 1 ;
+			}
 			$filt = 1 if $no_unal && $unal;
 			if($passthru) {
-				if(scalar(keys %read_fhs) == 0) {
+				if(scalar(keys %read_fhs) == 0 || $seqIndex == -1 ) {
 					# Next line is read with some whitespace escaped
-					my $l = <BT>;
+					# my $l = <BT>;
 				} else {
-					my $mate1 = (($fl &  64) != 0);
-					my $mate2 = (($fl & 128) != 0);
-					my $unp = !$mate1 && !$mate2;
-					my $pair = !$unp;
+					my @cols = split /\t/ ;
+					my $isPaired = 0 ;
+					my $pair = 0 ;
+					if (  $cols[$seqIndex] =~ /_/ )
+					{
+						$pair = 1 ;
+					}
+					my $unp = !$pair ;
+
 					# Next line is read with some whitespace escaped
-					my $l = <BT>;
-					chomp($l);
-					$l =~ s/%(..)/chr(hex($1))/eg;
+					#my $l = <BT>;
+					#chomp($l);
+					#$l =~ s/%(..)/chr(hex($1))/eg;
+
 					if((defined($read_fhs{un}) || defined($read_fhs{al})) && $unp) {
+						my $l ;
+						if ( $qualIndex != -1 )
+						{
+							$l = "@".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n+\n".$cols[$qualIndex]."\n" ;
+						}
+						else 
+						{
+							$l = ">".$cols[ $readIdIndex ]."\n".$cols[$seqIndex]."\n" ;
+						}
 						if($unal) {
 							# Failed to align
 							print {$read_fhs{un}} $l if defined($read_fhs{un});
@@ -517,21 +643,42 @@ if(defined($cap_out)) {
 							print {$read_fhs{al}} $l if defined($read_fhs{al});
 						}
 					}
+					my $warnedAboutLength = 0 ;
 					if((defined($read_fhs{"un-conc"}) || defined($read_fhs{"al-conc"})) && $pair) {
-						my $conc  = (($fl &   2) != 0);
-						if     ($conc && $mate1) {
-							print {$read_fhs{"al-conc"}{1}} $l if defined($read_fhs{"al-conc"});
-						} elsif($conc && $mate2) {
-							print {$read_fhs{"al-conc"}{2}} $l if defined($read_fhs{"al-conc"});
-						} elsif(!$conc && $mate1) {
-							print {$read_fhs{"un-conc"}{1}} $l if defined($read_fhs{"un-conc"});
-						} elsif(!$conc && $mate2) {
-							print {$read_fhs{"un-conc"}{2}} $l if defined($read_fhs{"un-conc"});
+						my @seq = split /_/, $cols[$seqIndex] ;
+						my @qual = ( substr( $cols[$qualIndex], 0, length( $seq[0] ) ), substr( $cols[$qualIndex], length( $seq[0] ) + 1 ) ) ;
+						
+						my $l1 ;
+						my $l2 ;
+						if ( $qualIndex != -1 )
+						{
+							$l1 = "@".$cols[ $readIdIndex ]."\n".$seq[0]."\n+\n".$qual[0]."\n" ;
+						}
+						else 
+						{
+							$l1 = ">".$cols[ $readIdIndex ]."\n".$seq[0]."\n" ;
+						}
+						if ( $qualIndex != -1 )
+						{
+							$l2 = "@".$cols[ $readIdIndex ]."\n".$seq[1]."\n+\n".$qual[1]."\n" ;
+						}
+						else 
+						{
+							$l2 = ">".$cols[ $readIdIndex ]."\n".$seq[1]."\n" ;
+						}
+
+						if     ( !$unal) {
+							print {$read_fhs{"al-conc"}{1}} $l1 if defined($read_fhs{"al-conc"});
+							print {$read_fhs{"al-conc"}{2}} $l2 if defined($read_fhs{"al-conc"});
+						} else {
+							print {$read_fhs{"un-conc"}{1}} $l1 if defined($read_fhs{"un-conc"});
+							print {$read_fhs{"un-conc"}{2}} $l2 if defined($read_fhs{"un-conc"});
 						}
 					}
 				}
 			}
 		}
+		$_ = RemoveSeqCols( $_ ) if ( $removeSeqCols == 1 ) ;
 		print {$ofh} "$_\n" if !$filt;
 	}
 	for my $k (@fhs_to_close) { close($k); }

--- a/centrifuge-download
+++ b/centrifuge-download
@@ -11,10 +11,10 @@ if hash rsync 2>/dev/null; then
        DL_MODE="rsync"
 elif hash wget 2>/dev/null; then
 	DL_PROG="wget -N --reject=index.html -qO"
-        DL_MODE="ftp"
+        DL_MODE="https"
 else 
 	DL_PROG="curl -s -o"
-        DL_MODE="ftp"
+        DL_MODE="https"
 fi
 export DL_PROG DL_MODE

@@ -154,7 +154,7 @@ COMMON OPTIONS

 WHEN USING database refseq OR genbank:
 -d <domain>            What domain to download. One or more of ${ALL_GENOMES// /, } (comma separated).
- -a <assembly level>    Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'.
+ -a <assembly level>    Only download genomes with the specified assembly level. Default: '$ASSEMBLY_LEVEL'. Use 'Any' for any assembly level.
 -c <refseq category>   Only download genomes in the specified refseq category. Default: any.
 -t <taxids>            Only download the specified taxonomy IDs, comma separated. Default: any.
 -r                     Download RNA sequences, too.
@@ -262,8 +262,10 @@ SPECIES_TAXID_FIELD=7
 VERSION_STATUS_FIELD=11
 ASSEMBLY_LEVEL_FIELD=12
 FTP_PATH_FIELD=20
+FTP_PATH_FIELD2=21  ## Needed for wrongly formatted virus files - hopefully just a temporary fix

-AWK_QUERY="\$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\" && \$$VERSION_STATUS_FIELD==\"latest\""
+AWK_QUERY="\$$VERSION_STATUS_FIELD==\"latest\""
+[[ "$ASSEMBLY_LEVEL" != "Any" ]] && AWK_QUERY="$AWK_QUERY && \$$ASSEMBLY_LEVEL_FIELD==\"$ASSEMBLY_LEVEL\""
 [[ "$REFSEQ_CATEGORY" != "" ]] && AWK_QUERY="$AWK_QUERY && \$$REFSEQ_CAT_FIELD==\"$REFSEQ_CATEGORY\""

 TAXID=${TAXID//,/|}
@@ -320,11 +322,22 @@ for DOMAIN in $DOMAINS; do

    N_EXPECTED=`cat "$ASSEMBLY_SUMMARY_FILE" | wc -l`
    [[ $N_EXPECTED -gt 0 ]] || { echo "Domain $DOMAIN has no genomes with specified filter." >&2; exit 1; }
+
+    if [[ "$DOMAIN" == "viral" ]]; then
+      ## Wrong columns in viral assembly summary files - the path is sometimes in field 20, sometimes 21
+      cut -f "$TAXID_FIELD,$FTP_PATH_FIELD,$FTP_PATH_FIELD2" "$ASSEMBLY_SUMMARY_FILE" | \
+       sed 's/^\(.*\)\t\(ftp:.*\)\t.*/\1\t\2/;s/^\(.*\)\t.*\t\(ftp:.*\)/\1\t\2/' | \
+      sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
+         tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
+    else
      echo "Downloading $N_EXPECTED $DOMAIN genomes at assembly level $ASSEMBLY_LEVEL ... (will take a while)" >&2
      cut -f "$TAXID_FIELD,$FTP_PATH_FIELD" "$ASSEMBLY_SUMMARY_FILE" | sed 's#\([^/]*\)$#\1/\1_genomic.fna.gz#' |\
         tr '\n' '\0' | xargs -0 -n1 -P $N_PROC bash -c 'download_n_process_nofail "$@"' _ | count $N_EXPECTED
+    fi
    echo >&2

+
+
    if [[ "$DOWNLOAD_RNA" == "1" && ! `echo $DOMAIN | egrep 'bacteria|viral|archaea'` ]]; then
    	echo "Downloadinging rna sequence files" >&2
        cut -f $TAXID_FIELD,$FTP_PATH_FIELD  "$ASSEMBLY_SUMMARY_FILE"| sed 's#\([^/]*\)$#\1/\1_rna.fna.gz#' |\

--- a/centrifuge-kreport
+++ b/centrifuge-kreport
@@ -9,12 +9,17 @@
 use strict;
 use warnings;
 use Getopt::Long;
+use File::Basename;
+use Cwd;
+use Cwd 'cwd' ;
+use Cwd 'abs_path' ;

 my ($centrifuge_index, $min_score, $min_length);
 my $only_unique = 0;
 my $show_zeros = 0;
 my $is_cnts_table = 0;
 my $PROG = "centrifuge-kreport";
+my $CWD = dirname( abs_path( $0 ) ) ;

 GetOptions(
  "help" => \&display_help,
@@ -71,9 +76,23 @@ if ($is_cnts_table) {
    $seq_count += $count;
  }
 } else {
-  <>;
+  my $header = <>;
+  my @cols = split /\s+/, $header ;
+  my %headerMap ;
+  for ( my $i = 0 ; $i < scalar( @cols ) ; ++$i )
+  {
+  	$headerMap{ $cols[$i] } = $i ;
+  }
  while (<>) {
-    my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
+    #my (undef,$seqID,$taxid,$score, undef, $hitLength, $queryLength, $numMatches) = split /\t/;
+    my @cols = split /\s+/ ;
+    my $seqID = $cols[ $headerMap{ "seqID" } ] ; 
+    my $taxid = $cols[ $headerMap{ "taxID" } ] ; 
+    my $score = $cols[ $headerMap{ "score" } ] ; 
+    my $hitLength = $cols[ $headerMap{ "hitLength" } ] ; 
+    my $queryLength = $cols[ $headerMap{ "queryLength" } ] ; 
+    my $numMatches = $cols[ $headerMap{ "numMatches" } ] ; 
+
    next if $only_unique && $numMatches > 1;
    next if defined $min_length && $hitLength < $min_length;
    next if defined $min_score && $score < $min_score;
@@ -150,7 +169,7 @@ sub dfs_summation {
 sub load_taxonomy {

  print STDERR "Loading names file ...\n";
-  open NAMES, "-|", "centrifuge-inspect --name-table $centrifuge_index"
+  open NAMES, "-|", "$CWD/centrifuge-inspect --name-table $centrifuge_index"
    or die "$PROG: can't open names file: $!\n";
  while (<NAMES>) {
    chomp;
@@ -162,7 +181,7 @@ sub load_taxonomy {
  close NAMES;

  print STDERR "Loading nodes file ...\n";
-  open NODES, "-|", "centrifuge-inspect --taxonomy-tree $centrifuge_index"
+  open NODES, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index"
    or die "$PROG: can't open nodes file: $!\n";
  while (<NODES>) {
    chomp;

--- a/centrifuge-promote
+++ b/centrifuge-promote
+#!/usr/bin/env perl
+
+use strict ;
+use warnings ;
+
+use File::Basename;
+use Cwd;
+use Cwd 'cwd' ;
+use Cwd 'abs_path' ;
+
+
+die "Usage: centrifuge-promote.pl centrifuge_index centrifuge_output level > output\n\n".
+	"Promote the taxonomy id to specified level in Centrifuge output.\n" if ( @ARGV == 0 ) ;
+
+my $CWD = dirname( abs_path( $0 ) ) ;
+# Go through the index to obtain the taxonomy tree
+my %taxParent ; 
+my %taxIdToSeqId ;
+my %taxLevel ;
+
+my $centrifuge_index = $ARGV[0] ;
+open FP1, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index" or die "can't open $!\n" ;
+while ( <FP1> )
+{
+	chomp ;
+	my @cols = split /\t\|\t/;
+	$taxParent{ $cols[0] } = $cols[1] ;
+	$taxLevel{ $cols[0] } = $cols[2] ;
+}
+close FP1 ;
+open FP1, "-|", "$CWD/centrifuge-inspect --conversion-table $centrifuge_index" or die "can't open $!\n" ;
+while ( <FP1> )
+{
+	chomp ;
+	my @cols = split /\t/ ;
+	$taxIdToSeqId{ $cols[1] } = $cols[0] ;
+}
+close FP1 ;
+
+# Go through the output of centrifuge
+my $level = $ARGV[2] ;
+sub PromoteTaxId
+{
+	my $tid = $_[0] ;
+	return 0 if ( $tid <= 0 || !defined( $taxLevel{ $tid } ) ) ;
+
+	if ( $taxLevel{ $tid } eq $level )
+	{
+		return $tid ;
+	}
+	else
+	{
+		return 0 if ( $tid <= 1 ) ;
+		return PromoteTaxId( $taxParent{ $tid } ) ;
+	}
+}
+
+sub OutputPromotedLines
+{
+	my @lines = @{ $_[0] } ;
+	return if ( scalar( @lines ) <= 0 ) ;
+
+	my @newLines ;
+	my $i ;
+	my $numMatches = 0 ;
+	my %showedUpTaxId ;
+	my $tab = sprintf( "\t" ) ;
+	for ( $i = 0 ; $i < scalar( @lines ) ; ++$i )
+	{
+		my @cols = split /\t+/, $lines[ $i ] ;
+		my $newTid = PromoteTaxId( $cols[2] ) ;
+		if ( $newTid <= 1 )
+		{
+			$newTid = $cols[2] ;
+		}
+		my $newLevel = $cols[1] ;
+		$newLevel = $taxLevel{ $newTid } if ( $newTid >= 1 && defined $taxLevel{ $newTid } ) ;
+			
+		next if ( defined $showedUpTaxId{ $newTid } ) ;
+	
+		$showedUpTaxId{ $newTid } = 1 ;	
+		++$numMatches ;
+
+		$cols[2] = $newTid ;
+		$cols[1] = $newLevel ;
+		push @newLines, join( $tab, @cols ) ;
+	}
+
+	for ( $i = 0 ; $i < scalar( @newLines ) ; ++$i )
+	{
+		my @cols = split /\t+/, $newLines[$i] ;
+		$cols[-1] = $numMatches ;
+		print join( $tab, @cols ), "\n" ;
+	}
+}
+
+open FP1, $ARGV[1] ;
+my $header = <FP1> ;
+my $prevReadId = "" ;
+my @lines ;
+
+print $header ;
+while ( <FP1> )
+{
+	chomp ;
+	my @cols = split /\t/ ;
+	if ( $cols[0] eq $prevReadId )
+	{
+		push @lines, $_ ;
+	}
+	else
+	{
+		$prevReadId = $cols[0] ;
+				
+		OutputPromotedLines( \@lines ) ;
+
+		undef @lines ;
+		push @lines, $_ ;
+	}
+}
+OutputPromotedLines( \@lines )  ;
+close FP1 ;
--- a/centrifuge.cpp
+++ b/centrifuge.cpp
@@ -501,7 +501,7 @@ static void resetOptions() {
    col_name_map["CIGAR"] = PLACEHOLDER;
    col_name_map["RNEXT"] = SEQ_ID;
    col_name_map["PNEXT"] = PLACEHOLDER_ZERO;
-    col_name_map["TLEN"] = PLACEHOLDER_ZERO;
+    col_name_map["TLEN"] = QUERY_LENGTH ; //PLACEHOLDER_ZERO;
    col_name_map["SEQ"] = SEQ;
    col_name_map["QUAL"] = QUAL;


--- a/classifier.h
+++ b/classifier.h
@@ -418,8 +418,11 @@ public:
            
            if(!_tree_traverse) {
                if(_hitMap.size() > (size_t)rp.khits)
+		{
+		    reportUnclassified( sink ) ;
                    return 0;
 		}
+            }
            
            uint8_t rank = 0;
            while(_hitMap.size() > (size_t)rp.khits) {
@@ -511,7 +514,10 @@ public:
            }
        }
        if(!only_host_taxIDs && _hitMap.size() > (size_t)rp.khits)
+	{
+	    reportUnclassified( sink ) ;
            return 0;
+	}
       
 #if 0
       	// boost up the score if the assignment is unique
@@ -528,7 +534,7 @@ public:
            max_score += (rdlen > 15 ? (rdlen - 15) * (rdlen - 15) : 0);
        }
        
-       
+      	bool reported = false ; 
        for(size_t gi = 0; gi < _hitMap.size(); gi++) {
            assert_gt(_hitMap[gi].score, 0);
            HitCount<index_t>& hitCount = _hitMap[gi];
@@ -555,7 +561,12 @@ public:
                    hitCount.readPositions,
                    isFw);
            sink.report(0, &rs);
+	    reported = true ;
        }
+
+	if ( reported == false ) 
+		reportUnclassified( sink ) ;
+        
 	return 0;
    }
    
@@ -1024,6 +1035,17 @@ private:
 	    return idx;
    }

+    void reportUnclassified( AlnSinkWrap<index_t>& sink )
+    {
+	    AlnRes rs ;
+	    EList<pair<uint32_t,uint32_t> > dummy ;
+	    dummy.push_back( make_pair( 0, 0 ) ) ;
+	    rs.init( 0, 0, string( "unclassified" ), 0, 0, 0, dummy, true ) ;
+	    sink.report( 0, &rs ) ;
+    }
+
+
+
    // compare BWTHits by size, ascending, first, then by length, descending
    //   TODO: move this operator into BWTHits if that is the standard way we would like to sort
    //   TODO: this ordering does not necessarily give the best results

--- a/debian/changelog
+++ b/debian/changelog
+centrifuge (1.0.3-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Andreas Tille <tille@debian.org>  Fri, 02 Mar 2018 19:59:50 +0100
+
 centrifuge (1.0.3~beta-2) unstable; urgency=medium

  * Move hisat2 and jellyfish to Build-Depends to make sure package builds

--- a/debian/patches/0002-Fix-build-with-rank.patch
+++ b/debian/patches/0002-Fix-build-with-rank.patch
@@ -6,11 +6,9 @@ Subject: Fix build with rank
 classifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-diff --git a/classifier.h b/classifier.h
-index f61a7f0..13b110b 100644
 --- a/classifier.h
 +++ b/classifier.h
-@@ -425,7 +425,7 @@ public:
+@@ -428,7 +428,7 @@ public:
             while(_hitMap.size() > (size_t)rp.khits) {
                 _hitTaxCount.clear();
                 for(size_t i = 0; i < _hitMap.size(); i++) {

--- a/debian/patches/0003-Fix-make-install-DESTDIR.patch
+++ b/debian/patches/0003-Fix-make-install-DESTDIR.patch
@@ -6,11 +6,9 @@ Subject: Fix make install DESTDIR
 Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

-diff --git a/Makefile b/Makefile
-index 8d66b3c..1889b09 100644
 --- a/Makefile
 +++ b/Makefile
-@@ -412,20 +412,20 @@ prefix=/usr/local
+@@ -413,20 +413,20 @@ prefix=/usr/local
 
 .PHONY: install
 install: all

--- a/debian/patches/hardening.patch
+++ b/debian/patches/hardening.patch
@@ -13,7 +13,7 @@ Description: Propagate hardening options
 #CFLAGS         = -fdiagnostics-color=always
 
 ifeq (1,$(USE_SRA))
-@@ -254,7 +254,8 @@ DEFS=-fno-strict-aliasing \
+@@ -255,7 +255,8 @@ DEFS=-fno-strict-aliasing \
 	 $(CFLAGS) \
      $(PREF_DEF) \
      $(MM_DEF) \

--- a/doc/manual.inc.html
+++ b/doc/manual.inc.html
--- a/doc/sidebar.inc.shtml
+++ b/doc/sidebar.inc.shtml
@@ -52,7 +52,7 @@
    <table width="100%"><tr><td>last updated:</td> <td align="right">12/06/2016</td></tr>
      <tr>
        <td>
-	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria (compressed)</i></a>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Archaea (compressed)</i></a>
        </td>
 	<td align="right" style="font-size: x-small">
 	  <b>4.4 GB</b>
@@ -60,7 +60,7 @@
      </tr>
      <tr>
        <td>
-	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Viruses, Human (compressed)</i></a>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Archaea, Viruses, Human (compressed)</i></a>
        </td>
 	<td align="right" style="font-size: x-small">
 	  <b>5.4 GB</b>
@@ -68,7 +68,7 @@
      </tr>
      <tr>
        <td>
-	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Viruses, Human </i></a>
+	  <a href="ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p+h+v.tar.gz"><i>&nbsp;&nbsp;&nbsp;Bacteria, Aarchaea, Viruses, Human </i></a>
        </td>
 	<td align="right" style="font-size: x-small">
 	  <b>7.9 GB</b>

--- a/indices/Makefile
+++ b/indices/Makefile
@@ -11,6 +11,7 @@ KEEP_FILES?=0
 get_ref_file_names = $(addprefix $(REFERENCE_SEQUENCES_DIR)/, $(addsuffix $(1), \
 	$(addprefix all-,$(COMPLETE_GENOMES)) \
 	$(addprefix all-,$(addsuffix -chromosome_level,$(CHROMOSOME_LEVEL_GENOMES))) \
+	$(addprefix all-,$(addsuffix -any_level,$(ANY_LEVEL_GENOMES))) \
 	$(addprefix mammalian-reference-,$(MAMMALIAN_TAXIDS)) \
 	$(addprefix all-compressed-,$(COMPLETE_GENOMES_COMPRESSED)) \
 	$(if $(INCLUDE_CONTAMINANTS),contaminants)))
@@ -43,6 +44,10 @@ STANDARD TARGETS:

    p+h+v               As above, but with uncompressed bacterial genomes

+	p+v
+
+	v
+
 Alternatively, a IDX_NAME and one or more genomes may be specified as
 options to build a custom database.

@@ -79,25 +84,37 @@ all:

 IDX_NAME?=$(shell basename $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))))

-INDICES=p+h+v p_compressed p_compressed+h+v refseq_microbial refseq_full nt
+INDICES=p+h+v p+v v p p_compressed p_compressed+h+v refseq_microbial refseq_full nt

-p+h+v: export COMPLETE_GENOMES:=archaea bacteria viral
+p+h+v: export ANY_LEVEL_GENOMES:=viral
+p+h+v: export COMPLETE_GENOMES:=archaea bacteria
 p+h+v: export MAMMALIAN_TAXIDS:=9606
 p+h+v: export INCLUDE_CONTAMINANTS:=1
 p+h+v: export IDX_NAME:=p+h+v

-p_compressed: export COMPLETE_GENOMES:=
+p+v: export ANY_LEVEL_GENOMES:=viral
+p+v: export COMPLETE_GENOMES:=archaea bacteria
+p+v: export INCLUDE_CONTAMINANTS:=1
+p+v: export IDX_NAME:=p+v
+
+v: export ANY_LEVEL_GENOMES:=viral
+v: export IDX_NAME:=v
+
+p: export COMPLETE_GENOMES:=archaea bacteria
+p: export IDX_NAME:=p
+
 p_compressed: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
 p_compressed: export IDX_NAME:=p_compressed

-p_compressed+h+v: export COMPLETE_GENOMES:=viral
+p_compressed+h+v: export ANY_LEVEL_GENOMES:=viral
 p_compressed+h+v: export COMPLETE_GENOMES_COMPRESSED:=archaea bacteria
 p_compressed+h+v: export MAMMALIAN_TAXIDS:=9606
 p_compressed+h+v: export INCLUDE_CONTAMINANTS:=1
 p_compressed+h+v: export IDX_NAME:=p_compressed+h+v

-refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa viral
+refseq_microbial: export COMPLETE_GENOMES:=archaea bacteria fungi protozoa
 refseq_microbial: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+refseq_microbial: export ANY_LEVEL_GENOMES:=viral
 ##refseq_microbial: export SMALL_GENOMES:=mitochondrion plasmid plastid # TODO
 refseq_microbial: export MAMMALIAN_TAXIDS:=9606 10090
 refseq_microbial: export INCLUDE_CONTAMINANTS:=1
@@ -106,6 +123,7 @@ refseq_microbial: export CF_BUILD_OPTS+=--ftabchars 14

 refseq_full: export COMPLETE_GENOMES:=archaea bacteria fungi invertebrate plant protozoa vertebrate_mammalian vertebrate_other viral
 refseq_full: export CHROMOSOME_LEVEL_GENOMES:=$(COMPLETE_GENOMES)
+refseq_full: export ANY_LEVEL_GENOMES:=viral
 refseq_full: export SMALL_GENOMES:=mitochondrion plasmid plastid
 refseq_full: export MAMMALIAN_TAXIDS:=9606 10090
 refseq_full: export INCLUDE_CONTAMINANTS:=1
@@ -196,7 +214,7 @@ $(REFERENCE_SEQUENCES_DIR)/mammalian-reference-%.fna: | $(REFERENCE_SEQUENCES_DI
 	@[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
 	centrifuge-download -o $(TMP_DIR) -d "vertebrate_mammalian" -a "Chromosome" -t $* -c 'reference genome' -P $(THREADS) refseq > \
 		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX), $(notdir $@))
-	cat $(TMP_DIR)/vertebrate_mammalian/*.fna > $@.tmp && mv $@.tmp $@
+	find $(TMP_DIR)/vertebrate_mammalian -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
 	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
 ifeq (1,$(KEEP_FILES))
 	[[ -d $(DL_DIR)/vertebrate_mammalian ]] || mkdir -p $(DL_DIR)/vertebrate_mammalian
@@ -219,12 +237,12 @@ else
 	rm -rf $(TMP_DIR)
 endif

-$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
 	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
 	@echo Downloading and dust-masking $*
-	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
+	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
 		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
-	cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+	find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
 	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
 ifeq (1,$(KEEP_FILES))
 	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
@@ -233,12 +251,12 @@ else
 	rm -rf $(TMP_DIR)
 endif

-$(REFERENCE_SEQUENCES_DIR)/all-%-chromosome_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+$(REFERENCE_SEQUENCES_DIR)/all-%-any_level.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
 	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
 	@echo Downloading and dust-masking $*
-	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Chromosome" -d "$*" -P $(THREADS) refseq > \
+	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -a "Any" -d "$*" -P $(THREADS) refseq > \
 		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
-	cat $(TMP_DIR)/$*/*.fna > $@.tmp && mv $@.tmp $@
+	find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
 	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
 ifeq (1,$(KEEP_FILES))
 	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
@@ -247,12 +265,24 @@ else
 	rm -rf $(TMP_DIR)
 endif

-
+$(REFERENCE_SEQUENCES_DIR)/all-%.fna: | $(REFERENCE_SEQUENCES_DIR) .dustmasker-ok
+	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
+	@echo Downloading and dust-masking $*
+	centrifuge-download -o $(TMP_DIR) $(CF_DOWNLOAD_OPTS) -d "$*" -P $(THREADS) refseq > \
+		$(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
+	find $(TMP_DIR)/$* -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
+	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
+ifeq (1,$(KEEP_FILES))
+	[[ -d $(DL_DIR)/$* ]] || mkdir -p $(DL_DIR)/$*
+	mv $(TMP_DIR)/$*/* $(DL_DIR)/$*
+else
+	rm -rf $(TMP_DIR)
+endif

 $(REFERENCE_SEQUENCES_DIR)/contaminants.fna: | $(REFERENCE_SEQUENCES_DIR)
 	[[ -d $(TMP_DIR) ]] && rm -rf $(TMP_DIR); mkdir -p $(TMP_DIR)
 	centrifuge-download -o $(TMP_DIR) contaminants > $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@))
-	cat $(TMP_DIR)/contaminants/*.fna > $@.tmp && mv $@.tmp $@
+	find $(TMP_DIR)/contaminants -name "*.fna" | xargs cat > $@.tmp && mv $@.tmp $@
 	mv $(TMP_DIR)/$(patsubst %.fna,%$(TAXID_SUFFIX),$(notdir $@)) $(patsubst %.fna,%$(TAXID_SUFFIX),$@)
 ifeq (1,$(KEEP_FILES))
 	[[ -d $(DL_DIR)/contaminants ]] || mkdir -p $(DL_DIR)/contaminants