New upstream version 1.6.2+dfsg

parent 9c61fb61
\documentclass[12pt]{report}
\usepackage{graphicx,fancyvrb,url,comment,longtable,color,ccaption}
\usepackage{graphicx,fancyvrb,url,comment,longtable,color,ccaption,hyperref}
\usepackage{amsmath}
\textwidth=6.6in
......@@ -36,9 +36,9 @@
\begin{center}
{\Huge\bf Subread/Rsubread Users Guide}\\
\vspace{1 cm}
{\centering\large Subread v1.6.1/Rsubread v1.29.1\\}
{\centering\large Subread v1.6.2/Rsubread v1.31.3\\}
\vspace{1 cm}
\centering 21 March 2018\\
\centering 15 May 2018\\
\vspace{5 cm}
\Large Wei Shi and Yang Liao\\
\vspace{1 cm}
......@@ -289,10 +289,6 @@ Up to 4,096 posssible alignments will be examined for each read pair and a maxim
Total number of matched bases (for genomic DNA-seq data) or mis-matched bases (for RNA-seq data) will be used to determine the best mapping in the final re-alignment step.
\section{Recommended aligner setting}
It is recommended to report uniquely mapped reads only when running \code{Subread} and \code{Subjunc} aligners since this will give the most accurate mapping result.
This is the default setting for the two aligners in both SourceForge {\Subread} and {\Rsubread} packages.
\chapter{Mapping reads generated by genomic DNA sequencing technologies}
\label{chapter:subread-dnaseq}
......@@ -374,6 +370,9 @@ The \code{subread-buildindex} function divides each reference sequence name (whi
The first substrings must be distinct for different reference sequences (otherwise the index cannot be built).
Note that the starting `\code{>}' character in the header line is not included in the first substrings.
Sequences of reference genomes can be downloaded from NCBI, UCSC or Ensembl databases.
For instance, the latest mouse reference genome GRCm38.p6 (as of writing) can be downloaded from the NCBI database via \href{ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz}{\color{blue}\url{ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz}}.
Table 1 describes the arguments used by the \code{subread-buildindex} program.
\newpage
......@@ -414,97 +413,100 @@ Table 2 describes the arguments used by {\Subread} aligner (also {\Subjunc} and
Arguments used in Bioconductor \code{Rsubread} package are included in parenthesis.\\
\begin{longtable}{|p{4cm}|p{12cm}|}
\begin{longtable}{|p{5.5cm}|p{10.5cm}|}
\multicolumn{2}{p{16cm}}{Table 2: Arguments used by the \code{subread-align}/\code{subjunc}/\code{sublong} programs included in the SourceForge {\Subread} package in alphabetical order.
Arguments in parenthesis in the first column are the equivalent arguments used in Bioconductor {\Rsubread} package.
Arguments used by \code{subread-align} only are marked with $^*$.
Arguments used by \code{subjunc} only are marked with $^{**}$.
Arguments used by \code{sublong} only are marked with $^{***}$.
\newline
($^{1}$\code{subread-align} arguments,
$^{2}$\code{subjunc} arguments and
$^{3}$\code{sublong} arguments)
\newline
}
\endfirsthead
\hline
Arguments & Description \\
\hline
-a $<string>$\newline (\code{useAnnotation, annot.inbuilt, annot.ext}) & Name of a gene annotation file that includes chromosomal coordinates of exons from each gene. GTF/GFF format by default. See -F option for supported formats. Users may use the inbuilt annotations included in this package (SAF format) for human and mouse data. Exon-exon junctions are inferred by connecting each pair of neighboring exons from the same gene. Gzipped file is accepted. \\
$^{1,2}$ -a $<string>$\newline (\code{useAnnotation, annot.inbuilt, annot.ext}) & Name of a gene annotation file that includes chromosomal coordinates of exons from each gene. GTF/GFF format by default. See -F option for supported formats. Users may use the inbuilt annotations included in this package (SAF format) for human and mouse data. Exon-exon junctions are inferred by connecting each pair of neighboring exons from the same gene. Gzipped file is accepted. \\
\hline
-A $<string>$\newline (\code{chrAliases}) & Name of a comma-delimited text file that includes aliases of chromosome names. This file should contain two columns. First column contains names of chromosomes included in the SAF or GTF annotation and second column contains corresponding names of chromosomes in the reference genome. No column headers should be provided. Also note that chromosome names are case sensitive. This file can be used to match chromosome names between the annotation and the reference genome.\\
$^{1,2}$ -A $<string>$\newline (\code{chrAliases}) & Name of a comma-delimited text file that includes aliases of chromosome names. This file should contain two columns. First column contains names of chromosomes included in the SAF or GTF annotation and second column contains corresponding names of chromosomes in the reference genome. No column headers should be provided. Also note that chromosome names are case sensitive. This file can be used to match chromosome names between the annotation and the reference genome.\\
\hline
-b \newline (\code{color2base=TRUE}) & Output base-space reads instead of color-space reads in mapping output for color space data (eg. LifTech SOLiD data). Note that the mapping itself will still be performed at color-space.\\
$^{1,2}$ -b \newline (\code{color2base=TRUE}) & Output base-space reads instead of color-space reads in mapping output for color space data (eg. LifTech SOLiD data). Note that the mapping itself will still be performed at color-space.\\
\hline
-B $<int>$ \newline (\code{nBestLocations}) & Specify the maximal number of equally-best mapping locations to be reported for a read. 1 by default. In the mapping output, the `NH' tag is used to indicate how many alignments are reported for the read and the `HI' tag is used for numbering the alignments reported for the same read. This option should be used together with the `$--$multiMapping' option. \\
$^{1,2}$ -B $<int>$ \newline (\code{nBestLocations}) & Specify the maximal number of equally-best mapping locations to be reported for a read. 1 by default. In the mapping output, the `NH' tag is used to indicate how many alignments are reported for the read and the `HI' tag is used for numbering the alignments reported for the same read. This option should be used together with the `$--$multiMapping' option. \\
\hline
-d $<int>$ \newline (\code{minFragLength}) & Specify the minimum fragment/template length, 50 by default. Note that if the two reads from the same pair do not satisfy the fragment length criteria, they will be mapped individually as if they were single-end reads.\\
$^{1,2}$ -d $<int>$ \newline (\code{minFragLength}) & Specify the minimum fragment/template length, 50 by default. Note that if the two reads from the same pair do not satisfy the fragment length criteria, they will be mapped individually as if they were single-end reads.\\
\hline
-D $<int>$ \newline (\code{maxFragLength}) & Specify the maximum fragment/template length, 600 by default.\\
$^{1,2}$ -D $<int>$ \newline (\code{maxFragLength}) & Specify the maximum fragment/template length, 600 by default.\\
\hline
-F $<string>$ \newline (\code{isGTF}) & Specify format of the provided annotation file. Acceptable formats include `GTF' (or compatible GFF format) and `SAF'. Default format in SourceForge {\Subread} is `GTF'. Default format in {\Rsubread} is `SAF'. \\
$^{1,2}$ -F $<string>$ \newline (\code{isGTF}) & Specify format of the provided annotation file. Acceptable formats include `GTF' (or compatible GFF format) and `SAF'. Default format in SourceForge {\Subread} is `GTF'. Default format in {\Rsubread} is `SAF'. \\
\hline
-i $<string> \newline (\code{index}) $ & Specify the base name of the index. The index used by \code{sublong} aligner must be a full index and has only one block, ie. `-F' and `-B' options must be specified when building index with \code{subread-buildindex}.\\
$^{1,2,3}$ -i $<string> \newline (\code{index}) $ & Specify the base name of the index. The index used by \code{sublong} aligner must be a full index and has only one block, ie. `-F' and `-B' options must be specified when building index with \code{subread-buildindex}.\\
\hline
-I $<int>$ \newline (\code{indels}) & Specify the number of INDEL bases allowed in the mapping. 5 by default. Indels of up to 200bp long can be detected.\\
$^{1,2}$ -I $<int>$ \newline (\code{indels}) & Specify the number of INDEL bases allowed in the mapping. 5 by default. Indels of up to 200bp long can be detected.\\
\hline
-m $<int>$ \newline (\code{TH1}) & Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit. The consensus subreads are those subreads which vote for the same location in the reference genome for the read. If pair-end read data are provided, at least one of the two reads from the same pair must satisfy this criteria. 3 by default. For \code{sublong}, this is the consensus threshold for mapping a readlet (1 by default). A readlet is a 100bp sequence extracted from a long read. \\
$^{1,2}$ -m $<int>$ \newline (\code{TH1}) & Specify the consensus threshold, which is the minimal number of consensus subreads required for reporting a hit. The consensus subreads are those subreads which vote for the same location in the reference genome for the read. If pair-end read data are provided, at least one of the two reads from the same pair must satisfy this criteria. 3 by default. For \code{sublong}, this is the consensus threshold for mapping a readlet (1 by default). A readlet is a 100bp sequence extracted from a long read. \\
\hline
-M $<int>$ \newline (\code{maxMismatches}) & Specify the maximum number of mis-matched bases allowed in the alignment. 3 by default. Mis-matches found in soft-clipped bases are not counted.\\
$^{1,2}$ -M $<int>$ \newline (\code{maxMismatches}) & Specify the maximum number of mis-matched bases allowed in the alignment. 3 by default. Mis-matches found in soft-clipped bases are not counted.\\
\hline
-n $<int>$ \newline (\code{nsubreads}) & Specify the number of subreads extracted from each read, 10 by default. For \code{sublong}, this is number of subreads (85 by default) extracted from each readlet. A readlet is a 100bp sequence extracted from a long read.\\
$^{1,2}$ -n $<int>$ \newline (\code{nsubreads}) & Specify the number of subreads extracted from each read, 10 by default. For \code{sublong}, this is number of subreads (85 by default) extracted from each readlet. A readlet is a 100bp sequence extracted from a long read.\\
\hline
-o $<string>$ \newline (\code{output\_file}) & Give the name of output file. The default output format is BAM. All reads are included in mapping output, including both mapped and unmapped reads, and they are in the same order as in the input file.\\
$^{1,2,3}$ -o $<string>$ \newline (\code{output\_file}) & Give the name of output file. The default output format is BAM. All reads are included in mapping output, including both mapped and unmapped reads, and they are in the same order as in the input file.\\
\hline
-p $<int>$ \newline (\code{TH2}) & Specify the minimum number of consensus subreads both reads from the same pair must have. This argument is only applicable for paired-end read data. The value of this argument should not be greater than that of `-m' option, so as to rescue those read pairs in which one read has a high mapping quality but the other does not. 1 by default.\\
$^{1,2}$ -p $<int>$ \newline (\code{TH2}) & Specify the minimum number of consensus subreads both reads from the same pair must have. This argument is only applicable for paired-end read data. The value of this argument should not be greater than that of `-m' option, so as to rescue those read pairs in which one read has a high mapping quality but the other does not. 1 by default.\\
\hline
-P $<3:6>$ \newline (\code{phredOffset}) & Specify the format of Phred scores used in the input data, '3' for phred+33 and '6' for phred+64. '3' by default. For \code{align} function in \Rsubread, the possible values are `33' (for phred+33) and `64' (for phred+64). `33' by default.\\
$^{1,2}$ -P $<3:6>$ \newline (\code{phredOffset}) & Specify the format of Phred scores used in the input data, '3' for phred+33 and '6' for phred+64. '3' by default. For \code{align} function in \Rsubread, the possible values are `33' (for phred+33) and `64' (for phred+64). `33' by default.\\
\hline
-r $<string>$ \newline (\code{readfile1}) & Give the name of an input file (multiple files are allowed to be provided to \code{align} and \code{subjunc} functions in {\Rsubread}). For paired-end read data, this gives the first read file and the other read file should be provided via the -R option. Supported input formats include FASTQ/FASTA (uncompressed or gzip compressed)(default), SAM and BAM.\\
$^{1,2,3}$ -r $<string>$ \newline (\code{readfile1}) & Give the name of an input file (multiple files are allowed to be provided to \code{align} and \code{subjunc} functions in {\Rsubread}). For paired-end read data, this gives the first read file and the other read file should be provided via the -R option. Supported input formats include FASTQ/FASTA (uncompressed or gzip compressed)(default), SAM and BAM.\\
\hline
-R $<string>$ \newline (\code{readfile2}) & Provide name of the second read file from paired-end data. The program will switch to paired-end read mapping mode if this file is provided. (multiple files are allowed to be provided to \code{align} and \code{subjunc} functions in {\Rsubread}).\\
$^{1,2}$ -R $<string>$ \newline (\code{readfile2}) & Provide name of the second read file from paired-end data. The program will switch to paired-end read mapping mode if this file is provided. (multiple files are allowed to be provided to \code{align} and \code{subjunc} functions in {\Rsubread}).\\
\hline
-S $<ff:fr:rf>$ \newline (\code{PE\_orientation}) & Specify the orientation of the two reads from the same pair. It has three possible values including `fr', `ff' and `'rf. Letter `f' denotes the forward strand and letter `r' the reverse strand. `fr' by default (ie. the first read in the pair is on the forward strand and the second read on the reverse strand).\\
$^{1,2}$ -S $<ff:fr:rf>$ \newline (\code{PE\_orientation}) & Specify the orientation of the two reads from the same pair. It has three possible values including `fr', `ff' and `'rf. Letter `f' denotes the forward strand and letter `r' the reverse strand. `fr' by default (ie. the first read in the pair is on the forward strand and the second read on the reverse strand).\\
\hline
$^*$ -t $<int>$ \newline (\code{type}) & Specify the type of input sequencing data. Possible values include \code{0}, denoting RNA-seq data, or \code{1}, denoting genomic DNA-seq data. User must specify the value. Character values including `rna' and `dna' can also be used in the {\R} function. For genomic DNA-seq data, the aligner takes into account both the number of matched bases and the number of mis-matched bases to determine the the best mapping location after applying the `seed-and-vote' approach for read mapping. For RNA-seq data, only the number of mis-matched bases is considered for determining the best mapping location. \\
$^1$ -t $<int>$ \newline (\code{type}) & Specify the type of input sequencing data. Possible values include \code{0}, denoting RNA-seq data, or \code{1}, denoting genomic DNA-seq data. User must specify the value. Character values including `rna' and `dna' can also be used in the {\R} function. For genomic DNA-seq data, the aligner takes into account both the number of matched bases and the number of mis-matched bases to determine the the best mapping location after applying the `seed-and-vote' approach for read mapping. For RNA-seq data, only the number of mis-matched bases is considered for determining the best mapping location. \\
\hline
-T $<int>$ \newline (\code{nthreads}) & Specify the number of threads/CPUs used for mapping. The value should be between 1 and 32. 1 by default.\\
$^{1,2,3}$ -T $<int>$ \newline (\code{nthreads}) & Specify the number of threads/CPUs used for mapping. The value should be between 1 and 32. 1 by default.\\
%\hline
%-u \newline (\code{unique=TRUE}) & Output uniquely mapped reads only. Reads that were found to have more than one best mapping location will not be reported.\\
\hline
$^{***}$-X $<int>$ & Specify the maximum number of mis-matched bases allowed in the mapping of each subread. 0 by default. This parameter is only applicable for \code{sublong}. \\
$^{2}$$--$allJunctions \newline (\code{reportAllJunctions} \newline \code{=TRUE}) & This option should be used with \code{subjunc} for detecting canonical exon-exon junctions (with `GT/AG' donor/receptor sites), non-canonical exon-exon junctions and structural variants (SVs) in RNA-seq data. detected junctions will be saved to a file with suffix name ``.junction.bed". Detected SV breakpoints will be saved to a file with suffix name ``.breakpoints.txt", which includes chromosomal coordinates of detected SV breakpoints and also number of supporting reads. In the read mapping output, each breakpoint-containing read will contain the following extra fields for the description of its secondary alignment: CC(Chr), CP(Position),CG(CIGAR) and CT(strand). The primary alignment (described in the main field) and secondary alignment give respectively the mapping results for the two segments from the same read that were seperated by the breakpoint. Note that each breakpoint-containing read occupies only one row in mapping output. The mapping output includes mapping results for all the reads.\\
\hline
$^{**}$$--$allJunctions \newline (\code{reportAllJunctions} \newline \code{=TRUE}) & This option should be used with \code{subjunc} for detecting canonical exon-exon junctions (with `GT/AG' donor/receptor sites), non-canonical exon-exon junctions and structural variants (SVs) in RNA-seq data. detected junctions will be saved to a file with suffix name ``.junction.bed". Detected SV breakpoints will be saved to a file with suffix name ``.breakpoints.txt", which includes chromosomal coordinates of detected SV breakpoints and also number of supporting reads. In the read mapping output, each breakpoint-containing read will contain the following extra fields for the description of its secondary alignment: CC(Chr), CP(Position),CG(CIGAR) and CT(strand). The primary alignment (described in the main field) and secondary alignment give respectively the mapping results for the two segments from the same read that were seperated by the breakpoint. Note that each breakpoint-containing read occupies only one row in mapping output. The mapping output includes mapping results for all the reads.\\
$^{1,2}$$--$BAMinput \newline (\code{input\_format="BAM"}) & Specify that the input read data are in BAM format.\\
\hline
$--$BAMinput \newline (\code{input\_format="BAM"}) & Specify that the input read data are in BAM format.\\
$^{1,2}$$--$complexIndels & Detect multiple short indels that occur concurrently in a small genomic region (these indels could be as close as 1bp apart).\\
\hline
$--$complexIndels & Detect multiple short indels that occur concurrently in a small genomic region (these indels could be as close as 1bp apart).\\
$^{1,2}$$--$DPGapExt $<int>$ \newline (\code{DP\_GapExtPenalty}) & Specify the penalty for extending the gap when performing the Smith-Waterman dynamic programming. 0 by default.\\
\hline
$--$DPGapExt $<int>$ \newline (\code{DP\_GapExtPenalty}) & Specify the penalty for extending the gap when performing the Smith-Waterman dynamic programming. 0 by default.\\
$^{1,2}$$--$DPGapOpen $<int>$ \newline (\code{DP\_GapOpenPenalty}) & Specify the penalty for opening a gap when applying the Smith-Waterman dynamic programming to detecting indels. -2 by default.\\
\hline
$--$DPGapOpen $<int>$ \newline (\code{DP\_GapOpenPenalty}) & Specify the penalty for opening a gap when applying the Smith-Waterman dynamic programming to detecting indels. -2 by default.\\
$^{1,2}$$--$DPMismatch $<int>$ \newline (\code{DP\_MismatchPenalty}) & Specify the penalty for mismatches when performing the Smith-Waterman dynamic programming. 0 by default.\\
\hline
$--$DPMismatch $<int>$ \newline (\code{DP\_MismatchPenalty}) & Specify the penalty for mismatches when performing the Smith-Waterman dynamic programming. 0 by default.\\
$^{1,2}$$--$DPMatch $<int>$ \newline (\code{DP\_MatchScore}) & Specify the score for the matched base when performing the Smith-Waterman dynamic programming. 2 by default.\\
\hline
$--$DPMatch $<int>$ \newline (\code{DP\_MatchScore}) & Specify the score for the matched base when performing the Smith-Waterman dynamic programming. 2 by default.\\
$^{1,2}$$--$gtfFeature $<string>$ \newline (\code{GTF.featureType}) & Specify the type of features that will be extracted from a GTF annotation. `exon' by default. Feature types can be found in the 3rd column of a GTF annotation.\\
\hline
$--$gtfFeature $<string>$ \newline (\code{GTF.featureType}) & Specify the type of features that will be extracted from a GTF annotation. `exon' by default. Feature types can be found in the 3rd column of a GTF annotation.\\
$^{1,2}$$--$gtfAttr $<string>$ \newline (\code{GTF.attrType}) & Specify the type of attributes in a GTF annotation that will be used to group features. `gene\_id' by default. Attributes can be found in the 9th column of a GTF annotation.\\
\hline
$--$gtfAttr $<string>$ \newline (\code{GTF.attrType}) & Specify the type of attributes in a GTF annotation that will be used to group features. `gene\_id' by default. Attributes can be found in the 9th column of a GTF annotation.\\
$^{1,2}$$--$keepReadOrder \newline (\code{keepReadOrder} \newline \code{=FALSE}) & Output reads in the same order as in the input read file. This option only applies to BAM output. Note that in the output, reads from the same pair are always placed next to each other no matter this option is provided or not. \\
\hline
$--$multiMapping \newline (\code{unique=FALSE}) & Multi-mapping reads will also be reported in the mapping output. Number of alignments reported for each multi-mapping read is determined by the `-B' option. If the total number of equally best mapping locations found for a read is greater than the number specified by `-B', then random mapping locations (total number of these locations is the same as `-B' value) will be selected. For example, if value of `-B' is 1, then one random location will be reported. \\
$^{1,2}$$--$multiMapping \newline (\code{unique=FALSE}) & Multi-mapping reads will also be reported in the mapping output. Number of alignments reported for each multi-mapping read is determined by the `-B' option. If the total number of equally best mapping locations found for a read is greater than the number specified by `-B', then random mapping locations (total number of these locations is the same as `-B' value) will be selected. For example, if value of `-B' is 1, then one random location will be reported. \\
\hline
$--$rg $<string>$ \newline (\code{readGroup}) & Add a $<tag:value>$ to the read group (RG) header in the mapping output. \\
$^{1,2}$$--$rg $<string>$ \newline (\code{readGroup}) & Add a $<tag:value>$ to the read group (RG) header in the mapping output. \\
\hline
$--$rg-id $<string>$ \newline (\code{readGroupID}) & Specify the read group ID. If specified, the read group ID will be added to the read group header field and also to each read in the mapping output. \\
$^{1,2}$$--$rg-id $<string>$ \newline (\code{readGroupID}) & Specify the read group ID. If specified, the read group ID will be added to the read group header field and also to each read in the mapping output. \\
\hline
$--$SAMinput \newline (\code{input\_format="SAM"}) & Specify that the input read data are in SAM format.\\
$^{1,2}$$--$SAMinput \newline (\code{input\_format="SAM"}) & Specify that the input read data are in SAM format.\\
\hline
$--$SAMoutput \newline (\code{output\_format="SAM"}) & Specify that mapping results are saved into a SAM format file. \\
$^{1,2,3}$$--$SAMoutput \newline (\code{output\_format="SAM"}) & Specify that mapping results are saved into a SAM format file. \\
\hline
$^*$$--$sv \newline (\code{detectSV=TRUE}) & This option should be used with \code{subread-align} for detecting structural variants (SVs) in genomic DNA sequencing data. Detected SV breakpoints will be saved to a file with suffix name ``.breakpoints.txt", which includes chromosomal coordinates of detected SV breakpoints and also number of supporting reads for each SV event. In the read mapping output, each breakpoint-containing read will contain the following extra fields for the description of its secondary alignment: CC(Chr), CP(Position),CG(CIGAR) and CT(strand). The primary alignment (described in the main field) and secondary alignment give respectively the mapping results for the two segments from the same read that were seperated by the breakpoint. Note that each breakpoint-containing read occupies only one row in mapping output. The mapping output includes mapping results for all the reads.\\
$^{1,2}$$--$sortReadsByCoordinates \newline (\code{sortReadsByCoordinates} \newline \code{=FALSE}) & If specified, reads will be sorted by their mapping coordinates in the mapping output. This option is applicable for BAM output only. A BAI index file will also be generated for each BAM file so the BAM files can be directly loaded into a genome browser such as IGB or IGV.\\
\hline
$--$trim5 $<int>$ \newline (\code{nTrim5}) & Trim off $<int>$ number of bases from 5' end of each read. 0 by default.\\
$^1$$--$sv \newline (\code{detectSV=TRUE}) & This option should be used with \code{subread-align} for detecting structural variants (SVs) in genomic DNA sequencing data. Detected SV breakpoints will be saved to a file with suffix name ``.breakpoints.txt", which includes chromosomal coordinates of detected SV breakpoints and also number of supporting reads for each SV event. In the read mapping output, each breakpoint-containing read will contain the following extra fields for the description of its secondary alignment: CC(Chr), CP(Position),CG(CIGAR) and CT(strand). The primary alignment (described in the main field) and secondary alignment give respectively the mapping results for the two segments from the same read that were seperated by the breakpoint. Note that each breakpoint-containing read occupies only one row in mapping output. The mapping output includes mapping results for all the reads.\\
\hline
$--$trim3 $<int>$ \newline (\code{nTrim3}) & Trim off $<int>$ number of bases from 3' end of each read. 0 by default.\\
$^{1,2}$$--$trim5 $<int>$ \newline (\code{nTrim5}) & Trim off $<int>$ number of bases from 5' end of each read. 0 by default.\\
\hline
-v & Output version of the program. \\
$^{1,2}$$--$trim3 $<int>$ \newline (\code{nTrim3}) & Trim off $<int>$ number of bases from 3' end of each read. 0 by default.\\
\hline
$^{1,2,3}$ -v & Output version of the program. \\
\hline
\end{longtable}
......@@ -549,8 +551,14 @@ Short indels detected from the read data will be saved to a text file (`.indel')
If `$--$sv' is specified when running \code{subread-align}, breakpoints detected from structural variant events will be output to a text file for each library as well (`.breakpoints.txt').
Screen output includes a brief mapping summary, including percentage of uniquely mapped reads, percentage of multi-mapping reads and percentage of unmapped reads.
\newpage
\section{Mapping of long reads}
We developed a new long-read aligner, called {\Sublong}, for the mapping of long reads that were generated by long-read sequencing technologies such as Nanopore and PacBio sequencers.
{\Sublong} is also based on the seed-and-vote mapping strategy.
Parameters of {\Sublong} program can be found in Table 2.
\newpage
\chapter{Mapping reads generated by RNA sequencing technologies}
......@@ -703,14 +711,6 @@ The {\featureCounts} program can be readily used for summarizing reads to miRNA
\chapter{Mapping long sequence reads}
We have developed a new long-read aligner called {\Sublong}, which is also based on the seed-and-vote mapping strategy.
Parameters of {\Sublong} program can be found in Table 2.
\chapter{Read summarization}
\section{Introduction}
......@@ -743,12 +743,11 @@ Here we describe the {\featureCounts} program, an efficient and accurate read qu
\subsection{Input data}
The data input to {\featureCounts} consists of (i) one or more files of aligned reads in either SAM or BAM format and (ii) a list of genomic features in either Gene Transfer Format (GTF) or General Feature Format (GFF) or Simplified Annotation Format (SAF). The format of input reads is automatically detected (SAM or BAM).
The data input to {\featureCounts} consists of (i) one or more files of aligned reads (short or long reads) in either SAM or BAM format and (ii) a list of genomic features in either Gene Transfer Format (GTF) or General Feature Format (GFF) or Simplified Annotation Format (SAF). The format of input reads is automatically detected (SAM or BAM).
For paired-end reads, if they were location-sorted in the input {\featureCounts} will automatically re-order the reads to place next to each other the reads from the same pair before counting them.
We also provide an utility program {\repair} to allow users to pair up the reads before feeding them to {\featureCounts}.
Note that name-sorted paired-end reads generated by other programs may include incorrectly paired reads due to for example multi-mapping issue.
If this is the case, {\featureCounts} will re-sort them.
If the input contains location-sorted paired-end reads, {\featureCounts} will automatically re-order the reads to place next to each other the reads from the same pair before counting them.
Sometimes name-sorted paired-end input reads are not compatible with featureCounts (due to for example reporting of multi-mapping results) and in this case {\featureCounts} will also automatically re-order them.
We provide an utility program {\repair} to allow users to pair up the reads before feeding them to {\featureCounts}.
Both read alignment and read counting should use the same reference genome. For each read, the BAM/SAM file gives the name of the reference chromosome or contig the read mapped to, the start position of the read on the chromosome or contig/scaffold, and the so-called CIGAR string giving the detailed alignment information including insertions and deletions and so on relative to the start position.
......@@ -804,7 +803,7 @@ In this case, {\featureCounts} can be instructed to count fragments rather than
{\featureCounts} automatically sorts reads by name if paired reads are not in consecutive positions in the SAM or BAM file, with minimal cost.
Users do not need to sort their paired reads before providing them to {\featureCounts}.
\subsection{Assign reads to features or meta-features}
\subsection{Assign reads to features and meta-features}
{\featureCounts} is a general-purpose read summarization function, which assigns mapped reads (RNA-seq reads or genomic DNA-seq reads) to genomic features or meta-features.
A feature is an interval (range of positions) on one of the reference sequences.
......@@ -824,6 +823,12 @@ When counting reads at meta-feature level, a hit is called for a meta-feature if
Note that if a read hits a meta-feature, it is always counted once no matter how many features in the meta-feature this read overalps with.
For instance, an exon-spanning read overlapping with more than one exon within the same gene only contributes 1 count to the gene.
When assigning reads to genes or exons, most reads can be successfully assigned without ambiguity.
However if reads are to be assigned to transcripts, due to the high overlap between transcripts from the same gene, many reads will be found to overlap more than one transcript and therefore cannot be uniquely assigned.
Specialized transcript-level quantification tools are recommended for counting reads to transcripts.
Such tools use model-based approaches to deconvolve reads overlapping with multiple transcripts.
\subsection{Count multi-mapping reads and multi-overlapping reads}
......@@ -957,7 +962,7 @@ read mapping that produced the provided SAM/BAM files. This optional argument ca
\hline
-R $<string>$ \newline (\code{reportReads}) & Output detailed read assignment results for each read (or fragment if paired end). The detailed assignment results can be saved in three different formats including \code{CORE}, \code{SAM} and \code{BAM} (note that these values are case sensitive). \newline When \code{CORE} format is specified, a tab-delimited file will be generated for each input file. Name of each generated file is the input file name added with `.featureCounts'. Each generated file contains four columns including read name, status (assigned or the reason if not assigned), number of targets and target list. A target is a feature or a meta-feature. Items in the target lists is separated by comma. If a read is not assigned, its number of targets will be set as -1. \newline When \code{SAM} or \code{BAM} format is specified, the detailed assignment results will be saved to SAM and BAM format files. Names of generated files are the input file names added with `.featureCounts.sam' or `.featureCounts.bam'. Three tags are used to describe read assignment results: XS, XN and XT. Tag XS gives the assignment status. Tag XN gives number of targets. Tag XT gives comma separated target list. \\
\hline
-s $<int>$ \newline (\code{isStrandSpecific}) & Indicate if strand-specific read counting should be performed. Acceptable values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default. For paired-end reads, strand of the first read is taken as the strand of the whole fragment. FLAG field is used to tell if a read is first or second read in a pair.\\
-s $<int or string>$ \newline (\code{isStrandSpecific}) & Indicate if strand-specific read counting should be performed. A single integer value (applied to all input files) or a string of comma-separated values (applied to each corresponding input file) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files). For paired-end reads, strand of the first read is taken as the strand of the whole fragment. FLAG field is used to tell if a read is first or second read in a pair. Value of \code{isStrandSpecific} parameter in {\Rsubread} {\featureCounts} is a vector which has a length of either {\code{1}}, or the same with the total number of input files provided. \\
\hline
-t $<string>$ \newline (\code{GTF.featureType}) & Specify the feature type. Only rows which have the matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default.\\
\hline
......@@ -973,6 +978,8 @@ $--$countNonSplit \newline AlignmentsOnly \newline (\code{nonSplitOnly}) & If s
\hline
$--$donotsort \newline (\code{autosort}) & If specified, paired end reads will not be re-ordered even if reads from the same pair were found not to be next to each other in the input.\\
\hline
$--$extraAttributes $<string>$ \newline (\code{GTF.attrType.extra}) & Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by comma (in {\Rsubread} {\featureCounts} its value is a character vector). \\
\hline
$--$fraction \newline (\code{fraction}) & Assign fractional counts to features. This option must be used together with `-M' or `-O' or both. When `-M' is specified, each reported alignment from a multi-mapping read (identified via `NH' tag) will carry a count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When `-O' is specified, each overlapping feature will receive a count of 1/y, where y is the total number of features overlapping with the read. When both `-M' and `-O' are specified, each alignment will carry a count of 1/(x*y).\\
\hline
$--$fracOverlap $<float>$ \newline (\code{fracOverlap}) & Minimum fraction of overlapping bases in a read that is required for read assignment. Value should be a float number in the range [0,1]. 0 by default. If paired end, number of overlapping bases is counted from both reads. Soft-clipped bases are counted when calculating total read length (but ignored when counting overlapping bases). Both this option and `--minOverlap' option need to be satisfied for read assignment. \\
......@@ -999,6 +1006,8 @@ $--$readExtension5 $<int>$ \newline (\code{readExtension5}) & Reads are extended
\hline
$--$readExtension3 $<int>$ \newline (\code{readExtension3}) & Reads are extended downstream by $<int>$ bases from their 3' end. 0 by default.\\
\hline
$--$Rpath $<string>$ \newline (\code{reportReadsPath}) & Specify a directory to save the detailed assignment results. If unspecified, the directory where counting results are saved is used. See `-R' option for obtaining detailed assignment results for reads.\\
\hline
$--$tmpDir $<string>$ \newline (\code{tmpDir}) & Directory under which intermediate files are saved (later removed). By default, intermediate files will be saved to the directory specified in `-o' argument (In \R, intermediate files are saved to the current working directory by default).\\
\hline
$--$verbose \newline (\code{verbose}) & Output verbose information for debugging such as unmatched chromosomes/contigs between reads and annotation.\\
......@@ -1200,6 +1209,10 @@ It takes only about half a minute to re-order a location-sorted BAM file includi
Compute the read coverage for each chromosomal location in the genome.
\section{flattenGTF}
Flatten features (eg. exons) provided in a GTF annotation and output the modified annotation to a SAF format annotation.
\section{promoterRegions}
This function is only implemented in {\Rsubread}. It generates a SAF format annotation that includes coordinates of promoter regions for each gene.
......
......@@ -52,6 +52,20 @@
#include "HelperFunctions.h"
char * get_short_fname(char * lname){
char * ret = lname;
int x1;
for(x1 = strlen(lname)-1; x1>=0; x1--){
if(lname [x1] == '/'){
ret = lname + x1 + 1;
break;
}
}
return ret;
}
// This assumes the first part of Cigar has differet strandness to the main part of the cigar.
// Pos is the LAST WANTED BASE location before the first strand jump (split by 'b' or 'n').
......
......@@ -75,4 +75,6 @@ int load_features_annotation(char * file_name, int file_type, char * gene_id_col
void * context, int do_add_feature(char * gene_name, char * transcript_id, char * chrome_name, unsigned int start, unsigned int end, int is_negative_strand, void * context) );
HashTable * load_alias_table(char * fname) ;
char * get_short_fname(char * lname);
#endif
#MACOS = -D MACOS
include makefile.version
CC_EXEC = gcc
OPT_LEVEL = 3
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 # -w
include makefile.version
-include ~/.R/DBPZ_debug_makefile
CCFLAGS = -mtune=core2 ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 ${WARNING_LEVEL}
LDFLAGS = ${STATIC_MAKE} -pthread -lz -lm ${MACOS} -O${OPT_LEVEL} -DMAKE_FOR_EXON -D MAKE_STANDALONE
CC_EXEC = gcc
CC = ${CC_EXEC} ${CCFLAGS} -fmessage-length=0 -ggdb
......@@ -14,11 +17,11 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
all: detectionCall sublong repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # samMappedBases mergeVCF testZlib
all: detectionCall sublong repair txUnique featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount flattenGTF # samMappedBases mergeVCF testZlib
mkdir -p ../bin/utilities
mv longread-one/LRM longread-one/sublong
mv longread-one/sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv detectionCall repair coverageCount propmapped qualityScores removeDup subread-fullscan txUnique ../bin/utilities
mv detectionCall repair coverageCount propmapped qualityScores removeDup subread-fullscan txUnique flattenGTF ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
......@@ -34,6 +37,9 @@ sublong: longread-one/longread-mapping.c ${ALL_OBJECTS}
rm -f longread-one/*.o
cd longread-one && $(MAKE)
flattenGTF: flattenAnnotations.c ${ALL_OBJECTS}
${CC} -o flattenGTF flattenAnnotations.c ${ALL_OBJECTS} ${LDFLAGS}
detectionCall: detection-calls.c ${ALL_OBJECTS}
${CC} -o detectionCall detection-calls.c ${ALL_OBJECTS} ${LDFLAGS}
......
MACOS = -D MACOS
include makefile.version
CCFLAGS = -mtune=core2 ${MACOS} -O9 -w -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64
CCFLAGS = -mtune=core2 ${MACOS} -O9 -w -DMAKE_FOR_EXON -D MAKE_STANDALONE -D SUBREAD_VERSION=\"${SUBREAD_VERSION}\" -D_FILE_OFFSET_BITS=64 ${WARNING_LEVEL}
LDFLAGS = -pthread -lz -lm ${MACOS} -DMAKE_FOR_EXON -D MAKE_STANDALONE # -DREPORT_ALL_THE_BEST
CC = gcc ${CCFLAGS} ${STATIC_MAKE} -ggdb -fomit-frame-pointer -O3 -ffast-math -funroll-loops -mmmx -msse -msse2 -msse3 -fmessage-length=0
......@@ -11,11 +11,11 @@ ALL_OBJECTS=$(addsuffix .o, ${ALL_LIBS})
ALL_H=$(addsuffix .h, ${ALL_LIBS})
ALL_C=$(addsuffix .c, ${ALL_LIBS})
all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount # globalReassembly testZlib
all: sublong repair featureCounts removeDup exactSNP subread-buildindex subindel subread-align subjunc qualityScores subread-fullscan propmapped coverageCount flattenGTF # globalReassembly testZlib
mkdir -p ../bin/utilities
mv longread-one/LRM longread-one/sublong
mv longread-one/sublong subread-align subjunc featureCounts subindel exactSNP subread-buildindex ../bin/
mv repair coverageCount subread-fullscan qualityScores removeDup propmapped ../bin/utilities
mv repair coverageCount subread-fullscan qualityScores removeDup propmapped flattenGTF ../bin/utilities
@echo
@echo "###########################################################"
@echo "# #"
......@@ -31,6 +31,9 @@ sublong: longread-one/longread-mapping.c ${ALL_OBJECTS}
rm -f longread-one/*.o
cd longread-one && $(MAKE)
flattenGTF: flattenAnnotations.c ${ALL_OBJECTS}
${CC} -o flattenGTF flattenAnnotations.c ${ALL_OBJECTS} ${LDFLAGS}
repair: read-repair.c ${ALL_OBJECTS}
${CC} -o repair read-repair.c ${ALL_OBJECTS} ${LDFLAGS}
......
......@@ -264,7 +264,7 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
while(!feof(tmp_fp))
{
int type_char = fgetc(tmp_fp);
int type_char = fgetc(tmp_fp), rlen=-1;
if(type_char == EOF ) break;
fseek(tmp_fp, -1 , SEEK_CUR);
......@@ -273,7 +273,11 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
{
VCF_temp_read_t SNP_rec;
fread(&SNP_rec, sizeof(SNP_rec),1 , tmp_fp);
rlen = fread(&SNP_rec, sizeof(SNP_rec),1 , tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
if(!(*SNP_bitmap_recorder))
{
(*SNP_bitmap_recorder)=malloc((reference_len/8)+2);
......@@ -292,10 +296,25 @@ int read_tmp_block(struct SNP_Calling_Parameters * parameters, FILE * tmp_fp, ch
char read[MAX_READ_LENGTH];
char qual[MAX_READ_LENGTH];
fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
fread(&read_len, sizeof(short), 1, tmp_fp);
fread(read, sizeof(char), read_len, tmp_fp);
int rlen = fread(qual, sizeof(char), read_len, tmp_fp);
int rlen = fread(&read_rec, sizeof(read_rec), 1, tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(&read_len, sizeof(short), 1, tmp_fp);
if(rlen < 1){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(read, sizeof(char), read_len, tmp_fp);
if(rlen < read_len){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
}
rlen = fread(qual, sizeof(char), read_len, tmp_fp);
if(rlen < read_len){
SUBREADputs("ERROR: the temporary file is broken.");
return -1;
......@@ -1803,9 +1822,9 @@ int main_snp_calling_test(int argc,char ** argv)
print_in_box(80,1,1,"exactSNP setting");
print_in_box(80,0,1,"");
print_in_box(80,0,0," Input file : %s (%s)", in_SAM_file, parameters.is_BAM_file_input?"BAM":"SAM");
print_in_box(80,0,0," Output file : %s", out_BED_file);
print_in_box(80,0,0," Reference genome : %s", in_FASTA_file);
print_in_box(80,0,0," Input file : %s (%s)", get_short_fname(in_SAM_file), parameters.is_BAM_file_input?"BAM":"SAM");
print_in_box(80,0,0," Output file : %s", get_short_fname(out_BED_file));
print_in_box(80,0,0," Reference genome : %s", get_short_fname(in_FASTA_file));
print_in_box(80,0,1,"");
print_in_box(80,0,0," Threads : %d", threads);
print_in_box(80,0,0," Min supporting reads : %d", parameters.min_supporting_read_number);
......@@ -1817,7 +1836,7 @@ int main_snp_calling_test(int argc,char ** argv)
print_in_box(80,0,0," P value upper bound : %.5f", parameters.cutoff_upper_bound);
print_in_box(80,0,0," Flanking windows size : %d", parameters.fisher_exact_testlen);
if(parameters.known_SNP_vcf[0])
print_in_box(80,0,0," Known SNP annotations : %s", parameters.known_SNP_vcf);
print_in_box(80,0,0," Known SNP annotations : %s", get_short_fname(parameters.known_SNP_vcf));
print_in_box(80,0,1,"");
print_in_box(80,2,1,"http://subread.sourceforge.net/");
......
/***************************************************************
The Subread software package is free software package:
you can redistribute it and/or modify it under the terms
of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License,
or (at your option) any later version.
Subread is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
Authors: Drs Yang Liao and Wei Shi
***************************************************************/
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <sys/types.h>
#include <sys/stat.h>
#ifdef WINDOWS
#define __i686__
#include <winsock2.h>
#else
#include <arpa/inet.h>
#endif
#include <unistd.h>
#include "subread.h"
#include "sublog.h"
#include "core.h"
#include "input-files.h"
#include "sorted-hashtable.h"
#include "core-indel.h"
#include "core-junction.h"
#define BASE_BLOCK_LENGTH_INDEX 15000000
#define BUCKET_SIZE_INDEX 10000
#define ntohll(x) ( ( (unsigned long long)(ntohl( (unsigned int )(((x) << 32) >> 32) ))<< 32) | ntohl( ((unsigned int)((x) >> 32)) ) )
#define htonll(x) ntohll(x)
char temp_file_prefix[MAX_FILE_NAME_LENGTH];
struct read_position_info
{
unsigned long long file_offset;
unsigned int chro_offset;
unsigned short flags;
unsigned short section_length;
char cigar[EXON_MAX_CIGAR_LEN];
};
// BASE_NUMBER_CURVE_POINTS should be (2+4*n) where n is an integer.
#define BASE_NUMBER_CURVE_POINTS 6
struct bucket_info
{
char chromosome_name[MAX_CHROMOSOME_NAME_LEN];
unsigned long long int L2index_offset;
unsigned int chromosome_offset;
unsigned int bases_in_bucket;
unsigned int reads_in_bucket;
unsigned short base_number_curve[BASE_NUMBER_CURVE_POINTS];
};
unsigned int calculate_read_span(char * cigar)
{
int xk0 = 0;
unsigned int x = 0, ret = 0;
while (1)
{
char nch = cigar[xk0];
if(!nch)break;
if(isdigit(nch))
x = x*10+nch-'0';
else
{
if(nch!='I') ret += x;
x=0;
}
xk0++;
}
return ret;
}
unsigned int transform_pillup_to_index(char * chromosome_name, unsigned int block_start, char * temp_file_name, FILE * L1index_fp, FILE * L2index_fp)
{
FILE * pile_fp = f_subr_open(temp_file_name,"rb");
if(!pile_fp) return 0;
int buckets_in_block = BASE_BLOCK_LENGTH_INDEX / BUCKET_SIZE_INDEX;
struct read_position_info ** buckets = malloc(sizeof(struct read_position_info *) * buckets_in_block);
unsigned int * bucket_counts = malloc(sizeof(int) * buckets_in_block);
unsigned int * bucket_sizes = malloc(sizeof(int) * buckets_in_block);
unsigned int * bucket_bases = malloc(sizeof(int) * buckets_in_block);
unsigned int * bucket_curve_bases = malloc(sizeof(int) * buckets_in_block * BASE_NUMBER_CURVE_POINTS);
unsigned int total_reads= 0;
memset(buckets, 0, sizeof(struct read_position_info *) * buckets_in_block);
memset(bucket_curve_bases, 0, sizeof(int) * buckets_in_block * BASE_NUMBER_CURVE_POINTS);
memset(bucket_counts, 0, sizeof(int) * buckets_in_block);
memset(bucket_sizes, 0, sizeof(int) * buckets_in_block);
while(!feof(pile_fp))
{
struct read_position_info record;
fread(&record, sizeof(record), 1, pile_fp);
unsigned int read_chro_offset = ntohl(record.chro_offset), window_pointer;
unsigned int read_span = calculate_read_span(record.cigar);
for(window_pointer = read_chro_offset; window_pointer < read_chro_offset + read_span; window_pointer += BUCKET_SIZE_INDEX )
{
if(window_pointer<block_start)continue;
int bucket_no = (window_pointer-block_start) / BUCKET_SIZE_INDEX;
if(bucket_no >=buckets_in_block) continue;
int bucket_curve_offset = (int)((window_pointer-block_start)/(BUCKET_SIZE_INDEX*1./BASE_NUMBER_CURVE_POINTS) - bucket_no * BASE_NUMBER_CURVE_POINTS);
struct read_position_info * bucket = buckets[bucket_no];
if(!bucket)
{
bucket = malloc(sizeof(struct read_position_info) * 10);
buckets[bucket_no] = bucket;
bucket_counts[bucket_no] = 0;
bucket_sizes[bucket_no] = 10;
}
if( bucket_counts[bucket_no] == bucket_sizes[bucket_no])
{
bucket = realloc(bucket, sizeof(struct read_position_info)*bucket_sizes[bucket_no]*1.5);
buckets[bucket_no] = bucket;
bucket_sizes[bucket_no] *= 1.5;
}
struct read_position_info * item = &(bucket[bucket_counts[bucket_no]]);
memcpy(item , &record , sizeof(struct read_position_info));
bucket_counts[bucket_no]++;
bucket_bases[bucket_no]+= ntohs(record.section_length);
bucket_curve_bases[bucket_no * BASE_NUMBER_CURVE_POINTS + bucket_curve_offset] += ntohs(record.section_length);
}
total_reads ++;
}
int xk1;
for(xk1 = 0; xk1 < buckets_in_block; xk1++)
{
unsigned long long int L2index_offset = ftello(L2index_fp);
struct bucket_info bucket_record;
strcpy(bucket_record.chromosome_name, chromosome_name);
bucket_record.chromosome_offset = htonl(block_start+xk1 * BUCKET_SIZE_INDEX);
bucket_record.reads_in_bucket = htonl(bucket_counts[xk1]);
bucket_record.bases_in_bucket = htonl(bucket_bases[xk1]);
bucket_record.L2index_offset = htonll(L2index_offset);
int xk2;
for(xk2 = xk1 * BASE_NUMBER_CURVE_POINTS; xk2< xk1 * BASE_NUMBER_CURVE_POINTS + BASE_NUMBER_CURVE_POINTS; xk2++)
{
if(ntohl(bucket_record.bases_in_bucket >0))
{
bucket_record.base_number_curve[xk2 - xk1 * BASE_NUMBER_CURVE_POINTS] = htons((unsigned short)(bucket_curve_bases[xk2] * 30000./ ntohl(bucket_record.bases_in_bucket)));
//printf("RV=%d; CV=%d; TOTAL=%d\n", bucket_curve_bases[xk2] , ntohs(bucket_record.base_number_curve[xk2 - xk1 * BASE_NUMBER_CURVE_POINTS]), ntohl( bucket_record.bases_in_bucket));
}
else bucket_record.base_number_curve[xk2 - xk1 * BASE_NUMBER_CURVE_POINTS] = 0;
}
fwrite(&bucket_record, sizeof(struct bucket_info), 1, L1index_fp);
fwrite(buckets[xk1] , sizeof(struct read_position_info) * bucket_counts[xk1],1, L2index_fp);
}
fclose(pile_fp);
unlink(temp_file_name);
free(buckets);
free(bucket_curve_bases);
free(bucket_counts);
free(bucket_sizes);
return total_reads;
}
int finalise_sam_index(HashTable * chromosome_size_table, char * output_file_prefix)
{
KeyValuePair * cursor;
char temp_file_name[MAX_FILE_NAME_LENGTH];
sprintf(temp_file_name, "%s.L1i",output_file_prefix);
FILE * L1index_fp = f_subr_open(temp_file_name,"wb");
sprintf(temp_file_name, "%s.L2i",output_file_prefix);
FILE * L2index_fp = f_subr_open(temp_file_name,"wb");
int bucket;
for(bucket=0; bucket<chromosome_size_table -> numOfBuckets; bucket++)
{
cursor = chromosome_size_table -> bucketArray[bucket];
while(1)
{
if(!cursor) break;
char * chro_name = (char *)cursor -> key;
unsigned int chro_max_length =cursor ->value - NULL - 1;
unsigned int chro_offset = 0;