Skip to content
Commits on Source (9)
# BBTools bioinformatics tools, including BBMap.
# Author: Brian Bushnell, Jon Rood
# Language: Java
# Author: Brian Bushnell, Jon Rood, Shijie Yao
# Language: Java, Bash
# Information about documentation is in /docs/readme.txt.
# Version 35.85
# Version 38.26
#!/bin/bash
#a_sample_mt in=<infile> out=<outfile>
usage(){
echo "
Written by Brian Bushnell
Last modified November 19, 2015
Last modified August 22, 2018
Description: Does nothing. Should be fast.
......@@ -12,7 +11,6 @@ Usage: a_sample_mt.sh in=<input file> out=<output file>
Input may be fasta or fastq, compressed or uncompressed.
Standard parameters:
in=<file> Primary input, or read 1 input.
in2=<file> Read 2 input if reads are in two files.
......@@ -30,11 +28,14 @@ None yet!
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -51,6 +52,7 @@ CP="$DIR""current/"
z="-Xmx4g"
z2="-Xms4g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -71,12 +73,25 @@ calcXmx () {
calcXmx "$@"
a_sample_mt() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $z -cp $CP jgi.A_SampleMT $@"
local CMD="java $EA $EOOM $z -cp $CP jgi.A_SampleMT $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#addadapters in=<infile> out=<outfile>
function usage(){
usage(){
echo "
Written by Brian Bushnell
Last modified February 17, 2015
......@@ -11,13 +10,14 @@ The input is a set of reads, paired or unpaired.
The output is those same reads with adapter sequence replacing some of the bases in some reads.
For paired reads, adapters are located in the same position in read1 and read2.
This is designed for benchmarking adapter-trimming software, and evaluating methodology.
randomreads.sh is better for paired reads, though, as it actually adds adapters at the correct location,
so that overlap may be used for adapter detection.
Usage: addadapters.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> adapters=<file>
in2 and out2 are for paired reads and are optional.
If input is paired and there is only one output file, it will be written interleaved.
Parameters:
ow=f (overwrite) Overwrites files that already exist.
int=f (interleaved) Determines whether INPUT file is considered interleaved.
......@@ -38,6 +38,7 @@ Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -53,6 +54,7 @@ CP="$DIR""current/"
z="-Xmx200m"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -67,11 +69,25 @@ calcXmx () {
calcXmx "$@"
function addadapters() {
if [[ $NERSC_HOST == genepool ]]; then
module load oracle-jdk/1.7_64bit
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $z -cp $CP jgi.AddAdapters $@"
local CMD="java $EA $EOOM $z -cp $CP jgi.AddAdapters $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
usage(){
echo "
Written by Brian Bushnell
Last modified August 9, 2018
Description: Looks at accessions to see how to compress them.
Usage: analyzeaccession.sh *accession2taxid.gz out=<output file>
Parameters:
lines=-1 If positive, stop after this many lines.
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"
z="-Xmx400m"
z2="-Xms400m"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
usage
exit
fi
calcXmx () {
source "$DIR""/calcmem.sh"
parseXmx "$@"
}
calcXmx "$@"
a_sample_mt() {
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $EOOM $z -cp $CP tax.AnalyzeAccession $@"
echo $CMD >&2
eval $CMD
}
a_sample_mt "$@"
#!/bin/bash
usage(){
echo "
Written by Brian Bushnell
Last modified September 27, 2018
Description: Generates a prokaryotic gene model (.pkm) for gene calling.
Input is fasta and gff files.
The .pkm file may be used by CallGenes.
Usage: analyzegenes.sh in=x.fa gff=x.gff out=x.pgm
File parameters:
in=<file> A fasta file or comma-delimited list of fasta files.
gff=<file> A gff file or comma-delimited list. This is optional;
if present, it must match the number of fasta files.
If absent, a fasta file 'foo.fasta' will imply the
presence of 'foo.gff'.
out=<file> Output pgm file.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
usage
exit
fi
calcXmx () {
source "$DIR""/calcmem.sh"
parseXmx "$@"
}
calcXmx "$@"
function analyze() {
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $EOOM $z $z2 -cp $CP prok.AnalyzeGenes $@"
echo $CMD >&2
eval $CMD
}
analyze "$@"
#!/bin/bash
usage(){
echo "
Written by Brian Bushnell
Last modified July 24, 2018
Description: Error corrects reads and/or filters by depth, storing
kmer counts in a count-min sketch (a Bloom filter variant).
This uses a fixed amount of memory. The error-correction algorithm is taken
from Tadpole; with plenty of memory, the behavior is almost identical to
Tadpole. As the number of unique kmers in a dataset increases, the accuracy
decreases such that it will make fewer corrections. It is still capable
of making useful corrections far past the point where Tadpole would crash
by running out of memory, even with the prefilter flag. But if there is
sufficient memory to use Tadpole, then Tadpole is more desirable.
Because accuracy declines with an increasing number of unique kmers, it can
be useful with very large datasets to run this in 2 passes, with the first
pass for filtering only using a 2-bit filter with the flags tossjunk=t and
ecc=f (and possibly mincount=2 and hcf=0.4), and the second pass using a
4-bit filter for the actual error correction.
Usage: bbcms.sh in=<input file> out=<output> outb=<reads failing filters>
Example of use in error correction:
bbcms.sh in=reads.fq out=ecc.fq bits=4 hashes=3 k=31 merge
Example of use in depth filtering:
bbcms.sh in=reads.fq out=high.fq outb=low.fq k=31 mincount=2 ecc=f hcf=0.4
Error correction and depth filtering can be done simultaneously.
File parameters:
in=<file> Primary input, or read 1 input.
in2=<file> Read 2 input if reads are in two files.
out=<file> Primary read output.
out2=<file> Read 2 output if reads are in two files.
outb=<file> (outbad/outlow) Output for reads failing mincount.
outb2=<file> (outbad2/outlow2) Read 2 output if reads are in two files.
extra=<file> Additional comma-delimited files for generating kmer counts.
ref=<file> If ref is set, then only files in the ref list will be used
for kmer counts, and the input files will NOT be used for
counts; they will just be filtered or corrected.
overwrite=t (ow) Set to false to force the program to abort rather than
overwrite an existing file.
Hashing parameters:
k=31 Kmer length, currently 1-31.
hashes=3 Number of hashes per kmer. Higher generally reduces
false positives at the expense of speed; rapidly
diminishing returns above 4.
minprob=0.5 Ignore kmers with probability of being correct below this.
memmult=1.0 Fraction of free memory to use for Bloom filter. 1.0 should
generally work; if the program crashes with an out of memory
error, set this lower. You may be able to increase accuracy
by setting it slightly higher.
cells= Option to set the number of cells manually. By default this
will be autoset to use all available memory. The only reason
to set this is to ensure deterministic output.
seed=0 This will change the hash function used.
Depth filtering parameters:
mincount=0 If positive, reads with kmer counts below mincount will
be discarded (sent to outb).
hcf=1.0 (highcountfraction) Fraction of kmers that must be at least
mincount to pass.
requireboth=t Require both reads in a pair to pass in order to go to out.
When true, if either read has a count below mincount, both
reads in the pair will go to outb. When false, reads will
only go to outb if both fail.
tossjunk=f Remove reads or pairs with outermost kmer depth below 2.
(Suggested params for huge metagenomes: mincount=2 hcf=0.4 tossjunk=t)
Error correction parameters:
ecc=t Perform error correction.
bits= Bits used to store kmer counts; max count is 2^bits-1.
Supports 2, 4, 8, 16, or 32. 16 is best for high-depth data;
2 or 4 are for huge, low-depth metagenomes that saturate the
bloom filter otherwise. Generally 4 bits is recommended for
error-correction and 2 bits is recommended for filtering only.
ecco=f Error-correct paired reads by overlap prior to kmer-counting.
merge=t Merge paired reads by overlap prior to kmer-counting, and
again prior to correction. Output will still be unmerged.
smooth=3 Remove spikes from kmer counts due to hash collisions.
The number is the max width of peaks to be smoothed; range is
0-3 (3 is most aggressive; 0 disables smoothing).
This also affects tossjunk.
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"
z="-Xmx4g"
z2="-Xms4g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
usage
exit
fi
calcXmx () {
source "$DIR""/calcmem.sh"
parseXmx "$@"
if [[ $set == 1 ]]; then
return
fi
freeRam 4000m 84
z="-Xmx${RAM}m"
z2="-Xms${RAM}m"
}
calcXmx "$@"
bloomfilter() {
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $EOOM $z $z2 -cp $CP bloom.BloomFilterCorrectorWrapper $@"
echo $CMD >&2
eval $CMD
}
bloomfilter "$@"
#!/bin/bash
#bbcountunique in=<infile> out=<outfile>
usage(){
echo "
Written by Brian Bushnell
Last modified February 17, 2015
Last modified August 1, 2016
Description: Generates a kmer uniqueness histogram, binned by file position.
There are 3 columns for single reads, 6 columns for paired:
......@@ -15,10 +14,9 @@ r2_first percent unique 1st kmer of read 2
r2_rand percent unique random kmer of read 2
pair percent unique concatenated kmer from read 1 and 2
Usage: bbcountunique.sh in=<input> out=<output>
Please read bbmap/docs/guides/CalcUniquenessGuide.txt for more information.
Optional parameters (and their defaults)
Usage: bbcountunique.sh in=<input> out=<output>
Input parameters:
in2=null Second input file for paired reads
......@@ -30,22 +28,25 @@ Output parameters:
out=<file> File for output stats
Processing parameters:
k=20 Kmer length (range 1-31).
k=25 Kmer length (range 1-31).
interval=25000 Print one line to the histogram per this many reads.
cumulative=f Show cumulative numbers rather than per-interval numbers.
percent=t Show percentages of unique reads.
count=f Show raw counts of unique reads.
printlastbin=f (plb) Print a line for the final undersized bin.
minprob=0 Ignore kmers with a probability of correctness below this (based on q-scores).
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -62,6 +63,7 @@ CP="$DIR""current/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -82,12 +84,25 @@ calcXmx () {
calcXmx "$@"
bbcountunique() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $z -cp $CP jgi.CalcUniqueness $@"
local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.CalcUniqueness $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#bbduk in=<file> out=<file> ref=<ref file>
usage(){
echo "
Written by Brian Bushnell
Last modified December 10, 2015
Last modified August 29, 2018
Description: Compares reads to the kmers in a reference dataset, optionally
allowing an edit distance. Splits the reads into two outputs - those that
match the reference, and those that don't. Can also trim (remove) the matching
parts of the reads rather than binning the reads.
Please read bbmap/docs/guides/BBDukGuide.txt for more information.
Usage: bbduk.sh in=<input file> out=<output file> ref=<contaminant files>
......@@ -17,11 +17,12 @@ Input may be stdin or a fasta or fastq file, compressed or uncompressed.
If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
fasta input, set in=stdin.fa.gz
Input parameters:
in=<file> Main input. in=stdin.fq will pipe from stdin.
in2=<file> Input for 2nd read of pairs in a different file.
ref=<file,file> Comma-delimited list of reference files.
In addition to filenames, you may also use the keywords:
adapters, artifacts, phix, lambda, pjet, mtst, kapa
literal=<seq,seq> Comma-delimited list of literal reference sequences.
touppercase=f (tuc) Change all bases upper-case.
interleaved=auto (int) t/f overrides interleaved autodetection.
......@@ -31,6 +32,7 @@ copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
possible unambiguous copies. Intended for short motifs
or adapter barcodes, as time/memory use is exponential.
samplerate=1 Set lower to only process a fraction of input reads.
samref=<file> Optional reference fasta for processing sam files.
Output parameters:
out=<file> (outnonmatch) Write reads here that do not contain
......@@ -38,8 +40,11 @@ out=<file> (outnonmatch) Write reads here that do not contain
to standard out.
out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a
different file.
outm=<file> (outmatch) Write reads here that contain kmers matching
the database.
outm=<file> (outmatch) Write reads here that fail filters. In default
kfilter mode, this means any read with a matching kmer.
In any mode, it also includes reads that fail filters such
as minlength, mingc, maxgc, entropy, etc. In other words,
it includes all reads that do not go to 'out'.
outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a
different file.
outs=<file> (outsingle) Use this to write singleton reads whose mate
......@@ -73,17 +78,27 @@ qchist=<file> Count of bases with each quality value.
aqhist=<file> Histogram of average read quality.
bqhist=<file> Quality histogram designed for box plots.
lhist=<file> Read length histogram.
phist=<file> Polymer length histogram.
gchist=<file> Read GC content histogram.
gcbins=100 Number gchist bins. Set to 'auto' to use read length.
maxhistlen=6000 Set an upper bound for histogram lengths; higher uses
more memory. The default is 6000 for some histograms
and 80000 for others.
Histograms for sam files only (requires sam format 1.4 or higher):
Histograms for mapped sam/bam files only:
histbefore=t Calculate histograms from reads before processing.
ehist=<file> Errors-per-read histogram.
qahist=<file> Quality accuracy histogram of error rates versus quality
score.
indelhist=<file> Indel length histogram.
mhist=<file> Histogram of match, sub, del, and ins rates by read location.
mhist=<file> Histogram of match, sub, del, and ins rates by position.
idhist=<file> Histogram of read count versus percent identity.
idbins=100 Number idhist bins. Set to 'auto' to use read length.
varfile=<file> Ignore substitution errors listed in this file when
calculating error rates. Can be generated with
CallVariants.
vcf=<file> Ignore substitution errors listed in this VCF file
when calculating error rates.
Processing parameters:
k=27 Kmer length used for finding contaminants. Contaminants
......@@ -114,6 +129,8 @@ forbidn=f (fn) Forbids matching of read kmers containing N.
removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
match (or either is trimmed shorter than minlen).
Set to false to require both.
trimfailures=f Instead of discarding failed reads, trim them to 1bp.
This makes the statistics a bit odd.
findbestmatch=f (fbm) If multiple matches, associate read with sequence
sharing most kmers. Reduces speed.
skipr1=f Don't do kmer-based operations on read 1.
......@@ -145,31 +162,39 @@ speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
Trimming/Filtering/Masking parameters:
Note - if neither ktrim nor kmask is set, the default behavior is kfilter.
All three are mutually exclusive.
Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
All kmer processing modes are mutually exclusive.
Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.
ktrim=f Trim reads to remove bases matching reference kmers.
Values:
f (don't trim),
r (trim to the right),
l (trim to the left)
kmask=f Replace bases matching ref kmers with another symbol.
Allows any non-whitespace character other than t or f,
and processes short kmers on both ends. 'kmask=lc' will
kmask= Replace bases matching ref kmers with another symbol.
Allows any non-whitespace character, and processes short
kmers on both ends if mink is set. 'kmask=lc' will
convert masked bases to lowercase.
maskfullycovered=f (mfc) Only mask bases that are fully covered by kmers.
ksplit=f For single-ended reads only. Reads will be split into
pairs around the kmer. If the kmer is at the end of the
read, it will be trimmed instead. Singletons will go to
out, and pairs will go to outm. Do not use ksplit with
other operations such as quality-trimming or filtering.
mink=0 Look for shorter kmers at read tips down to this length,
when k-trimming or masking. 0 means disabled. Enabling
this will disable maskmiddle.
qtrim=f Trim read ends to remove bases with quality below trimq.
Performed AFTER looking for kmers.
Values:
Performed AFTER looking for kmers. Values:
rl (trim both ends),
f (neither end),
r (right end only),
l (left end only),
w (sliding window).
trimq=6 Regions with average quality BELOW this will be trimmed.
trimq=6 Regions with average quality BELOW this will be trimmed,
if qtrim is set to something other than f. Can be a
floating-point number like 7.3.
trimclip=f Trim soft-clipped bases from sam files.
minlength=10 (ml) Reads shorter than this after trimming will be
discarded. Pairs will be discarded if both are shorter.
mlf=0 (minlengthfraction) Reads shorter than this fraction of
......@@ -179,11 +204,8 @@ maxlength= Reads longer than this after trimming will be discarded.
minavgquality=0 (maq) Reads with average quality (after trimming) below
this will be discarded.
maqb=0 If positive, calculate maq from this many initial bases.
chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
or barcodes containing 'N' otherwise. A barcode must be
the last part of the read header.
barcodes= Comma-delimited list of barcodes or files of barcodes.
minbasequality=0 (mbq) Reads with any base below this quality (after
trimming) will be discarded.
maxns=-1 If non-negative, reads with more Ns than this
(after trimming) will be discarded.
mcb=0 (minconsecutivebases) Discard reads without at least
......@@ -212,6 +234,39 @@ restrictright=0 If positive, only look for kmer matches in the
rightmost X bases.
mingc=0 Discard reads with GC content below this.
maxgc=1 Discard reads with GC content above this.
gcpairs=t Use average GC of paired reads.
Also affects gchist.
tossjunk=f Discard reads with invalid characters as bases.
Header-parsing parameters - these require Illumina headers:
chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
or barcodes containing 'N' otherwise. A barcode must be
the last part of the read header. Values:
t: Remove reads with bad barcodes.
f: Ignore barcodes.
crash: Crash upon encountering bad barcodes.
barcodes= Comma-delimited list of barcodes or files of barcodes.
xmin=-1 If positive, discard reads with a lesser X coordinate.
ymin=-1 If positive, discard reads with a lesser Y coordinate.
xmax=-1 If positive, discard reads with a greater X coordinate.
ymax=-1 If positive, discard reads with a greater Y coordinate.
Polymer trimming:
trimpolya=0 If greater than 0, trim poly-A or poly-T tails of
at least this length on either end of reads.
trimpolygleft=0 If greater than 0, trim poly-G prefixes of at least this
length on the left end of reads. Does not trim poly-C.
trimpolygright=0 If greater than 0, trim poly-G tails of at least this
length on the right end of reads. Does not trim poly-C.
trimpolyg=0 This sets both left and right at once.
filterpolyg=0 If greater than 0, remove reads with a poly-G prefix of
at least this length (on the left).
Note: there are also equivalent poly-C flags.
Polymer tracking:
pratio=base,base 'pratio=G,C' will print the ratio of G to C polymers.
plen=20 Length of homopolymers to count.
Entropy/Complexity parameters:
entropy=-1 Set between 0 and 1 to filter reads with entropy below
......@@ -219,9 +274,17 @@ entropy=-1 Set between 0 and 1 to filter reads with entropy below
entropywindow=50 Calculate entropy using a sliding window of this length.
entropyk=5 Calculate entropy using kmers of this length.
minbasefrequency=0 Discard reads with a minimum base frequency below this.
entropymask=f Values:
f: Discard low-entropy sequences.
t: Mask low-entropy parts of sequences with N.
lc: Change low-entropy parts of sequences to lowercase.
entropymark=f Mark each base with its entropy value. This is on a scale
of 0-41 and is reported as quality scores, so the output
should be fastq or fasta+qual.
Cardinality estimation:
cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
cardinalityout=f (loglogout) Count unique kmers in output reads.
loglogk=31 Use this kmer length for counting.
loglogbuckets=1999 Use this many buckets for counting.
......@@ -231,12 +294,15 @@ Java Parameters:
the program's automatic memory detection. -Xmx20g will
specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an
out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
There is a changelog at /bbmap/docs/changelog_bbduk.txt
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -254,6 +320,7 @@ NATIVELIBDIR="$DIR""jni/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -274,14 +341,32 @@ calcXmx () {
calcXmx "$@"
bbduk() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module unload PrgEnv-intel
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
module load samtools
fi
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDukF $@"
#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDukF $@"
local CMD="java $EA $z $z2 -cp $CP jgi.BBDukF $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#bbduk2 in=<file> out=<file> fref=<file>
usage(){
echo "
Written by Brian Bushnell
Last modified December 10, 2015
BBDuk2 is like BBDuk but can kfilter, kmask, and ktrim in a single pass.
It does not replace BBDuk, and is only provided to allow maximally efficient
pipeline integration when multiple steps will be performed. The syntax is
slightly different.
Description: Compares reads to the kmers in a reference dataset, optionally
allowing an edit distance. Splits the reads into two outputs - those that
match the reference, and those that don't. Can also trim (remove) the matching
parts of the reads rather than binning the reads.
Usage: bbduk2.sh in=<input file> out=<output file> fref=<contaminant files>
Input may be stdin or a fasta or fastq file, compressed or uncompressed.
If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
fasta input, set in=stdin.fa.gz
Input parameters:
in=<file> Main input. in=stdin.fq will pipe from stdin.
in2=<file> Input for 2nd read of pairs in a different file.
fref=<file,file> Comma-delimited list of fasta reference files for filtering.
rref=<file,file> Comma-delimited list of fasta reference files for right-trimming.
lref=<file,file> Comma-delimited list of fasta reference files for left-trimming.
mref=<file,file> Comma-delimited list of fasta reference files for masking.
fliteral=<seq,seq> Comma-delimited list of literal sequences for filtering.
rliteral=<seq,seq> Comma-delimited list of literal sequences for right-trimming.
lliteral=<seq,seq> Comma-delimited list of literal sequences for left-trimming.
mliteral=<seq,seq> Comma-delimited list of literal sequences for masking.
touppercase=f (tuc) Change all bases upper-case.
interleaved=auto (int) t/f overrides interleaved autodetection.
qin=auto Input quality offset: 33 (Sanger), 64, or auto.
reads=-1 If positive, quit after processing X reads or pairs.
copyundefined=f (cu) Process non-AGCT IUPAC reference bases by making all
possible unambiguous copies. Intended for short motifs
or adapter barcodes, as time/memory use is exponential.
Output parameters:
out=<file> (outnonmatch) Write reads here that do not contain
kmers matching the database. 'out=stdout.fq' will pipe
to standard out.
out2=<file> (outnonmatch2) Use this to write 2nd read of pairs to a
different file.
outm=<file> (outmatch) Write reads here that contain kmers matching
the database.
outm2=<file> (outmatch2) Use this to write 2nd read of pairs to a
different file.
outs=<file> (outsingle) Use this to write singleton reads whose mate
was trimmed shorter than minlen.
stats=<file> Write statistics about which contamininants were detected.
refstats=<file> Write statistics on a per-reference-file basis.
rpkm=<file> Write RPKM for each reference sequence (for RNA-seq).
dump=<file> Dump kmer tables to a file, in fasta format.
nzo=t Only write statistics about ref sequences with nonzero hits.
overwrite=t (ow) Grant permission to overwrite files.
showspeed=t (ss) 'f' suppresses display of processing speed.
ziplevel=2 (zl) Compression level; 1 (min) through 9 (max).
fastawrap=80 Length of lines in fasta output.
qout=auto Output quality offset: 33 (Sanger), 64, or auto.
statscolumns=3 (cols) Number of columns for stats output, 3 or 5.
5 includes base counts.
rename=f Rename reads to indicate which sequences they matched.
refnames=f Use names of reference files rather than scaffold IDs.
trd=f Truncate read and ref names at the first whitespace.
ordered=f Set to true to output reads in same order as input.
Histogram output parameters:
bhist=<file> Base composition histogram by position.
qhist=<file> Quality histogram by position.
qchist=<file> Count of bases with each quality value.
aqhist=<file> Histogram of average read quality.
bqhist=<file> Quality histogram designed for box plots.
lhist=<file> Read length histogram.
gchist=<file> Read GC content histogram.
gcbins=100 Number gchist bins. Set to 'auto' to use read length.
Histograms for sam files only (requires sam format 1.4 or higher):
ehist=<file> Errors-per-read histogram.
qahist=<file> Quality accuracy histogram of error rates versus quality
score.
indelhist=<file> Indel length histogram.
mhist=<file> Histogram of match, sub, del, and ins rates by read location.
idhist=<file> Histogram of read count versus percent identity.
idbins=100 Number idhist bins. Set to 'auto' to use read length.
Processing parameters:
k=27 Kmer length used for finding contaminants. Contaminants
shorter than k will not be found. k must be at least 1.
rcomp=t Look for reverse-complements of kmers in addition to
forward kmers.
maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to
increase sensitivity in the presence of errors.
minkmerhits=1 (mkh) Reads need at least this many matching kmers
to be considered as matching the reference.
hammingdistance=0 (hdist) Maximum Hamming distance for ref kmers (subs only).
Memory use is proportional to (3*K)^hdist.
qhdist=0 Hamming distance for query kmers; impacts speed, not memory.
editdistance=0 (edist) Maximum edit distance from ref kmers (subs
and indels). Memory use is proportional to (8*K)^edist.
hammingdistance2=0 (hdist2) Sets hdist for short kmers, when using mink.
qhdist2=0 Sets qhdist for short kmers, when using mink.
editdistance2=0 (edist2) Sets edist for short kmers, when using mink.
forbidn=f (fn) Forbids matching of read kmers containing N.
By default, these will match a reference 'A' if
hdist>0 or edist>0, to increase sensitivity.
removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is
match (or either is trimmed shorter than minlen).
Set to false to require both.
findbestmatch=f (fbm) If multiple matches, associate read with sequence
sharing most kmers. Reduces speed.
skipr1=f Don't do kmer-based operations on read 1.
skipr2=f Don't do kmer-based operations on read 2.
ecco=f For overlapping paired reads only. Performs error-
correction with BBMerge prior to kmer operations.
recalibrate=f (recal) Recalibrate quality scores. Requires calibration
matrices generated by CalcTrueQuality.
sam=<file,file> If recalibration is desired, and matrices have not already
been generated, BBDuk will create them from the sam file.
Speed and Memory parameters:
threads=auto (t) Set number of threads to use; default is number of
logical processors.
prealloc=f Preallocate memory in table. Allows faster table loading
and more efficient memory usage, for a large reference.
monitor=f Kill this process if it crashes. monitor=600,0.01 would
kill after 600 seconds under 1% usage.
minrskip=1 (mns) Force minimal skip interval when indexing reference
kmers. 1 means use all, 2 means use every other kmer, etc.
maxrskip=1 (mxs) Restrict maximal skip interval when indexing
reference kmers. Normally all are used for scaffolds<100kb,
but with longer scaffolds, up to maxrskip-1 are skipped.
rskip= Set both minrskip and maxrskip to the same value.
If not set, rskip will vary based on sequence length.
qskip=1 Skip query kmers to increase speed. 1 means use all.
speed=0 Ignore this fraction of kmer space (0-15 out of 16) in both
reads and reference. Increases speed and reduces memory.
Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
Trimming/Filtering/Masking parameters:
Note - for BBDuk2, kmer filtering, trimming, and masking are independent,
and all can be performed at the same time.
ktrim=f Trim reads to remove bases matching reference kmers.
Values:
f (don't trim),
r (trim to the right),
l (trim to the left)
kmask=f Replace bases matching ref kmers with another symbol.
Allows any non-whitespace character other than t or f,
and processes short kmers on both ends. 'kmask=lc' will
convert masked bases to lowercase.
mink=0 Look for shorter kmers at read tips down to this length,
when k-trimming or masking. 0 means disabled. Enabling
this will disable maskmiddle.
qtrim=f Trim read ends to remove bases with quality below trimq.
Performed AFTER looking for kmers.
Values:
rl (trim both ends),
f (neither end),
r (right end only),
l (left end only),
w (sliding window)
trimq=6 Regions with average quality BELOW this will be trimmed.
minlength=10 (ml) Reads shorter than this after trimming will be
discarded. Pairs will be discarded if both are shorter.
mlf=0 (minlengthfraction) Reads shorter than this fraction of
original length after trimming will be discarded.
maxlength= Reads longer than this after trimming will be discarded.
Pairs will be discarded only if both are longer.
minavgquality=0 (maq) Reads with average quality (after trimming) below
this will be discarded.
maqb=0 If positive, calculate maq from this many initial bases.
chastityfilter=f (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
barcodefilter=f Remove reads with unexpected barcodes if barcodes is set,
or barcodes containing 'N' otherwise. A barcode must be
the last part of the read header.
barcodes= Comma-delimited list of barcodes or files of barcodes.
maxns=-1 If non-negative, reads with more Ns than this
(after trimming) will be discarded.
mcb=0 (minconsecutivebases) Discard reads without at least
this many consecutive called bases.
ottm=f (outputtrimmedtomatch) Output reads trimmed to shorter
than minlength to outm rather than discarding.
tp=0 (trimpad) Trim this much extra around matching kmers.
tbo=f (trimbyoverlap) Trim adapters based on where paired
reads overlap.
strictoverlap=t Adjust sensitivity for trimbyoverlap mode.
minoverlap=14 Require this many bases of overlap for detection.
mininsert=50 Require insert size of at least this for overlap.
Should be reduced to 16 for small RNA sequencing.
tpe=f (trimpairsevenly) When kmer right-trimming, trim both
reads to the minimum length of either.
forcetrimleft=0 (ftl) If positive, trim bases to the left of this position
(exclusive, 0-based).
forcetrimright=0 (ftr) If positive, trim bases to the right of this position
(exclusive, 0-based).
forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
forcetrimmod=0 (ftm) If positive, right-trim length to be equal to zero,
modulo this number.
restrictleft=0 If positive, only look for kmer matches in the
leftmost X bases.
restrictright=0 If positive, only look for kmer matches in the
rightmost X bases.
mingc=0 Discard reads with GC content below this.
maxgc=1 Discard reads with GC content above this.
Entropy/Complexity parameters:
entropy=-1 Set between 0 and 1 to filter reads with entropy below
that value. Higher is more stringent.
entropywindow=50 Calculate entropy using a sliding window of this length.
entropyk=5 Calculate entropy using kmers of this length.
minbasefrequency=0 Discard reads with a minimum base frequency below this.
Cardinality estimation:
cardinality=f (loglog) Count unique kmers using the LogLog algorithm.
loglogk=31 Use this kmer length for counting.
loglogbuckets=1999 Use this many buckets for counting.
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding
the program's automatic memory detection. -Xmx20g will
specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
The max is typically 85% of physical memory.
There is a changelog at /bbmap/docs/changelog_bbduk.txt
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"
NATIVELIBDIR="$DIR""jni/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
usage
exit
fi
calcXmx () {
source "$DIR""/calcmem.sh"
parseXmx "$@"
if [[ $set == 1 ]]; then
return
fi
freeRam 1400m 42
z="-Xmx${RAM}m"
z2="-Xms${RAM}m"
}
calcXmx "$@"
bbduk2() {
if [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load pigz
module load samtools
fi
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDuk2 $@"
echo $CMD >&2
eval $CMD
}
bbduk2 "$@"
#!/bin/bash
#bbest in=<infile> out=<outfile>
function usage(){
usage(){
echo "
Written by Brian Bushnell
Last modified November 4, 2015
......@@ -23,6 +22,7 @@ Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -39,6 +39,7 @@ CP="$DIR""current/"
z="-Xmx120m"
z2="-Xms120m"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -53,14 +54,31 @@ calcXmx () {
calcXmx "$@"
function bbest() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load samtools
module load oracle-jdk/1.8_144_64bit
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module unload PrgEnv-intel
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
fi
local CMD="java $EA $z -cp $CP jgi.SamToEst $@"
local CMD="java $EA $EOOM $z -cp $CP jgi.SamToEst $@"
# echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#fakereads in=<infile> out=<outfile>
function usage(){
usage(){
echo "
Written by Brian Bushnell
Last modified February 17, 2015
......@@ -12,7 +11,6 @@ Usage: bbfakereads.sh in=<file> out=<outfile> out2=<outfile2>
Out2 is optional; if there is only one output file, it will be written interleaved.
Standard parameters:
ow=f (overwrite) Overwrites files that already exist.
zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
......@@ -36,17 +34,14 @@ addspace=t Set to false to omit the space before /1 and /2 of paired r
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
Supported output formats are fastq, fasta, fast+qual, bread
Supported compression formats are gz, zip, and bz2
To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'
To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -62,6 +57,7 @@ CP="$DIR""current/"
z="-Xmx600m"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -77,14 +73,25 @@ calcXmx () {
calcXmx "$@"
function fakereads() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
module load samtools
fi
local CMD="java $EA $z -cp $CP jgi.FakeReads $@"
local CMD="java $EA $EOOM $z -cp $CP jgi.FakeReads $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#bbmap in=<infile> out=<outfile>
usage(){
echo "
BBMap v35.x
BBMap
Written by Brian Bushnell, from Dec. 2010 - present
Last modified December 15, 2015
Last modified September 18, 2018
Description: Fast and accurate splice-aware read aligner.
Please read bbmap/docs/guides/BBMapGuide.txt for more information.
To index: bbmap.sh ref=<reference fasta>
To map: bbmap.sh in=<reads> out=<output sam>
......@@ -20,7 +20,6 @@ input and output files e.g. in=stdin.fa.gz will read gzipped fasta from
standard in; out=stdout.sam.gz will write gzipped sam.
Indexing Parameters (required when building the index):
nodisk=f Set to true to build index in memory and write nothing
to disk except output.
ref=<file> Specify the reference sequence. Only do this ONCE,
......@@ -40,7 +39,6 @@ usemodulo=f Throw away ~80% of kmers based on remainder modulo a
rebuild=f Force a rebuild of the index (ref= should be set).
Input Parameters:
build=1 Designate index to use. Corresponds to the number
specified when building the index.
in=<file> Primary reads input; required parameter.
......@@ -68,7 +66,6 @@ skipreads=0 Set to a number N to skip the first N reads (or pairs),
then map the rest.
Mapping Parameters:
fast=f This flag is a macro which sets other paramters to run
faster, at reduced sensitivity. Bad for RNA-seq.
slow=f This flag is a macro which sets other paramters to run
......@@ -77,6 +74,8 @@ maxindel=16000 Don't look for indels longer than this. Lower is faster.
Set to >=100k for RNAseq with long introns like mammals.
strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'.
By default these are not sought, but may be found anyway.
tipsearch=100 Look this far for read-end deletions with anchors
shorter than K, using brute force.
minid=0.76 Approximate minimum alignment identity to look for.
Higher is faster and less sensitive.
minhits=1 Minimum number of seed hits required for candidate sites.
......@@ -114,18 +113,28 @@ rescuemismatches=32 Maximum mismatches allowed in a rescued read. Lower
is faster.
averagepairdist=100 (apd) Initial average distance between paired reads.
Varies dynamically; does not need to be specified.
deterministic=f Run in deterministic mode. In this case it is good
to set averagepairdist. BBMap is deterministic
without this flag if using single-ended reads,
or run singlethreaded.
bandwidthratio=0 (bwr) If above zero, restrict alignment band to this
fraction of read length. Faster but less accurate.
bandwidth=0 (bw) Set the bandwidth directly.
fraction of read length. Faster but less accurate.
usejni=f (jni) Do alignments faster, in C code. Requires
compiling the C code; details are in /jni/README.txt.
maxsites2=800 Don't analyze (or print) more than this many alignments
per read.
monitor=f Kill this process if CPU usage drops to zero for
a long time. monitor=600,0.01 would kill after 600
seconds under 1% usage.
ignorefrequentkmers=t (ifk) Discard low-information kmers that occur often.
excludefraction=0.03 (ef) Fraction of kmers to ignore. For example, 0.03
will ignore the most common 3% of kmers.
greedy=t Use a greedy algorithm to discard the least-useful
kmers on a per-read basis.
kfilter=0 If positive, potential mapping sites must have at
least this many consecutive exact matches.
Quality and Trimming Parameters:
Quality and Trimming Parameters:
qin=auto Set to 33 or 64 to specify input quality value ASCII
offset. 33 is Sanger, 64 is old Solexa.
qout=auto Set to 33 or 64 to specify output quality value ASCII
......@@ -143,11 +152,10 @@ ignorebadquality=f (ibq) Keep going, rather than crashing, if a read has
out-of-range quality values.
usequality=t Use quality scores when determining which read kmers
to use as seeds.
minaveragequality=0 (maq) Discard reads with average quality below this.
minaveragequality=0 (maq) Do not map reads with average quality below this.
maqb=0 If positive, calculate maq from this many initial bases.
Output Parameters:
out=<file> Write all reads to this file.
outu=<file> Write only unmapped reads to this file. Does not
include unmapped paired reads with a mapped mate.
......@@ -180,9 +188,19 @@ printunmappedcount=f Print the total number of unmapped reads and bases.
showprogress=0 If positive, print a '.' every X reads.
showprogress2=0 If positive, print the number of seconds since the
last progress update (instead of a '.').
renamebyinsert=f Renames reads based on their mapped insert size.
Bloom-Filtering Parameters (bloomfilter.sh is the standalone version).
bloom=f Use a Bloom filter to ignore reads not sharing kmers
with the reference. This uses more memory, but speeds
mapping when most reads don't match the reference.
bloomhashes=2 Number of hash functions.
bloomminhits=3 Number of consecutive hits to be considered matched.
bloomk=31 Bloom filter kmer length.
bloomserial=t Use the serialized Bloom filter for greater loading
speed, if available. If not, generate and write one.
Post-Filtering Parameters:
idfilter=0 Independant of minid; sets exact minimum identity
allowed for alignments to be printed. Range 0 to 1.
subfilter=-1 Ban alignments with more than this many substitutions.
......@@ -192,9 +210,10 @@ indelfilter=-1 Ban alignments with more than this many indels.
editfilter=-1 Ban alignments with more than this many edits.
inslenfilter=-1 Ban alignments with an insertion longer than this.
dellenfilter=-1 Ban alignments with a deletion longer than this.
nfilter=-1 Ban alignments with more than this many ns. This
includes nocall, noref, and off scaffold ends.
Sam flags and settings:
noheader=f Disable generation of header lines.
sam=1.4 Set to 1.4 to write Sam version 1.4 cigar strings,
with = and X, or 1.3 to use M.
......@@ -228,7 +247,6 @@ boundstag=f Write a tag indicating whether either read in the pair
notags=f Turn off all optional tags.
Histogram and statistics output parameters:
scafstats=<file> Statistics on how many reads mapped to which scaffold.
refstats=<file> Statistics on how many reads mapped to which reference
file; only for BBSplit.
......@@ -247,12 +265,12 @@ mhist=<file> Histogram of match, sub, del, and ins rates by
read location.
gchist=<file> Read GC content histogram.
gcbins=100 Number gchist bins. Set to 'auto' to use read length.
gcpairs=t Use average GC of paired reads.
idhist=<file> Histogram of read count versus percent identity.
idbins=100 Number idhist bins. Set to 'auto' to use read length.
statsfile=stderr Mapping statistics are printed here.
Coverage output parameters (these may reduce speed and use more RAM):
covstats=<file> Per-scaffold coverage info.
rpkm=<file> Per-scaffold RPKM/FPKM counts.
covhist=<file> Histogram of # occurrences of each depth level.
......@@ -270,6 +288,7 @@ physcov=f Calculate physical coverage for paired reads.
This includes the unsequenced bases.
delcoverage=t (delcov) Count bases covered by deletions as covered.
True is faster than false.
covk=0 If positive, calculate kmer coverage statistics.
Java Parameters:
-Xmx This will be passed to Java to set memory usage,
......@@ -279,14 +298,16 @@ Java Parameters:
physical memory. The human genome requires around 24g,
or 12g with the 'usemodulo' flag. The index uses
roughly 6 bytes per reference base.
-eoom This flag will cause the process to exit if an
out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
This list is not complete. For more information, please consult
$DIR""docs/readme.txt.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter
any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -304,6 +325,7 @@ NATIVELIBDIR="$DIR""jni/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -325,14 +347,32 @@ calcXmx "$@"
bbmap() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module unload PrgEnv-intel
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
module load samtools
fi
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
local CMD="java $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#This is a version of BBMap designed to final all sites above a given threshold,
#rather than the single best site.
#rather than the single best site. Syntax is the same as BBMap.
usage(){
bash "$DIR"bbmap.sh
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -24,6 +25,7 @@ NATIVELIBDIR="$DIR""jni/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -44,13 +46,31 @@ calcXmx () {
calcXmx "$@"
mapPacBioSkimmer() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module unload samtools
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module unload PrgEnv-intel
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
module load samtools
fi
#local CMD="java $EA $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambig=all minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambig=all minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
echo $CMD >&2
eval $CMD
......
#!/bin/bash
#bbmask in=<infile> out=<outfile>
usage(){
echo "
Written by Brian Bushnell
Last modified May 19, 2015
Last modified October 17, 2017
Description: Masks sequences of low-complexity, or containing repeat kmers, or covered by mapped reads.
By default this program will mask using entropy with a window=80 and entropy=0.75
Please read bbmap/docs/guides/BBMaskGuide.txt for more information.
Usage: bbmask.sh in=<file> out=<file> sam=<file,file,...file>
Input may be stdin or a fasta or fastq file, raw or gzipped.
sam is optional, but may be a comma-delimited list of sam files to mask.
Sam files may also be used as arguments without sam=, so you can use *.sam for example.
If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz
Input parameters:
in=<file> Input sequences to mask. 'in=stdin.fa' will pipe from standard in.
sam=<file,file> Comma-delimited list of sam files. Optional. Their mapped coordinates will be masked.
......@@ -56,11 +56,14 @@ unpigz=t Use pigz to decompress.
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -77,6 +80,7 @@ CP="$DIR""current/"
z="-Xmx1g"
z2="-Xms1g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -90,19 +94,32 @@ calcXmx () {
if [[ $set == 1 ]]; then
return
fi
freeRam 3200m 42
freeRam 3200m 84
z="-Xmx${RAM}m"
z2="-Xms${RAM}m"
}
calcXmx "$@"
bbmask() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $z -cp $CP jgi.BBMask $@"
local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.BBMask $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#merge in=<infile> out=<outfile>
function usage(){
usage(){
echo "
bbmerge-auto.sh is a wrapper for BBMerge that attempts to use all available
memory, instead of a fixed amount. This is for use with the Tadpole options
......@@ -13,6 +12,7 @@ For information about usage and parameters, please run bbmerge.sh.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -30,6 +30,7 @@ NATIVELIBDIR="$DIR""jni/"
z="-Xmx14g"
z2="-Xms14g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -50,12 +51,26 @@ calcXmx () {
calcXmx "$@"
function merge() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
local CMD="java $EA $z $z2 -cp $CP jgi.BBMerge $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#merge in=<infile> out=<outfile>
function usage(){
usage(){
echo "
BBMerge v8.82
Written by Brian Bushnell and Jonathan Rood
Last modified October 27, 2015
Last modified August 23, 2018
Description: Merges paired reads into single reads by overlap detection.
With sufficient coverage, can also merge nonoverlapping reads by kmer extension.
With sufficient coverage, can merge nonoverlapping reads by kmer extension.
Kmer modes (Tadpole or Bloom Filter) require much more memory, and should
be used with the bbmerge-auto.sh script rather than bbmerge.sh.
Please read bbmap/docs/guides/BBMergeGuide.txt for more information.
Usage for interleaved files: bbmerge.sh in=<reads> out=<merged reads> outu=<unmerged reads>
Usage for paired files: bbmerge.sh in1=<read1> in2=<read2> out=<merged reads> outu1=<unmerged1> outu2=<unmerged2>
Input may be stdin or a fasta, fastq, or scarf file, raw or gzipped.
Input may be stdin or a file, fasta or fastq, raw or gzipped.
Input parameters:
in=null Primary input. 'in2' will specify a second file.
......@@ -22,7 +22,6 @@ interleaved=auto May be set to true or false to override autodetection of
whether the input file as interleaved.
reads=-1 Quit after this many read pairs (-1 means all).
Output parameters:
out=<file> File for merged reads. 'out2' will specify a second file.
outu=<file> File for unmerged reads. 'outu2' will specify a second file.
......@@ -38,7 +37,6 @@ ordered=f Output reads in same order as input.
mix=f Output both the merged (or mergable) and unmerged reads
in the same file (out=). Useful for ecco mode.
Trimming/Filtering parameters:
qtrim=f Trim read ends to remove bases with quality below minq.
Trims BEFORE merging.
......@@ -67,14 +65,21 @@ forcetrimright=0 (ftr) If nonzero, trim right bases of the read
forcetrimright2=0 (ftr2) If positive, trim this many bases on the right end.
forcetrimmod=5 (ftm) If positive, trim length to be equal to
zero modulo this number.
ooi=f Output only incorrectly merged reads, for testing.
trimpolya=t Trim trailing poly-A tail from adapter output. Only
affects outadapter. This also trims poly-A followed
by poly-G, which occurs on NextSeq.
Processing Parameters:
usejni=f (jni) Do overlapping in C code, which is faster. Requires
compiling the C code; details are in /jni/README.txt.
However, the jni path is currently disabled.
merge=t Create merged reads. If set to false, you can still
generate an insert histogram.
ecco=f Error-correct the overlapping part, but don't merge.
trimnonoverlapping=f (tno) Trim all non-overlapping portions, leaving only
consensus sequence. By default, only sequence to the
right of the overlap (adapter sequence) is trimmed.
useoverlap=t Attempt find the insert size using read overlap.
mininsert=35 Minimum insert size to merge reads.
mininsert0=35 Insert sizes less than this will not be considered.
......@@ -87,44 +92,49 @@ maxq=41 Cap output quality scores at this.
entropy=t Increase the minimum overlap requirement for low-
complexity reads.
efilter=6 Ban overlaps with over this many times the expected
number of errors. Lower is more strict.
pfilter=0.00002 Ban improbable overlaps. Higher is more strict. 0 will
number of errors. Lower is more strict. -1 disables.
pfilter=0.00004 Ban improbable overlaps. Higher is more strict. 0 will
disable the filter; 1 will allow only perfect overlaps.
kfilter=0 Ban overlaps that create kmers with count below
this value (0 disables). Does not seem to help.
lowercase=f Expect lowercase letters to signify adapter sequence.
this value (0 disables). If this is used minprob should
probably be set to 0. Requires good coverage.
ouq=f Calculate best overlap using quality values.
owq=t Calculate best overlap without using quality values.
usequality=t If disabled, quality values are completely ignored,
both for overlap detection and filtering. May be useful
for data with inaccurate quality values.
iupacton=f (itn) Change ambiguous IUPAC symbols to N.
adapter= Specify the adapter sequences used for these reads, if
known; this can be a fasta file or a literal sequence.
Read 1 and 2 can have adapters specified independently
with the adapter1 and adapter2 flags. adapter=default
will use a list of common adapter sequences.
Ratio Mode:
ratiomode=t Score overlaps based on the ratio of matching to
mismatching bases.
maxratio=0.09 Max error rate; higher increases merge rate.
ratiomargin=5.5 Lower increases merge rate; min is 1.
ratiooffset=0.55 Lower increases merge rate; min is 0.
ratiominoverlapreduction=3 This is the difference between minoverlap in
flat mode and minoverlap in ratio mode; generally,
minoverlap should be lower in ratio mode.
minsecondratio=0.1 Cutoff for second-best overlap ratio.
Normal Mode:
normalmode=f Original BBMerge algorithm. Faster, but lower overall
merge rate.
Flat Mode:
flatmode=f Score overlaps based on the total number of mismatching
bases only.
margin=2 The best overlap must have at least 'margin' fewer
mismatches than the second best.
mismatches=3 Do not allow more than this many mismatches.
requireratiomatch=f (rrm) Require the answer from normal mode and ratio mode
requireratiomatch=f (rrm) Require the answer from flat mode and ratio mode
to agree, reducing false positives if both are enabled.
trimonfailure=t (tof) If detecting insert size by overlap fails,
the reads will be trimmed and this will be re-attempted.
Ratio Mode:
ratiomode=t Newer algorithm. Slower, but higher merge rate.
Much better for long overlaps and high error rates.
maxratio=0.09 Max error rate; higher increases merge rate.
ratiomargin=5.5 Lower increases merge rate; min is 1.
ratiooffset=0.55 Lower increases merge rate; min is 0.
ratiominoverlapreduction=3 This is the difference between minoverlap in
normal mode and minoverlap in ratio mode; generally,
minoverlap should be lower in ratio mode.
*** Ratio Mode and Normal Mode may be used alone or simultaneously. ***
*** Ratio Mode is much more accurate and is now the default mode. ***
*** Ratio Mode and Flat Mode may be used alone or simultaneously. ***
*** Ratio Mode is usually more accurate and is the default mode. ***
Strictness (these are mutually exclusive macros that set other parameters):
......@@ -138,15 +148,29 @@ ultraloose=f (uloose) Increase FP and merging rate even more.
maxloose=f (xloose) Maximally decrease FP and merging rate.
fast=f Fastest possible mode; less accurate.
Tadpole Parameters (for read extension and error-correction):
*Note: These require more memory and should be run with bbmerge-auto.sh.*
k=31 Kmer length. 31 (or less) is fastest and uses the least
memory, but higher values may be more accurate.
60 tends to work well for 150bp reads.
extend=0 Extend reads to the right this much before merging.
Requires sufficient (>5x) kmer coverage.
extend2=0 Extend reads only after a failed merge attempt.
extend2=0 Extend reads this much only after a failed merge attempt,
or in rem/rsem mode.
iterations=1 (ei) Iteratively attempt to extend by extend2 distance
and merge up to this many times.
rem=f (requireextensionmatch) Do not merge if the predicted
insert size differs before and after extension.
However, if only the extended reads overlap, then that
insert will be used. Requires setting extend2.
rsem=f (requirestrictextensionmatch) Similar to rem but stricter.
Reads will only merge if the predicted insert size before
and after extension match. Requires setting extend2.
Enables the lowest possible false-positive rate.
ecctadpole=f (ecct) If reads fail to merge, error-correct with Tadpole
and try again. This happens prior to extend2.
reassemble=t If ecct is enabled, use Tadpole's reassemble mode for
error correction. Alternatives are pincer and tail.
removedeadends (shave) Remove kmers leading to dead ends.
removebubbles (rinse) Remove kmers in error bubbles.
mindepthseed=3 (mds) Minimum kmer depth to begin extension.
......@@ -157,7 +181,6 @@ branchlower=3 Max value of 2nd-greatest path depth to be considered low.
ibb=t Ignore backward branches when extending.
extra=<file> A file or comma-delimited list of files of reads to use
for kmer counting, but not for merging or output.
k=31 Kmer length (1-31 is fastest).
prealloc=f Pre-allocate memory rather than dynamically growing;
faster and more memory-efficient for large datasets.
A float fraction (0-1) may be specified, default 1.
......@@ -166,17 +189,33 @@ prefilter=0 If set to a positive integer, use a countmin sketch to
reduce memory usage.
minprob=0.5 Ignore kmers with overall probability of correctness
below this, to reduce memory usage.
minapproxoverlap=26 For rem mode, do not merge reads if the extended reads
indicate that the raw reads should have overlapped by
at least this much, but no overlap was found.
Bloom Filter Parameters (for kmer operations with less memory than Tadpole)
*Note: These require more memory and should be run with bbmerge-auto.sh.*
eccbloom=f (eccb) If reads fail to merge, error-correct with bbcms
and try again.
testmerge=f Test kmer counts around the read merge junctions. If
it appears that the merge created new errors, undo it.
This reduces the false-positive rate, but not as much as
rem or rsem.
Java Parameters:
-Xmx This will be passed to Java to set memory usage,
overriding the program's automatic memory detection.
For example, -Xmx400m will specify 400 MB RAM.
-eoom This flag will cause the process to exit if an
out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -192,7 +231,9 @@ CP="$DIR""current/"
NATIVELIBDIR="$DIR""jni/"
z="-Xmx1000m"
z2="-Xms1000m"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -207,12 +248,26 @@ calcXmx () {
calcXmx "$@"
function merge() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP jgi.BBMerge $@"
#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.BBMerge $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
#bbnorm in=<infile> out=<outfile>
usage(){
echo "
Written by Brian Bushnell
Last modified December 14, 2015
Last modified October 19, 2017
Description: Normalizes read depth based on kmer counts.
Can also error-correct, bin reads by kmer depth, and generate a kmer depth histogram.
However, Tadpole has superior error-correction to BBNorm.
Please read bbmap/docs/guides/BBNormGuide.txt for more information.
Usage: bbnorm.sh in=<input> out=<reads to keep> outt=<reads to toss> hist=<histogram output>
Input may be fasta or fastq, compressed or uncompressed.
'out' and 'hist' are both optional.
Optional parameters (and their defaults)
Input parameters:
in=null Primary input. Use in2 for paired reads in a second file
in2=null Second input file for paired reads in two files
......@@ -80,7 +75,7 @@ highthresh=12 (ht) Threshold for high kmer. A high kmer at this or above
lowthresh=3 (lt) Threshold for low kmer. Kmers at this and below are always considered errors.
Error correction parameters:
ecc=f Set to true to correct errors.
ecc=f Set to true to correct errors. NOTE: Tadpole is now preferred for ecc as it does a better job.
ecclimit=3 Correct up to this many errors per read. If more are detected, the read will remain unchanged.
errorcorrectratio=140 (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error.
echighthresh=22 (echt) Threshold for high kmer. A kmer at this or above may be considered non-error.
......@@ -109,8 +104,8 @@ histlen=1048576 Max kmer depth displayed in histogram. Also affects statist
Peak calling parameters:
peaks=<file> Write the peaks to this file. Default is stdout.
minHeight=2 (h) Ignore peaks shorter than this.
minVolume=2 (v) Ignore peaks with less area than this.
minWidth=2 (w) Ignore peaks narrower than this.
minVolume=5 (v) Ignore peaks with less area than this.
minWidth=3 (w) Ignore peaks narrower than this.
minPeak=2 (minp) Ignore peaks with an X-value below this.
maxPeak=BIG (maxp) Ignore peaks with an X-value above this.
maxPeakCount=8 (maxpc) Print up to this many peaks (prioritizing height).
......@@ -118,11 +113,14 @@ maxPeakCount=8 (maxpc) Print up to this many peaks (prioritizing height).
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
......@@ -139,6 +137,7 @@ CP="$DIR""current/"
z="-Xmx31g"
z2="-Xms31g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
......@@ -159,12 +158,25 @@ calcXmx () {
calcXmx "$@"
normalize() {
if [[ $NERSC_HOST == genepool ]]; then
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.7_64bit
module load oracle-jdk/1.8_144_64bit
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module load pigz
fi
local CMD="java $EA $z $z2 -cp $CP jgi.KmerNormalize bits=32 $@"
local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.KmerNormalize bits=32 $@"
echo $CMD >&2
eval $CMD
}
......
#!/bin/bash
usage(){
echo "
Written by Brian Bushnell
Last modified April 26, 2017
Description: Realigns mapped reads to a reference.
Usage: bbrealign.sh in=<file> ref=<file> out=<file>
Input may be a sorted or unsorted sam or bam file.
The reference should be fasta.
I/O parameters:
in=<file> Input reads.
out=<file> Output reads.
ref=<file> Reference fasta.
overwrite=f (ow) Set to false to force the program to abort rather than
overwrite an existing file.
Trimming parameters:
border=0 Trim at least this many bases on both ends of reads.
qtrim=r Quality-trim reads on this end
r: right, l: left, rl: both, f: don't quality-trim.
trimq=10 Quality-trim bases below this score.
Realignment parameters:
unclip=f Convert clip symbols from exceeding the ends of the
realignment zone into matches and substitutitions.
repadding=70 Pad alignment by this much on each end. Typically,
longer is more accurate for long indels, but greatly
reduces speed.
rerows=602 Use this many rows maximum for realignment. Reads longer
than this cannot be realigned.
recols=2000 Reads may not be aligned to reference seqments longer
than this. Needs to be at least read length plus
max deletion length plus twice padding.
msa= Select the aligner. Options:
MultiStateAligner11ts: Default.
MultiStateAligner9PacBio: Use for PacBio reads, or for
Illumina reads mapped to PacBio/Nanopore reads.
Sam-filtering parameters:
minpos= Ignore alignments not overlapping this range.
maxpos= Ignore alignments not overlapping this range.
minreadmapq=4 Ignore alignments with lower mapq.
contigs= Comma-delimited list of contig names to include. These
should have no spaces, or underscores instead of spaces.
secondary=f Include secondary alignments.
supplimentary=f Include supplimentary alignments.
invert=f Invert sam filters.
Java Parameters:
-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
-Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory.
-eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
-da Disable assertions.
Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
"
}
#This block allows symlinked shellscripts to correctly set classpath.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
CP="$DIR""current/"
z="-Xmx4g"
z2="-Xms4g"
EA="-ea"
EOOM=""
set=0
if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
usage
exit
fi
calcXmx () {
source "$DIR""/calcmem.sh"
parseXmx "$@"
if [[ $set == 1 ]]; then
return
fi
freeRam 4000m 84
z="-Xmx${RAM}m"
z2="-Xms${RAM}m"
}
calcXmx "$@"
bbrealign() {
if [[ $SHIFTER_RUNTIME == 1 ]]; then
#Ignore NERSC_HOST
shifter=1
elif [[ $NERSC_HOST == genepool ]]; then
module unload oracle-jdk
module load oracle-jdk/1.8_144_64bit
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == denovo ]]; then
module unload java
module load java/1.8.0_144
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
elif [[ $NERSC_HOST == cori ]]; then
module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
module unload java
module load java/1.8.0_144
module unload PrgEnv-intel
module load PrgEnv-gnu/7.1
module load samtools/1.4
module load pigz
fi
local CMD="java $EA $EOOM $z $z2 -cp $CP var2.Realign $@"
echo $CMD >&2
eval $CMD
}
bbrealign "$@"
#!/bin/bash
#For more information, please see rename.sh
#This exists for people who type bbrename.sh instead of rename.sh
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
"$DIR"rename.sh $@
#!/bin/bash
#For more information, please see sketch.sh
#This exists for people who type bbsketch.sh instead of sketch.sh
#I haven't decided which one will be the canonical version.
pushd . > /dev/null
DIR="${BASH_SOURCE[0]}"
while [ -h "$DIR" ]; do
cd "$(dirname "$DIR")"
DIR="$(readlink "$(basename "$DIR")")"
done
cd "$(dirname "$DIR")"
DIR="$(pwd)/"
popd > /dev/null
#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
"$DIR"sketch.sh $@