Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/README.md
+++ b/README.md
 # BBTools bioinformatics tools, including BBMap.
-# Author: Brian Bushnell, Jon Rood
-# Language: Java
+# Author: Brian Bushnell, Jon Rood, Shijie Yao
+# Language: Java, Bash
 # Information about documentation is in /docs/readme.txt.
-# Version 35.85
+
+# Version 38.26
--- a/a_sample_mt.sh
+++ b/a_sample_mt.sh
 #!/bin/bash
-#a_sample_mt in=<infile> out=<outfile>

 usage(){
 echo "
 Written by Brian Bushnell
-Last modified November 19, 2015
+Last modified August 22, 2018

 Description:  Does nothing.  Should be fast.

@@ -12,7 +11,6 @@ Usage:  a_sample_mt.sh in=<input file> out=<output file>

 Input may be fasta or fastq, compressed or uncompressed.

-
 Standard parameters:
 in=<file>       Primary input, or read 1 input.
 in2=<file>      Read 2 input if reads are in two files.
@@ -30,11 +28,14 @@ None yet!
 Java Parameters:
 -Xmx            This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
                -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom           This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da             Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -51,6 +52,7 @@ CP="$DIR""current/"
 z="-Xmx4g"
 z2="-Xms4g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -71,12 +73,25 @@ calcXmx () {
 calcXmx "$@"

 a_sample_mt() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java $EA $z -cp $CP jgi.A_SampleMT $@"
+	local CMD="java $EA $EOOM $z -cp $CP jgi.A_SampleMT $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/addadapters.sh
+++ b/addadapters.sh
 #!/bin/bash
-#addadapters in=<infile> out=<outfile>

-function usage(){
+usage(){
 echo "
 Written by Brian Bushnell
 Last modified February 17, 2015
@@ -11,13 +10,14 @@ The input is a set of reads, paired or unpaired.
 The output is those same reads with adapter sequence replacing some of the bases in some reads.
 For paired reads, adapters are located in the same position in read1 and read2.
 This is designed for benchmarking adapter-trimming software, and evaluating methodology.
+randomreads.sh is better for paired reads, though, as it actually adds adapters at the correct location,
+so that overlap may be used for adapter detection.

 Usage:  addadapters.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> adapters=<file>

 in2 and out2 are for paired reads and are optional.
 If input is paired and there is only one output file, it will be written interleaved.

-
 Parameters:
 ow=f                (overwrite) Overwrites files that already exist.
 int=f               (interleaved) Determines whether INPUT file is considered interleaved.
@@ -38,6 +38,7 @@ Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -53,6 +54,7 @@ CP="$DIR""current/"

 z="-Xmx200m"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -67,11 +69,25 @@ calcXmx () {
 calcXmx "$@"

 function addadapters() {
-	if [[ $NERSC_HOST == genepool ]]; then
-		module load oracle-jdk/1.7_64bit
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
+		module unload oracle-jdk
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java $EA $z -cp $CP jgi.AddAdapters $@"
+	local CMD="java $EA $EOOM $z -cp $CP jgi.AddAdapters $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/analyzeaccession.sh
+++ b/analyzeaccession.sh
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified August 9, 2018
+
+Description:  Looks at accessions to see how to compress them.
+
+Usage:  analyzeaccession.sh *accession2taxid.gz out=<output file>
+
+Parameters:
+lines=-1        If positive, stop after this many lines.
+
+Java Parameters:
+-Xmx            This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+                -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom           This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da             Disable assertions.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx400m"
+z2="-Xms400m"
+EA="-ea"
+EOOM=""
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	parseXmx "$@"
+}
+calcXmx "$@"
+
+a_sample_mt() {
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
+		module unload oracle-jdk
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	fi
+	local CMD="java $EA $EOOM $z -cp $CP tax.AnalyzeAccession $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+a_sample_mt "$@"
--- a/analyzegenes.sh
+++ b/analyzegenes.sh
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified September 27, 2018
+
+Description:  Generates a prokaryotic gene model (.pkm) for gene calling.
+Input is fasta and gff files.
+The .pkm file may be used by CallGenes.
+
+Usage:  analyzegenes.sh in=x.fa gff=x.gff out=x.pgm
+
+File parameters:
+in=<file>       A fasta file or comma-delimited list of fasta files.
+gff=<file>      A gff file or comma-delimited list.  This is optional;
+                if present, it must match the number of fasta files.
+                If absent, a fasta file 'foo.fasta' will imply the
+                presence of 'foo.gff'.
+out=<file>      Output pgm file.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+EA="-ea"
+EOOM=""
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	parseXmx "$@"
+}
+calcXmx "$@"
+
+function analyze() {
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
+		module unload oracle-jdk
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	fi
+	local CMD="java $EA $EOOM $z $z2 -cp $CP prok.AnalyzeGenes $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+analyze "$@"
--- a/bbcms.sh
+++ b/bbcms.sh
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified July 24, 2018
+
+Description:  Error corrects reads and/or filters by depth, storing
+kmer counts in a count-min sketch (a Bloom filter variant).
+This uses a fixed amount of memory.  The error-correction algorithm is taken
+from Tadpole; with plenty of memory, the behavior is almost identical to 
+Tadpole.  As the number of unique kmers in a dataset increases, the accuracy 
+decreases such that it will make fewer corrections.  It is still capable
+of making useful corrections far past the point where Tadpole would crash
+by running out of memory, even with the prefilter flag.  But if there is
+sufficient memory to use Tadpole, then Tadpole is more desirable.
+
+Because accuracy declines with an increasing number of unique kmers, it can
+be useful with very large datasets to run this in 2 passes, with the first 
+pass for filtering only using a 2-bit filter with the flags tossjunk=t and 
+ecc=f (and possibly mincount=2 and hcf=0.4), and the second pass using a 
+4-bit filter for the actual error correction.
+
+Usage:  bbcms.sh in=<input file> out=<output> outb=<reads failing filters>
+
+Example of use in error correction:
+bbcms.sh in=reads.fq out=ecc.fq bits=4 hashes=3 k=31 merge
+
+Example of use in depth filtering:
+bbcms.sh in=reads.fq out=high.fq outb=low.fq k=31 mincount=2 ecc=f hcf=0.4
+
+Error correction and depth filtering can be done simultaneously.
+
+File parameters:
+in=<file>       Primary input, or read 1 input.
+in2=<file>      Read 2 input if reads are in two files.
+out=<file>      Primary read output.
+out2=<file>     Read 2 output if reads are in two files.
+outb=<file>     (outbad/outlow) Output for reads failing mincount.
+outb2=<file>    (outbad2/outlow2) Read 2 output if reads are in two files.
+extra=<file>    Additional comma-delimited files for generating kmer counts.
+ref=<file>      If ref is set, then only files in the ref list will be used
+                for kmer counts, and the input files will NOT be used for
+                counts; they will just be filtered or corrected.
+overwrite=t     (ow) Set to false to force the program to abort rather than
+                overwrite an existing file.
+
+Hashing parameters:
+k=31            Kmer length, currently 1-31.
+hashes=3        Number of hashes per kmer.  Higher generally reduces 
+                false positives at the expense of speed; rapidly
+                diminishing returns above 4.
+minprob=0.5     Ignore kmers with probability of being correct below this.
+memmult=1.0     Fraction of free memory to use for Bloom filter.  1.0 should
+                generally work; if the program crashes with an out of memory
+                error, set this lower.  You may be able to increase accuracy
+                by setting it slightly higher.
+cells=          Option to set the number of cells manually.  By default this
+                will be autoset to use all available memory.  The only reason
+                to set this is to ensure deterministic output.
+seed=0          This will change the hash function used.
+
+Depth filtering parameters:
+mincount=0      If positive, reads with kmer counts below mincount will
+                be discarded (sent to outb).
+hcf=1.0         (highcountfraction) Fraction of kmers that must be at least
+                mincount to pass.
+requireboth=t   Require both reads in a pair to pass in order to go to out.
+                When true, if either read has a count below mincount, both
+                reads in the pair will go to outb.  When false, reads will
+                only go to outb if both fail.
+tossjunk=f      Remove reads or pairs with outermost kmer depth below 2.
+(Suggested params for huge metagenomes: mincount=2 hcf=0.4 tossjunk=t)
+
+Error correction parameters:
+ecc=t           Perform error correction.
+bits=           Bits used to store kmer counts; max count is 2^bits-1.
+                Supports 2, 4, 8, 16, or 32.  16 is best for high-depth data;
+                2 or 4 are for huge, low-depth metagenomes that saturate the 
+                bloom filter otherwise.  Generally 4 bits is recommended for 
+                error-correction and 2 bits is recommended for filtering only.
+ecco=f          Error-correct paired reads by overlap prior to kmer-counting.
+merge=t         Merge paired reads by overlap prior to kmer-counting, and 
+                again prior to correction.  Output will still be unmerged.
+smooth=3        Remove spikes from kmer counts due to hash collisions.
+                The number is the max width of peaks to be smoothed; range is
+                0-3 (3 is most aggressive; 0 disables smoothing).
+                This also affects tossjunk.
+                
+
+Java Parameters:
+-Xmx            This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+                -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom           This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da             Disable assertions.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+EOOM=""
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	parseXmx "$@"
+	if [[ $set == 1 ]]; then
+		return
+	fi
+	freeRam 4000m 84
+	z="-Xmx${RAM}m"
+	z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bloomfilter() {
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
+		module unload oracle-jdk
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	fi
+	local CMD="java $EA $EOOM $z $z2 -cp $CP bloom.BloomFilterCorrectorWrapper $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+bloomfilter "$@"
--- a/bbcountunique.sh
+++ b/bbcountunique.sh
 #!/bin/bash
-#bbcountunique in=<infile> out=<outfile>

 usage(){
 echo "
 Written by Brian Bushnell
-Last modified February 17, 2015
+Last modified August 1, 2016

 Description:  Generates a kmer uniqueness histogram, binned by file position.
 There are 3 columns for single reads, 6 columns for paired:
@@ -15,10 +14,9 @@ r2_first     percent unique 1st kmer of read 2
 r2_rand      percent unique random kmer of read 2
 pair         percent unique concatenated kmer from read 1 and 2

-Usage:	bbcountunique.sh in=<input> out=<output>
-
+Please read bbmap/docs/guides/CalcUniquenessGuide.txt for more information.

-Optional parameters (and their defaults)
+Usage:	bbcountunique.sh in=<input> out=<output>

 Input parameters:
 in2=null            Second input file for paired reads
@@ -30,22 +28,25 @@ Output parameters:
 out=<file>          File for output stats

 Processing parameters:
-k=20                Kmer length (range 1-31).
+k=25                Kmer length (range 1-31).
 interval=25000      Print one line to the histogram per this many reads.
 cumulative=f        Show cumulative numbers rather than per-interval numbers.
 percent=t           Show percentages of unique reads.
 count=f             Show raw counts of unique reads.
 printlastbin=f      (plb) Print a line for the final undersized bin.
-
+minprob=0           Ignore kmers with a probability of correctness below this (based on q-scores).

 Java Parameters:
 -Xmx                This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -62,6 +63,7 @@ CP="$DIR""current/"
 z="-Xmx1g"
 z2="-Xms1g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -82,12 +84,25 @@ calcXmx () {
 calcXmx "$@"

 bbcountunique() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java $EA $z -cp $CP jgi.CalcUniqueness $@"
+	local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.CalcUniqueness $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbduk.sh
+++ b/bbduk.sh
 #!/bin/bash
-#bbduk in=<file> out=<file> ref=<ref file>

 usage(){
 echo "
 Written by Brian Bushnell
-Last modified December 10, 2015
+Last modified August 29, 2018

 Description:  Compares reads to the kmers in a reference dataset, optionally 
 allowing an edit distance. Splits the reads into two outputs - those that 
 match the reference, and those that don't. Can also trim (remove) the matching 
 parts of the reads rather than binning the reads.
+Please read bbmap/docs/guides/BBDukGuide.txt for more information.

 Usage:  bbduk.sh in=<input file> out=<output file> ref=<contaminant files>

@@ -17,11 +17,12 @@ Input may be stdin or a fasta or fastq file, compressed or uncompressed.
 If you pipe via stdin/stdout, please include the file type; e.g. for gzipped 
 fasta input, set in=stdin.fa.gz

-
 Input parameters:
 in=<file>           Main input. in=stdin.fq will pipe from stdin.
 in2=<file>          Input for 2nd read of pairs in a different file.
 ref=<file,file>     Comma-delimited list of reference files.
+                    In addition to filenames, you may also use the keywords:
+                    adapters, artifacts, phix, lambda, pjet, mtst, kapa
 literal=<seq,seq>   Comma-delimited list of literal reference sequences.
 touppercase=f       (tuc) Change all bases upper-case.
 interleaved=auto    (int) t/f overrides interleaved autodetection.
@@ -31,6 +32,7 @@ copyundefined=f     (cu) Process non-AGCT IUPAC reference bases by making all
                    possible unambiguous copies.  Intended for short motifs
                    or adapter barcodes, as time/memory use is exponential.
 samplerate=1        Set lower to only process a fraction of input reads.
+samref=<file>       Optional reference fasta for processing sam files.

 Output parameters:
 out=<file>          (outnonmatch) Write reads here that do not contain 
@@ -38,8 +40,11 @@ out=<file>          (outnonmatch) Write reads here that do not contain
                    to standard out.
 out2=<file>         (outnonmatch2) Use this to write 2nd read of pairs to a 
                    different file.
-outm=<file>         (outmatch) Write reads here that contain kmers matching
-                    the database.
+outm=<file>         (outmatch) Write reads here that fail filters.  In default
+                    kfilter mode, this means any read with a matching kmer.
+                    In any mode, it also includes reads that fail filters such
+                    as minlength, mingc, maxgc, entropy, etc.  In other words,
+                    it includes all reads that do not go to 'out'.
 outm2=<file>        (outmatch2) Use this to write 2nd read of pairs to a 
                    different file.
 outs=<file>         (outsingle) Use this to write singleton reads whose mate 
@@ -73,17 +78,27 @@ qchist=<file>       Count of bases with each quality value.
 aqhist=<file>       Histogram of average read quality.
 bqhist=<file>       Quality histogram designed for box plots.
 lhist=<file>        Read length histogram.
+phist=<file>        Polymer length histogram.
 gchist=<file>       Read GC content histogram.
 gcbins=100          Number gchist bins.  Set to 'auto' to use read length.
+maxhistlen=6000     Set an upper bound for histogram lengths; higher uses 
+                    more memory.  The default is 6000 for some histograms
+                    and 80000 for others.

-Histograms for sam files only (requires sam format 1.4 or higher):
+Histograms for mapped sam/bam files only:
+histbefore=t        Calculate histograms from reads before processing.
 ehist=<file>        Errors-per-read histogram.
 qahist=<file>       Quality accuracy histogram of error rates versus quality 
                    score.
 indelhist=<file>    Indel length histogram.
-mhist=<file>        Histogram of match, sub, del, and ins rates by read location.
+mhist=<file>        Histogram of match, sub, del, and ins rates by position.
 idhist=<file>       Histogram of read count versus percent identity.
 idbins=100          Number idhist bins.  Set to 'auto' to use read length.
+varfile=<file>      Ignore substitution errors listed in this file when 
+                    calculating error rates.  Can be generated with
+                    CallVariants.
+vcf=<file>          Ignore substitution errors listed in this VCF file 
+                    when calculating error rates.

 Processing parameters:
 k=27                Kmer length used for finding contaminants.  Contaminants 
@@ -114,6 +129,8 @@ forbidn=f           (fn) Forbids matching of read kmers containing N.
 removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is 
                    match (or either is trimmed shorter than minlen).  
                    Set to false to require both.
+trimfailures=f      Instead of discarding failed reads, trim them to 1bp.
+                    This makes the statistics a bit odd.
 findbestmatch=f     (fbm) If multiple matches, associate read with sequence 
                    sharing most kmers.  Reduces speed.
 skipr1=f            Don't do kmer-based operations on read 1.
@@ -145,31 +162,39 @@ speed=0             Ignore this fraction of kmer space (0-15 out of 16) in both
 Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.

 Trimming/Filtering/Masking parameters:
-Note - if neither ktrim nor kmask is set, the default behavior is kfilter.
-All three are mutually exclusive.
+Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter.
+All kmer processing modes are mutually exclusive.
+Reads only get sent to 'outm' purely based on kmer matches in kfilter mode.

 ktrim=f             Trim reads to remove bases matching reference kmers.
                    Values: 
                       f (don't trim), 
                       r (trim to the right), 
                       l (trim to the left)
-kmask=f             Replace bases matching ref kmers with another symbol.
-                    Allows any non-whitespace character other than t or f,
-                    and processes short kmers on both ends.  'kmask=lc' will
+kmask=              Replace bases matching ref kmers with another symbol.
+                    Allows any non-whitespace character, and processes short
+                    kmers on both ends if mink is set.  'kmask=lc' will
                    convert masked bases to lowercase.
 maskfullycovered=f  (mfc) Only mask bases that are fully covered by kmers.
+ksplit=f            For single-ended reads only.  Reads will be split into
+                    pairs around the kmer.  If the kmer is at the end of the
+                    read, it will be trimmed instead.  Singletons will go to
+                    out, and pairs will go to outm.  Do not use ksplit with
+                    other operations such as quality-trimming or filtering.
 mink=0              Look for shorter kmers at read tips down to this length, 
                    when k-trimming or masking.  0 means disabled.  Enabling
                    this will disable maskmiddle.
 qtrim=f             Trim read ends to remove bases with quality below trimq.
-                    Performed AFTER looking for kmers.
-                    Values: 
+                    Performed AFTER looking for kmers.  Values: 
                       rl (trim both ends), 
                       f (neither end), 
                       r (right end only), 
                       l (left end only),
                       w (sliding window).
-trimq=6             Regions with average quality BELOW this will be trimmed.
+trimq=6             Regions with average quality BELOW this will be trimmed,
+                    if qtrim is set to something other than f.  Can be a 
+                    floating-point number like 7.3.
+trimclip=f          Trim soft-clipped bases from sam files.
 minlength=10        (ml) Reads shorter than this after trimming will be 
                    discarded.  Pairs will be discarded if both are shorter.
 mlf=0               (minlengthfraction) Reads shorter than this fraction of 
@@ -179,11 +204,8 @@ maxlength=          Reads longer than this after trimming will be discarded.
 minavgquality=0     (maq) Reads with average quality (after trimming) below 
                    this will be discarded.
 maqb=0              If positive, calculate maq from this many initial bases.
-chastityfilter=f    (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
-barcodefilter=f     Remove reads with unexpected barcodes if barcodes is set,
-                    or barcodes containing 'N' otherwise.  A barcode must be
-                    the last part of the read header.
-barcodes=           Comma-delimited list of barcodes or files of barcodes.
+minbasequality=0    (mbq) Reads with any base below this quality (after 
+                    trimming) will be discarded.
 maxns=-1            If non-negative, reads with more Ns than this 
                    (after trimming) will be discarded.
 mcb=0               (minconsecutivebases) Discard reads without at least 
@@ -212,6 +234,39 @@ restrictright=0     If positive, only look for kmer matches in the
                    rightmost X bases.
 mingc=0             Discard reads with GC content below this.
 maxgc=1             Discard reads with GC content above this.
+gcpairs=t           Use average GC of paired reads.
+                    Also affects gchist.
+tossjunk=f          Discard reads with invalid characters as bases.
+
+Header-parsing parameters - these require Illumina headers:
+chastityfilter=f    (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
+barcodefilter=f     Remove reads with unexpected barcodes if barcodes is set,
+                    or barcodes containing 'N' otherwise.  A barcode must be
+                    the last part of the read header.  Values:
+                       t:     Remove reads with bad barcodes.
+                       f:     Ignore barcodes.
+                       crash: Crash upon encountering bad barcodes.
+barcodes=           Comma-delimited list of barcodes or files of barcodes.
+xmin=-1             If positive, discard reads with a lesser X coordinate.
+ymin=-1             If positive, discard reads with a lesser Y coordinate.
+xmax=-1             If positive, discard reads with a greater X coordinate.
+ymax=-1             If positive, discard reads with a greater Y coordinate.
+
+Polymer trimming:
+trimpolya=0         If greater than 0, trim poly-A or poly-T tails of
+                    at least this length on either end of reads.
+trimpolygleft=0     If greater than 0, trim poly-G prefixes of at least this
+                    length on the left end of reads.  Does not trim poly-C.
+trimpolygright=0    If greater than 0, trim poly-G tails of at least this 
+                    length on the right end of reads.  Does not trim poly-C.
+trimpolyg=0         This sets both left and right at once.
+filterpolyg=0       If greater than 0, remove reads with a poly-G prefix of
+                    at least this length (on the left).
+Note: there are also equivalent poly-C flags.
+
+Polymer tracking:
+pratio=base,base    'pratio=G,C' will print the ratio of G to C polymers.
+plen=20             Length of homopolymers to count.

 Entropy/Complexity parameters:
 entropy=-1          Set between 0 and 1 to filter reads with entropy below
@@ -219,9 +274,17 @@ entropy=-1          Set between 0 and 1 to filter reads with entropy below
 entropywindow=50    Calculate entropy using a sliding window of this length.
 entropyk=5          Calculate entropy using kmers of this length.
 minbasefrequency=0  Discard reads with a minimum base frequency below this.
+entropymask=f       Values:
+                       f:  Discard low-entropy sequences.
+                       t:  Mask low-entropy parts of sequences with N.
+                       lc: Change low-entropy parts of sequences to lowercase.
+entropymark=f       Mark each base with its entropy value.  This is on a scale
+                    of 0-41 and is reported as quality scores, so the output
+                    should be fastq or fasta+qual.

 Cardinality estimation:
 cardinality=f       (loglog) Count unique kmers using the LogLog algorithm.
+cardinalityout=f    (loglogout) Count unique kmers in output reads.
 loglogk=31          Use this kmer length for counting.
 loglogbuckets=1999  Use this many buckets for counting.

@@ -231,12 +294,15 @@ Java Parameters:
                    the program's automatic memory detection. -Xmx20g will 
                    specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  
                    The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an 
+                    out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.

-There is a changelog at /bbmap/docs/changelog_bbduk.txt
 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "	
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -254,6 +320,7 @@ NATIVELIBDIR="$DIR""jni/"
 z="-Xmx1g"
 z2="-Xms1g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -274,14 +341,32 @@ calcXmx () {
 calcXmx "$@"

 bbduk() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module unload PrgEnv-intel
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
 		module load pigz
-		module load samtools
 	fi
-	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDukF $@"
+	#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDukF $@"
+	local CMD="java $EA $z $z2 -cp $CP jgi.BBDukF $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbduk2.sh
+++ b/bbduk2.sh
-#!/bin/bash
-#bbduk2 in=<file> out=<file> fref=<file>
-
-usage(){
-echo "
-Written by Brian Bushnell
-Last modified December 10, 2015
-
-BBDuk2 is like BBDuk but can kfilter, kmask, and ktrim in a single pass.
-It does not replace BBDuk, and is only provided to allow maximally efficient
-pipeline integration when multiple steps will be performed.  The syntax is 
-slightly different.
-
-Description:  Compares reads to the kmers in a reference dataset, optionally 
-allowing an edit distance. Splits the reads into two outputs - those that 
-match the reference, and those that don't. Can also trim (remove) the matching 
-parts of the reads rather than binning the reads.
-
-Usage:	bbduk2.sh in=<input file> out=<output file> fref=<contaminant files>
-
-Input may be stdin or a fasta or fastq file, compressed or uncompressed.
-If you pipe via stdin/stdout, please include the file type; e.g. for gzipped 
-fasta input, set in=stdin.fa.gz
-
-
-Input parameters:
-in=<file>           Main input. in=stdin.fq will pipe from stdin.
-in2=<file>          Input for 2nd read of pairs in a different file.
-fref=<file,file>    Comma-delimited list of fasta reference files for filtering.
-rref=<file,file>    Comma-delimited list of fasta reference files for right-trimming.
-lref=<file,file>    Comma-delimited list of fasta reference files for left-trimming.
-mref=<file,file>    Comma-delimited list of fasta reference files for masking.
-fliteral=<seq,seq>  Comma-delimited list of literal sequences for filtering.
-rliteral=<seq,seq>  Comma-delimited list of literal sequences for right-trimming.
-lliteral=<seq,seq>  Comma-delimited list of literal sequences for left-trimming.
-mliteral=<seq,seq>  Comma-delimited list of literal sequences for masking.
-touppercase=f       (tuc) Change all bases upper-case.
-interleaved=auto    (int) t/f overrides interleaved autodetection.
-qin=auto            Input quality offset: 33 (Sanger), 64, or auto.
-reads=-1            If positive, quit after processing X reads or pairs.
-copyundefined=f     (cu) Process non-AGCT IUPAC reference bases by making all
-                    possible unambiguous copies.  Intended for short motifs
-                    or adapter barcodes, as time/memory use is exponential.
-
-Output parameters:
-out=<file>          (outnonmatch) Write reads here that do not contain 
-                    kmers matching the database.  'out=stdout.fq' will pipe 
-                    to standard out.
-out2=<file>         (outnonmatch2) Use this to write 2nd read of pairs to a 
-                    different file.
-outm=<file>         (outmatch) Write reads here that contain kmers matching
-                    the database.
-outm2=<file>        (outmatch2) Use this to write 2nd read of pairs to a 
-                    different file.
-outs=<file>         (outsingle) Use this to write singleton reads whose mate 
-                    was trimmed shorter than minlen.
-stats=<file>        Write statistics about which contamininants were detected.
-refstats=<file>     Write statistics on a per-reference-file basis.
-rpkm=<file>         Write RPKM for each reference sequence (for RNA-seq).
-dump=<file>         Dump kmer tables to a file, in fasta format.
-nzo=t               Only write statistics about ref sequences with nonzero hits.
-overwrite=t         (ow) Grant permission to overwrite files.
-showspeed=t         (ss) 'f' suppresses display of processing speed.
-ziplevel=2          (zl) Compression level; 1 (min) through 9 (max).
-fastawrap=80        Length of lines in fasta output.
-qout=auto           Output quality offset: 33 (Sanger), 64, or auto.
-statscolumns=3      (cols) Number of columns for stats output, 3 or 5.
-                    5 includes base counts.
-rename=f            Rename reads to indicate which sequences they matched.
-refnames=f          Use names of reference files rather than scaffold IDs.
-trd=f               Truncate read and ref names at the first whitespace.
-ordered=f           Set to true to output reads in same order as input.
-
-Histogram output parameters:
-bhist=<file>        Base composition histogram by position.
-qhist=<file>        Quality histogram by position.
-qchist=<file>       Count of bases with each quality value.
-aqhist=<file>       Histogram of average read quality.
-bqhist=<file>       Quality histogram designed for box plots.
-lhist=<file>        Read length histogram.
-gchist=<file>       Read GC content histogram.
-gcbins=100          Number gchist bins.  Set to 'auto' to use read length.
-
-Histograms for sam files only (requires sam format 1.4 or higher):
-
-ehist=<file>        Errors-per-read histogram.
-qahist=<file>       Quality accuracy histogram of error rates versus quality 
-                    score.
-indelhist=<file>    Indel length histogram.
-mhist=<file>        Histogram of match, sub, del, and ins rates by read location.
-idhist=<file>       Histogram of read count versus percent identity.
-idbins=100          Number idhist bins.  Set to 'auto' to use read length.
-
-Processing parameters:
-k=27                Kmer length used for finding contaminants.  Contaminants 
-                    shorter than k will not be found.  k must be at least 1.
-rcomp=t             Look for reverse-complements of kmers in addition to 
-                    forward kmers.
-maskmiddle=t        (mm) Treat the middle base of a kmer as a wildcard, to 
-                    increase sensitivity in the presence of errors.
-minkmerhits=1       (mkh) Reads need at least this many matching kmers 
-                    to be considered as matching the reference.
-hammingdistance=0   (hdist) Maximum Hamming distance for ref kmers (subs only).
-                    Memory use is proportional to (3*K)^hdist.
-qhdist=0            Hamming distance for query kmers; impacts speed, not memory.
-editdistance=0      (edist) Maximum edit distance from ref kmers (subs 
-                    and indels).  Memory use is proportional to (8*K)^edist.
-hammingdistance2=0  (hdist2) Sets hdist for short kmers, when using mink.
-qhdist2=0           Sets qhdist for short kmers, when using mink.
-editdistance2=0     (edist2) Sets edist for short kmers, when using mink.
-forbidn=f           (fn) Forbids matching of read kmers containing N.
-                    By default, these will match a reference 'A' if 
-                    hdist>0 or edist>0, to increase sensitivity.
-removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is 
-                    match (or either is trimmed shorter than minlen).  
-                    Set to false to require both.
-findbestmatch=f     (fbm) If multiple matches, associate read with sequence 
-                    sharing most kmers.  Reduces speed.
-skipr1=f            Don't do kmer-based operations on read 1.
-skipr2=f            Don't do kmer-based operations on read 2.
-ecco=f              For overlapping paired reads only.  Performs error-
-                    correction with BBMerge prior to kmer operations.
-recalibrate=f       (recal) Recalibrate quality scores.  Requires calibration
-                    matrices generated by CalcTrueQuality.
-sam=<file,file>     If recalibration is desired, and matrices have not already
-                    been generated, BBDuk will create them from the sam file.
-
-Speed and Memory parameters:
-threads=auto        (t) Set number of threads to use; default is number of 
-                    logical processors.
-prealloc=f          Preallocate memory in table.  Allows faster table loading 
-                    and more efficient memory usage, for a large reference.
-monitor=f           Kill this process if it crashes.  monitor=600,0.01 would 
-                    kill after 600 seconds under 1% usage.
-minrskip=1          (mns) Force minimal skip interval when indexing reference 
-                    kmers.  1 means use all, 2 means use every other kmer, etc.
-maxrskip=1          (mxs) Restrict maximal skip interval when indexing 
-                    reference kmers. Normally all are used for scaffolds<100kb, 
-                    but with longer scaffolds, up to maxrskip-1 are skipped.
-rskip=              Set both minrskip and maxrskip to the same value.
-                    If not set, rskip will vary based on sequence length.
-qskip=1             Skip query kmers to increase speed.  1 means use all.
-speed=0             Ignore this fraction of kmer space (0-15 out of 16) in both
-                    reads and reference.  Increases speed and reduces memory.
-Note: Do not use more than one of 'speed', 'qskip', and 'rskip'.
-
-Trimming/Filtering/Masking parameters:
-Note - for BBDuk2, kmer filtering, trimming, and masking are independent,
-and all can be performed at the same time.
-
-ktrim=f             Trim reads to remove bases matching reference kmers.
-                    Values: 
-                            f (don't trim), 
-                            r (trim to the right), 
-                            l (trim to the left)
-kmask=f             Replace bases matching ref kmers with another symbol.
-                    Allows any non-whitespace character other than t or f,
-                    and processes short kmers on both ends.  'kmask=lc' will
-                    convert masked bases to lowercase.
-mink=0              Look for shorter kmers at read tips down to this length, 
-                    when k-trimming or masking.  0 means disabled.  Enabling
-                    this will disable maskmiddle.
-qtrim=f             Trim read ends to remove bases with quality below trimq.
-                    Performed AFTER looking for kmers.
-                    Values: 
-                            rl (trim both ends), 
-                            f (neither end), 
-                            r (right end only), 
-                            l (left end only),
-                            w (sliding window)
-trimq=6             Regions with average quality BELOW this will be trimmed.
-minlength=10        (ml) Reads shorter than this after trimming will be 
-                    discarded.  Pairs will be discarded if both are shorter.
-mlf=0               (minlengthfraction) Reads shorter than this fraction of 
-                    original length after trimming will be discarded.
-maxlength=          Reads longer than this after trimming will be discarded.
-                    Pairs will be discarded only if both are longer.
-minavgquality=0     (maq) Reads with average quality (after trimming) below 
-                    this will be discarded.
-maqb=0              If positive, calculate maq from this many initial bases.
-chastityfilter=f    (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'.
-barcodefilter=f     Remove reads with unexpected barcodes if barcodes is set,
-                    or barcodes containing 'N' otherwise.  A barcode must be
-                    the last part of the read header.
-barcodes=           Comma-delimited list of barcodes or files of barcodes.
-maxns=-1            If non-negative, reads with more Ns than this 
-                    (after trimming) will be discarded.
-mcb=0               (minconsecutivebases) Discard reads without at least 
-                    this many consecutive called bases.
-ottm=f              (outputtrimmedtomatch) Output reads trimmed to shorter 
-                    than minlength to outm rather than discarding.
-tp=0                (trimpad) Trim this much extra around matching kmers.
-tbo=f               (trimbyoverlap) Trim adapters based on where paired 
-                    reads overlap.
-strictoverlap=t     Adjust sensitivity for trimbyoverlap mode.
-minoverlap=14       Require this many bases of overlap for detection.
-mininsert=50        Require insert size of at least this for overlap. 
-                    Should be reduced to 16 for small RNA sequencing.
-tpe=f               (trimpairsevenly) When kmer right-trimming, trim both 
-                    reads to the minimum length of either.
-forcetrimleft=0     (ftl) If positive, trim bases to the left of this position
-                    (exclusive, 0-based).
-forcetrimright=0    (ftr) If positive, trim bases to the right of this position
-                    (exclusive, 0-based).
-forcetrimright2=0   (ftr2) If positive, trim this many bases on the right end.
-forcetrimmod=0      (ftm) If positive, right-trim length to be equal to zero,
-                    modulo this number.
-restrictleft=0      If positive, only look for kmer matches in the 
-                    leftmost X bases.
-restrictright=0     If positive, only look for kmer matches in the 
-                    rightmost X bases.
-mingc=0             Discard reads with GC content below this.
-maxgc=1             Discard reads with GC content above this.
-
-Entropy/Complexity parameters:
-entropy=-1          Set between 0 and 1 to filter reads with entropy below
-                    that value.  Higher is more stringent.
-entropywindow=50    Calculate entropy using a sliding window of this length.
-entropyk=5          Calculate entropy using kmers of this length.
-minbasefrequency=0  Discard reads with a minimum base frequency below this.
-
-Cardinality estimation:
-cardinality=f           (loglog) Count unique kmers using the LogLog algorithm.
-loglogk=31              Use this kmer length for counting.
-loglogbuckets=1999      Use this many buckets for counting.
-
-Java Parameters:
-
-Xmx                This will be passed to Java to set memory usage, overriding 
-                    the program's automatic memory detection. -Xmx20g will 
-                    specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  
-                    The max is typically 85% of physical memory.
-
-There is a changelog at /bbmap/docs/changelog_bbduk.txt
-Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
-"	
-}
-
-pushd . > /dev/null
-DIR="${BASH_SOURCE[0]}"
-while [ -h "$DIR" ]; do
-  cd "$(dirname "$DIR")"
-  DIR="$(readlink "$(basename "$DIR")")"
-done
-cd "$(dirname "$DIR")"
-DIR="$(pwd)/"
-popd > /dev/null
-
-#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
-CP="$DIR""current/"
-NATIVELIBDIR="$DIR""jni/"
-
-z="-Xmx1g"
-z2="-Xms1g"
-EA="-ea"
-set=0
-
-if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
-	usage
-	exit
-fi
-
-calcXmx () {
-	source "$DIR""/calcmem.sh"
-	parseXmx "$@"
-	if [[ $set == 1 ]]; then
-		return
-	fi
-	freeRam 1400m 42
-	z="-Xmx${RAM}m"
-	z2="-Xms${RAM}m"
-}
-calcXmx "$@"
-
-bbduk2() {
-	if [[ $NERSC_HOST == genepool ]]; then
-		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
-		module load pigz
-		module load samtools
-	fi
-	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBDuk2 $@"
-	echo $CMD >&2
-	eval $CMD
-}
-
-bbduk2 "$@"
--- a/bbest.sh
+++ b/bbest.sh
 #!/bin/bash
-#bbest in=<infile> out=<outfile>

-function usage(){
+usage(){
 echo "
 Written by Brian Bushnell
 Last modified November 4, 2015
@@ -23,6 +22,7 @@ Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -39,6 +39,7 @@ CP="$DIR""current/"
 z="-Xmx120m"
 z2="-Xms120m"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -53,14 +54,31 @@ calcXmx () {
 calcXmx "$@"

 function bbest() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
-		module load samtools
+		module load oracle-jdk/1.8_144_64bit
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module unload PrgEnv-intel
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
 		module load pigz
 	fi
-	local CMD="java $EA $z -cp $CP jgi.SamToEst $@"
+	local CMD="java $EA $EOOM $z -cp $CP jgi.SamToEst $@"
 #	echo $CMD >&2
 	eval $CMD
 }

--- a/bbfakereads.sh
+++ b/bbfakereads.sh
 #!/bin/bash
-#fakereads in=<infile> out=<outfile>

-function usage(){
+usage(){
 echo "
 Written by Brian Bushnell
 Last modified February 17, 2015
@@ -12,7 +11,6 @@ Usage:        bbfakereads.sh in=<file> out=<outfile> out2=<outfile2>

 Out2 is optional; if there is only one output file, it will be written interleaved.

-
 Standard parameters:
 ow=f                (overwrite) Overwrites files that already exist.
 zl=4                (ziplevel) Set compression level, 1 (low) to 9 (max).
@@ -36,17 +34,14 @@ addspace=t          Set to false to omit the  space before /1 and /2 of paired r
 Java Parameters:
 -Xmx                This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
-
-Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)
-Supported output formats are fastq, fasta, fast+qual, bread
-Supported compression formats are gz, zip, and bz2
-To read from stdin, set 'in=stdin'.  The format should be specified with an extension, like 'in=stdin.fq.gz'
-To write to stdout, set 'out=stdout'.  The format should be specified with an extension, like 'out=stdout.fasta'
+-eoom               This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -62,6 +57,7 @@ CP="$DIR""current/"

 z="-Xmx600m"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -77,14 +73,25 @@ calcXmx () {
 calcXmx "$@"

 function fakereads() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
-		module load samtools
 	fi
-	local CMD="java $EA $z -cp $CP jgi.FakeReads $@"
+	local CMD="java $EA $EOOM $z -cp $CP jgi.FakeReads $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbmap.sh
+++ b/bbmap.sh
 #!/bin/bash
-#bbmap in=<infile> out=<outfile>

 usage(){
 echo "
-BBMap v35.x
+BBMap
 Written by Brian Bushnell, from Dec. 2010 - present
-Last modified December 15, 2015
+Last modified September 18, 2018

 Description:  Fast and accurate splice-aware read aligner.
+Please read bbmap/docs/guides/BBMapGuide.txt for more information.

 To index:     bbmap.sh ref=<reference fasta>
 To map:       bbmap.sh in=<reads> out=<output sam>
@@ -20,7 +20,6 @@ input and output files e.g. in=stdin.fa.gz will read gzipped fasta from
 standard in; out=stdout.sam.gz will write gzipped sam.

 Indexing Parameters (required when building the index):
-
 nodisk=f                Set to true to build index in memory and write nothing 
                        to disk except output.
 ref=<file>              Specify the reference sequence.  Only do this ONCE, 
@@ -40,7 +39,6 @@ usemodulo=f             Throw away ~80% of kmers based on remainder modulo a
 rebuild=f               Force a rebuild of the index (ref= should be set).

 Input Parameters:
-
 build=1                 Designate index to use.  Corresponds to the number 
                        specified when building the index.
 in=<file>               Primary reads input; required parameter.
@@ -68,7 +66,6 @@ skipreads=0             Set to a number N to skip the first N reads (or pairs),
                        then map the rest.

 Mapping Parameters:
-
 fast=f                  This flag is a macro which sets other paramters to run 
                        faster, at reduced sensitivity.  Bad for RNA-seq.
 slow=f                  This flag is a macro which sets other paramters to run 
@@ -77,6 +74,8 @@ maxindel=16000          Don't look for indels longer than this. Lower is faster.
                        Set to >=100k for RNAseq with long introns like mammals.
 strictmaxindel=f        When enabled, do not allow indels longer than 'maxindel'.
                        By default these are not sought, but may be found anyway.
+tipsearch=100           Look this far for read-end deletions with anchors
+                        shorter than K, using brute force.
 minid=0.76              Approximate minimum alignment identity to look for. 
                        Higher is faster and less sensitive.
 minhits=1               Minimum number of seed hits required for candidate sites.
@@ -114,18 +113,28 @@ rescuemismatches=32     Maximum mismatches allowed in a rescued read.  Lower
                        is faster.
 averagepairdist=100     (apd) Initial average distance between paired reads.
                        Varies dynamically; does not need to be specified.
+deterministic=f         Run in deterministic mode.  In this case it is good
+                        to set averagepairdist.  BBMap is deterministic
+                        without this flag if using single-ended reads,
+                        or run singlethreaded.
 bandwidthratio=0        (bwr) If above zero, restrict alignment band to this 
                        fraction of read length.  Faster but less accurate.
+bandwidth=0             (bw) Set the bandwidth directly.
+                        fraction of read length.  Faster but less accurate.
 usejni=f                (jni) Do alignments faster, in C code.  Requires 
                        compiling the C code; details are in /jni/README.txt.
 maxsites2=800           Don't analyze (or print) more than this many alignments 
                        per read.
-monitor=f               Kill this process if CPU usage drops to zero for
-                        a long time.  monitor=600,0.01 would kill after 600
-                        seconds under 1% usage.
+ignorefrequentkmers=t   (ifk) Discard low-information kmers that occur often.
+excludefraction=0.03    (ef) Fraction of kmers to ignore.  For example, 0.03
+                        will ignore the most common 3% of kmers.
+greedy=t                Use a greedy algorithm to discard the least-useful
+                        kmers on a per-read basis.
+kfilter=0               If positive, potential mapping sites must have at
+                        least this many consecutive exact matches.

-Quality and Trimming Parameters:

+Quality and Trimming Parameters:
 qin=auto                Set to 33 or 64 to specify input quality value ASCII
                        offset. 33 is Sanger, 64 is old Solexa.
 qout=auto               Set to 33 or 64 to specify output quality value ASCII 
@@ -143,11 +152,10 @@ ignorebadquality=f      (ibq) Keep going, rather than crashing, if a read has
                        out-of-range quality values.
 usequality=t            Use quality scores when determining which read kmers
                        to use as seeds.
-minaveragequality=0     (maq) Discard reads with average quality below this.
+minaveragequality=0     (maq) Do not map reads with average quality below this.
 maqb=0                  If positive, calculate maq from this many initial bases.

 Output Parameters:
-
 out=<file>              Write all reads to this file.
 outu=<file>             Write only unmapped reads to this file.  Does not 
                        include unmapped paired reads with a mapped mate.
@@ -180,9 +188,19 @@ printunmappedcount=f    Print the total number of unmapped reads and bases.
 showprogress=0          If positive, print a '.' every X reads.
 showprogress2=0         If positive, print the number of seconds since the
                        last progress update (instead of a '.').
+renamebyinsert=f        Renames reads based on their mapped insert size.
+
+Bloom-Filtering Parameters (bloomfilter.sh is the standalone version).
+bloom=f                 Use a Bloom filter to ignore reads not sharing kmers
+                        with the reference.  This uses more memory, but speeds
+                        mapping when most reads don't match the reference.
+bloomhashes=2           Number of hash functions.
+bloomminhits=3          Number of consecutive hits to be considered matched.
+bloomk=31               Bloom filter kmer length.
+bloomserial=t           Use the serialized Bloom filter for greater loading
+                        speed, if available.  If not, generate and write one.

 Post-Filtering Parameters:
-
 idfilter=0              Independant of minid; sets exact minimum identity 
                        allowed for alignments to be printed.  Range 0 to 1.
 subfilter=-1            Ban alignments with more than this many substitutions.
@@ -192,9 +210,10 @@ indelfilter=-1          Ban alignments with more than this many indels.
 editfilter=-1           Ban alignments with more than this many edits.
 inslenfilter=-1         Ban alignments with an insertion longer than this.
 dellenfilter=-1         Ban alignments with a deletion longer than this.
+nfilter=-1              Ban alignments with more than this many ns.  This 
+                        includes nocall, noref, and off scaffold ends.

 Sam flags and settings:
-
 noheader=f              Disable generation of header lines.
 sam=1.4                 Set to 1.4 to write Sam version 1.4 cigar strings, 
                        with = and X, or 1.3 to use M.
@@ -228,7 +247,6 @@ boundstag=f             Write a tag indicating whether either read in the pair
 notags=f                Turn off all optional tags.

 Histogram and statistics output parameters:
-
 scafstats=<file>        Statistics on how many reads mapped to which scaffold.
 refstats=<file>         Statistics on how many reads mapped to which reference
                        file; only for BBSplit.
@@ -247,12 +265,12 @@ mhist=<file>            Histogram of match, sub, del, and ins rates by
                        read location.
 gchist=<file>           Read GC content histogram.
 gcbins=100              Number gchist bins.  Set to 'auto' to use read length.
+gcpairs=t               Use average GC of paired reads.
 idhist=<file>           Histogram of read count versus percent identity.
 idbins=100              Number idhist bins.  Set to 'auto' to use read length.
 statsfile=stderr        Mapping statistics are printed here.

 Coverage output parameters (these may reduce speed and use more RAM):
-
 covstats=<file>         Per-scaffold coverage info.
 rpkm=<file>             Per-scaffold RPKM/FPKM counts.
 covhist=<file>          Histogram of # occurrences of each depth level.
@@ -270,6 +288,7 @@ physcov=f               Calculate physical coverage for paired reads.
                        This includes the unsequenced bases.
 delcoverage=t           (delcov) Count bases covered by deletions as covered.
                        True is faster than false.
+covk=0                  If positive, calculate kmer coverage statistics.

 Java Parameters:
 -Xmx                    This will be passed to Java to set memory usage, 
@@ -279,14 +298,16 @@ Java Parameters:
                        physical memory.  The human genome requires around 24g,
                        or 12g with the 'usemodulo' flag.  The index uses 
                        roughly 6 bytes per reference base.
+-eoom                   This flag will cause the process to exit if an 
+                        out-of-memory exception occurs.  Requires Java 8u92+.
+-da                     Disable assertions.

-This list is not complete.  For more information, please consult 
-$DIR""docs/readme.txt. 
 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter 
 any problems, or post at: http://seqanswers.com/forums/showthread.php?t=41057
 "   
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -304,6 +325,7 @@ NATIVELIBDIR="$DIR""jni/"
 z="-Xmx1g"
 z2="-Xms1g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -325,14 +347,32 @@ calcXmx "$@"


 bbmap() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module unload PrgEnv-intel
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
 		module load pigz
-		module load samtools
 	fi
-	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
+	#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
+	local CMD="java $EA $z -cp $CP align2.BBMap build=1 overwrite=true fastareadlen=500 $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbmapskimmer.sh
+++ b/bbmapskimmer.sh
 #!/bin/bash

 #This is a version of BBMap designed to final all sites above a given threshold,
-#rather than the single best site.
+#rather than the single best site.  Syntax is the same as BBMap.

 usage(){
 	bash "$DIR"bbmap.sh
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -24,6 +25,7 @@ NATIVELIBDIR="$DIR""jni/"
 z="-Xmx1g"
 z2="-Xms1g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -44,13 +46,31 @@ calcXmx () {
 calcXmx "$@"

 mapPacBioSkimmer() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module unload samtools
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module unload PrgEnv-intel
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
 		module load pigz
-		module load samtools
 	fi
+	#local CMD="java $EA $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambig=all minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
 	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 fastareadlen=6000 ambig=all minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@"
 	echo $CMD >&2
 	eval $CMD

--- a/bbmask.sh
+++ b/bbmask.sh
 #!/bin/bash
-#bbmask in=<infile> out=<outfile>

 usage(){
 echo "
 Written by Brian Bushnell
-Last modified May 19, 2015
+Last modified October 17, 2017

 Description:  Masks sequences of low-complexity, or containing repeat kmers, or covered by mapped reads.
 By default this program will mask using entropy with a window=80 and entropy=0.75
+Please read bbmap/docs/guides/BBMaskGuide.txt for more information.

 Usage:   bbmask.sh in=<file> out=<file> sam=<file,file,...file>

 Input may be stdin or a fasta or fastq file, raw or gzipped.
 sam is optional, but may be a comma-delimited list of sam files to mask.
+Sam files may also be used as arguments without sam=, so you can use *.sam for example.
 If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz

-
 Input parameters:
 in=<file>           Input sequences to mask. 'in=stdin.fa' will pipe from standard in.
 sam=<file,file>     Comma-delimited list of sam files.  Optional.  Their mapped coordinates will be masked.
@@ -56,11 +56,14 @@ unpigz=t            Use pigz to decompress.
 Java Parameters:
 -Xmx                This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -77,6 +80,7 @@ CP="$DIR""current/"
 z="-Xmx1g"
 z2="-Xms1g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -90,19 +94,32 @@ calcXmx () {
 	if [[ $set == 1 ]]; then
 		return
 	fi
-	freeRam 3200m 42
+	freeRam 3200m 84
 	z="-Xmx${RAM}m"
 	z2="-Xms${RAM}m"
 }
 calcXmx "$@"

 bbmask() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java $EA $z -cp $CP jgi.BBMask $@"
+	local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.BBMask $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbmerge-auto.sh
+++ b/bbmerge-auto.sh
 #!/bin/bash
-#merge in=<infile> out=<outfile>

-function usage(){
+usage(){
 echo "
 bbmerge-auto.sh is a wrapper for BBMerge that attempts to use all available
 memory, instead of a fixed amount.  This is for use with the Tadpole options
@@ -13,6 +12,7 @@ For information about usage and parameters, please run bbmerge.sh.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -30,6 +30,7 @@ NATIVELIBDIR="$DIR""jni/"
 z="-Xmx14g"
 z2="-Xms14g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -50,12 +51,26 @@ calcXmx () {
 calcXmx "$@"

 function merge() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
+	#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
+	local CMD="java $EA $z $z2 -cp $CP jgi.BBMerge $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbmerge.sh
+++ b/bbmerge.sh
 #!/bin/bash
-#merge in=<infile> out=<outfile>

-function usage(){
+usage(){
 echo "
-BBMerge v8.82
 Written by Brian Bushnell and Jonathan Rood
-Last modified October 27, 2015
+Last modified August 23, 2018

 Description:  Merges paired reads into single reads by overlap detection.
-With sufficient coverage, can also merge nonoverlapping reads by kmer extension.
+With sufficient coverage, can merge nonoverlapping reads by kmer extension.
+Kmer modes (Tadpole or Bloom Filter) require much more memory, and should
+be used with the bbmerge-auto.sh script rather than bbmerge.sh.
+Please read bbmap/docs/guides/BBMergeGuide.txt for more information.

 Usage for interleaved files:	bbmerge.sh in=<reads> out=<merged reads> outu=<unmerged reads>
 Usage for paired files:     	bbmerge.sh in1=<read1> in2=<read2> out=<merged reads> outu1=<unmerged1> outu2=<unmerged2>

-Input may be stdin or a fasta, fastq, or scarf file, raw or gzipped.
-
+Input may be stdin or a file, fasta or fastq, raw or gzipped.

 Input parameters:
 in=null              Primary input. 'in2' will specify a second file.
@@ -22,7 +22,6 @@ interleaved=auto     May be set to true or false to override autodetection of
                     whether the input file as interleaved.
 reads=-1             Quit after this many read pairs (-1 means all).

-
 Output parameters:
 out=<file>           File for merged reads. 'out2' will specify a second file.
 outu=<file>          File for unmerged reads. 'outu2' will specify a second file.
@@ -38,7 +37,6 @@ ordered=f            Output reads in same order as input.
 mix=f                Output both the merged (or mergable) and unmerged reads
                     in the same file (out=).  Useful for ecco mode.

-
 Trimming/Filtering parameters:
 qtrim=f              Trim read ends to remove bases with quality below minq.
                     Trims BEFORE merging.
@@ -67,14 +65,21 @@ forcetrimright=0     (ftr) If nonzero, trim right bases of the read
 forcetrimright2=0    (ftr2) If positive, trim this many bases on the right end.
 forcetrimmod=5       (ftm) If positive, trim length to be equal to 
                     zero modulo this number.
-
+ooi=f                Output only incorrectly merged reads, for testing.
+trimpolya=t          Trim trailing poly-A tail from adapter output.  Only 
+                     affects outadapter.  This also trims poly-A followed
+                     by poly-G, which occurs on NextSeq.

 Processing Parameters:
 usejni=f             (jni) Do overlapping in C code, which is faster.  Requires
                     compiling the C code; details are in /jni/README.txt.
+                     However, the jni path is currently disabled.
 merge=t              Create merged reads.  If set to false, you can still 
                     generate an insert histogram.
 ecco=f               Error-correct the overlapping part, but don't merge.
+trimnonoverlapping=f (tno) Trim all non-overlapping portions, leaving only
+                     consensus sequence.  By default, only sequence to the 
+                     right of the overlap (adapter sequence) is trimmed.
 useoverlap=t         Attempt find the insert size using read overlap.
 mininsert=35         Minimum insert size to merge reads.
 mininsert0=35        Insert sizes less than this will not be considered.
@@ -87,44 +92,49 @@ maxq=41              Cap output quality scores at this.
 entropy=t            Increase the minimum overlap requirement for low-
                     complexity reads.
 efilter=6            Ban overlaps with over this many times the expected 
-                     number of errors.  Lower is more strict.
-pfilter=0.00002      Ban improbable overlaps.  Higher is more strict. 0 will
+                     number of errors.  Lower is more strict. -1 disables.
+pfilter=0.00004      Ban improbable overlaps.  Higher is more strict. 0 will
                     disable the filter; 1 will allow only perfect overlaps.
 kfilter=0            Ban overlaps that create kmers with count below
-                     this value (0 disables).  Does not seem to help.
-lowercase=f          Expect lowercase letters to signify adapter sequence.
+                     this value (0 disables).  If this is used minprob should
+                     probably be set to 0.  Requires good coverage.
 ouq=f                Calculate best overlap using quality values.
 owq=t                Calculate best overlap without using quality values.
 usequality=t         If disabled, quality values are completely ignored,
                     both for overlap detection and filtering.  May be useful
                     for data with inaccurate quality values.
 iupacton=f           (itn) Change ambiguous IUPAC symbols to N.
+adapter=             Specify the adapter sequences used for these reads, if
+                     known; this can be a fasta file or a literal sequence.
+                     Read 1 and 2 can have adapters specified independently
+                     with the adapter1 and adapter2 flags.  adapter=default
+                     will use a list of common adapter sequences.

+Ratio Mode: 
+ratiomode=t          Score overlaps based on the ratio of matching to 
+                     mismatching bases.
+maxratio=0.09        Max error rate; higher increases merge rate.
+ratiomargin=5.5      Lower increases merge rate; min is 1.
+ratiooffset=0.55     Lower increases merge rate; min is 0.
+ratiominoverlapreduction=3  This is the difference between minoverlap in 
+                     flat mode and minoverlap in ratio mode; generally, 
+                     minoverlap should be lower in ratio mode.
+minsecondratio=0.1   Cutoff for second-best overlap ratio.

-Normal Mode: 
-normalmode=f         Original BBMerge algorithm.  Faster, but lower overall
-                     merge rate.
+Flat Mode: 
+flatmode=f           Score overlaps based on the total number of mismatching
+                     bases only.
 margin=2             The best overlap must have at least 'margin' fewer 
                     mismatches than the second best.
 mismatches=3         Do not allow more than this many mismatches.
-requireratiomatch=f  (rrm) Require the answer from normal mode and ratio mode
+requireratiomatch=f  (rrm) Require the answer from flat mode and ratio mode
                     to agree, reducing false positives if both are enabled.
 trimonfailure=t      (tof) If detecting insert size by overlap fails,
                     the reads will be trimmed and this will be re-attempted.


-Ratio Mode: 
-ratiomode=t          Newer algorithm.  Slower, but higher merge rate.
-                     Much better for long overlaps and high error rates.
-maxratio=0.09        Max error rate; higher increases merge rate.
-ratiomargin=5.5      Lower increases merge rate; min is 1.
-ratiooffset=0.55     Lower increases merge rate; min is 0.
-ratiominoverlapreduction=3  This is the difference between minoverlap in 
-                     normal mode and minoverlap in ratio mode; generally, 
-                     minoverlap should be lower in ratio mode.
-
-*** Ratio Mode and Normal Mode may be used alone or simultaneously. ***
-*** Ratio Mode is much more accurate and is now the default mode. ***
+*** Ratio Mode and Flat Mode may be used alone or simultaneously. ***
+*** Ratio Mode is usually more accurate and is the default mode. ***


 Strictness (these are mutually exclusive macros that set other parameters):
@@ -138,15 +148,29 @@ ultraloose=f         (uloose) Increase FP and merging rate even more.
 maxloose=f           (xloose) Maximally decrease FP and merging rate.
 fast=f               Fastest possible mode; less accurate.

-
 Tadpole Parameters (for read extension and error-correction):
+*Note: These require more memory and should be run with bbmerge-auto.sh.*
+k=31                 Kmer length.  31 (or less) is fastest and uses the least
+                     memory, but higher values may be more accurate.  
+                     60 tends to work well for 150bp reads.
 extend=0             Extend reads to the right this much before merging.
                     Requires sufficient (>5x) kmer coverage.
-extend2=0            Extend reads only after a failed merge attempt.
+extend2=0            Extend reads this much only after a failed merge attempt,
+                     or in rem/rsem mode.
 iterations=1         (ei) Iteratively attempt to extend by extend2 distance
                     and merge up to this many times.
+rem=f                (requireextensionmatch) Do not merge if the predicted
+                     insert size differs before and after extension.
+                     However, if only the extended reads overlap, then that
+                     insert will be used.  Requires setting extend2.
+rsem=f               (requirestrictextensionmatch) Similar to rem but stricter.
+                     Reads will only merge if the predicted insert size before
+                     and after extension match.  Requires setting extend2.
+                     Enables the lowest possible false-positive rate.
 ecctadpole=f         (ecct) If reads fail to merge, error-correct with Tadpole
                     and try again.  This happens prior to extend2.
+reassemble=t         If ecct is enabled, use Tadpole's reassemble mode for 
+                     error correction.  Alternatives are pincer and tail.
 removedeadends       (shave) Remove kmers leading to dead ends.
 removebubbles        (rinse) Remove kmers in error bubbles.
 mindepthseed=3       (mds) Minimum kmer depth to begin extension.
@@ -157,7 +181,6 @@ branchlower=3        Max value of 2nd-greatest path depth to be considered low.
 ibb=t                Ignore backward branches when extending.
 extra=<file>         A file or comma-delimited list of files of reads to use
                     for kmer counting, but not for merging or output.
-k=31                 Kmer length (1-31 is fastest).
 prealloc=f           Pre-allocate memory rather than dynamically growing; 
                     faster and more memory-efficient for large datasets.  
                     A float fraction (0-1) may be specified, default 1.
@@ -166,17 +189,33 @@ prefilter=0          If set to a positive integer, use a countmin sketch to
                     reduce memory usage.
 minprob=0.5          Ignore kmers with overall probability of correctness 
                     below this, to reduce memory usage.
+minapproxoverlap=26  For rem mode, do not merge reads if the extended reads
+                     indicate that the raw reads should have overlapped by
+                     at least this much, but no overlap was found.
+

+Bloom Filter Parameters (for kmer operations with less memory than Tadpole)
+*Note: These require more memory and should be run with bbmerge-auto.sh.*
+eccbloom=f           (eccb) If reads fail to merge, error-correct with bbcms
+                     and try again.
+testmerge=f          Test kmer counts around the read merge junctions.  If
+                     it appears that the merge created new errors, undo it.
+                     This reduces the false-positive rate, but not as much as
+                     rem or rsem.

 Java Parameters:
 -Xmx                 This will be passed to Java to set memory usage, 
                     overriding the program's automatic memory detection.
                     For example, -Xmx400m will specify 400 MB RAM.
+-eoom                This flag will cause the process to exit if an 
+                     out-of-memory exception occurs.  Requires Java 8u92+.
+-da                  Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -192,7 +231,9 @@ CP="$DIR""current/"
 NATIVELIBDIR="$DIR""jni/"

 z="-Xmx1000m"
+z2="-Xms1000m"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -207,12 +248,26 @@ calcXmx () {
 calcXmx "$@"

 function merge() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z -cp $CP jgi.BBMerge $@"
+	#local CMD="java -Djava.library.path=$NATIVELIBDIR $EA $z $z2 -cp $CP jgi.BBMerge $@"
+	local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.BBMerge $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbnorm.sh
+++ b/bbnorm.sh
 #!/bin/bash
-#bbnorm in=<infile> out=<outfile>

 usage(){
 echo "
 Written by Brian Bushnell
-Last modified December 14, 2015
+Last modified October 19, 2017

 Description:  Normalizes read depth based on kmer counts.
 Can also error-correct, bin reads by kmer depth, and generate a kmer depth histogram.
+However, Tadpole has superior error-correction to BBNorm.
+Please read bbmap/docs/guides/BBNormGuide.txt for more information.

 Usage:     bbnorm.sh in=<input> out=<reads to keep> outt=<reads to toss> hist=<histogram output>

-Input may be fasta or fastq, compressed or uncompressed.
-'out' and 'hist' are both optional.
-
-
-Optional parameters (and their defaults)
-
 Input parameters:
 in=null             Primary input.  Use in2 for paired reads in a second file
 in2=null            Second input file for paired reads in two files
@@ -80,7 +75,7 @@ highthresh=12       (ht) Threshold for high kmer.  A high kmer at this or above
 lowthresh=3         (lt) Threshold for low kmer.  Kmers at this and below are always considered errors.

 Error correction parameters:
-ecc=f               Set to true to correct errors.
+ecc=f               Set to true to correct errors.  NOTE: Tadpole is now preferred for ecc as it does a better job.
 ecclimit=3          Correct up to this many errors per read.  If more are detected, the read will remain unchanged.
 errorcorrectratio=140  (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error.
 echighthresh=22     (echt) Threshold for high kmer.  A kmer at this or above may be considered non-error.
@@ -109,8 +104,8 @@ histlen=1048576     Max kmer depth displayed in histogram.  Also affects statist
 Peak calling parameters:
 peaks=<file>        Write the peaks to this file.  Default is stdout.
 minHeight=2         (h) Ignore peaks shorter than this.
-minVolume=2         (v) Ignore peaks with less area than this.
-minWidth=2          (w) Ignore peaks narrower than this.
+minVolume=5         (v) Ignore peaks with less area than this.
+minWidth=3          (w) Ignore peaks narrower than this.
 minPeak=2           (minp) Ignore peaks with an X-value below this.
 maxPeak=BIG         (maxp) Ignore peaks with an X-value above this.
 maxPeakCount=8      (maxpc) Print up to this many peaks (prioritizing height).
@@ -118,11 +113,14 @@ maxPeakCount=8      (maxpc) Print up to this many peaks (prioritizing height).
 Java Parameters:
 -Xmx                This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom               This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da                 Disable assertions.

 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
 "
 }

+#This block allows symlinked shellscripts to correctly set classpath.
 pushd . > /dev/null
 DIR="${BASH_SOURCE[0]}"
 while [ -h "$DIR" ]; do
@@ -139,6 +137,7 @@ CP="$DIR""current/"
 z="-Xmx31g"
 z2="-Xms31g"
 EA="-ea"
+EOOM=""
 set=0

 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
@@ -159,12 +158,25 @@ calcXmx () {
 calcXmx "$@"

 normalize() {
-	if [[ $NERSC_HOST == genepool ]]; then
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
 		module unload oracle-jdk
-		module load oracle-jdk/1.7_64bit
+		module load oracle-jdk/1.8_144_64bit
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
 		module load pigz
 	fi
-	local CMD="java $EA $z $z2 -cp $CP jgi.KmerNormalize bits=32 $@"
+	local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.KmerNormalize bits=32 $@"
 	echo $CMD >&2
 	eval $CMD
 }

--- a/bbrealign.sh
+++ b/bbrealign.sh
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified April 26, 2017
+
+Description:  Realigns mapped reads to a reference.
+
+Usage:  bbrealign.sh in=<file> ref=<file> out=<file>
+
+Input may be a sorted or unsorted sam or bam file.
+The reference should be fasta.
+
+I/O parameters:
+in=<file>       Input reads.
+out=<file>      Output reads.
+ref=<file>      Reference fasta.
+overwrite=f     (ow) Set to false to force the program to abort rather than
+                overwrite an existing file.
+
+Trimming parameters:
+border=0        Trim at least this many bases on both ends of reads.
+qtrim=r         Quality-trim reads on this end
+                   r: right, l: left, rl: both, f: don't quality-trim.
+trimq=10        Quality-trim bases below this score.
+
+Realignment parameters:
+unclip=f        Convert clip symbols from exceeding the ends of the
+                realignment zone into matches and substitutitions.
+repadding=70    Pad alignment by this much on each end.  Typically,
+                longer is more accurate for long indels, but greatly
+                reduces speed.
+rerows=602      Use this many rows maximum for realignment.  Reads longer
+                than this cannot be realigned.
+recols=2000     Reads may not be aligned to reference seqments longer 
+                than this.  Needs to be at least read length plus
+                max deletion length plus twice padding.
+msa=            Select the aligner.  Options:
+                   MultiStateAligner11ts:     Default.
+                   MultiStateAligner9PacBio:  Use for PacBio reads, or for
+                   Illumina reads mapped to PacBio/Nanopore reads.
+
+Sam-filtering parameters:
+minpos=         Ignore alignments not overlapping this range.
+maxpos=         Ignore alignments not overlapping this range.
+minreadmapq=4   Ignore alignments with lower mapq.
+contigs=        Comma-delimited list of contig names to include. These 
+                should have no spaces, or underscores instead of spaces.
+secondary=f     Include secondary alignments.
+supplimentary=f Include supplimentary alignments.
+invert=f        Invert sam filters.
+
+Java Parameters:
+-Xmx            This will be passed to Java to set memory usage, overriding the program's automatic memory detection.
+                -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  The max is typically 85% of physical memory.
+-eoom           This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
+-da             Disable assertions.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx4g"
+z2="-Xms4g"
+EA="-ea"
+EOOM=""
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	parseXmx "$@"
+	if [[ $set == 1 ]]; then
+		return
+	fi
+	freeRam 4000m 84
+	z="-Xmx${RAM}m"
+	z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+bbrealign() {
+	if [[ $SHIFTER_RUNTIME == 1 ]]; then
+		#Ignore NERSC_HOST
+		shifter=1
+	elif [[ $NERSC_HOST == genepool ]]; then
+		module unload oracle-jdk
+		module load oracle-jdk/1.8_144_64bit
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == denovo ]]; then
+		module unload java
+		module load java/1.8.0_144
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	elif [[ $NERSC_HOST == cori ]]; then
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/jgi
+		module use /global/common/software/m342/nersc-builds/denovo/Modules/usg
+		module unload java
+		module load java/1.8.0_144
+		module unload PrgEnv-intel
+		module load PrgEnv-gnu/7.1
+		module load samtools/1.4
+		module load pigz
+	fi
+	local CMD="java $EA $EOOM $z $z2 -cp $CP var2.Realign $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+bbrealign "$@"
--- a/bbrename.sh
+++ b/bbrename.sh
+#!/bin/bash
+
+#For more information, please see rename.sh
+#This exists for people who type bbrename.sh instead of rename.sh
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+
+"$DIR"rename.sh $@
--- a/bbsketch.sh
+++ b/bbsketch.sh
+#!/bin/bash
+
+#For more information, please see sketch.sh
+#This exists for people who type bbsketch.sh instead of sketch.sh
+#I haven't decided which one will be the canonical version.
+
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+
+"$DIR"sketch.sh $@