Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · 94e81687 · 94e81687
--- a/.gitignore
+++ b/.gitignore
 *.pyc
 *.idx
+*.swo
+*.swp
 bcbio/pipeline/version.py
 bcbio_nextgen.egg-info/
 build/

--- a/HISTORY.md
+++ b/HISTORY.md
+## 1.1.4 (3 April 2019)
+
+- Move to Python 3.6. A python2 environment in the install runs non python3
+  compatible programs. The codebase is still compatible with python 2.7 but
+  will only get run and tested on python 3 for future releases.
+- RNA-seq: fix for race condition when creating the pizzly cache
+- RNA-seq: Add Salmon to multiqc report.
+- RNA-seq single-cell/DGE: Properly strip transcript versions from GENCODE GTFs.
+- RNA-seq: Faster and more flexible rRNA biotype lookup.
+- Move to R3.5.1, including updates to all CRAN and Bioconductor packages.
+- tumor-only germline prioritization: provide more useful germline filtering
+  based on prioritization INFO tag (EPR) rather than filter field.
+- Install: do not require fabric for tool and data installs, making full codebase
+  compatible with python 3.
+- variant: Filter out variants with missing ALT alleles output by GATK4.
+- GATK: enable specification of spark specific parameters with `gatk-spark`
+  resources.
+- RNA-seq single-cell/DGE: added `demultiplexed` option. If set to True, treat the
+  data as if it has already been demultiplexed into cells/wells.
+- Multiple orders of magnitude faster templating with thousands of input files.
+
 ## 1.1.3 (29 January 2019)

 - CNV: support background inputs for CNVkit, GATK4 CNV and seq2c. Allows

--- a/bcbio/bam/__init__.py
+++ b/bcbio/bam/__init__.py
@@ -3,7 +3,6 @@
 from __future__ import print_function
 import collections
 import os
-import itertools
 import signal
 import subprocess
 import numpy
@@ -11,6 +10,7 @@ import numpy
 import pybedtools
 import pysam
 import toolz as tz
+from six.moves import zip_longest

 from bcbio import broad, utils
 from bcbio.bam import ref
@@ -48,7 +48,7 @@ def is_paired(bam_file):
    """
    bam_file = objectstore.cl_input(bam_file)
    cmd = ("set -o pipefail; "
-           "samtools view -h {bam_file} | head -50000 | "
+           "samtools view -h {bam_file} | head -300000 | "
           "samtools view -S -f 1 /dev/stdin  | head -1 | wc -l")
    p = subprocess.Popen(cmd.format(**locals()), shell=True,
                         executable=do.find_bash(),
@@ -291,7 +291,7 @@ def _check_bam_contigs(in_bam, ref_file, config):
    extra_rcs = [x for x in ref_contigs if x not in bam_contigs]
    problems = []
    warnings = []
-    for bc, rc in itertools.izip_longest([x for x in bam_contigs if (x not in extra_bcs and
+    for bc, rc in zip_longest([x for x in bam_contigs if (x not in extra_bcs and
                                                                     x not in allowed_outoforder)],
                                         [x for x in ref_contigs if (x not in extra_rcs and
                                                                     x not in allowed_outoforder)]):

--- a/bcbio/bam/callable.py
+++ b/bcbio/bam/callable.py
@@ -198,8 +198,12 @@ def block_regions(callable_bed, in_bam, ref_file, data):
                if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
                    ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed)
                else:
-                    raise ValueError("No callable regions found from BAM file. Alignment regions might "
-                                     "not overlap with regions found in your `variant_regions` BED: %s" % in_bam)
+                    raise ValueError("No callable regions found in %s from BAM file %s. Some causes:\n "
+                                     " - Alignment regions do not overlap with regions found "
+                                     "in your `variant_regions` BED: %s\n"
+                                     " - There are no aligned reads in your BAM file that pass sanity checks "
+                                     " (mapping score > 1, non-duplicates, both ends of paired reads mapped)"
+                                     % (dd.get_sample_name(data), in_bam, dd.get_variant_regions(data)))
    return callblock_bed, nblock_bed

 def _write_bed_regions(data, final_regions, out_file, out_file_ref):

--- a/bcbio/bam/fastq.py
+++ b/bcbio/bam/fastq.py
@@ -7,6 +7,8 @@ from itertools import product
 import os
 import random
 import sys
+import toolz as tz
+from collections import defaultdict

 from Bio import SeqIO
 from bcbio.distributed import objectstore
@@ -115,6 +117,8 @@ def combine_pairs(input_files, force_single=False, full_name=False, separators=N
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"])
+    if len(input_files) > 1000:
+        return fast_combine_pairs(input_files, force_single, full_name, separators)

    pairs = []
    used = set([])
@@ -180,6 +184,25 @@ def combine_pairs(input_files, force_single=False, full_name=False, separators=N
            used.add(in_file)
    return pairs

+def fast_combine_pairs(files, force_single, full_name, separators):
+    """
+    assume files that need to be paired are within 10 entries of each other, once the list is sorted
+    """
+    files = sort_filenames(files)
+    chunks = tz.sliding_window(10, files)
+    pairs = [combine_pairs(chunk, force_single, full_name, separators) for chunk in chunks]
+    pairs = [y for x in pairs for y in x]
+    longest = defaultdict(list)
+    # for each file, save the longest pair it is in
+    for pair in pairs:
+        for file in pair:
+            if len(longest[file]) < len(pair):
+                longest[file] = pair
+    # keep only unique pairs
+    longest = {tuple(sort_filenames(x)) for x in longest.values()}
+    # ensure filenames are R1 followed by R2
+    return [sort_filenames(list(x)) for x in longest]
+
 def dif(a, b):
    """ copy from http://stackoverflow.com/a/8545526 """
    return [i for i in range(len(a)) if a[i] != b[i]]

--- a/bcbio/broad/__init__.py
+++ b/bcbio/broad/__init__.py
@@ -72,7 +72,7 @@ def _clean_java_out(version_str):
    Java will report information like _JAVA_OPTIONS environmental variables in the output.
    """
    out = []
-    for line in version_str.split("\n"):
+    for line in version_str.decode().split("\n"):
        if line.startswith("Picked up"):
            pass
        if line.find("setlocale") > 0:
@@ -95,15 +95,13 @@ def get_gatk_version(gatk_jar=None, config=None):
    with closing(subprocess.Popen(cl, stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, shell=True).stdout) as stdout:
        out = _clean_java_out(stdout.read().strip())
-        # versions earlier than 2.4 do not have explicit version command,
-        # parse from error output from GATK
-        if out.find("ERROR") >= 0:
+        # Historical GATK version (2.4) and newer versions (4.1.0.0)
+        # have a flag in front of output version
+        version = out
        flag = "The Genome Analysis Toolkit (GATK)"
        for line in out.split("\n"):
            if line.startswith(flag):
                version = line.split(flag)[-1].split(",")[0].strip()
-        else:
-            version = out
    if version.startswith("v"):
        version = version[1:]
    _check_for_bad_version(version, "GATK")
@@ -546,7 +544,7 @@ def gatk_cmd(name, jvm_opts, params, config=None):
    if not gatk_cmd:
        gatk_cmd = utils.which(name)
    if gatk_cmd:
-        return "%s && export PATH=%s:$PATH && %s %s %s" % \
+        return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \
            (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd,
             " ".join(jvm_opts), " ".join([str(x) for x in params]))

@@ -554,7 +552,7 @@ def _gatk4_cmd(jvm_opts, params, data):
    """Retrieve unified command for GATK4, using 'gatk'. GATK3 is 'gatk3'.
    """
    gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk"))
-    return "%s && export PATH=%s:$PATH && gatk --java-options '%s' %s" % \
+    return "%s && export PATH=%s:\"$PATH\" && gatk --java-options '%s' %s" % \
        (utils.clear_java_home(), utils.get_java_binpath(gatk_cmd),
         " ".join(jvm_opts), " ".join([str(x) for x in params]))

@@ -565,7 +563,7 @@ class PicardCmdRunner:

    def run(self, subcmd, opts, memscale=None):
        jvm_opts = get_picard_opts(self._config, memscale=memscale)
-        cmd = ["export", "PATH=%s:$PATH" % utils.get_java_binpath(), "&&"] + \
+        cmd = ["export", "PATH=%s:\"$PATH\"" % utils.get_java_binpath(), "&&"] + \
              [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
              ["VALIDATION_STRINGENCY=SILENT"]
        do.run(utils.clear_java_home() + " && " + " ".join(cmd), "Picard: %s" % subcmd)

--- a/bcbio/chipseq/__init__.py
+++ b/bcbio/chipseq/__init__.py
@@ -69,32 +69,23 @@ def _prepare_bam(bam_file, bed_file, config):
            do.run(cmd.format(**locals()), "Remove blacklist regions from %s" % bam_file)
    return out_file

-def get_genome(data):
-    """
-    get the effective length of the genome, falling back to the length of the genome
-    if the effective length is not precomputed
-    """
-    from bcbio.chipseq import macs2
-    from bcbio.bam import fasta
-    genome = dd.get_genome_build(data)
-    loaded = macs2.HS
-    if genome in loaded:
-        return loaded[genome]
-    else:
-        return sum([x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
-
 def _bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools"""
    cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
          "--binSize 20 --effectiveGenomeSize {size} "
          "--smoothLength 60 --extendReads 150 --centerReads -p {cores}")
-    size = int(get_genome(data))
+    size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    cores = dd.get_num_cores(data)
    try:
        bam_coverage = config_utils.get_program("bamCoverage", data)
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
+    resources = config_utils.get_resources("bamCoverage", data["config"])
+    if resources:
+        options = resources.get("options")
+        if options:
+            cmd += " %s" % " ".join([str(x) for x in options])
    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
    if utils.file_exists(bw_output):
        return bw_output

--- a/bcbio/chipseq/macs2.py
+++ b/bcbio/chipseq/macs2.py
@@ -8,12 +8,6 @@ from bcbio.pipeline import config_utils
 from bcbio.pipeline import datadict as dd
 from bcbio import bam

-HS = {"hg19": 2.7e9,
-      "GRCh37": 2.7e9,
-      "hg38": 2.7e9,
-      "mm10": 1.87e9,
-      "dm3": 1.2e8}
-
 def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
    Run macs2 for chip and input samples avoiding
@@ -28,7 +22,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(resources.get("macs2", {}).get("options", ""))
-    genome_size = HS.get(genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data)))
+    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):

--- a/bcbio/cwl/create.py
+++ b/bcbio/cwl/create.py
@@ -490,7 +490,7 @@ def _flatten_samples(samples, base_file, get_retriever):
                cur_flat[flat_key] = flat_val
        flat_data.append(cur_flat)
    out = {}
-    for key in sorted(list(set(reduce(operator.add, [d.keys() for d in flat_data])))):
+    for key in sorted(list(set(reduce(operator.add, [list(d.keys()) for d in flat_data])))):
        # Periods in keys cause issues with WDL and some CWL implementations
        clean_key = key.replace(".", "_")
        out[clean_key] = []

--- a/bcbio/cwl/defs.py
+++ b/bcbio/cwl/defs.py
@@ -99,12 +99,13 @@ def _alignment(checkpoints):
                               cwlout(["config", "algorithm", "quality_format"], ["string", "null"]),
                               cwlout(["align_split"], ["string", "null"])])],
               "bcbio-vc", ["grabix", "htslib", "biobambam", "atropos;env=python3",
-                            "optitype", "razers3=3.5.0", "coincbc"],  # HLA deps for general docker inclusion
+                            "optitype;env=python2", "razers3=3.5.0", "coincbc"],  # HLA deps for general docker inclusion
               disk={"files": 1.5}),
             s("process_alignment", "single-parallel" if checkpoints["align_split"] else "single-single",
               [["alignment_rec"], ["process_alignment_rec"]], process_alignment_out,
               "bcbio-vc", ["bwa", "bwakit", "grabix", "minimap2", "novoalign", "snap-aligner=1.0dev.97",
-                            "sentieon", "samtools", "pysam>=0.13.0", "sambamba", "fgbio", "umis",
+                            "sentieon;env=python2", "samtools", "pysam>=0.13.0", "sambamba", "fgbio",
+                            "umis;env=python2",
                            "biobambam", "seqtk", "samblaster", "variantbam"],
               disk={"files": 2})]
    if checkpoints["align_split"]:
@@ -138,7 +139,7 @@ def _variant_hla(checkpoints):
             [["hla_rec"]],
             [cwlout(["hla", "hlacaller"], ["string", "null"]),
              cwlout(["hla", "call_file"], ["File", "null"])],
-             "bcbio-vc", ["optitype", "razers3=3.5.0", "coincbc"])]
+             "bcbio-vc", ["optitype;env=python2", "razers3=3.5.0", "coincbc"])]
    return hla, [["hla", "call_file"]]

 def _variant_vc(checkpoints):
@@ -156,10 +157,10 @@ def _variant_vc(checkpoints):
               [cwlout(["vrn_file_region"], ["File", "null"], [".tbi"]),
                cwlout(["region_block"], {"type": "array", "items": "string"})],
               "bcbio-vc", ["bcftools", "bedtools", "freebayes=1.1.0.46",
-                            "gatk4", "vqsr_cnn", "deepvariant", "sentieon",
-                            "htslib", "octopus", "picard", "platypus-variant", "pythonpy",
-                            "samtools", "pysam>=0.13.0", "strelka", "vardict", "vardict-java",
-                            "varscan", "moreutils", "vcfanno", "vcflib", "vt", "r=3.4.1", "r-base=3.4.1=h4fe35fd_8",
+                            "gatk4", "vqsr_cnn", "deepvariant;env=dv", "sentieon;env=python2",
+                            "htslib", "octopus", "picard", "platypus-variant;env=python2", "pythonpy",
+                            "samtools", "pysam>=0.13.0", "strelka;env=python2", "vardict", "vardict-java",
+                            "varscan", "moreutils", "vcfanno", "vcflib", "vt", "r=3.5.1", "r-base",
                            "perl"],
               disk={"files": 2.0}),
             s("concat_batch_variantcalls", "batch-merge",
@@ -184,7 +185,7 @@ def _variant_vc(checkpoints):
                                cwlout(["validate", "fp"], ["File", "null"], [".tbi"]),
                                cwlout(["validate", "fn"], ["File", "null"], [".tbi"]),
                                cwlout("inherit", exclude=vc_rec_exclude)])],
-                "bcbio-vc", ["bcftools", "bedtools", "pythonpy", "gvcf-regions",
+                "bcbio-vc", ["bcftools", "bedtools", "pythonpy", "gvcf-regions;env=python2",
                             "htslib", "rtg-tools", "vcfanno"],
                disk={"files": 1.5})]
    batch_in = [["analysis"], ["genome_build"], ["align_bam"], ["vrn_file"],
@@ -221,7 +222,9 @@ def _variant_vc(checkpoints):
        batch_in += [["genome_resources", "variation", "train_hapmap"],
                     ["genome_resources", "variation", "train_indels"]]
    vc = [s("batch_for_variantcall", "multi-batch", batch_in,
-            [cwlout("batch_rec", "record")],
+            [cwlout("batch_rec", "record",
+                    fields=[cwlout(["config", "algorithm", "variantcaller_order"], "int"),
+                            cwlout("inherit")])],
            "bcbio-vc",
            disk={"files": 2.0}, cores=1,
            unlist=[["config", "algorithm", "variantcaller"]], no_files=True),
@@ -273,7 +276,7 @@ def _variant_jointvc():
          s("run_jointvc", "batch-parallel",
            [["jointvc_batch_rec"], ["region"]],
            [cwlout(["vrn_file_region"], ["File", "null"], [".tbi"]), cwlout(["region"], "string")],
-            "bcbio-vc", ["gatk4", "gvcfgenotyper", "sentieon"],
+            "bcbio-vc", ["gatk4", "gvcfgenotyper", "sentieon;env=python2"],
            disk={"files": 1.5}, cores=1),
          s("concat_batch_variantcalls_jointvc", "batch-merge",
            [["jointvc_batch_rec"], ["region"], ["vrn_file_region"]],
@@ -387,7 +390,7 @@ def _postprocess_alignment(checkpoints):
             cwlout(["depth", "coverage", "dist"], ["File", "null"]),
             cwlout(["depth", "coverage", "thresholds"], ["File", "null"]),
             cwlout(["align_bam"], ["File", "null"])],
-            "bcbio-vc", ["sambamba", "goleft", "bedtools", "htslib", "gatk4", "mosdepth", "sentieon"],
+            "bcbio-vc", ["sambamba", "goleft", "bedtools", "htslib", "gatk4", "mosdepth", "sentieon;env=python2"],
            disk={"files": 3.0}),
          s("combine_sample_regions", "multi-combined",
            [["regions", "callable"], ["regions", "nblock"], ["metadata", "batch"],
@@ -440,7 +443,7 @@ def variant(samples):
    else:
        align = [s("organize_noalign", "multi-parallel",
                   ["files"],
-                   [cwlout(["align_bam"], "File", [".bai"]),
+                   [cwlout(["align_bam"], ["File", "null"], [".bai"]),
                    cwlout(["work_bam_plus", "disc"], ["File", "null"]),
                    cwlout(["work_bam_plus", "sr"], ["File", "null"]),
                    cwlout(["hla", "fastq"], ["File", "null"])],
@@ -492,7 +495,7 @@ def _qc_workflow(checkpoints):
                            cwlout(["config", "algorithm", "qc"])])],
            "bcbio-vc", ["bcftools", "bedtools", "fastqc=0.11.7=5", "goleft", "hts-nim-tools", "mosdepth",
                         "picard", "pythonpy", "qsignature", "qualimap", "sambamba",
-                         "samtools", "preseq", "peddy", "verifybamid2"],
+                         "samtools", "preseq", "peddy;env=python2", "verifybamid2"],
            disk={"files": 2.0}),
          s("multiqc_summary", "multi-combined",
            [["qcout_rec"]],
@@ -519,11 +522,12 @@ def _variant_sv(checkpoints):
                            cwlout(["svvalidate", "summary"], ["File", "null"]),
                            cwlout("inherit", exclude=[["align_bam"], ["work_bam_plus"],
                                                       ["reference", "snpeff"]])])],
-            "bcbio-vc", ["bedtools", "cnvkit", "delly", "duphold", "extract-sv-reads",
-                         "lumpy-sv", "manta", "break-point-inspector", "mosdepth", "samtools",
-                         "smoove", "pysam>=0.13.0",
-                         "seq2c", "simple_sv_annotation", "survivor", "svtools", "svtyper",
-                         "r=3.4.1", "r-base=3.4.1=h4fe35fd_8", "xorg-libxt", "vawk"],
+            "bcbio-vc", ["bedtools", "cnvkit", "delly", "duphold", "extract-sv-reads", "gsort",
+                         "lumpy-sv;env=python2", "manta;env=python2", "break-point-inspector", "mosdepth", "samtools",
+                         "smoove;env=python2", "pysam>=0.13.0",
+                         "seq2c", "simple_sv_annotation;env=python2", "survivor", "svtools;env=python2",
+                         "svtyper;env=python2",
+                         "r=3.5.1", "r-base", "xorg-libxt", "vawk;env=python2"],
            disk={"files": 2.0})]
    sv_batch_inputs = [["analysis"], ["genome_build"],
                       ["work_bam_plus", "disc"], ["work_bam_plus", "sr"],
@@ -619,7 +623,7 @@ def rnaseq(samples):
    align = [s("process_alignment", "multi-parallel",
               [["trim_rec"]],
               [cwlout(["align_bam"], "File", [".bai"])],
-               "bcbio-rnaseq", ["star", "hisat2", "tophat", "samtools",
+               "bcbio-rnaseq", ["star", "hisat2", "tophat;env=python2", "samtools",
                                "sambamba", "seqtk"],
               {"files": 1.5})]
    if checkpoints.get("vc"):
@@ -633,7 +637,7 @@ def rnaseq(samples):
                   cwlout(["quant", "hdf5"], "File"),
                   cwlout(["quant", "fusion"], "File")],
                  "bcbio-rnaseq", programs=["sailfish", "salmon", "kallisto>=0.43.1", "subread", "gffread",
-                                            "r=3.4.1", "r-base=3.4.1=h4fe35fd_8", "xorg-libxt", "r-wasabi"],
+                                            "r=3.5.1", "r-base", "xorg-libxt", "r-wasabi"],
                  disk={"files": 0.5})]
    qc = [s("qc_to_rec", "multi-combined",
            [["align_bam"], ["analysis"], ["reference", "fasta", "base"], dd.get_keys("gtf_file"),

--- a/bcbio/cwl/hpc.py
+++ b/bcbio/cwl/hpc.py
@@ -78,7 +78,7 @@ def _args_to_cromwell(args):
    """Convert input arguments into cromwell inputs for config and command line.
    """
    default_config = {"slurm": {"timelimit": "1-00:00", "account": ""},
-                      "sge": {"memtype": "mem_type", "pename": "smp"},
+                      "sge": {"memtype": "mem_free", "pename": "smp"},
                      "lsf": {"walltime": "24:00", "account": ""},
                      "htcondor": {},
                      "torque": {"walltime": "24:00:00", "account": ""},
@@ -340,7 +340,7 @@ HPC_CONFIGS = {
        Int cpu = 1
        Int memory_mb = 2048
        String queue = "%(queue)s"
-        String pename = "%(pename}s"
+        String pename = "%(pename)s"
        String memtype = "%(memtype)s"
        %(docker_attrs)s
        %(cwl_attrs)s
@@ -348,8 +348,8 @@ HPC_CONFIGS = {
        submit = \"\"\"
        qsub -V -w w -j y -N ${job_name} -wd ${cwd} \
        -o ${out} -e ${err} -q ${queue} \
-        -pe ${pename} ${cpu} ${"-l " + mem_type + "=" + memory_mb + "m"} \
-        /usr/bin/env bash ${script}
+        -pe ${pename} ${cpu} -l ${memtype}=${memory_mb}m \
+        ${script}
        \"\"\"
        kill = "qdel ${job_id}"
        check-alive = "qstat -j ${job_id}"

--- a/bcbio/cwl/tool.py
+++ b/bcbio/cwl/tool.py
@@ -61,10 +61,15 @@ def _chown_workdir(work_dir):
    """Ensure work directory files owned by original user.

    Docker runs can leave root owned files making cleanup difficult.
+    Skips this if it fails, avoiding errors where we run remotely
+    and don't have docker locally.
    """
    cmd = ("""docker run --rm -v %s:%s quay.io/bcbio/bcbio-base /bin/bash -c 'chown -R %s %s'""" %
           (work_dir, work_dir, os.getuid(), work_dir))
+    try:
        subprocess.check_call(cmd, shell=True)
+    except subprocess.CalledProcessError:
+        pass

 def _remove_bcbiovm_path():
    """Avoid referencing minimal bcbio_nextgen in bcbio_vm installation.
@@ -153,6 +158,9 @@ def _run_wes(args):
    """
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    main_file = _pack_cwl(main_file)
+    if args.host and "stratus" in args.host:
+        _run_wes_stratus(args, main_file, json_file)
+    else:
        opts = ["--no-wait"]
        if args.host:
            opts += ["--host", args.host]
@@ -161,6 +169,54 @@ def _run_wes(args):
        cmd = ["wes-client"] + opts + [main_file, json_file]
        _run_tool(cmd)

+def _run_wes_stratus(args, main_file, json_file):
+    """Run WES on Illumina stratus endpoint server, which wes-client doesn't support.
+
+    https://stratus-docs.readme.io/docs/quick-start-4
+    """
+    import requests
+    base_url = args.host
+    if not base_url.startswith("http"):
+        base_url = "https://%s" % base_url
+    with open(main_file) as in_handle:
+        r = requests.post("%s/v1/workflows" % base_url,
+                          headers={"Content-Type": "application/json",
+                                   "Authorization": "Bearer %s" % args.auth},
+                          data=in_handle.read())
+    print(r.status_code)
+    print(r.text)
+
+def _estimate_runner_memory(json_file):
+    """Estimate Java memory requirements based on number of samples.
+
+    A rough approach to selecting correct allocated memory for Cromwell.
+    """
+    with open(json_file) as in_handle:
+        sinfo = json.load(in_handle)
+    num_parallel = 1
+    for key in ["config__algorithm__variantcaller", "description"]:
+        item_counts = []
+        n = 0
+        for val in (sinfo.get(key) or []):
+            n += 1
+            if val:
+                if isinstance(val, (list, tuple)):
+                    item_counts.append(len(val))
+                else:
+                    item_counts.append(1)
+        print(key, n, item_counts)
+        if n and item_counts:
+            num_parallel = n * max(item_counts)
+            break
+    if num_parallel < 25:
+        return "3g"
+    if num_parallel < 150:
+        return "6g"
+    elif num_parallel < 500:
+        return "12g"
+    else:
+        return "24g"
+
 def _run_cromwell(args):
    """Run CWL with Cromwell.
    """
@@ -177,7 +233,8 @@ def _run_cromwell(args):
    with open(option_file, "w") as out_handle:
        json.dump(cromwell_opts, out_handle)

-    cmd = ["cromwell", "-Xms1g", "-Xmx3g", "run", "--type", "CWL",
+    cmd = ["cromwell", "-Xms1g", "-Xmx%s" % _estimate_runner_memory(json_file),
+           "run", "--type", "CWL",
           "-Dconfig.file=%s" % hpc.create_cromwell_config(args, work_dir, json_file)]
    cmd += hpc.args_to_cromwell_cl(args)
    cmd += ["--metadata-output", metadata_file, "--options", option_file,

--- a/bcbio/cwl/workflow.py
+++ b/bcbio/cwl/workflow.py
@@ -154,7 +154,7 @@ def is_cwl_record(d):
        if d.get("type") == "record":
            return d
        else:
-            recs = filter(lambda x: x is not None, [is_cwl_record(v) for v in d.values()])
+            recs = list(filter(lambda x: x is not None, [is_cwl_record(v) for v in d.values()]))
            return recs[0] if recs else None
    else:
        return None
@@ -254,7 +254,7 @@ def _nest_variable(v, check_records=False):
    check_records -- avoid re-nesting a record input if it comes from a previous
    step and is already nested, don't need to re-array.
    """
-    if (check_records and is_cwl_record(v) and v["id"].split("/") > 1 and
+    if (check_records and is_cwl_record(v) and len(v["id"].split("/")) > 1 and
         v.get("type", {}).get("type") == "array"):
        return v
    else:

--- a/bcbio/distributed/runfn.py
+++ b/bcbio/distributed/runfn.py
@@ -116,7 +116,7 @@ def _get_record_attrs(out_keys):
    """Check for records, a single key plus output attributes.
    """
    if len(out_keys) == 1:
-        attr = out_keys.keys()[0]
+        attr = list(out_keys.keys())[0]
        if out_keys[attr]:
            return attr, out_keys[attr]
    return None, None

--- a/bcbio/hla/optitype.py
+++ b/bcbio/hla/optitype.py
@@ -33,6 +33,9 @@ def run(data):
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data), "hla",
                                                  "OptiType-HLA-A_B_C"))
+        # When running UMIs and hla typing we want to pick the original fastqs
+        if len(hlas) > len(SUPPORTED_HLAS):
+            hlas = [x for x in hlas if os.path.basename(x[1]).find("-cumi") == -1]
        if len(hlas) == len(SUPPORTED_HLAS):
            hla_fq = combine_hla_fqs(hlas, out_dir + "-input.fq", data)
            if utils.file_exists(hla_fq):

--- a/bcbio/install.py
+++ b/bcbio/install.py
@@ -39,7 +39,7 @@ REMOTES = {
 SUPPORTED_GENOMES = ["GRCh37", "hg19", "hg38", "hg38-noalt", "mm10", "mm9",
                     "rn6", "rn5", "canFam3", "dm3", "galGal4", "phix",
                     "pseudomonas_aeruginosa_ucbpp_pa14", "sacCer3", "TAIR10",
-                     "WBcel235", "xenTro3", "GRCz10", "GRCz11", "Sscrofa11.1"]
+                     "WBcel235", "xenTro3", "GRCz10", "GRCz11", "Sscrofa11.1", "BDGP6"]
 TARBALL_DIRECTORIES = ["bwa", "rtg", "hisat2"]
 SUPPORTED_INDEXES = TARBALL_DIRECTORIES + ["bbmap", "bowtie", "bowtie2", "minimap2", "novoalign", "twobit",
                                           "snap", "star", "seq"]
@@ -191,10 +191,10 @@ def _install_container_bcbio_system(datadir):
    expose_file = os.path.join(datadir, "galaxy", "bcbio_system.yaml")
    expose = set(["memory", "cores", "jvm_opts"])
    with open(base_file) as in_handle:
-        config = yaml.load(in_handle)
+        config = yaml.safe_load(in_handle)
    if os.path.exists(expose_file):
        with open(expose_file) as in_handle:
-            expose_config = yaml.load(in_handle)
+            expose_config = yaml.safe_load(in_handle)
    else:
        expose_config = {"resources": {}}
    for pname, vals in config["resources"].items():
@@ -214,21 +214,6 @@ def _get_conda_bin():
    if os.path.exists(conda_bin):
        return conda_bin

-def _default_deploy_args(args):
-    """Standard install arguments for CloudBioLinux.
-
-    Avoid using sudo and keep an installation isolated if running as the root user.
-    """
-    return {"flavor": "ngs_pipeline_minimal",
-            "vm_provider": "novm",
-            "hostname": "localhost",
-            "fabricrc_overrides": {"edition": "minimal",
-                                   "use_sudo": False,
-                                   "keep_isolated": args.isolate or os.geteuid() == 0,
-                                   "conda_cmd": _get_conda_bin(),
-                                   "distribution": args.distribution or "__auto__",
-                                   "dist_name": "__auto__"}}
-
 def _check_for_conda_problems():
    """Identify post-install conda problems and fix.

@@ -236,7 +221,7 @@ def _check_for_conda_problems():
    """
    conda_bin = _get_conda_bin()
    channels = _get_conda_channels(conda_bin)
-    lib_dir = os.path.join(os.path.dirname(conda_bin), os.pardir, "iib")
+    lib_dir = os.path.join(os.path.dirname(conda_bin), os.pardir, "lib")
    for l in ["libgomp.so.1", "libquadmath.so"]:
        if not os.path.exists(os.path.join(lib_dir, l)):
            subprocess.check_call([conda_bin, "install", "-f", "--yes"] + channels + ["libgcc-ng"])
@@ -245,12 +230,13 @@ def _update_bcbiovm():
    """Update or install a local bcbiovm install with tools and dependencies.
    """
    print("## CWL support with bcbio-vm")
-    conda_bin, env_name = _add_environment("bcbiovm", "python=2")
+    python_env = "python=3"
+    conda_bin, env_name = _add_environment("bcbiovm", python_env)
    channels = _get_conda_channels(conda_bin)
    base_cmd = [conda_bin, "install", "--yes", "--name", env_name] + channels
-    subprocess.check_call(base_cmd + ["bcbio-nextgen"])
+    subprocess.check_call(base_cmd + [python_env, "nomkl", "bcbio-nextgen"])
    extra_uptodate = ["cromwell"]
-    subprocess.check_call(base_cmd + ["bcbio-nextgen-vm"] + extra_uptodate)
+    subprocess.check_call(base_cmd + [python_env, "bcbio-nextgen-vm"] + extra_uptodate)

 def _get_envs(conda_bin):
    info = json.loads(subprocess.check_output("{conda_bin} info --envs --json".format(**locals()), shell=True))
@@ -272,7 +258,7 @@ def _get_conda_channels(conda_bin):
    """
    channels = ["bioconda", "conda-forge"]
    out = []
-    config = yaml.load(subprocess.check_output([conda_bin, "config", "--show"]))
+    config = yaml.safe_load(subprocess.check_output([conda_bin, "config", "--show"]))
    for c in channels:
        present = False
        for orig_c in config.get("channels") or []:
@@ -346,29 +332,23 @@ def get_gemini_dir(data=None):
 def upgrade_bcbio_data(args, remotes):
    """Upgrade required genome data files in place.
    """
-    from fabric.api import env
    if hasattr(args, "datadir") and args.datadir and os.path.exists(args.datadir):
        data_dir = args.datadir
    else:
        data_dir = _get_data_dir()
-    s = _default_deploy_args(args)
-    s["actions"] = ["setup_biodata"]
    tooldir = args.tooldir or get_defaults().get("tooldir")
-    if tooldir:
-        s["fabricrc_overrides"]["system_install"] = tooldir
-    s["fabricrc_overrides"]["data_files"] = data_dir
-    s["fabricrc_overrides"]["galaxy_home"] = os.path.join(data_dir, "galaxy")
+    galaxy_home = os.path.join(data_dir, "galaxy")
    cbl = get_cloudbiolinux(remotes)
-    s["genomes"] = _get_biodata(cbl["biodata"], args)
+    tool_data_table_conf_file = os.path.join(cbl["dir"], "installed_files", "tool_data_table_conf.xml")
+    genome_opts = _get_biodata(cbl["biodata"], args)
    sys.path.insert(0, cbl["dir"])
-    env.cores = args.cores
-    cbl_deploy = __import__("cloudbio.deploy", fromlist=["deploy"])
-    cbl_deploy.deploy(s)
-    _upgrade_genome_resources(s["fabricrc_overrides"]["galaxy_home"],
-                              remotes["genome_resources"])
-    _upgrade_snpeff_data(s["fabricrc_overrides"]["galaxy_home"], args, remotes)
+    cbl_genomes = __import__("cloudbio.biodata.genomes", fromlist=["genomes"])
+    cbl_genomes.install_data_local(genome_opts, tooldir, data_dir, galaxy_home, tool_data_table_conf_file,
+                                   args.cores, ["ggd", "s3", "raw"])
+    _upgrade_genome_resources(galaxy_home, remotes["genome_resources"])
+    _upgrade_snpeff_data(galaxy_home, args, remotes)
    if "vep" in args.datatarget:
-        _upgrade_vep_data(s["fabricrc_overrides"]["galaxy_home"], tooldir)
+        _upgrade_vep_data(galaxy_home, tooldir)
    if "kraken" in args.datatarget:
        _install_kraken_db(_get_data_dir(), args)
    if args.cwl:
@@ -404,8 +384,8 @@ def _upgrade_genome_resources(galaxy_dir, base_url):
            local_file = os.path.join(os.path.dirname(ref_file), os.path.basename(remote_url))
            if os.path.exists(local_file):
                with open(local_file) as in_handle:
-                    local_config = yaml.load(in_handle)
-                remote_config = yaml.load(r.text)
+                    local_config = yaml.safe_load(in_handle)
+                remote_config = yaml.safe_load(r.text)
                needs_update = remote_config["version"] > local_config.get("version", 0)
                if needs_update:
                    shutil.move(local_file, local_file + ".old%s" % local_config.get("version", 0))
@@ -430,7 +410,7 @@ def _upgrade_snpeff_data(galaxy_dir, args, remotes):
        resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
        if os.path.exists(resource_file):
            with open(resource_file) as in_handle:
-                resources = yaml.load(in_handle)
+                resources = yaml.safe_load(in_handle)
            snpeff_db, snpeff_base_dir = effects.get_db({"genome_resources": resources,
                                                         "reference": {"fasta": {"base": ref_file}}})
            if snpeff_db:
@@ -472,7 +452,7 @@ def _get_biodata(base_file, args):
    """Retrieve biodata genome targets customized by install parameters.
    """
    with open(base_file) as in_handle:
-        config = yaml.load(in_handle)
+        config = yaml.safe_load(in_handle)
    config["install_liftover"] = False
    config["genome_indexes"] = args.aligners
    ann_groups = config.pop("annotation_groups", {})
@@ -503,22 +483,15 @@ def upgrade_thirdparty_tools(args, remotes):

    Creates a manifest directory with installed programs on the system.
    """
-    s = {"fabricrc_overrides": {"system_install": args.tooldir,
-                                "local_install": os.path.join(args.tooldir, "local_install"),
-                                "distribution": args.distribution,
-                                "conda_cmd": _get_conda_bin(),
-                                "use_sudo": False,
-                                "edition": "minimal"}}
-    s = _default_deploy_args(args)
-    s["actions"] = ["install_biolinux"]
-    s["fabricrc_overrides"]["system_install"] = args.tooldir
-    s["fabricrc_overrides"]["local_install"] = os.path.join(args.tooldir, "local_install")
-    if args.toolconf and os.path.exists(args.toolconf):
-        s["fabricrc_overrides"]["conda_yaml"] = args.toolconf
    cbl = get_cloudbiolinux(remotes)
+    if args.toolconf and os.path.exists(args.toolconf):
+        package_yaml = args.toolconf
+    else:
+        package_yaml = os.path.join(cbl["dir"], "contrib", "flavor",
+                                    "ngs_pipeline_minimal", "packages-conda.yaml")
    sys.path.insert(0, cbl["dir"])
-    cbl_deploy = __import__("cloudbio.deploy", fromlist=["deploy"])
-    cbl_deploy.deploy(s)
+    cbl_conda = __import__("cloudbio.package.conda", fromlist=["conda"])
+    cbl_conda.install_in(_get_conda_bin(), args.tooldir, package_yaml)
    manifest_dir = os.path.join(_get_data_dir(), "manifest")
    print("Creating manifest of installed packages in %s" % manifest_dir)
    cbl_manifest = __import__("cloudbio.manifest", fromlist=["manifest"])
@@ -571,7 +544,7 @@ def _update_manifest(manifest_file, name, version):
    """
    if os.path.exists(manifest_file):
        with open(manifest_file) as in_handle:
-            manifest = yaml.load(in_handle)
+            manifest = yaml.safe_load(in_handle)
    else:
        manifest = {}
    manifest[name] = {"name": name, "version": version}
@@ -585,7 +558,7 @@ def _update_system_file(system_file, name, new_kvs):
        bak_file = system_file + ".bak%s" % datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        shutil.copyfile(system_file, bak_file)
        with open(system_file) as in_handle:
-            config = yaml.load(in_handle)
+            config = yaml.safe_load(in_handle)
    else:
        utils.safe_makedir(os.path.dirname(system_file))
        config = {}
@@ -661,7 +634,7 @@ def save_install_defaults(args):
        return
    if utils.file_exists(install_config):
        with open(install_config) as in_handle:
-            cur_config = yaml.load(in_handle)
+            cur_config = yaml.safe_load(in_handle)
    else:
        cur_config = {}
    if args.tooldir:
@@ -695,7 +668,7 @@ def add_install_defaults(args):
        default_args = {}
    else:
        with open(install_config) as in_handle:
-            default_args = yaml.load(in_handle)
+            default_args = yaml.safe_load(in_handle)
    # if we are upgrading to development, also upgrade the tools
    if args.upgrade in ["development"] and (args.tooldir or "tooldir" in default_args):
        args.tools = True
@@ -760,7 +733,7 @@ def get_defaults():
    if install_config is None or not utils.file_exists(install_config):
        return {}
    with open(install_config) as in_handle:
-        return yaml.load(in_handle)
+        return yaml.safe_load(in_handle)

 def _check_toolplus(x):
    """Parse options for adding non-standard/commercial tools like GATK and MuTecT.

--- a/bcbio/ngsalign/postalign.py
+++ b/bcbio/ngsalign/postalign.py
@@ -149,7 +149,9 @@ def _sam_to_grouped_umi_cl(data, umi_consensus, tx_out_file):
        cmd += "fgbio {jvm_opts} AnnotateBamWithUmis -i /dev/stdin -f {umi_consensus} -o {tx_out_file}"
    # UMIs embedded in read name
    else:
-        cmd += "umis bamtag - | samtools view -b > {tx_out_file}"
+        cmd += ("%s %s bamtag - | samtools view -b > {tx_out_file}" %
+                (utils.get_program_python("umis"),
+                 config_utils.get_program("umis", data["config"])))
    return cmd.format(**locals())

 def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None):

--- a/bcbio/pipeline/alignment.py
+++ b/bcbio/pipeline/alignment.py
@@ -56,12 +56,15 @@ def organize_noalign(data):
    data = utils.to_single_data(data[0])
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data))
+    if data.get("files"):
        if data["files"][0].endswith(".cram"):
            work_bam = cram.to_bam(data["files"][0], work_bam, data)
        else:
            assert data["files"][0].endswith(".bam"), data["files"][0]
            utils.copy_plus(data["files"][0], work_bam)
        bam.index(work_bam, data["config"])
+    else:
+        work_bam = None
    data["align_bam"] = work_bam
    return data


--- a/bcbio/pipeline/config_utils.py
+++ b/bcbio/pipeline/config_utils.py
@@ -134,7 +134,7 @@ def load_config(config_file):
    """Load YAML config file, replacing environmental variables.
    """
    with open(config_file) as in_handle:
-        config = yaml.load(in_handle)
+        config = yaml.safe_load(in_handle)
    config = _expand_paths(config)
    if 'resources' not in config:
        config['resources'] = {}

--- a/bcbio/pipeline/datadict.py
+++ b/bcbio/pipeline/datadict.py
@@ -70,6 +70,7 @@ LOOKUPS = {
    "disease": {"keys": ["metadata", "disease"], "default": ""},
    "hetcaller": {"keys": ["config", "algorithm", "hetcaller"]},
    "variantcaller": {"keys": ['config', 'algorithm', 'variantcaller']},
+    "variantcaller_order": {"keys": ['config', 'algorithm', 'variantcaller_order'], "default": 0},
    "svcaller": {"keys": ['config', 'algorithm', 'svcaller'], "default": [], "always_list": True},
    "jointcaller": {"keys": ['config', 'algorithm', 'jointcaller']},
    "hlacaller": {"keys": ['config', 'algorithm', 'hlacaller']},
@@ -86,6 +87,7 @@ LOOKUPS = {
    "novel_mirna_counts": {"keys": ["novel_mirna_counts"]},
    "novel_isomir_counts": {"keys": ["novel_isomir_counts"]},
    "combined_counts": {"keys": ["combined_counts"]},
+    "combined_histogram": {"keys": ["combined_histogram"]},
    "annotated_combined_counts": {"keys": ["annotated_combined_counts"]},
    "genome_context_files": {"keys": ["reference", "genome_context"], "default": [], "always_list": True},
    "viral_files": {"keys": ["reference", "viral"], "default": [], "always_list": True},
@@ -96,6 +98,7 @@ LOOKUPS = {
    "express_fpkm": {"keys": ['express_fpkm']},
    "express_tpm": {"keys": ['express_tpm']},
    "express_counts": {"keys": ['express_counts']},
+    "histogram_counts": {"keys": ['histogram_counts']},
    "isoform_to_gene": {"keys": ['isoform_to_gene']},
    "fusion_mode": {"keys": ['config', 'algorithm', 'fusion_mode']},
    "fusion_caller": {"keys": ['config', 'algorithm', 'fusion_caller']},
@@ -176,6 +179,7 @@ LOOKUPS = {
    "cellular_barcode_correction": {"keys": ["config", "algorithm",
                                             "cellular_barcode_correction"],
                                    "default": 1},
+    "demultiplexed": {"keys": ["config", "algorithm", "demultiplexed"]},
    "kallisto_quant": {"keys": ["kallisto_quant"]},
    "salmon_dir": {"keys": ["salmon_dir"]},
    "salmon_fraglen_file": {"keys": ["salmon_fraglen_file"]},
@@ -344,3 +348,27 @@ def get_keys(lookup):
    """
    return tz.get_in((lookup, "keys"), LOOKUPS, None)

+def update_summary_qc(data, key, base=None, secondary=None):
+    """
+    updates summary_qc with a new section, keyed by key.
+    stick files into summary_qc if you want them propagated forward
+    and available for multiqc
+    """
+    summary = get_summary_qc(data, {})
+    if base and secondary:
+        summary[key] = {"base": base, "secondary": secondary}
+    elif base:
+        summary[key] = {"base": base}
+    elif secondary:
+        summary[key] = {"secondary": secondary}
+    data = set_summary_qc(data, summary)
+    return data
+
+def has_variantcalls(data):
+    """
+    returns True if the data dictionary is configured for variant calling
+    """
+    analysis = get_analysis(data).lower()
+    variant_pipeline = analysis.startswith(("standard", "variant", "variant2"))
+    variantcaller = get_variantcaller(data)
+    return variant_pipeline or variantcaller