Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · 419f6294
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cwl/*-workflow
 .idea/
 __pycache__
 .coverage
+.pytest_cache
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,8 +22,9 @@ install:
  - df -h
  - export PATH=~/install/bcbio-vm/anaconda/bin:$PATH
  - conda install --yes nomkl
-  - travis_wait 30 conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm bcbio-nextgen
-  - travis_wait conda install --yes -c conda-forge -c bioconda cwltool toil rabix-bunny
+  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen
+  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm
+  - conda install --yes -c conda-forge -c bioconda cwltool toil rabix-bunny
  # Clean up space with external tools we don't need for tests
  - conda clean --yes --tarballs --index-cache
  - conda remove --yes --force qt
@@ -43,14 +44,14 @@ script:
  # Update to latest bcbio-nextgen code within the container
  - bcbio_vm.py devel setup_install -i quay.io/bcbio/bcbio-vc
  # -- Standard bcbio variant tests
-  - docker run -v `pwd`:`pwd` quay.io/bcbio/bcbio-vc bash -c "cd `pwd` && /usr/local/share/bcbio-nextgen/anaconda/bin/py.test tests/unit --cov=bcbio"
-  - py.test tests/bcbio_vm -v -m docker_multicore
+  - docker run -v `pwd`:`pwd` quay.io/bcbio/bcbio-vc bash -c "cd `pwd` && /usr/local/share/bcbio-nextgen/anaconda/bin/py.test -p no:cacheprovider tests/unit --cov=bcbio"
+  - py.test -p no:cacheprovider tests/bcbio_vm -v -m docker_multicore
  # -- bcbio variant CWL tests
-  - py.test tests/bcbio_vm -v -s -m cwl_docker_joint
-  - py.test tests/bcbio_vm -v -s -m cwl_docker_somatic
+  - py.test -p no:cacheprovider tests/bcbio_vm -v -s -m cwl_docker_joint
+  - py.test -p no:cacheprovider tests/bcbio_vm -v -s -m cwl_docker_somatic
  # -- platform integration
  - sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
-  - py.test tests/bcbio_vm -v -s -m cwl_arvados
+  - py.test -p no:cacheprovider tests/bcbio_vm -v -s -m cwl_arvados
  # -- Cleanup variant docker image
  - docker ps -a -q | xargs --no-run-if-empty docker rm
  - docker rmi -f quay.io/bcbio/bcbio-vc
@@ -61,7 +62,7 @@ script:
  - docker images
  - df -h
  - bcbio_vm.py devel setup_install -i quay.io/bcbio/bcbio-rnaseq
-  - py.test tests/bcbio_vm -v -s -m cwl_docker_rnaseq
+  - py.test -p no:cacheprovider tests/bcbio_vm -v -s -m cwl_docker_rnaseq
  # -- Cleanup RNA-seq docker image
  - docker ps -a -q | xargs --no-run-if-empty docker rm
  - docker rmi -f quay.io/bcbio/bcbio-rnaseq

--- a/Dockerfile
+++ b/Dockerfile
@@ -12,9 +12,9 @@ RUN apt-get update && \
 # bcbio-nextgen installation
    mkdir -p /tmp/bcbio-nextgen-install && cd /tmp/bcbio-nextgen-install && \
    wget --no-check-certificate \
-      https://raw.github.com/chapmanb/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py && \
+      https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py && \
    python bcbio_nextgen_install.py /usr/local/share/bcbio-nextgen \
-      --isolate --nodata -u development --tooldir=/usr/local && \
+      --isolate --minimize-disk --nodata -u development && \
    git config --global url.https://github.com/.insteadOf git://github.com/ && \
    /usr/local/share/bcbio-nextgen/anaconda/bin/conda install -y nomkl && \
    /usr/local/share/bcbio-nextgen/anaconda/bin/bcbio_nextgen.py upgrade --isolate --tooldir=/usr/local --tools && \
@@ -28,7 +28,7 @@ RUN apt-get update && \

 # add user run script
    wget --no-check-certificate -O createsetuser \
-      https://raw.github.com/chapmanb/bcbio-nextgen-vm/master/scripts/createsetuser && \
+      https://raw.github.com/bcbio/bcbio-nextgen-vm/master/scripts/createsetuser && \
    chmod a+x createsetuser && mv createsetuser /sbin && \

 # clean filesystem

--- a/HISTORY.md
+++ b/HISTORY.md
+## 1.0.9 (10 April 2018)
+
+- Use smoove for lumpy variant calling and genotyping, replacing custom lumpyexpress
+  implementation: [validation](https://github.com/bcbio/bcbio_validations/tree/master/NA24385_sv#smoove-validation)
+- Generalize exclusion of regions during variant calling with new
+  `exclude_regions` target. Includes previously available LCR and high depth
+  regions, in addition to removal of polyX and alternative contigs.
+- Normalize allele frequency calculation and filtering for Strelka2 and MuTect2.
+  Thanks to Vlad Saveliev.
+- CNVkit: enable specification of pre-built reference background cnn with
+  `background: cnv_reference`.
+- CNVkit: handle projects with mixed CNVkit and non-CNVkit usage. Thanks to Luca
+  Beltrame.
+- Improved Atropos trimming: better use of multicore parallelization in variant
+  and RNA-seq pipelines.
+- Add support for polyG and polyX trimming to variant calling for NovaSeq 3' end
+  cleanup and generally avoiding low complexity reads.
+- Structural variant: use SURVIVOR for validation comparisons.
+- RNA-seq variant calling: use multiple cores for VarDict.
+- Support miRge2.0 for alternative small RNA annotation. Users should
+  install the tool manually until compatible with bioconda.
+- Add bamCoverage to chip-seq pipeline to calculate bigwig files.
+- GATK4: Correctly use GATK4 GatherVcfs when tools_off: [gatk4] specified for
+  variant calling. Thanks to Luca Beltrame.
+- variant: Default to `mark_duplicates: false` if alignment turned off
+  (`aligner: false`).
+- variant: Fix race condition when preparing BED files for coverage and
+  sv_regions. Thanks to Tristan Lubinski.
+- Fix `noalt_calling` to correctly avoid parallelizing on non-standard
+  chromosomes without a variant regions file.
+- Fix broken `kraken` command. Thanks to @choosehappy.
+
 ## 1.0.8 (5 February 2018)

 - GATK4 is the new default GATK release used in bcbio when running HaplotypeCaller or

--- a/README.rst
+++ b/README.rst
@@ -10,8 +10,8 @@ provides a shared community resource that handles the data processing component
 of sequencing analysis, providing researchers with more time to focus on the
 downstream biology.

-.. image:: https://travis-ci.org/chapmanb/bcbio-nextgen.png
-    :target: https://travis-ci.org/chapmanb/bcbio-nextgen
+.. image:: https://travis-ci.org/bcbio/bcbio-nextgen.png
+    :target: https://travis-ci.org/bcbio/bcbio-nextgen

 Features
 --------
@@ -55,7 +55,7 @@ Quick start

 1. `Install`_ ``bcbio-nextgen`` with all tool dependencies and data files::

-         wget https://raw.github.com/chapmanb/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
+         wget https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
         python bcbio_nextgen_install.py /usr/local/share/bcbio --tooldir=/usr/local \
           --genomes GRCh37 --aligners bwa --aligners bowtie2

@@ -74,8 +74,8 @@ Quick start
         cd project1/work
         bcbio_nextgen.py ../config/project1.yaml -n 8

-.. _system configuration file: https://github.com/chapmanb/bcbio-nextgen/blob/master/config/bcbio_system.yaml
-.. _sample description file: https://github.com/chapmanb/bcbio-nextgen/blob/master/config/bcbio_sample.yaml
+.. _system configuration file: https://github.com/bcbio/bcbio-nextgen/blob/master/config/bcbio_system.yaml
+.. _sample description file: https://github.com/bcbio/bcbio-nextgen/blob/master/config/bcbio_sample.yaml
 .. _Automatically create a processing description: https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#automated-sample-configuration
 .. _Install: https://bcbio-nextgen.readthedocs.org/en/latest/contents/installation.html#automated
 .. _configuration options: https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html
@@ -88,7 +88,7 @@ See the `full documentation`_ and `longer analysis-based articles
 and discussion on the `biovalidation mailing list`_.

 .. _full documentation: https://bcbio-nextgen.readthedocs.org
-.. _GitHub: https://github.com/chapmanb/bcbio-nextgen/issues
+.. _GitHub: https://github.com/bcbio/bcbio-nextgen/issues
 .. _biovalidation mailing list: https://groups.google.com/d/forum/biovalidation

 Contributors

--- a/bcbio/bam/__init__.py
+++ b/bcbio/bam/__init__.py
@@ -266,7 +266,7 @@ def _check_sample(in_bam, rgnames):
    if len(msgs) > 0:
        raise ValueError("Problems with pre-aligned input BAM file: %s\n" % (in_bam)
                         + "\n".join(msgs) +
-                         "\nSetting `bam_clean: picard` or `bam_clean: fixrg`\n"
+                         "\nSetting `bam_clean: fixrg`\n"
                         "in the configuration can often fix this issue.")
    if warnings:
        print("*** Potential problems in input BAM compared to reference:\n%s\n" %
@@ -301,7 +301,7 @@ def _check_bam_contigs(in_bam, ref_file, config):
        warnings.append("Extra reference chromosomes: %s" % rc)
    if problems:
        raise ValueError("Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n"
-                         "Setting `bam_clean: picard` in the configuration can often fix this issue."
+                         "Setting `bam_clean: remove_extracontigs` in the configuration can often fix this issue."
                         % "\n".join(problems))
    if warnings:
        print("*** Potential problems in input BAM compared to reference:\n%s\n" %

--- a/bcbio/bam/callable.py
+++ b/bcbio/bam/callable.py
@@ -32,18 +32,17 @@ def sample_callable_bed(bam_file, ref_file, data):
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
-    noalt_calling = "noalt_calling" in dd.get_tools_on(data)
+    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
-    config = data["config"]
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
-    with shared.bedtools_tmpdir({"config": config}):
+    with shared.bedtools_tmpdir(data):
        callable_bed, depth_files = coverage.calculate(bam_file, data)
-        input_regions_bed = config["algorithm"].get("variant_regions", None)
+        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
-            with file_transaction(config, out_file) as tx_out_file:
+            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
@@ -101,11 +100,11 @@ def _combine_regions(all_regions, ref_regions):
    bed_lines = ["%s\t%s\t%s" % (c, s, e) for (c, s, e) in all_intervals]
    return pybedtools.BedTool("\n".join(bed_lines), from_string=True)

-def _add_config_regions(nblock_regions, ref_regions, config):
+def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
-    input_regions_bed = config["algorithm"].get("variant_regions", None)
+    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
@@ -119,9 +118,13 @@ def _add_config_regions(nblock_regions, ref_regions, config):
                             "excludes all genomic regions. Do the chromosome names "
                             "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions)
-        return all_intervals.merge()
    else:
-        return nblock_regions
+        all_intervals = nblock_regions
+    if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data):
+        from bcbio.heterogeneity import chromhacks
+        remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom))
+        all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions)
+    return all_intervals.merge()

 class NBlockRegionPicker:
    """Choose nblock regions reasonably spaced across chromosomes.
@@ -179,15 +182,14 @@ def block_regions(callable_bed, in_bam, ref_file, data):
    Identifies islands of callable regions, surrounding by regions
    with no read support, that can be analyzed independently.
    """
-    config = data["config"]
-    min_n_size = int(config["algorithm"].get("nomap_split_size", 250))
-    with shared.bedtools_tmpdir({"config": config}):
+    min_n_size = int(data["config"]["algorithm"].get("nomap_split_size", 250))
+    with shared.bedtools_tmpdir(data):
        nblock_bed = "%s-nblocks.bed" % utils.splitext_plus(callable_bed)[0]
        callblock_bed = "%s-callableblocks.bed" % utils.splitext_plus(callable_bed)[0]
        if not utils.file_uptodate(nblock_bed, callable_bed):
-            ref_regions = get_ref_bedtool(ref_file, config)
+            ref_regions = get_ref_bedtool(ref_file, data["config"])
            nblock_regions = _get_nblock_regions(callable_bed, min_n_size, ref_regions)
-            nblock_regions = _add_config_regions(nblock_regions, ref_regions, config)
+            nblock_regions = _add_config_regions(nblock_regions, ref_regions, data)
            with file_transaction(data, nblock_bed, callblock_bed) as (tx_nblock_bed, tx_callblock_bed):
                nblock_regions.filter(lambda r: len(r) > min_n_size).saveas(tx_nblock_bed)
                if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
@@ -195,7 +197,7 @@ def block_regions(callable_bed, in_bam, ref_file, data):
                else:
                    raise ValueError("No callable regions found from BAM file. Alignment regions might "
                                     "not overlap with regions found in your `variant_regions` BED: %s" % in_bam)
-    return callblock_bed, nblock_bed, callable_bed
+    return callblock_bed, nblock_bed

 def _write_bed_regions(data, final_regions, out_file, out_file_ref):
    ref_file = tz.get_in(["reference", "fasta", "base"], data)

--- a/bcbio/bam/trim.py
+++ b/bcbio/bam/trim.py
@@ -40,7 +40,10 @@ def _trim_adapters(fastq_files, out_dir, data):
    MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim)
    """
    to_trim = _get_sequences_to_trim(data["config"], SUPPORTED_ADAPTERS)
-    out_files, report_file = _atropos_trim(fastq_files, to_trim, out_dir, data)
+    if dd.get_trim_reads(data) == "fastp":
+        out_files, report_file = _fastp_trim(fastq_files, to_trim, out_dir, data)
+    else:
+        out_files, report_file = _atropos_trim(fastq_files, to_trim, out_dir, data)
    # quality_format = _get_quality_format(data["config"])
    # out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir)
    # log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0]
@@ -52,6 +55,8 @@ def _trim_adapters(fastq_files, out_dir, data):
    #     open(log_file, 'w').write(content)
    return out_files

+# ## Atropos trimming
+
 def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
@@ -63,16 +68,24 @@ def _atropos_trim(fastq_files, adapters, out_dir, data):
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
-            adapters_args = " ".join(["-a %s" % a for a in adapters])
+            # polyX trimming, anchored to the 3' ends of reads
+            if "polyx" in dd.get_adapters(data):
+                adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"]
+            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
+            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
+            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
+                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
-                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals())
+                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
-                adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters])
+                cores = max(1, dd.get_num_cores(data) // 2)
+                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
-                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals())
+                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
+                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
@@ -81,16 +94,51 @@ def _atropos_trim(fastq_files, adapters, out_dir, data):
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
-                                 ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]:
+                                 ("--minimum-length", ["-m "], str(dd.get_min_read_length(data))),
+                                 ("--nextseq-trim", [], "25")]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
-            thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
+            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file

+# ## fastp trimming
+
+def _fastp_trim(fastq_files, adapters, out_dir, data):
+    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
+    """
+    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
+    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
+                 for x in fastq_files]
+    if not utils.file_exists(out_files[0]):
+        with file_transaction(data, *[report_file] + out_files) as tx_out:
+            tx_report = tx_out[0]
+            tx_out_files = tx_out[1:]
+            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
+            if dd.get_quality_format(data).lower() == "illumina":
+                cmd += ["--phred64"]
+            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
+                if i == 0:
+                    cmd += ["-i", inf, "-o", outf]
+                else:
+                    cmd += ["-I", inf, "-O", outf]
+            cmd += ["--trim_poly_g", "--poly_g_min_len", "8",
+                    "--cut_by_quality3", "--cut_mean_quality", "5",
+                    "--length_required", str(dd.get_min_read_length(data)),
+                    "--disable_quality_filtering"]
+            if "polyx" in dd.get_adapters(data):
+                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
+            for a in adapters:
+                cmd += ["--adapter_sequence", a]
+            if not adapters:
+                cmd += ["--disable_adapter_trimming"]
+            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
+            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
+    return out_files, report_file
+
 def _get_sequences_to_trim(config, builtin):
    builtin_adapters = _get_builtin_adapters(config, builtin)
    polya = builtin_adapters.get("polya", [None])[0]

--- a/bcbio/broad/__init__.py
+++ b/bcbio/broad/__init__.py
@@ -26,7 +26,7 @@ def get_default_jvm_opts(tmp_dir=None, parallel_gc=False):
    Avoids issues with multiple spun up Java processes running into out of memory errors.
    Parallel GC can use a lot of cores on big machines and primarily helps reduce task latency
    and responsiveness which are not needed for batch jobs.
-    https://github.com/chapmanb/bcbio-nextgen/issues/532#issuecomment-50989027
+    https://github.com/bcbio/bcbio-nextgen/issues/532#issuecomment-50989027
    https://wiki.csiro.au/pages/viewpage.action?pageId=545034311
    http://stackoverflow.com/questions/9738911/javas-serial-garbage-collector-performing-far-better-than-other-garbage-collect
    However, serial GC causes issues with Spark local runs so we use parallel for those cases:

--- a/bcbio/chipseq/__init__.py
+++ b/bcbio/chipseq/__init__.py
+import os
 import sys
+import toolz as tz
+from bcbio import utils
+from bcbio import bam
+from bcbio.pipeline import config_utils
 import bcbio.pipeline.datadict as dd
 from bcbio.ngsalign import bowtie2, bwa
+from bcbio.distributed.transaction import file_transaction
+from bcbio.provenance import do
 from bcbio.log import logger

 def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
-    data["raw_bam"] = dd.get_work_bam(data)
+    data["align_bam"] = dd.get_work_bam(data)
    if aligner:
        if aligner == "bowtie2":
            filterer = bowtie2.filter_multimappers
@@ -19,4 +26,47 @@ def clean_chipseq_alignment(data):
    else:
        logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                    "If BAM is not cleaned for peak calling, can result in downstream errors.")
+    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
+    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
+    if encode_bed:
+        data["work_bam"] = _prepare_bam(data["work_bam"], encode_bed, data['config'])
+        bam.index(data["work_bam"], data['config'])
+    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
    return [[data]]
+
+def _prepare_bam(bam_file, bed_file, config):
+    """Remove regions from bed files"""
+    if not bam_file or not bed_file:
+        return bam_file
+    out_file = utils.append_stem(bam_file, '_filter')
+    bedtools = config_utils.get_program("bedtools", config)
+    if not utils.file_exists(out_file):
+        with file_transaction(out_file) as tx_out:
+            cmd = "{bedtools} subtract -nonamecheck -A -a {bam_file} -b {bed_file} > {tx_out}"
+            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
+    return out_file
+
+def  get_genome(genome):
+    from bcbio.chipseq import macs2
+    loaded = macs2.HS
+    if genome in loaded:
+        return loaded[genome]
+
+def _bam_coverage(name, bam_input, data):
+    """Run bamCoverage from deeptools"""
+    cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
+          "--binSize 20 --effectiveGenomeSize {size} "
+          "--smoothLength 60 --extendReads 150 --centerReads -p {cores}")
+    size = int(get_genome(dd.get_genome_build(data)))
+    cores = dd.get_num_cores(data)
+    try:
+        bam_coverage = config_utils.get_program("bamCoverage", data)
+    except config_utils.CmdNotFound:
+        logger.info("No bamCoverage found, skipping bamCoverage.")
+        return None
+    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
+    if utils.file_exists(bw_output):
+        return bw_output
+    with file_transaction(bw_output) as out_tx:
+        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
+    return bw_output
--- a/bcbio/chipseq/macs2.py
+++ b/bcbio/chipseq/macs2.py
@@ -8,11 +8,11 @@ from bcbio.provenance import do
 from bcbio.pipeline import config_utils
 from bcbio import bam

-HS = {"hg19": "2.7e9",
-      "GRCh37": "2.7e9",
-      "hg38": "2.7e9",
-      "mm10": "1.87e9",
-      "dm3": "1.2e8"}
+HS = {"hg19": 2.7e9,
+      "GRCh37": 2.7e9,
+      "hg38": 2.7e9,
+      "mm10": 1.87e9,
+      "dm3": 1.2e8}

 def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, config):
    """

--- a/bcbio/chipseq/peaks.py
+++ b/bcbio/chipseq/peaks.py
@@ -44,38 +44,19 @@ def peakcall_prepare(data, run_parallel):

 def calling(data):
    """Main function to parallelize peak calling."""
-    chip_bam = dd.get_work_bam(data)
+    chip_bam = data.get("work_bam")
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
-    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
-    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
-    if encode_bed:
-        chip_bam = _prepare_bam(chip_bam, encode_bed, data['config'])
-        data["work_bam_filter"] = chip_bam
-        input_bam = _prepare_bam(input_bam, encode_bed, data['config'])
-        data["input_bam_filter"] = input_bam
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["resources"], data["config"])
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
+    # data["input_bam_filter"] = input_bam
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
-    data["input_bam_filter"] = input_bam
-
-def _prepare_bam(bam_file, bed_file, config):
-    """Remove regions from bed files"""
-    if not bam_file or not bed_file:
-        return bam_file
-    out_file = utils.append_stem(bam_file, '_filter')
-    bedtools = config_utils.get_program("bedtools", config)
-    if not utils.file_exists(out_file):
-        with file_transaction(out_file) as tx_out:
-            cmd = "{bedtools} subtract -nonamecheck -A -a {bam_file} -b {bed_file} > {tx_out}"
-            do.run(cmd.format(**locals()), "Clean %s" % bam_file)
-    return out_file

 def _sync(original, processed):
    """
@@ -87,7 +68,7 @@ def _sync(original, processed):
        original_sample[0]["peaks_files"] = {}
        for process_sample in processed:
            if dd.get_sample_name(original_sample[0]) == dd.get_sample_name(process_sample[0]):
-                for key in ["peaks_file", "work_bam_filter", "input_bam_filter"]:
+                for key in ["peaks_files"]:
                    if process_sample[0].get(key):
                        original_sample[0][key] = process_sample[0][key]
    return original
@@ -100,7 +81,7 @@ def _check(sample, data):
        return None
    for origin in data:
        if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input":
-            sample["work_bam_input"] = dd.get_work_bam(origin[0])
+            sample["work_bam_input"] = origin[0].get("work_bam")
            return [sample]
    return [sample]

@@ -150,3 +131,4 @@ def greylisting(data):
                            % dd.get_sample_name(data))
                return None
    return greylistdir
+
--- a/bcbio/cwl/create.py
+++ b/bcbio/cwl/create.py
@@ -759,6 +759,7 @@ def _directory_tarball(dirname):
        tarball_dir = os.path.join(extra_tarball, tarball_dir)
    tarball = os.path.join(base_dir, "%s-wf.tar.gz" % (tarball_dir.replace(os.path.sep, "--")))
    if not utils.file_exists(tarball):
+        print("Preparing CWL input tarball: %s" % tarball)
        with utils.chdir(base_dir):
            with tarfile.open(tarball, "w:gz") as tar:
                tar.add(tarball_dir)

--- a/bcbio/cwl/cwlutils.py
+++ b/bcbio/cwl/cwlutils.py
@@ -12,8 +12,9 @@ import tarfile

 import toolz as tz

-from bcbio import utils
+from bcbio import bam, utils
 from bcbio.pipeline import datadict as dd
+from bcbio.variation import vcfutils

 def to_rec(samples, default_keys=None):
    """Convert inputs into CWL records, useful for single item parallelization.
@@ -190,3 +191,70 @@ def samples_to_records(samples, default_keys=None):
        data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {}))
        out.append(data)
    return out
+
+def assign_complex_to_samples(items):
+    """Assign complex inputs like variants and align outputs to samples.
+
+    Handles list inputs to record conversion where we have inputs from multiple
+    locations and need to ensure they are properly assigned to samples in many
+    environments.
+
+    The unpleasant approach here is to use standard file naming to match
+    with samples so this can work in environments where we don't download/stream
+    the input files (for space/time savings).
+    """
+    extract_fns = {("variants", "samples"): _get_vcf_samples,
+                   ("align_bam",): _get_bam_samples}
+    complex = {k: {} for k in extract_fns.keys()}
+    for data in items:
+        for k in complex:
+            v = tz.get_in(k, data)
+            if v is not None:
+                for s in extract_fns[k](v, items):
+                    if s:
+                        complex[k][s] = v
+    out = []
+    for data in items:
+        for k in complex:
+            newv = tz.get_in([k, dd.get_sample_name(data)], complex)
+            if newv:
+                data = tz.update_in(data, k, lambda x: newv)
+        out.append(data)
+    return out
+
+def _get_vcf_samples(calls, items):
+    have_full_file = False
+    all_samples = set([])
+    sample_matches = False
+    for f in utils.flatten(calls):
+        if have_full_file:
+            cur = set(vcfutils.get_samples(f))
+            if cur:
+                if not all_samples:
+                    all_samples = cur
+                else:
+                    all_samples &= set(cur)
+        else:
+            for data in items:
+                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
+                    if os.path.basename(f).startswith(("%s-" % test_name,
+                                                       "%s." % test_name)):
+                        # Prefer matches to single samples (gVCF) over joint batches
+                        if i == 0:
+                            sample_matches = True
+                        if sample_matches and i > 0:
+                            continue
+                        else:
+                            all_samples.add(dd.get_sample_name(data))
+    return list(all_samples)
+
+def _get_bam_samples(f, items):
+    have_full_file = False
+    if have_full_file:
+        return [bam.sample_name(f)]
+    else:
+        for data in items:
+            if os.path.basename(f).startswith(("%s-" % dd.get_sample_name(data),
+                                               "%s." % dd.get_sample_name(data))):
+                return [dd.get_sample_name(data)]
+        return []
--- a/bcbio/cwl/defs.py
+++ b/bcbio/cwl/defs.py
@@ -69,7 +69,7 @@ def et(name, parallel, inputs, outputs, expression):
    ExpressionTool = collections.namedtuple("ExpressionTool", "name inputs outputs expression parallel")
    return ExpressionTool(name, inputs, outputs, expression, parallel)

-def cwlout(key, valtype=None, extensions=None, fields=None):
+def cwlout(key, valtype=None, extensions=None, fields=None, exclude=None):
    """Definition of an output variable, defining the type and associated secondary files.
    """
    out = {"id": key}
@@ -79,6 +79,8 @@ def cwlout(key, valtype=None, extensions=None, fields=None):
        out["fields"] = fields
    if extensions:
        out["secondaryFiles"] = extensions
+    if exclude:
+        out["exclude"] = exclude
    return out

 def _alignment(checkpoints):
@@ -88,7 +90,7 @@ def _alignment(checkpoints):
                       fields=[cwlout(["files"], ["null", {"type": "array", "items": "File"}], [".gbi"]),
                               cwlout(["config", "algorithm", "quality_format"], ["string", "null"]),
                               cwlout(["align_split"], ["string", "null"])])],
-               "bcbio-vc", ["grabix", "htslib", "biobambam"],
+               "bcbio-vc", ["grabix", "htslib", "biobambam", "atropos;env=python3"],
               disk={"files": 1.5}),
             s("process_alignment", "single-parallel" if checkpoints["align_split"] else "single-single",
               [["alignment_rec"], ["process_alignment_rec"]],
@@ -154,7 +156,7 @@ def _variant_vc(checkpoints):
        vc_wf += [s("postprocess_variants", "batch-single",
                    [["batch_rec"], ["vrn_file"]],
                    [cwlout(["vrn_file"], "File", [".tbi"])],
-                    "bcbio-vc", ["snpeff=4.3i"], disk={"files": 0.5})]
+                    "bcbio-vc", ["snpeff=4.3.1t"], disk={"files": 0.5})]
    vc_wf += [s("compare_to_rm", "batch-single",
                [["batch_rec"], ["vrn_file"]],
                [cwlout("vc_rec", "record",
@@ -163,7 +165,9 @@ def _variant_vc(checkpoints):
                                cwlout(["validate", "tp"], ["File", "null"], [".tbi"]),
                                cwlout(["validate", "fp"], ["File", "null"], [".tbi"]),
                                cwlout(["validate", "fn"], ["File", "null"], [".tbi"]),
-                                cwlout("inherit")])],
+                                cwlout("inherit", exclude=[["align_bam"], ["reference", "twobit"],
+                                                           ["reference", "snpeff"], ["reference", "rtg"],
+                                                           ["genome_resources", "variation"]])])],
                "bcbio-vc", ["bcftools", "bedtools", "pythonpy", "gvcf-regions",
                             "htslib", "rtg-tools", "vcfanno"],
                disk={"files": 1.5})]
@@ -173,6 +177,8 @@ def _variant_vc(checkpoints):
             ["metadata", "batch"], ["metadata", "phenotype"],
             ["regions", "sample_callable"], ["config", "algorithm", "variantcaller"],
             ["config", "algorithm", "coverage_interval"],
+             ["config", "algorithm", "effects"],
+             ["config", "algorithm", "exclude_regions"],
             ["config", "algorithm", "variant_regions"],
             ["config", "algorithm", "validate"], ["config", "algorithm", "validate_regions"],
             ["config", "algorithm", "tools_on"],
@@ -180,6 +186,8 @@ def _variant_vc(checkpoints):
             ["reference", "fasta", "base"], ["reference", "twobit"],
             ["reference", "rtg"], ["reference", "genome_context"],
             ["genome_resources", "variation", "cosmic"], ["genome_resources", "variation", "dbsnp"],
+             ["genome_resources", "variation", "lcr"], ["genome_resources", "variation", "polyx"],
+             ["genome_resources", "variation", "encode_blacklist"],
             ["genome_resources", "aliases", "ensembl"], ["genome_resources", "aliases", "human"],
             ["genome_resources", "aliases", "snpeff"], ["reference", "snpeff", "genome_build"]],
            [cwlout("batch_rec", "record")],
@@ -221,7 +229,7 @@ def _variant_jointvc():
          s("postprocess_variants", "batch-single",
            [["jointvc_batch_rec"], ["vrn_file_joint"]],
            [cwlout(["vrn_file_joint"], "File", [".tbi"])],
-            "bcbio-vc", ["snpeff=4.3i"],
+            "bcbio-vc", ["snpeff=4.3.1t"],
            disk={"files": 1.5}),
          s("finalize_jointvc", "batch-single",
            [["jointvc_batch_rec"], ["vrn_file_joint"]],
@@ -241,14 +249,12 @@ def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
-    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
+    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d)
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
-    # Currently always have alignment on until expression tool widely supported
-    checkpoints["align"] = True
-    #checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
+    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
@@ -268,6 +274,8 @@ def variant(samples):
                    ["rgnames", "lane"], ["rgnames", "rg"], ["rgnames", "lb"],
                    ["reference", "aligner", "indexes"],
                    ["config", "algorithm", "aligner"],
+                    ["config", "algorithm", "trim_reads"],
+                    ["config", "algorithm", "adapters"],
                    ["config", "algorithm", "bam_clean"],
                    ["config", "algorithm", "mark_duplicates"]],
                   [cwlout("alignment_rec", "record")],
@@ -277,13 +285,13 @@ def variant(samples):
                   [["align_split"], ["process_alignment_rec"],
                    ["work_bam"], ["config", "algorithm", "quality_format"]])]
    else:
-        align = [et("organize_align_bam", "multi-parallel",
-                    ["files"],
-                    [cwlout(["align_bam"], ["File", "null"], [".bai"]),
-                     cwlout(["work_bam_plus", "disc"], "null"),
-                     cwlout(["work_bam_plus", "sr"], "null"),
-                     cwlout(["hla", "fastq"], "null")],
-                    """${return {"align_bam": inputs.files[0]}}""")]
+        align = [s("organize_noalign", "multi-parallel",
+                   ["files"],
+                   [cwlout(["align_bam"], "File", [".bai"]),
+                    cwlout(["work_bam_plus", "disc"], ["File", "null"]),
+                    cwlout(["work_bam_plus", "sr"], ["File", "null"]),
+                    cwlout(["hla", "fastq"], ["File", "null"])],
+                   "bcbio-vc", cores=1, no_files=True)]
    align += [s("prep_samples_to_rec", "multi-combined",
                [["config", "algorithm", "coverage"],
                 ["config", "algorithm", "variant_regions"],
@@ -305,6 +313,7 @@ def variant(samples):
              s("postprocess_alignment_to_rec", "multi-combined",
                [["align_bam"],
                 ["config", "algorithm", "coverage_interval"],
+                 ["config", "algorithm", "exclude_regions"],
                 ["config", "algorithm", "variant_regions"],
                 ["config", "algorithm", "variant_regions_merged"],
                 ["config", "algorithm", "variant_regions_orig"],
@@ -316,6 +325,8 @@ def variant(samples):
                 ["config", "algorithm", "tools_on"],
                 ["genome_resources", "rnaseq", "gene_bed"],
                 ["genome_resources", "variation", "dbsnp"],
+                 ["genome_resources", "variation", "lcr"], ["genome_resources", "variation", "polyx"],
+                 ["genome_resources", "variation", "encode_blacklist"],
                 ["reference", "twobit"],
                 ["reference", "fasta", "base"]],
                [cwlout("postprocess_alignment_rec", "record")],
@@ -368,7 +379,7 @@ def _qc_workflow(checkpoints):
    qc_inputs = \
      [["align_bam"], ["analysis"], ["reference", "fasta", "base"],
       ["config", "algorithm", "tools_on"], ["config", "algorithm", "tools_off"],
-       ["genome_build"], ["config", "algorithm", "qc"],
+       ["genome_build"], ["config", "algorithm", "qc"], ["metadata", "batch"],
       ["config", "algorithm", "coverage_interval"],
       ["depth", "variant_regions", "regions"], ["depth", "variant_regions", "dist"],
       ["depth", "samtools", "stats"], ["depth", "samtools", "idxstats"],
@@ -414,11 +425,12 @@ def _variant_sv(checkpoints):
            [cwlout("sv_rec", "record",
                    fields=[cwlout(["sv", "variantcaller"], ["string", "null"]),
                            cwlout(["sv", "vrn_file"], ["File", "null"], [".tbi"]),
+                            cwlout(["svvalidate", "summary"], ["File", "null"]),
                            cwlout("inherit")])],
            "bcbio-vc", ["bedtools", "cnvkit", "delly", "extract-sv-reads",
                         "lumpy-sv", "manta", "break-point-inspector", "mosdepth", "samtools",
-                         "pysam>=0.13.0",
-                         "seq2c", "simple_sv_annotation", "svtools", "svtyper",
+                         "smoove", "pysam>=0.13.0",
+                         "seq2c", "simple_sv_annotation", "survivor", "svtools", "svtyper",
                         "r=3.4.1", "vawk"],
            disk={"files": 2.0})]
    steps = [s("calculate_sv_bins", "multi-combined",
@@ -426,11 +438,14 @@ def _variant_sv(checkpoints):
                ["metadata", "batch"], ["metadata", "phenotype"],
                ["config", "algorithm", "callable_regions"],
                ["config", "algorithm", "coverage_interval"],
+                ["config", "algorithm", "exclude_regions"],
                ["config", "algorithm", "sv_regions"],
                ["config", "algorithm", "variant_regions"],
                ["config", "algorithm", "variant_regions_merged"],
                ["config", "algorithm", "svcaller"],
                ["depth", "variant_regions", "regions"],
+                ["genome_resources", "variation", "lcr"], ["genome_resources", "variation", "polyx"],
+                ["genome_resources", "variation", "encode_blacklist"],
                ["genome_resources", "rnaseq", "gene_bed"]],
               [cwlout("sv_bin_rec", "record",
                       fields=[cwlout(["regions", "bins", "target"], ["File", "null"]),
@@ -451,6 +466,7 @@ def _variant_sv(checkpoints):
               [["sv_rawcoverage_rec"]],
               [cwlout("sv_coverage_rec", "record",
                       fields=[cwlout(["depth", "bins", "normalized"], ["File", "null"]),
+                               cwlout(["depth", "bins", "background"], ["File", "null"]),
                               cwlout("inherit")])],
               "bcbio-vc", ["cnvkit"],
               disk={"files": 1.5}),
@@ -459,6 +475,7 @@ def _variant_sv(checkpoints):
                ["work_bam_plus", "disc"], ["work_bam_plus", "sr"],
                ["config", "algorithm", "tools_on"],
                ["config", "algorithm", "tools_off"],
+                ["config", "algorithm", "svvalidate"], ["regions", "sample_callable"],
                ["sv_coverage_rec"]],
               [cwlout("sv_batch_rec", "record")],
               "bcbio-vc",
@@ -466,9 +483,11 @@ def _variant_sv(checkpoints):
             w("svcall", "multi-parallel", sv, []),
             s("summarize_sv", "multi-combined",
               [["sv_rec"]],
-               [cwlout(["sv", "calls"], {"type": "array", "items": ["File", "null"]})],
+               [cwlout(["sv", "calls"], {"type": "array", "items": ["File", "null"]}),
+                cwlout(["svvalidate", "grading_summary"], ["File", "null"]),
+                cwlout(["svvalidate", "grading_plots"], {"type": "array", "items": ["File", "null"]})],
               "bcbio-vc", disk={"files": 1.0}, cores=1)]
-    final_outputs = [["sv", "calls"]]
+    final_outputs = [["sv", "calls"], ["svvalidate", "grading_summary"]]
    return steps, final_outputs

 def rnaseq(samples):

--- a/bcbio/cwl/tool.py
+++ b/bcbio/cwl/tool.py
@@ -64,6 +64,7 @@ def _run_cwltool(args):
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cwltool_work"))
    tmp_dir = utils.safe_makedir(os.path.join(work_dir, "tmpcwl"))
+    log_file = os.path.join(work_dir, "%s-cwltool.log" % project_name)
    os.environ["TMPDIR"] = tmp_dir
    flags = ["--tmpdir-prefix", tmp_dir, "--tmp-outdir-prefix", tmp_dir]
    if args.no_container:
@@ -71,7 +72,7 @@ def _run_cwltool(args):
        flags += ["--no-container", "--preserve-environment", "PATH", "--preserve-environment", "HOME"]
    cmd = ["cwltool"] + flags + args.toolargs + ["--", main_file, json_file]
    with utils.chdir(work_dir):
-        _run_tool(cmd, not args.no_container, work_dir)
+        _run_tool(cmd, not args.no_container, work_dir, log_file=log_file)

 def _run_arvados(args):
    """Run CWL on Arvados.

--- a/bcbio/cwl/workflow.py
+++ b/bcbio/cwl/workflow.py
@@ -233,9 +233,11 @@ def _flatten_nested_input(v):
                new_type = x["items"]
            elif isinstance(x, basestring) and x == "null":
                want_null = True
+            else:
+                new_type = x
        if want_null:
            if not isinstance(new_type, (list, tuple)):
-                new_type = [new_type]
+                new_type = [new_type] if new_type is not None else []
            for toadd in ["null", "string"]:
                if toadd not in new_type:
                    new_type.append(toadd)
@@ -343,10 +345,12 @@ def _create_record(name, field_defs, step_name, inputs, unlist, file_vs, std_vs,
        fields = []
        inherit = []
        inherit_all = False
+        inherit_exclude = []
        for fdef in field_defs:
            if not fdef.get("type"):
                if fdef["id"] == "inherit":
                    inherit_all = True
+                    inherit_exclude = fdef.get("exclude", [])
                else:
                    inherit.append(fdef["id"])
            else:
@@ -354,7 +358,7 @@ def _create_record(name, field_defs, step_name, inputs, unlist, file_vs, std_vs,
                       "type": fdef["type"]}
                fields.append(_add_secondary_to_rec_field(fdef, cur))
        if inherit_all:
-            fields.extend(_infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel))
+            fields.extend(_infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel, exclude=inherit_exclude))
        elif inherit:
            fields.extend(_infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel, inherit))
    else:
@@ -373,13 +377,15 @@ def _add_secondary_to_rec_field(orig, cur):
        cur["secondaryFiles"] = orig.get("secondaryFiles")
    return cur

-def _infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel, to_include=None):
+def _infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel, to_include=None,
+                          exclude=None):
    """Infer the outputs of a record from the original inputs
    """
    fields = []
    unlist = set([_get_string_vid(x) for x in unlist])
    input_vids = set([_get_string_vid(v) for v in _handle_special_inputs(inputs, file_vs)])
    to_include = set([_get_string_vid(x) for x in to_include]) if to_include else None
+    to_exclude = tuple(set([_get_string_vid(x) for x in exclude])) if exclude else None
    added = set([])
    for raw_v in std_vs + [v for v in file_vs if get_base_id(v["id"]) in input_vids]:
        # unpack record inside this record and un-nested inputs to avoid double nested
@@ -392,13 +398,14 @@ def _infer_record_outputs(inputs, unlist, file_vs, std_vs, parallel, to_include=
        for orig_v in nested_vs:
            if (get_base_id(orig_v["id"]) not in added
                 and (not to_include or get_base_id(orig_v["id"]) in to_include)):
-                cur_v = {}
-                cur_v["name"] = get_base_id(orig_v["id"])
-                cur_v["type"] = orig_v["type"]
-                if cur_v["name"] in unlist:
-                    cur_v = _flatten_nested_input(cur_v)
-                fields.append(_add_secondary_to_rec_field(orig_v, cur_v))
-                added.add(get_base_id(orig_v["id"]))
+                if to_exclude is None or not get_base_id(orig_v["id"]).startswith(to_exclude):
+                    cur_v = {}
+                    cur_v["name"] = get_base_id(orig_v["id"])
+                    cur_v["type"] = orig_v["type"]
+                    if cur_v["name"] in unlist:
+                        cur_v = _flatten_nested_input(cur_v)
+                    fields.append(_add_secondary_to_rec_field(orig_v, cur_v))
+                    added.add(get_base_id(orig_v["id"]))
    return fields

 def _create_variable(orig_v, step, variables):

--- a/bcbio/distributed/multitasks.py
+++ b/bcbio/distributed/multitasks.py
@@ -9,7 +9,7 @@ from bcbio.cwl import create as cwl_create
 from bcbio.cwl import cwlutils
 from bcbio.rnaseq import (sailfish, rapmap, salmon, umi, kallisto, spikein)
 from bcbio.ngsalign import alignprep
-from bcbio.pipeline import (archive, disambiguate, qcsummary, region, sample,
+from bcbio.pipeline import (archive, alignment, disambiguate, qcsummary, region, sample,
                            main, shared, variation, run_info, rnaseq)
 from bcbio.qc import multiqc, qsignature
 from bcbio.structural import regions as svregions
@@ -122,6 +122,10 @@ def alignment_to_rec(*args):
                    "rgnames__lane", "rgnames__rg", "rgnames__lb"]
    return cwlutils.to_rec_single(*args, default_keys=default_keys)

+@utils.map_wrap
+def organize_noalign(*args):
+    return alignment.organize_noalign(args)
+
 @utils.map_wrap
 def postprocess_alignment_to_rec(*args):
    default_keys = ["config__algorithm__coverage_interval", "config__algorithm__seq2c_bed_ready",

--- a/bcbio/heterogeneity/__init__.py
+++ b/bcbio/heterogeneity/__init__.py
@@ -26,7 +26,8 @@ def _get_calls(data, cnv_only=False):
 def get_variants(data):
    """Retrieve set of variant calls to use for heterogeneity analysis.
    """
-    supported = ["vardict", "vardict-java", "vardict-perl", "strelka2", "mutect2", "freebayes", "mutect"]
+    supported = ["precalled", "vardict", "vardict-java", "vardict-perl",
+                 "strelka2", "mutect2", "freebayes", "mutect"]
    out = []
    for v in data.get("variants", []):
        if v["variantcaller"] in supported:

--- a/bcbio/install.py
+++ b/bcbio/install.py
@@ -29,10 +29,10 @@ from bcbio.distributed.transaction import file_transaction
 from bcbio.pipeline import datadict as dd

 REMOTES = {
-    "requirements": "https://raw.githubusercontent.com/chapmanb/bcbio-nextgen/master/requirements-conda.txt",
-    "gitrepo": "https://github.com/chapmanb/bcbio-nextgen.git",
+    "requirements": "https://raw.githubusercontent.com/bcbio/bcbio-nextgen/master/requirements-conda.txt",
+    "gitrepo": "https://github.com/bcbio/bcbio-nextgen.git",
    "cloudbiolinux": "https://github.com/chapmanb/cloudbiolinux/archive/master.tar.gz",
-    "genome_resources": "https://raw.github.com/chapmanb/bcbio-nextgen/master/config/genomes/%s-resources.yaml",
+    "genome_resources": "https://raw.github.com/bcbio/bcbio-nextgen/master/config/genomes/%s-resources.yaml",
    "snpeff_dl_url": ("http://downloads.sourceforge.net/project/snpeff/databases/v{snpeff_ver}/"
                      "snpEff_v{snpeff_ver}_{genome}.zip")}
 SUPPORTED_GENOMES = ["GRCh37", "hg19", "hg38", "hg38-noalt", "mm10", "mm9",
@@ -222,9 +222,10 @@ def _check_for_conda_problems():
    """
    conda_bin = _get_conda_bin()
    lib_dir = os.path.join(os.path.dirname(conda_bin), os.pardir, "iib")
-    if not os.path.exists(os.path.join(lib_dir, "libquadmath.so")):
-        subprocess.check_call([conda_bin, "install", "-f",
-                               "--yes", "-c", "bioconda", "-c", "conda-forge", "libgcc-ng"])
+    for l in ["libgomp.so.1", "libquadmath.so"]:
+        if not os.path.exists(os.path.join(lib_dir, l)):
+            subprocess.check_call([conda_bin, "install", "-f",
+                                   "--yes", "-c", "bioconda", "-c", "conda-forge", "libgcc-ng"])

 def _update_conda_packages():
    """If installed in an anaconda directory, upgrade conda packages.