Steffen Möller · Steffen Möller · Steffen Möller · 39e67630 · 39e67630 · 39e67630
--- a/HISTORY.md
+++ b/HISTORY.md
-## 1.1.8
+## 1.1.9 (in progress)
+- Fix for get VEP cache.
+- Support Picard's new syntax for ReorderSam (REFERENCE -> SEQUENCE_DICTIONARY).
+- Remove mitochondrial reads from ChIP/ATAC-seq calling.
+- Add documentation describing ATAC-seq outputs.
+- Add ENCODE library complexity metrics for ATAC/ChIP-seq to MultiQC report 
+  (see https://www.encodeproject.org/data-standards/terms/#library for a description of the metrics)
+- Add STAR sample-specific 2-pass. This helps assign a moderate number of reads per genes. Thanks
+  to @naumenko-sa for the intial implementation and push to get this going.
+- Index transcriptomes only once for pseudo/quasi aligner tools. This fixes race conditions that
+  can happen.
+- Add --buildversion option, for tracking which version of a gene build was used. This is used
+  during `bcbio_setup_genome.py`. Suggested formats are source_version, so Ensembl_94, 
+  EnsemblMetazoa_25, FlyBase_26, etc.
+- Sort MACS2 bedgraph files before compressing. Thanks to @LMannarino for the suggestion.
+- Check for the reserved field `sample` in RNA-seq metadata and quit with a useful error message. 
+  Thanks to @marypiper for suggesting this.
+- Split ATAC-seq BAM files into nucleosome-free and mono/di/tri nucleosome files, so we can call 
+  peaks on them separately.
+- Call peaks on NF/MN/DN/TN regions separately for each caller during ATAC-seq.
+- Allow viral contamination to be assasyed on non tumor/normal samples.
+- Ensure EBV coverage is calculated when run on genomes with it included as a contig.
+
+## 1.1.8 (28 October 2019)
 - Add `antibody` configuration option. Setting a specific antibody for ChIP-seq will use appropriate
  settings for that antibody. See the documentation for supported antibodies.
 - Add `use_lowfreq_filter` for forcing vardict to report variants with low allelic frequency,

--- a/bcbio/bam/__init__.py
+++ b/bcbio/bam/__init__.py
@@ -403,11 +403,11 @@ def merge(bamfiles, out_bam, config):
    return out_bam


-def sort(in_bam, config, order="coordinate", out_dir=None):
+def sort(in_bam, config, order="coordinate", out_dir=None, force=False):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
-    if bam_already_sorted(in_bam, config, order):
+    if not force and bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order, out_dir)
@@ -579,3 +579,24 @@ def remove_duplicates(in_bam, data):
        do.run(cmd, message)
    index(out_bam, dd.get_config(data))
    return out_bam
+
+def count_alignments(in_bam, data, filter=None):
+    """
+    count alignments in a BAM file passing a given filter. valid filter
+    strings are available in the sambamba documentation:
+
+    https://github.com/biod/sambamba/wiki/%5Bsambamba-view%5D-Filter-expression-syntax
+    """
+    sambamba = config_utils.get_program("sambamba", dd.get_config(data))
+    num_cores = dd.get_num_cores(data)
+    if not filter:
+        filter_string = ""
+        message = f"Counting alignments in {in_bam}."
+    else:
+        filter_string = "--filter {filter}"
+        message = f"Counting alignments in {in_bam} matching {filter}."
+    cmd = f"{sambamba} view -c --nthreads {num_cores} -f bam {filter_string} {in_bam}"
+    logger.info(message)
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    return int(result.stdout.decode().strip())
--- a/bcbio/broad/picardrun.py
+++ b/bcbio/broad/picardrun.py
@@ -98,9 +98,10 @@ def picard_reorder(picard, in_bam, ref_file, out_file):
    if not file_exists(out_file):
        with tx_tmpdir(picard._config) as tmp_dir:
            with file_transaction(picard._config, out_file) as tx_out_file:
+                dict_file = "%s.dict" % os.path.splitext(ref_file)[0]
                opts = [("INPUT", in_bam),
                        ("OUTPUT", tx_out_file),
-                        ("REFERENCE", ref_file),
+                        ("SEQUENCE_DICTIONARY", dict_file),
                        ("ALLOW_INCOMPLETE_DICT_CONCORDANCE", "true"),
                        ("TMP_DIR", tmp_dir)]
                picard.run("ReorderSam", opts)

--- a/bcbio/chipseq/__init__.py
+++ b/bcbio/chipseq/__init__.py
 import os
+import shutil
 import subprocess
 import sys
 import toolz as tz
@@ -10,17 +11,29 @@ from bcbio.ngsalign import bowtie2, bwa
 from bcbio.distributed.transaction import file_transaction
 from bcbio.provenance import do
 from bcbio.log import logger
-from bcbio.heterogeneity.chromhacks import get_mitochondrial_chroms
+from bcbio.heterogeneity.chromhacks import (get_mitochondrial_chroms,
+                                            get_nonmitochondrial_chroms)
+from bcbio.chipseq import atac

 def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
+    method = dd.get_chip_method(data)
+    if method == "atac":
+        data = shift_ATAC(data)
    work_bam = dd.get_work_bam(data)
+    work_bam = bam.sort(work_bam, dd.get_config(data))
+    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
+    clean_bam = remove_mitochondrial_reads(clean_bam, data)
+    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
+    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
+    if method == "atac":
+        data  = atac.split_ATAC(data)
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config'])
@@ -35,6 +48,26 @@ def clean_chipseq_alignment(data):
                                       dd.get_work_bam(data), data)
    return [[data]]

+def remove_mitochondrial_reads(bam_file, data):
+    mito = get_mitochondrial_chroms(data)
+    if not mito:
+        logger.info(f"Mitochondrial chromosome not identified, skipping removal of "
+                    "mitochondrial reads from {bam_file}.")
+        return bam_file
+    nonmito = get_nonmitochondrial_chroms(data)
+    mito_bam = os.path.splitext(bam_file)[0] + "-noMito.bam"
+    if utils.file_exists(mito_bam):
+        return mito_bam
+    samtools = config_utils.get_program("samtools", dd.get_config(data))
+    nonmito_flag = " ".join(nonmito)
+    num_cores = dd.get_num_cores(data)
+    with file_transaction(mito_bam) as tx_out_bam:
+        cmd = (f"{samtools} view -bh -@ {num_cores} {bam_file} {nonmito_flag} "
+               f"> {tx_out_bam}")
+        message = f"Removing mitochondrial reads on {','.join(mito)} from {bam_file}."
+        do.run(cmd, message)
+    return mito_bam
+
 def remove_multimappers(bam_file, data):
    aligner = dd.get_aligner(data)
    if aligner:
@@ -50,13 +83,13 @@ def remove_multimappers(bam_file, data):
        unique_bam = bam_file
        logger.warn("When a BAM file is given as input, bcbio skips removal of "
                    "multimappers.")
-    logger.warn("ChIP/ATAC-seq usually requires duplicate marking, but it was disabled.")
    return unique_bam

 def remove_nonassembled_chrom(bam_file, data):
    """Remove non-assembled contigs from the BAM file"""
    ref_file =  dd.get_ref_file(data)
    config = dd.get_config(data)
+    bam.index(bam_file, config)
    fai = "%s.fai" % ref_file
    chrom = []
    with open(fai) as inh:
@@ -143,31 +176,35 @@ def _normalized_bam_coverage(name, bam_input, data):
 def _compute_deeptools_matrix(data):
    pass

-def extract_NF_regions(data):
+def shift_ATAC(data):
    """
-    extract the nucleosome free regions from the work_bam. These regions will
-    be < 100 bases
+    shift the ATAC-seq alignments
    """
    MAX_FRAG_LENGTH = 100
    sieve = config_utils.get_program("alignmentSieve", data)
    work_bam = dd.get_work_bam(data)
    num_cores = dd.get_num_cores(data)
-    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
-    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
-    if file_exists(out_file):
-        data["NF_bam"] = out_file
+    out_file = os.path.splitext(work_bam)[0] + "-shifted.bam"
+    log_file = os.path.splitext(work_bam)[0] + "-shifted.log"
+    if utils.file_exists(out_file):
+        data["work_bam"] = out_file
        return data

+    unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam"
+    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file, \
            file_transaction(log_file) as tx_log_file:
-        tx_unsorted_bam = tx_out_file + ".unsorted"
+            tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam"
            cmd = (
-            f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
-            f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
+                f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} --ATACshift "
+                f"--numberOfProcessors {num_cores} --maxFragmentLength 0 "
+                f"--minFragmentLength 0 "
                f"--minMappingQuality 10 "
                f"--filterMetrics {tx_log_file} ")
-        do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
-        tx_out_file = bam.sort(tx_unsorted_bam)
-
-    data["NF_bam"] = out_file
+            do.run(cmd, f"Shifting ATAC-seq alignments in {work_bam} to {tx_unsorted_file}.")
+            # shifting can cause the file to become unsorted
+            sorted_file = bam.sort(tx_unsorted_file, dd.get_config(data), force=True)
+            shutil.move(sorted_file, tx_out_file)
+    bam.index(out_file, dd.get_config(data))
+    data["work_bam"] = out_file
    return data
--- a/bcbio/chipseq/atac.py
+++ b/bcbio/chipseq/atac.py
+import os
+import toolz as tz
+from collections import namedtuple
+
+from bcbio import utils
+from bcbio.pipeline import datadict as dd
+from bcbio.pipeline import config_utils
+from bcbio.log import logger
+from bcbio.distributed.transaction import file_transaction
+from bcbio.provenance import do
+from bcbio import bam
+
+# ranges taken from Buenrostro, Nat. Methods 10, 1213–1218 (2013).
+ATACRange = namedtuple('ATACRange', ['label', 'min', 'max'])
+ATACRanges = {"NF": ATACRange("NF", 0, 100),
+              "MN": ATACRange("MN", 180, 247),
+              "DN": ATACRange("DN", 315, 473),
+              "TN": ATACRange("TN", 558, 615)}
+
+def calculate_complexity_metrics(work_bam, data):
+    """
+    the work_bam should have duplicates marked but not removed
+    mitochondrial reads should be removed 
+    """
+    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
+    work_dir = dd.get_work_dir(data)
+    metrics_dir = os.path.join(work_dir, "metrics", "atac")
+    utils.safe_makedir(metrics_dir)
+    metrics_file = os.path.join(metrics_dir,
+                                f"{dd.get_sample_name(data)}-atac-metrics.csv")
+    if utils.file_exists(metrics_file):
+        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
+        return data
+    # BAM file must be sorted by read name
+    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
+    with file_transaction(metrics_file) as tx_metrics_file:
+        with open(tx_metrics_file, "w") as out_handle:
+            out_handle.write("mt,m0,m1,m2\n")
+        cmd = (f"{bedtools} bamtobed -bedpe -i {work_bam} | "
+               "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
+               "sort | "
+               "uniq -c | "
+               "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
+               "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
+               f"{tx_metrics_file}")
+        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
+        do.run(cmd, message)
+    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
+    return data
+
+def calculate_encode_complexity_metrics(data):
+    metrics_file = tz.get_in(['atac', 'complexity_metrics_file'], data, None)
+    if not metrics_file:
+        return {}
+    else:
+        with open(metrics_file) as in_handle:
+            header = next(in_handle).strip().split(",")
+            values = next(in_handle).strip().split(",")
+    raw_metrics = {h: int(v) for h, v in zip(header, values)}
+    metrics = {"PBC1": raw_metrics["m1"] / raw_metrics["m0"],
+               "NRF": raw_metrics["m0"] / raw_metrics["mt"]}
+    if raw_metrics["m2"] == 0:
+        PBC2 = 0
+    else:
+        PBC2 = raw_metrics["m1"] / raw_metrics["m2"]
+    metrics["PBC2"] = PBC2
+    return(metrics)
+
+def split_ATAC(data, bam_file=None):
+    """
+    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
+    on the estimated insert sizes
+    uses the current working BAM file if no BAM file is supplied
+    """
+    sambamba = config_utils.get_program("sambamba", data)
+    num_cores = dd.get_num_cores(data)
+    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
+    bam_file = bam_file if bam_file else dd.get_work_bam(data)
+    out_stem = os.path.splitext(bam_file)[0]
+    split_files = {}
+    for arange in ATACRanges.values():
+        out_file = f"{out_stem}-{arange.label}.bam"
+        if not utils.file_exists(out_file):
+            with file_transaction(out_file) as tx_out_file:
+                cmd = base_cmd +\
+                    f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
+                    f'{bam_file} > {tx_out_file}'
+                message = f'Splitting {arange.label} regions from {bam_file}.'
+                do.run(cmd, message)
+        split_files[arange.label] = out_file
+    data = tz.assoc_in(data, ['atac', 'align'], split_files)
+    return data
--- a/bcbio/chipseq/macs2.py
+++ b/bcbio/chipseq/macs2.py
@@ -9,6 +9,7 @@ from bcbio.pipeline import datadict as dd
 from bcbio import bam
 from bcbio.chipseq import antibodies
 from bcbio.log import logger
+from bcbio.distributed.transaction import file_transaction

 def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
@@ -20,7 +21,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
-        _compress_bdg_files(out_dir)
+        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
@@ -40,13 +41,13 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
-            raise RuntimeWarning("macs2 terminated with an error.\n"
+            raise RuntimeWarning("macs2 terminated with an error. "
                                 "Please, check the message and report "
-                                 "error if it is related to bcbio.\n"
+                                 "error if it is related to bcbio. "
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
-    _compress_bdg_files(out_dir)
+    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)

 def _get_output_files(out_dir):
@@ -61,10 +62,16 @@ def _get_output_files(out_dir):
            break
    return {"main": peaks, "macs2": fns}

-def _compress_bdg_files(out_dir):
+def _compress_and_sort_bdg_files(out_dir, data):
    for fn in glob.glob(os.path.join(out_dir, "*bdg")):
-        cmd = "gzip  %s" % fn
-        do.run(cmd, "compress bdg file: %s" % fn)
+        out_file = fn + ".gz"
+        if utils.file_exists(out_file):
+            continue
+        bedtools = config_utils.get_program("bedtools", data)
+        with file_transaction(out_file) as tx_out_file:
+            cmd = f"{bedtools} sort -i {fn} | bgzip -c > {tx_out_file}"
+            message = f"Compressing and sorting {fn}."
+            do.run(cmd, message)

 def _macs2_cmd(data):
    """Main command for macs2 tool."""

--- a/bcbio/chipseq/peaks.py
+++ b/bcbio/chipseq/peaks.py
@@ -12,7 +12,7 @@ from bcbio.pipeline import config_utils
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
 from bcbio.distributed.transaction import file_transaction
-
+from bcbio.chipseq import atac

 def get_callers():
    """Get functions related to each caller"""
@@ -44,18 +44,28 @@ def peakcall_prepare(data, run_parallel):

 def calling(data):
    """Main function to parallelize peak calling."""
+    method = dd.get_chip_method(data)
+    caller_fn = get_callers()[data["peak_fn"]]
+    if method == "chip":
        chip_bam = data.get("work_bam")
        input_bam = data.get("work_bam_input", None)
-    caller_fn = get_callers()[data["peak_fn"]]
        name = dd.get_sample_name(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
        out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                            dd.get_chip_method(data), data["resources"], data)
        greylistdir = greylisting(data)
        data.update({"peaks_files": out_files})
-    # data["input_bam_filter"] = input_bam
        if greylistdir:
            data["greylist"] = greylistdir
+    if method == "atac":
+        for fraction in atac.ATACRanges.keys():
+            chip_bam = tz.get_in(("atac", "align", fraction), data)
+            logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
+            name = dd.get_sample_name(data) + f"-{fraction}"
+            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
+            out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
+                                  dd.get_chip_method(data), data["resources"], data)
+            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
    return [[data]]

 def _sync(original, processed):

--- a/bcbio/heterogeneity/chromhacks.py
+++ b/bcbio/heterogeneity/chromhacks.py
@@ -58,3 +58,16 @@ def get_mitochondrial_chroms(data):
    ref_file = dd.get_ref_file(data)
    mito = [c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)]
    return mito
+
+def get_nonmitochondrial_chroms(data):
+    ref_file = dd.get_ref_file(data)
+    nonmito = [c.name for c in ref.file_contigs(ref_file) if not is_mitochondrial(c.name)]
+    return nonmito
+
+def get_EBV(data):
+    EBVCONTIGS = ["chrEBV", "EBV"]
+    ref_file = dd.get_ref_file(data)
+    for c in ref.file_contigs(ref_file):
+        if c.name in EBVCONTIGS:
+            return c.name
+    return False
--- a/bcbio/ngsalign/bwa.py
+++ b/bcbio/ngsalign/bwa.py
@@ -44,7 +44,7 @@ def align_bam(in_bam, ref_file, names, align_dir, data):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("unset JAVA_HOME && "
-                       "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
+                       "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl

--- a/bcbio/ngsalign/star.py
+++ b/bcbio/ngsalign/star.py
@@ -62,6 +62,54 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
+        tx_1pass_dir = tx_align_dir + "1pass"
+        tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names)
+        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
+        safe_makedir(tx_1pass_dir)
+        safe_makedir(tx_out_dir)
+        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
+               "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
+               "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
+               "--outStd BAM_Unsorted {srna_opts} "
+               "--limitOutSJcollapsed 2000000 "
+               "--outSAMtype BAM Unsorted "
+               "--outSAMmapqUnique 60 "
+               "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
+        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
+        cmd += _read_group_option(names)
+        if dd.get_fusion_caller(data):
+            if "arriba" in dd.get_fusion_caller(data):
+                cmd += (
+                    "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions "
+                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
+                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
+                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
+                    "--chimSegmentReadGapMax 3 ")
+            else: 
+                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
+                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
+                    "--chimScoreSeparation 5 ")
+                if "oncofuse" in dd.get_fusion_caller(data):
+                    cmd += "--chimOutType Junctions "
+                else:
+                    cmd += "--chimOutType WithinBAM "
+        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
+                                    "unstranded").lower()
+        if strandedness == "unstranded" and not srna:
+            cmd += " --outSAMstrandField intronMotif "
+        if not srna:
+            cmd += " --quantMode TranscriptomeSAM "
+
+        resources = config_utils.get_resources("star", data["config"])
+        if resources.get("options", []):
+            cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
+        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
+        cmd += " > {tx_final_out} "
+        run_message = "Running 1st pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
+        do.run(cmd.format(**locals()), run_message, None)
+
+        sjfile = get_splicejunction_file(tx_out_dir, data)
+        sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else ""
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
@@ -71,6 +119,7 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
+            "{sjflag} "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
@@ -104,7 +153,7 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
            cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
-        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
+        run_message = "Running 2nd pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)

--- a/bcbio/pipeline/rnaseq.py
+++ b/bcbio/pipeline/rnaseq.py
@@ -11,7 +11,8 @@ from bcbio.distributed.transaction import file_transaction
 from bcbio.log import logger

 def fast_rnaseq(samples, run_parallel):
-    samples = run_parallel("run_salmon_index", [samples])
+    to_index = determine_indexes_to_make(samples)
+    run_parallel("run_salmon_index", [to_index])
    samples = run_parallel("run_salmon_reads", samples)
    samples = run_parallel("run_counts_spikein", samples)
    samples = spikein.combine_spikein(samples)
@@ -205,6 +206,7 @@ def quantitate_expression_parallel(samples, run_parallel):
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
+    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
@@ -213,13 +215,14 @@ def quantitate_expression_parallel(samples, run_parallel):
    if ("kallisto" in dd.get_expression_caller(data) or
        dd.get_fusion_mode(data) or
        "pizzly" in dd.get_fusion_caller(data, [])):
-        samples = run_parallel("run_kallisto_index", [samples])
+        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
-        samples = run_parallel("run_sailfish_index", [samples])
+        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
+    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
@@ -496,3 +499,25 @@ def combine_files(samples):
            data = dd.set_tx2gene(data, tx2gene_file)
        updated_samples.append([data])
    return updated_samples
+
+def determine_indexes_to_make(samples):
+    """
+    returns a subset of the samples that have different indexes in them to make sure we only
+    make each index once
+    """
+    samples = [to_single_data(x) for x in samples]
+    indexes = set()
+    tomake = []
+    for data in samples:
+        out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
+        out_stem = os.path.join(out_dir, dd.get_genome_build(data))
+        if dd.get_disambiguate(data):
+            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
+        if dd.get_disambiguate(data):
+            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
+        combined_file = out_stem + ".fa"
+        if combined_file not in indexes:
+            tomake.append(data)
+            indexes.add(combined_file)
+    return tomake
+
--- a/bcbio/pipeline/run_info.py
+++ b/bcbio/pipeline/run_info.py
@@ -11,6 +11,7 @@ import itertools
 import operator
 import os
 import string
+import sys

 import six
 import toolz as tz
@@ -331,6 +332,11 @@ def _clean_metadata(data):
        if "metadata" not in data:
            data["metadata"] = {}
        data["metadata"]["batch"] = "%s-joint" % dd.get_sample_name(data)
+    analysis = dd.get_analysis(data).lower()
+    if tz.get_in(("metadata", "sample"), data) and analysis == "rna-seq":
+        logger.error("'sample' is a reserved keyword in metadata for RNA-seq. Please "
+                     "rename this column to something else and rerun.")
+        sys.exit(1)
    return data

 def _clean_algorithm(data):

--- a/bcbio/pipeline/sample.py
+++ b/bcbio/pipeline/sample.py
@@ -147,7 +147,8 @@ def process_alignment(data, alt_input=None):
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
-            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
+            ref_file = dd.get_ref_file(data)
+            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)

--- a/bcbio/qc/multiqc.py
+++ b/bcbio/qc/multiqc.py
@@ -34,6 +34,7 @@ from bcbio.variation import bedutils
 from bcbio.qc.variant import get_active_vcinfo
 from bcbio.upload import get_all_upload_paths_from_sample
 from bcbio.variation import coverage
+from bcbio.chipseq import atac

 def summary(*samples):
    """Summarize all quality metrics together"""
@@ -447,6 +448,13 @@ def _add_disambiguate(sample):
            sample["summary"]["metrics"]["Disambiguated ambiguous reads"] = disambigStats[2]
    return sample

+def _add_atac(sample):
+    atac_metrics = atac.calculate_encode_complexity_metrics(sample)
+    if not atac_metrics:
+        return sample
+    sample["summary"]["metrics"] = tz.merge(atac_metrics, sample["summary"]["metrics"])
+    return sample
+
 def _fix_duplicated_rate(dt):
    """Get RNA duplicated rate if exists and replace by samtools metric"""
    if "Duplication_Rate_of_Mapped" in dt:
@@ -461,6 +469,7 @@ def _merge_metrics(samples, out_dir):
    sample_metrics = collections.defaultdict(dict)
    for s in samples:
        s = _add_disambiguate(s)
+        s = _add_atac(s)
        m = tz.get_in(['summary', 'metrics'], s)
        if isinstance(m, six.string_types):
            m = json.loads(m)

--- a/bcbio/qc/viral.py
+++ b/bcbio/qc/viral.py
@@ -11,6 +11,7 @@ from bcbio.distributed.transaction import file_transaction
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
 from bcbio.variation import vcfutils
+from bcbio.heterogeneity import chromhacks

 def run(bam_file, data, out_dir):
    """Run viral QC analysis:
@@ -21,7 +22,6 @@ def run(bam_file, data, out_dir):
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
-    if vcfutils.get_paired_phenotype(data):
    viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
    if viral_refs and utils.file_exists(viral_refs[0]):
        viral_ref = viral_refs[0]
@@ -52,6 +52,18 @@ def run(bam_file, data, out_dir):
                        "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
                        "sort -n -r -k 5,5 >> {tx_out_file}")
                do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
+                if chromhacks.get_EBV(data):
+                    ref_file = dd.get_ref_file(data)
+                    work_bam = dd.get_work_bam(data)
+                    ebv = chromhacks.get_EBV(data)
+                    mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV"
+                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by "
+                            "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && "
+                            "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
+                            "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
+                            "sort -n -r -k 5,5 >> {tx_out_file}")
+                    do.run(cmd.format(**locals()), "Analyse coverage of EBV")
+
        out["base"] = out_file
        out["secondary"] = []
    return out

--- a/bcbio/upload/__init__.py
+++ b/bcbio/upload/__init__.py
@@ -118,6 +118,7 @@ def _get_files_chipseq(sample):
    algorithm = sample["config"]["algorithm"]
    out = _maybe_add_summary(algorithm, sample, out)
    out = _maybe_add_alignment(algorithm, sample, out)
+    out = _maybe_add_nucleosome_alignments(algorithm, sample, out)
    out = _maybe_add_peaks(algorithm, sample, out)
    out = _maybe_add_greylist(algorithm, sample, out)
    return _add_meta(out, sample)
@@ -493,6 +494,17 @@ def _maybe_add_transcriptome_alignment(sample, out):
                    "ext": "transcriptome"})
    return out

+def _maybe_add_nucleosome_alignments(algorithm, sample, out):
+    """
+    for ATAC-seq, also upload NF, MN, DN and TN bam files
+    """
+    atac_align = tz.get_in(("atac", "align"), sample, {})
+    for alignment in atac_align.keys():
+        out.append({"path": atac_align[alignment],
+                    "type": "bam",
+                    "ext": alignment})
+    return out
+
 def _maybe_add_counts(algorithm, sample, out):
    if not dd.get_count_file(sample):
        return out
@@ -650,6 +662,17 @@ def _maybe_add_trna(algorithm, sample, out):

 def _maybe_add_peaks(algorithm, sample, out):
    out_dir = sample.get("peaks_files", {})
+    if "NF" in out_dir.keys():
+        for files in out_dir.values():
+            for caller in files:
+                if caller == "main":
+                    continue
+                for fn in files[caller]:
+                    if os.path.exists(fn):
+                        out.append({"path": fn,
+                                    "dir": caller,
+                                    "ext": utils.splitext_plus(fn)[1]})
+    else:
        for caller in out_dir:
            if caller == "main":
                continue

--- a/bcbio/variation/effects.py
+++ b/bcbio/variation/effects.py
@@ -139,13 +139,13 @@ def get_vep_cache(dbkey, ref_file, tooldir=None, config=None):
            resources = yaml.safe_load(in_handle)
        ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
        symlink_dir = _special_dbkey_maps(dbkey, ref_file)
-        if ensembl_name and ensembl_name.find("_vep_") == -1:
-            raise ValueError("%s has ensembl an incorrect value."
-                             "It should have _vep_ in the name."
-                             "Remove line or fix the name to avoid error.")
        if symlink_dir and ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            return symlink_dir, species
+        elif ensembl_name:
+            species, vepv = ensembl_name.split("_vep_")
+            vep_dir = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(ref_file)), "vep"))
+            return vep_dir, species
    return None, None

 def run_vep(in_file, data):

--- a/debian/TODO
+++ b/debian/TODO
@@ -92,7 +92,7 @@ Packages already in the New Queue
 package pizzly (in new queue) (in crash space)
   https://github.com/pmelsted/pizzly
   https://salsa.debian.org/med-team/pizzly
-      Was recently rejected from new, need to chase this up
+      Was recently rejected from new after a long wait, reuploaded, waiting again

 package r-wasabi (in new queue) (in crash space)
   https://salsa.debian.org/r-pkg-team/r-other-wasabi
@@ -191,6 +191,7 @@ r-bioc-edgeR
 r-bioc-HTSFilter
   https://bioconductor.org/packages/release/bioc/html/HTSFilter.html
     Needed by seqcluster
+   needs r-bioc-edger and r-bioc-deseq (also deseq2) as build dependends which are not in Debian 

 r-bioc-DEGreport
   https://bioconductor.org/packages/release/bioc/html/DEGreport.html

--- a/debian/changelog
+++ b/debian/changelog
+bcbio (1.1.9-1) unstable; urgency=medium
+
+  * New upstream version.
+  * Updated d/watch
+
+ -- Steffen Moeller <moeller@debian.org>  Wed, 11 Dec 2019 11:39:46 +0100
+
 bcbio (1.1.8-1) unstable; urgency=medium

  * New upstream version.

--- a/debian/rules
+++ b/debian/rules
@@ -21,7 +21,7 @@ override_dh_auto_install:
 	# To keep awareness of redundant binaries with different versions of Python
 	#mkdir -p debian/bcbio/usr/bin
 	#rm -rf debian/python-bcbio/usr/bin debian/python3-bcbio/usr/bin
-	find bcbio/usr/share/bcbio/ -name "*.sh" | xargs -r chmod +x
+	find $(CURDIR)/debian/bcbio/usr/share/bcbio/config/ -name "*.sh" | xargs -r chmod +x
 	
 # Problem with a series of tools that need bcbio before it is installed,
 # just to print --help