Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · ca3194e5 · ca3194e5
--- a/HISTORY.md
+++ b/HISTORY.md
-## 1.1.7
+## 1.1.8
+- Add `antibody` configuration option. Setting a specific antibody for ChIP-seq will use appropriate
+  settings for that antibody. See the documentation for supported antibodies.
+- Add `use_lowfreq_filter` for forcing vardict to report variants with low allelic frequency,
+ useful for calling somatic variants in panels with high coverage.
+- Fix for checking for pre-existing inputs with python3.
+- Add `keep_duplicates` option for ChIP/ATAC-seq which does not remove duplicates before peak calling.
+  Defaults to False.
+- Add `keep_multimappers` for ChIP/ATAC-seq which does not remove multimappers before peak calling.
+  Defaults to False.
+- Remove ethnicity as a required column in PED files.
+
+## 1.1.7 (10 October 2019)
+=======
 - hot fix for dataclasses not being supported in 3.6. Use namedtuple instead.

-## 1.1.6 
+## 1.1.6 (10 October 2019)

 - GATK ApplyBQSRSpark: avoid StreamClosed issue with GATK 4.1+
 - RNA-seq: fixes for cufflinks preparation due to python3 transition.
 - RNA-seq: output count tables from tximport for genes and transcripts. These
-are in `bcbioRNASeq/results/date/genes/counts` and 
-`bcbioRNASeq/results/data/transcripts/counts`.
+  are in `bcbioRNASeq/results/date/genes/counts` and `bcbioRNASeq/results/data/transcripts/counts`.
 - qualimap (RNA-seq): disable stranded mode for qualimap, as it gives incorrect
  results with the hisat2 aligner and for RNA-seq just setting it to unstranded
 - Add `quantify_genome_alignments` option to use genome alignments to quantify

--- a/README.rst
+++ b/README.rst
@@ -123,6 +123,7 @@ Contributors
 - `Roman Valls`_, Science for Life Laboratory, Stockholm
 - `Kevin Ying`_, Garvan Institute of Medical Research, Sydney, Australia
 - `Vlad Saveliev`_, Center for Algorithmic Biotechnology, St. Petersburg University
+- `Sergey Naumenko`_, Harvard Chan Bioinformatics Core


 .. _Miika Ahdesmaki: https://github.com/mjafin
@@ -146,6 +147,7 @@ Contributors
 .. _Matt Edwards: https://github.com/matted
 .. _Saket Choudhary: https://github.com/saketkc
 .. _Vlad Saveliev: https://github.com/vladsaveliev
+.. _Sergey Naumenko: https://github.com/naumenko-sa

 License
 -------

--- a/artwork/logo.png
+++ b/artwork/logo.png
--- a/bcbio/bam/__init__.py
+++ b/bcbio/bam/__init__.py
@@ -561,3 +561,21 @@ def convert_invalid_mapq(in_bam, out_bam=None):
                    read.mapq = VALIDMAPQ
                out_bam_fh.write(read)
    return out_bam
+
+def remove_duplicates(in_bam, data):
+    """
+    remove duplicates from a duplicate marked BAM file
+    """
+    base, ext = os.path.splitext(in_bam)
+    out_bam = base + "-noduplicates" + ext
+    if utils.file_exists(out_bam):
+        return out_bam
+    num_cores = dd.get_num_cores(data)
+    sambamba = config_utils.get_program("sambamba", data)
+    with file_transaction(out_bam) as tx_out_bam:
+        cmd = (f'{sambamba} view -h --nthreads {num_cores} -f bam -F "not duplicate" '
+               f'{in_bam} > {tx_out_bam}')
+        message = f"Removing duplicates from {in_bam}, saving as {out_bam}."
+        do.run(cmd, message)
+    index(out_bam, dd.get_config(data))
+    return out_bam
--- a/bcbio/chipseq/__init__.py
+++ b/bcbio/chipseq/__init__.py
 import os
+import subprocess
 import sys
 import toolz as tz
 from bcbio import utils
@@ -9,11 +10,33 @@ from bcbio.ngsalign import bowtie2, bwa
 from bcbio.distributed.transaction import file_transaction
 from bcbio.provenance import do
 from bcbio.log import logger
+from bcbio.heterogeneity.chromhacks import get_mitochondrial_chroms

 def clean_chipseq_alignment(data):
+    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
+    work_bam = dd.get_work_bam(data)
+    clean_bam = remove_nonassembled_chrom(work_bam, data)
+    if not dd.get_keep_multimapped(data):
+        clean_bam = remove_multimappers(clean_bam, data)
+    if not dd.get_keep_duplicates(data):
+        clean_bam = bam.remove_duplicates(clean_bam, data)
+    data["work_bam"] = clean_bam
+    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
+    if encode_bed:
+        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config'])
+        bam.index(data["work_bam"], data['config'])
+    try:
+        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
+                                                  dd.get_work_bam(data), data)
+    except subprocess.CalledProcessError:
+        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
+                       f" falling back to non-normalized coverage.")
+        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
+                                       dd.get_work_bam(data), data)
+    return [[data]]
+
+def remove_multimappers(bam_file, data):
    aligner = dd.get_aligner(data)
-    data["align_bam"] = dd.get_work_bam(data)
-    if dd.get_mark_duplicates(data):
    if aligner:
        if aligner == "bowtie2":
            filterer = bowtie2.filter_multimappers
@@ -22,24 +45,19 @@ def clean_chipseq_alignment(data):
        else:
            logger.error("ChIP-seq only supported for bowtie2 and bwa.")
            sys.exit(-1)
-            unique_bam = filterer(dd.get_work_bam(data), data)
-            data["work_bam"] = unique_bam
+        unique_bam = filterer(bam_file, data)
    else:
-            logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
-                        "If BAM is not cleaned for peak calling, can result in downstream errors.")
-    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
-    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
-                                             data["config"])
-    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
-    if encode_bed:
-        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
-        bam.index(data["work_bam"], data['config'])
-    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
-    return [[data]]
+        unique_bam = bam_file
+        logger.warn("When a BAM file is given as input, bcbio skips removal of "
+                    "multimappers.")
+    logger.warn("ChIP/ATAC-seq usually requires duplicate marking, but it was disabled.")
+    return unique_bam

-def _keep_assembled_chrom(bam_file, genome, config):
-    """Remove contigs from the BAM file"""
-    fai = "%s.fai" % genome
+def remove_nonassembled_chrom(bam_file, data):
+    """Remove non-assembled contigs from the BAM file"""
+    ref_file =  dd.get_ref_file(data)
+    config = dd.get_config(data)
+    fai = "%s.fai" % ref_file
    chrom = []
    with open(fai) as inh:
        for line in inh:
@@ -56,9 +74,8 @@ def _keep_assembled_chrom(bam_file, genome, config):
        bam.index(out_file, config)
    return out_file

-
-def _prepare_bam(bam_file, bed_file, config):
-    """Remove regions from bed files"""
+def remove_blacklist_regions(bam_file, bed_file, config):
+    """Remove blacklist regions from a BAM file"""
    if not bam_file or not bed_file:
        return bam_file
    out_file = utils.append_stem(bam_file, '_filter')
@@ -71,7 +88,31 @@ def _prepare_bam(bam_file, bed_file, config):

 def _bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools"""
-    cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
+    cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
+          "--binSize 20 --effectiveGenomeSize {size} "
+          "--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
+    size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
+    cores = dd.get_num_cores(data)
+    try:
+        bam_coverage = config_utils.get_program("bamCoverage", data)
+    except config_utils.CmdNotFound:
+        logger.info("No bamCoverage found, skipping bamCoverage.")
+        return None
+    resources = config_utils.get_resources("bamCoverage", data["config"])
+    if resources:
+        options = resources.get("options")
+        if options:
+            cmd += " %s" % " ".join([str(x) for x in options])
+    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
+    if utils.file_exists(bw_output):
+        return bw_output
+    with file_transaction(bw_output) as out_tx:
+        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
+    return bw_output
+
+def _normalized_bam_coverage(name, bam_input, data):
+    """Run bamCoverage from deeptools but produce normalized bigWig files"""
+    cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
          "--binSize 20 --effectiveGenomeSize {size} "
          "--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
    size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
@@ -81,6 +122,12 @@ def _bam_coverage(name, bam_input, data):
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
+    method = dd.get_chip_method(data)
+    cmd += "--normalizeUsing CPM "
+    toignore = get_mitochondrial_chroms(data)
+    if toignore:
+        ignorenormflag = f"--ignoreForNormalization {' '.join(toignore)} "
+        cmd += ignorenormflag
    resources = config_utils.get_resources("bamCoverage", data["config"])
    if resources:
        options = resources.get("options")
@@ -92,3 +139,35 @@ def _bam_coverage(name, bam_input, data):
    with file_transaction(bw_output) as out_tx:
        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
    return bw_output
+
+def _compute_deeptools_matrix(data):
+    pass
+
+def extract_NF_regions(data):
+    """
+    extract the nucleosome free regions from the work_bam. These regions will
+    be < 100 bases
+    """
+    MAX_FRAG_LENGTH = 100
+    sieve = config_utils.get_program("alignmentSieve", data)
+    work_bam = dd.get_work_bam(data)
+    num_cores = dd.get_num_cores(data)
+    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
+    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
+    if file_exists(out_file):
+        data["NF_bam"] = out_file
+        return data
+
+    with file_transaction(out_file) as tx_out_file, \
+         file_transaction(log_file) as tx_log_file:
+        tx_unsorted_bam = tx_out_file + ".unsorted"
+        cmd = (
+            f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
+            f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
+            f"--minMappingQuality 10 "
+            f"--filterMetrics {tx_log_file} ")
+        do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
+        tx_out_file = bam.sort(tx_unsorted_bam)
+
+    data["NF_bam"] = out_file
+    return data
--- a/bcbio/chipseq/macs2.py
+++ b/bcbio/chipseq/macs2.py
@@ -10,7 +10,6 @@ from bcbio import bam
 from bcbio.chipseq import antibodies
 from bcbio.log import logger

-
 def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
    """
    Run macs2 for chip and input samples avoiding
@@ -24,7 +23,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
        _compress_bdg_files(out_dir)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
-    antibody = antibodies.ANTIBODIES.get(dd.get_chipseq_antibody(data).lower(), None)
+    antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
    if antibody:
        logger.info(f"{antibody.name} specified, using {antibody.peaktype} peak settings.")
        peaksettings = select_peak_parameters(antibody)
@@ -35,7 +34,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
-        cmd = _macs2_cmd(method)
+        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
@@ -67,8 +66,9 @@ def _compress_bdg_files(out_dir):
        cmd = "gzip  %s" % fn
        do.run(cmd, "compress bdg file: %s" % fn)

-def _macs2_cmd(method="chip"):
+def _macs2_cmd(data):
    """Main command for macs2 tool."""
+    method = dd.get_chip_method(data)
    if method.lower() == "chip":
        cmd = ("{macs2} callpeak -t {chip_bam} -c {input_bam} {paired} "
               "{genome_size} -n {name} --bdg {options} ")

--- a/bcbio/chipseq/peaks.py
+++ b/bcbio/chipseq/peaks.py
@@ -109,7 +109,7 @@ def greylisting(data):
    """
    input_bam = data.get("work_bam_input", None)
    if not input_bam:
-        logger.info("No input BAM file detected, skipping greylisting.")
+        logger.info("No input control BAM file detected, skipping greylisting.")
        return None
    try:
        greylister = config_utils.get_program("chipseq-greylist", data)

--- a/bcbio/heterogeneity/chromhacks.py
+++ b/bcbio/heterogeneity/chromhacks.py
@@ -7,6 +7,8 @@ import os

 from bcbio import utils
 from bcbio.distributed.transaction import file_transaction
+from bcbio.pipeline import datadict as dd
+from bcbio.bam import ref

 def is_autosomal(chrom):
    """Keep chromosomes that are a digit 1-22, or chr prefixed digit chr1-chr22
@@ -51,3 +53,8 @@ def bed_to_standardonly(in_file, data, headers=None, include_sex_chroms=False, o
                        if checkfn(line.split()[0]) or (headers and line.startswith(headers)):
                            out_handle.write(line)
    return out_file
+
+def get_mitochondrial_chroms(data):
+    ref_file = dd.get_ref_file(data)
+    mito = [c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)]
+    return mito
--- a/bcbio/install.py
+++ b/bcbio/install.py
@@ -57,7 +57,7 @@ def upgrade_bcbio(args):
    args = add_install_defaults(args)
    if args.upgrade in ["stable", "system", "deps", "development"]:
        if args.upgrade == "development":
-            anaconda_dir = _update_conda_devel()
+            anaconda_dir = _update_conda_latest()
            _check_for_conda_problems()
            print("Upgrading bcbio-nextgen to latest development version")
            pip_bin = os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "pip")
@@ -287,6 +287,27 @@ def _update_conda_packages():
        os.remove(req_file)
    return os.path.dirname(os.path.dirname(conda_bin))

+def _update_conda_latest():
+    """Update to the latest bcbio conda package
+    """
+    conda_bin = _get_conda_bin()
+    output = subprocess.run([conda_bin, "search", "-c", "bioconda", "bcbio-nextgen"], stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE).stdout
+    lines = [l for l in output.decode().split("\n") if l]
+    latest = lines.pop()
+    tokens = latest.split()
+    conda_version = tokens[1].strip()
+    print(f"Detected {conda_version} as latest version of bcbio-nextgen on bioconda.")
+    channels = _get_conda_channels(conda_bin)
+    bcbio_version = version.__version__
+    if LooseVersion(bcbio_version) < LooseVersion(conda_version):
+        print(f"Installing bcbio {conda_version} from bioconda.")
+        subprocess.check_call([conda_bin, "install", "--quiet", "--yes"] + channels +
+                            [f"bcbio-nextgen>={conda_version}"])
+    else:
+        print(f"bcbio version {bcbio_version} is newer than the conda version {conda_version}, skipping upgrade from conda.")
+    return os.path.dirname(os.path.dirname(conda_bin))
+
 def _update_conda_devel():
    """Update to the latest development conda package.
    """

--- a/bcbio/ngsalign/alignprep.py
+++ b/bcbio/ngsalign/alignprep.py
@@ -716,7 +716,7 @@ def _check_gzipped_input(in_file, data):
    """Determine if a gzipped input file is blocked gzip or standard.
    """
    grabix = config_utils.get_program("grabix", data["config"])
-    is_bgzip = subprocess.check_output([grabix, "check", in_file])
+    is_bgzip = subprocess.check_output([grabix, "check", in_file]).decode().strip()
    if is_bgzip.strip() == "yes":
        return False, False
    else:

--- a/bcbio/ngsalign/bowtie2.py
+++ b/bcbio/ngsalign/bowtie2.py
@@ -88,7 +88,7 @@ def filter_multimappers(align_file, data):
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
-    base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" '
+    base_filter = '-F "[XS] == null and not unmapped {paired_filter}" '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:

--- a/bcbio/ngsalign/bwa.py
+++ b/bcbio/ngsalign/bwa.py
@@ -277,7 +277,7 @@ def filter_multimappers(align_file, data):
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
-    base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
+    base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:

--- a/bcbio/ngsalign/postalign.py
+++ b/bcbio/ngsalign/postalign.py
@@ -198,14 +198,19 @@ def correct_umis(data):
    # Improve speeds by avoiding compression read/write bottlenecks
    io_opts = "--async-io=true --compression=0"
    umis_whitelist = tz.get_in(["config", "algorithm", "correct_umis"], data)
-    umi_method, umi_tag = _check_umi_type(input_bam)

+    fgbio = config_utils.get_program("fgbio", data["config"])
+    samtools = config_utils.get_program("samtools", data["config"])
+
+    if not utils.file_exists(output_bam):
+        umi_method, umi_tag = _check_umi_type(input_bam)
        cmd = ("unset JAVA_HOME && "
-           "fgbio {jvm_opts} {io_opts} CorrectUmis "
+               "{fgbio} {jvm_opts} {io_opts} CorrectUmis "
               "-t {umi_tag} -m 3 -d 1 -x "
               "-U {umis_whitelist} "
-           "-i {input_bam} -o {output_bam}")
+               "-i {input_bam} -o /dev/stdout | {samtools} view -bh > {output_bam}")
        do.run(cmd.format(**locals()), "Correcting UMIs")
+        bam.index(output_bam, data["config"])
    return output_bam

 def umi_consensus(data):
@@ -216,6 +221,8 @@ def umi_consensus(data):
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data)
+    fgbio = config_utils.get_program("fgbio", data["config"])
+    bamtofastq = config_utils.get_program("bamtofastq", data["config"])
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
@@ -227,13 +234,13 @@ def umi_consensus(data):
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            ref_file = dd.get_ref_file(data)
            cmd = ("unset JAVA_HOME && "
-                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
+                   "{fgbio} {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
-                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
+                   "{fgbio} {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
                   "-i /dev/stdin -o /dev/stdout | "
-                   "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
+                   "{fgbio} {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
                   "-i /dev/stdin -o /dev/stdout | "
-                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
+                   "{bamtofastq} collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out, avg_coverage

@@ -304,7 +311,7 @@ def _check_dedup(data):
    return dup_param

 def dedup_bam(in_bam, data):
-    """Perform non-stream based deduplication of BAM input files using biobambam.
+    """Perform non-stream based duplicate marking of BAM input files using biobambam.
    """
    if _check_dedup(data):
        out_file = os.path.join(utils.safe_makedir(os.path.join(os.getcwd(), "align", dd.get_sample_name(data))),
@@ -317,7 +324,7 @@ def dedup_bam(in_bam, data):
                    cores, mem = _get_cores_memory(data, downscale=2)
                    cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
                           "markthreads={cores} I={in_bam} O={tx_out_file}")
-                    do.run(cmd.format(**locals()), "De-duplication with biobambam")
+                    do.run(cmd.format(**locals()), f"Mark duplication of {in_bam} with biobambam.")
        bam.index(out_file, data["config"])
        return out_file
    else:

--- a/bcbio/pipeline/datadict.py
+++ b/bcbio/pipeline/datadict.py
@@ -71,6 +71,8 @@ LOOKUPS = {
    "hetcaller": {"keys": ["config", "algorithm", "hetcaller"]},
    "variantcaller": {"keys": ['config', 'algorithm', 'variantcaller']},
    "variantcaller_order": {"keys": ['config', 'algorithm', 'variantcaller_order'], "default": 0},
+    "keep_duplicates": {"keys": ['config', 'algorithm', "keep_duplicates"], "default": False},
+    "keep_multimapped": {"keys": ['config', 'algorithm', "keep_multimapped"], "default": False},
    "svcaller": {"keys": ['config', 'algorithm', 'svcaller'], "default": [], "always_list": True},
    "jointcaller": {"keys": ['config', 'algorithm', 'jointcaller']},
    "hlacaller": {"keys": ['config', 'algorithm', 'hlacaller']},
@@ -176,6 +178,7 @@ LOOKUPS = {
    "salmon": {"keys": ["salmon"]},
    "umi_type": {"keys": ["config", "algorithm", "umi_type"]},
    "correct_umis": {"keys": ["config", "algorithm", "correct_umis"]},
+    "use_lowfreq_filter": {"keys": ["config", "algorithm", "use_lowfreq_filter"]},
    "sample_barcodes": {"keys": ["config", "algorithm", "sample_barcodes"]},
    "cellular_barcodes": {"keys": ["config", "algorithm", "cellular_barcodes"],
                          "default": []},
@@ -208,8 +211,7 @@ LOOKUPS = {
    "disc_bam": {"keys": ["work_bam_plus", "disc"]},
    "sr_bam": {"keys": ["work_bam_plus", "sr"]},
    "peddy_report": {"keys": ["peddy_report"]},
-    "chipseq_antibody": {"keys": ["config", "algorithm", "chipseq", "antibody"], "default": ""},
-    "peaktype": {"keys": ["config", "algorithm", "chipseq", "peaktype"]},
+    "antibody": {"keys": ["config", "algorithm", "antibody"], "default": ""},
    "tools_off": {"keys": ["config", "algorithm", "tools_off"], "default": [], "always_list": True},
    "tools_on": {"keys": ["config", "algorithm", "tools_on"], "default": [], "always_list": True},
    "cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]},

--- a/bcbio/pipeline/run_info.py
+++ b/bcbio/pipeline/run_info.py
@@ -544,10 +544,12 @@ ALG_ALLOW_BOOLEANS = set(["merge_bamprep", "mark_duplicates", "remove_lcr",
                          "demultiplexed", "clinical_reporting", "transcriptome_align",
                          "fusion_mode", "assemble_transcripts", "trim_reads",
                          "quantify_genome_alignments",
-                          "recalibrate", "realign", "cwl_reporting", "save_diskspace"])
+                          "recalibrate", "realign", "cwl_reporting", "save_diskspace", "keep_multimapped",
+                          "keep_duplicates"])
 ALG_ALLOW_FALSE = set(["aligner", "align_split_size", "bam_clean", "bam_sort",
                       "effects", "phasing", "mixup_check", "indelcaller",
-                       "variantcaller", "positional_umi", "maxcov_downsample", "preseq"])
+                       "variantcaller", "positional_umi", "maxcov_downsample", "preseq",
+                       "use_lowfreq_filter"])

 ALG_DOC_URL = "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#algorithm-parameters"


--- a/bcbio/qc/fastqc.py
+++ b/bcbio/qc/fastqc.py
@@ -39,7 +39,9 @@ def run(bam_file, data, fastqc_out):
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        fastqc_clean_name = dd.get_sample_name(data)
-        num_cores = data["config"]["algorithm"].get("num_cores", 1)
+        # FastQC scales memory with threads (250mb per thread) so we avoid
+        # very low memory usage
+        num_cores = max(data["config"]["algorithm"].get("num_cores", 1), 2)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),

--- a/bcbio/variation/effects.py
+++ b/bcbio/variation/effects.py
@@ -103,7 +103,9 @@ def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
            if not os.path.exists(out_dir):
                tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp"))
                eversion = vepv.split("_")[0]
-                if int(eversion) >= 97:
+                if int(eversion) >= 98:
+                    vep_url_string = "indexed_vep_cache"
+                elif int(eversion) == 97:
                    vep_url_string = "vep"
                else:
                    vep_url_string = "VEP"
@@ -179,7 +181,7 @@ def run_vep(in_file, data):
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
-                    config_args += ["--sift", "b", "--polyphen", "b"]
+                    config_args += ["--sift", "b", "--polyphen", "b", "--humdiv"]
                    if hgvs_compatible:
                        config_args += ["--hgvsg", "--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")

--- a/bcbio/variation/population.py
+++ b/bcbio/variation/population.py
@@ -235,8 +235,7 @@ def get_ped_info(data, samples):
        "family_id": family_id,
        "maternal_id": tz.get_in(["metadata", "maternal_id"], data, -9),
        "paternal_id": tz.get_in(["metadata", "paternal_id"], data, -9),
-        "affected": get_affected_status(data),
-        "ethnicity": tz.get_in(["metadata", "ethnicity"], data, -9)
+        "affected": get_affected_status(data)
    }

 def create_ped_file(samples, base_vcf, out_dir=None):
@@ -249,7 +248,8 @@ def create_ped_file(samples, base_vcf, out_dir=None):
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    sample_ped_lines = {}
-    header = ["#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID", "Sex", "Phenotype", "Ethnicity"]
+    header = ["#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID",
+              "Sex", "Phenotype"]
    for md_ped in list(set([x for x in [tz.get_in(["metadata", "ped"], data)
                                        for data in samples] if x is not None])):
        with open(md_ped) as in_handle:
@@ -273,10 +273,12 @@ def create_ped_file(samples, base_vcf, out_dir=None):
                        if sname in sample_ped_lines:
                            writer.writerow(sample_ped_lines[sname])
                        else:
-                            writer.writerow([ped_info["family_id"], ped_info["individual_id"],
-                                            ped_info["paternal_id"], ped_info["maternal_id"],
-                                            ped_info["gender"], ped_info["affected"],
-                                            ped_info["ethnicity"]])
+                            writer.writerow([ped_info["family_id"],
+                                             ped_info["individual_id"],
+                                             ped_info["paternal_id"],
+                                             ped_info["maternal_id"],
+                                             ped_info["gender"],
+                                             ped_info["affected"]])
    return out_file

 def _find_shared_batch(samples):

--- a/bcbio/variation/vardict.py
+++ b/bcbio/variation/vardict.py
@@ -170,13 +170,19 @@ def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
-                lowfreq_filter = _lowfreq_linear_filter(0, False)
+                use_lowfreq_filter = config["algorithm"].get("use_lowfreq_filter")
+                if use_lowfreq_filter is False:
+                    lowfreq_filter = " | "
+                else:
+                    lowfreq_filter = " | " + _lowfreq_linear_filter(0, False) + " | "
+                teststrandbias = config_utils.get_program("teststrandbias.R", config)
+                var2vcf_valid = config_utils.get_program("var2vcf_valid.pl", config)
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} "
                       "-N {sample} -b {bamfile} {opts} "
-                       "| teststrandbias.R "
-                       "| var2vcf_valid.pl -A -N {sample} -E {var2vcf_opts} "
-                       "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} "
-                       "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
+                       "| {teststrandbias} "
+                       "| {var2vcf_valid} -A -N {sample} -E {var2vcf_opts} "
+                       "| {contig_cl} | bcftools filter -i 'QUAL >= 0' {lowfreq_filter} "
+                       "{fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"

--- a/config/genomes/BDGP6-resources.yaml
+++ b/config/genomes/BDGP6-resources.yaml
-version: 18
+version: 19

 aliases:
-  ensembl: drosophila_melanogaster_vep_97_BDGP6.22
+  ensembl: drosophila_melanogaster_vep_98_BDGP6.22
  snpeff: BDGP6.86