Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · c2724712 · c2724712
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,25 +14,25 @@ before_install:
  # Temporal fix for networking problem: https://github.com/travis-ci/travis-ci/issues/1484
  - echo "127.0.1.1 "`hostname` | sudo tee /etc/hosts
  # Get and install anaconda for custom Python installation
-  - wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
-  - bash Miniconda2-latest-Linux-x86_64.sh -b -p ~/install/bcbio-vm/anaconda
+  - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+  - bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/install/bcbio-vm/anaconda

 install:
  # Install bcbio-nextgen and bcbio-nextgen-vm
  - df -h
  - export PATH=~/install/bcbio-vm/anaconda/bin:$PATH
  - conda install --yes nomkl
-  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen
-  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm
+  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen python=3
+  - conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm python=3
  # Clean up space with external tools we don't need for tests
  - conda clean --yes --tarballs --index-cache
  - conda remove --yes --force qt
  - rm -rf ~/install/bcbio-vm/anaconda/pkgs/qt-*
  - du -sch ~/install/bcbio-vm/anaconda/pkgs/* | sort -h
  - df -h
-  # Update to latest bcbio-nextgen code externally and within the container
-  - rm -rf ~/install/bcbio-vm/anaconda/lib/python2.7/site-packages/bcbio
-  - rm -rf ~/install/bcbio-vm/anaconda/lib/python2.7/site-packages/bcbio_nextgen-*
+  # Update to latest bcbio-nextgen code externally
+  - rm -rf ~/install/bcbio-vm/anaconda/lib/python*/site-packages/bcbio
+  - rm -rf ~/install/bcbio-vm/anaconda/lib/python*/site-packages/bcbio_nextgen-*
  - ~/install/bcbio-vm/anaconda/bin/python setup.py install

 jobs:

--- a/HISTORY.md
+++ b/HISTORY.md
+## 1.1.5 (12 April 2019)
+
+- Fixes for Python3 incompatibilities on distributed IPython runs.
+- Numerous smaller Python3 incompatibilities with strings/unicode and types.
+  Thanks to the community for reporting these.
+- GATK HaplotypeCaller: correctly apply skipping of marked duplicates only
+  for amplicon runs. Thanks to Ben Liesfeld.
+- Fix format detection for bzip2 fastq inputs.
+- Support latest GATK4 MuTect2 (4.1.1.0) with changes to ploidy and reference
+  parameters.
+- Support changes to GATK4 for VQSR --resource specification in 4.1.1.0. Thanks
+  to Timothee Cezard.
+- Support latest bedtools (2.28.0) which expects SAM heads for bgzipped BED
+  inputs.
+
 ## 1.1.4 (3 April 2019)

 - Move to Python 3.6. A python2 environment in the install runs non python3

--- a/bcbio/distributed/ipythontasks.py
+++ b/bcbio/distributed/ipythontasks.py
@@ -68,7 +68,18 @@ def _pack_n_log(f):
            return ipython.zip_args(fn(*args))
    return wrapper

-@require(sample)
+def apply(object, args=None, kwargs=None):
+    """Python3 apply replacement for double unpacking of inputs during apply.
+
+    Thanks to: https://github.com/stefanholek/apply
+    """
+    if args is None:
+        args = ()
+    if kwargs is None:
+        kwargs = {}
+    return object(*args, **kwargs)
+
+require(sample)
 def prepare_sample(*args):
    args = ipython.unzip_args(args)
    with _setup_logging(args) as config:

--- a/bcbio/galaxy/nglims.py
+++ b/bcbio/galaxy/nglims.py
@@ -4,7 +4,6 @@ from __future__ import print_function
 import collections
 import copy
 import glob
-import gzip
 import operator
 import os
 import subprocess
@@ -53,7 +52,7 @@ def _prep_sample_and_config(ldetail_group, fastq_dir, fastq_final_dir):
            return out

 def _non_empty(f):
-    with gzip.open(f) as in_handle:
+    with utils.open_gzipsafe(f) as in_handle:
        for line in in_handle:
            return True
    return False

--- a/bcbio/heterogeneity/loh.py
+++ b/bcbio/heterogeneity/loh.py
@@ -4,7 +4,6 @@ Provides high level summaries of calls in regions of interest.
 """
 import csv
 import collections
-import gzip
 import os
 import decimal
 import uuid
@@ -61,7 +60,7 @@ def _civic_regions(civic_file, variant_types=None, diseases=None, drugs=None):
    """
    if isinstance(diseases, six.string_types):
        diseases = [diseases]
-    with gzip.open(civic_file) as in_handle:
+    with utils.open_gzipsafe(civic_file) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        for chrom, start, end, info_str in reader:
            info = edn_loads(info_str)

--- a/bcbio/install.py
+++ b/bcbio/install.py
@@ -18,6 +18,7 @@ import subprocess
 import sys
 import glob

+import six
 from six.moves import urllib
 import toolz as tz
 import yaml
@@ -441,7 +442,7 @@ def _is_old_database(db_dir, args):
        pred_file = os.path.join(db_dir, "snpEffectPredictor.bin")
        if not utils.file_exists(pred_file):
            return True
-        with gzip.open(pred_file) as in_handle:
+        with utils.open_gzipsafe(pred_file, is_gz=True) as in_handle:
            version_info = in_handle.readline().strip().split("\t")
        program, version = version_info[:2]
        if not program.lower() == "snpeff" or LooseVersion(snpeff_version) > LooseVersion(version):

--- a/bcbio/pipeline/region.py
+++ b/bcbio/pipeline/region.py
@@ -135,6 +135,8 @@ def _add_combine_info(output, combine_map, file_key):
        data = samples[0]
        data["region_bams"] = region_bams
        data["region"] = regions
+        data = dd.set_mark_duplicates(data, data["config"]["algorithm"]["orig_markduplicates"])
+        del data["config"]["algorithm"]["orig_markduplicates"]
        out.append([data])
    return out

@@ -155,6 +157,7 @@ def parallel_prep_region(samples, run_parallel):
            extras.append([data])
        else:
            # Do not want to re-run duplicate marking after realignment
+            data["config"]["algorithm"]["orig_markduplicates"] = dd.get_mark_duplicates(data)
            data = dd.set_mark_duplicates(data, False)
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,

--- a/bcbio/pipeline/run_info.py
+++ b/bcbio/pipeline/run_info.py
@@ -240,7 +240,7 @@ def _fill_validation_targets(data):
    sv_truth = tz.get_in(["config", "algorithm", "svvalidate"], data, {})
    sv_targets = (zip(itertools.repeat("svvalidate"), sv_truth.keys()) if isinstance(sv_truth, dict)
                  else [["svvalidate"]])
-    for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"], ["variant_regions"]] + sv_targets]:
+    for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"], ["variant_regions"]] + list(sv_targets)]:
        val = tz.get_in(["config", "algorithm"] + vtarget, data)
        if val and not os.path.exists(val) and not objectstore.is_remote(val):
            installed_val = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "validation", val))
@@ -663,8 +663,7 @@ def _check_quality_format(items):
                             "is not supported. Supported values are %s."
                             % (SAMPLE_FORMAT.values()))

-        fastq_file = next((file for file in item.get('files') or [] if
-                           any([ext for ext in fastq_extensions if ext in file])), None)
+        fastq_file = next((f for f in item.get("files") or [] if f.endswith(tuple(fastq_extensions))), None)

        if fastq_file and specified_format and not objectstore.is_remote(fastq_file):
            fastq_format = _detect_fastq_format(fastq_file)

--- a/bcbio/provenance/system.py
+++ b/bcbio/provenance/system.py
@@ -62,7 +62,7 @@ def _slurm_info(queue):
    """Returns machine information for a slurm job scheduler.
    """
    cl = "sinfo -h -p {} --format '%c %m %D'".format(queue)
-    num_cpus, mem, num_nodes = subprocess.check_output(shlex.split(cl)).split()
+    num_cpus, mem, num_nodes = subprocess.check_output(shlex.split(cl)).decode().split()
    # if the queue contains multiple memory configurations, the minimum value is printed with a trailing '+'
    mem = float(mem.replace('+', ''))
    num_cpus = int(num_cpus.replace('+', ''))
@@ -83,7 +83,7 @@ def _torque_info(queue):
    hosts are available, it uses the first host found from pbsnodes.
    """
    nodes = _torque_queue_nodes(queue)
-    pbs_out = subprocess.check_output(["pbsnodes"])
+    pbs_out = subprocess.check_output(["pbsnodes"]).decode()
    info = {}
    for i, line in enumerate(pbs_out.split("\n")):
        if i == 0 and len(nodes) == 0:
@@ -104,7 +104,7 @@ def _torque_queue_nodes(queue):
    Parses out nodes from `acl_hosts` in qstat -Qf and extracts the
    initial names of nodes used in pbsnodes.
    """
-    qstat_out = subprocess.check_output(["qstat", "-Qf", queue])
+    qstat_out = subprocess.check_output(["qstat", "-Qf", queue]).decode()
    hosts = []
    in_hosts = False
    for line in qstat_out.split("\n"):
@@ -128,9 +128,9 @@ def median_left(x):
 def _sge_info(queue):
    """Returns machine information for an sge job scheduler.
    """
-    qhost_out = subprocess.check_output(["qhost", "-q", "-xml"])
+    qhost_out = subprocess.check_output(["qhost", "-q", "-xml"]).decode()
    qstat_queue = ["-q", queue] if queue and "," not in queue else []
-    qstat_out = subprocess.check_output(["qstat", "-f", "-xml"] + qstat_queue)
+    qstat_out = subprocess.check_output(["qstat", "-f", "-xml"] + qstat_queue).decode()
    slot_info = _sge_get_slots(qstat_out)
    mem_info = _sge_get_mem(qhost_out, queue)
    machine_keys = slot_info.keys()

--- a/bcbio/qc/multiqc.py
+++ b/bcbio/qc/multiqc.py
@@ -4,6 +4,7 @@ https://github.com/ewels/MultiQC
 """
 import collections
 import glob
+import io
 import json
 import mimetypes
 import os
@@ -145,13 +146,13 @@ def _save_uploaded_data_json(samples, data_json_work, out_dir):
    if not upload_path_mapping:
        return data_json_work

-    with open(data_json_work) as f:
+    with io.open(data_json_work, encoding="utf-8") as f:
        data = json.load(f, object_pairs_hook=OrderedDict)
    upload_base = samples[0]["upload"]["dir"]
    data = walk_json(data, lambda s: _work_path_to_rel_final_path(s, upload_path_mapping, upload_base))

    data_json_final = os.path.join(out_dir, "multiqc_data_final.json")
-    with open(data_json_final, "w") as f:
+    with io.open(data_json_final, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    return data_json_final


--- a/bcbio/qc/srna.py
+++ b/bcbio/qc/srna.py
@@ -31,10 +31,10 @@ def _mirbase_stats(data, out_dir):
    return {"base": out_file, "secondary": [out_file_novel]}

 def _get_stats_from_miraligner(fn, out_file, name):
-    df = pd.read_csv(fn, sep="\t", dtype={"mism": "string",
-                                          "add": "string",
-                                          "t5": "string",
-                                          "t3": "string"},
+    df = pd.read_csv(fn, sep="\t", dtype={"mism": "str",
+                                          "add": "str",
+                                          "t5": "str",
+                                          "t3": "str"},
                     na_values=["."])
    dfmirs = df[['mir', 'freq']].groupby(['mir']).count()
    df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count()

--- a/bcbio/rnaseq/ericscript.py
+++ b/bcbio/rnaseq/ericscript.py
@@ -77,7 +77,7 @@ class EricScriptConfig(object):

    Private constants:
        _OUTPUT_DIR_NAME: name of the dir created in working directory for
-    ericscript ouput
+    ericscript output
    """
    info_message = 'Detect gene fusions with EricScript'
    EXECUTABLE = 'ericscript.pl'

--- a/bcbio/rnaseq/gtf.py
+++ b/bcbio/rnaseq/gtf.py
@@ -2,7 +2,6 @@ import gffutils
 import tempfile
 import os
 import random
-import gzip
 import re

 from bcbio import utils
@@ -21,7 +20,7 @@ def guess_infer_extent(gtf_file):
    tmp_out = tempfile.NamedTemporaryFile(suffix=".gtf", delete=False).name
    with open(tmp_out, "w") as out_handle:
        count = 0
-        in_handle = open(gtf_file) if ext != ".gz" else gzip.open(gtf_file)
+        in_handle = utils.open_gzipsafe(gtf_file)
        for line in in_handle:
            if count > 1000:
                break

--- a/bcbio/srna/mirdeep.py
+++ b/bcbio/srna/mirdeep.py
@@ -39,7 +39,7 @@ def run(data):
        return None
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
-        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
+        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res -g -1").format(**locals())
        if not file_exists(out_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
@@ -90,7 +90,7 @@ def _parse_novel(csv_file, sps="new"):
                if read and line.strip():
                    cols = line.strip().split("\t")
                    name, start, score = cols[0], cols[16], cols[1]
-                    if score < 1:
+                    if float(score) < 1:
                        continue
                    m5p, m3p, pre = cols[13], cols[14], cols[15].replace('u', 't').upper()
                    m5p_start = cols[15].find(m5p) + 1

--- a/bcbio/structural/prioritize.py
+++ b/bcbio/structural/prioritize.py
@@ -69,7 +69,8 @@ def _find_gene_list_from_bed(bed_file, base_file, data):
    if not os.path.exists(out_file):
        genes = set([])
        import pybedtools
-        for r in pybedtools.BedTool(bed_file):
+        with utils.open_gzipsafe(bed_file) as in_handle:
+            for r in pybedtools.BedTool(in_handle):
                if r.name:
                    if not r.name.startswith("{"):
                        genes.add(r.name)

--- a/bcbio/structural/regions.py
+++ b/bcbio/structural/regions.py
@@ -109,10 +109,12 @@ class MemoizedSizes:
        for data in items:
            region_bed = tz.get_in(["depth", "variant_regions", "regions"], data)
            if region_bed and region_bed not in checked_beds:
-                for r in pybedtools.BedTool(region_bed).intersect(cnv_file):
+                with utils.open_gzipsafe(region_bed) as in_handle:
+                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file):
                        if r.stop - r.start > range_map["target"][0]:
                            target_bps.append(float(r.name))
-                for r in pybedtools.BedTool(region_bed).intersect(cnv_file, v=True):
+                with utils.open_gzipsafe(region_bed) as in_handle:
+                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file, v=True):
                        if r.stop - r.start > range_map["target"][1]:
                            anti_bps.append(float(r.name))
                checked_beds.add(region_bed)

--- a/bcbio/structural/shared.py
+++ b/bcbio/structural/shared.py
@@ -298,7 +298,7 @@ def insert_size_stats(dists):
    MAD is the Median Absolute Deviation: http://en.wikipedia.org/wiki/Median_absolute_deviation
    """
    med = numpy.median(dists)
-    filter_dists = filter(lambda x: x < med + 10 * med, dists)
+    filter_dists = list(filter(lambda x: x < med + 10 * med, dists))
    median = numpy.median(filter_dists)
    return {"mean": float(numpy.mean(filter_dists)), "std": float(numpy.std(filter_dists)),
            "median": float(median),

--- a/bcbio/utils.py
+++ b/bcbio/utils.py
@@ -373,12 +373,15 @@ def symlink_plus(orig, new):
            with chdir(os.path.dirname(new_noext)):
                os.symlink(os.path.relpath(orig_noext + sub_ext), os.path.basename(new_noext + sub_ext))

-def open_gzipsafe(f):
-    if f.endswith(".gz"):
+def open_gzipsafe(f, is_gz=False):
+    if f.endswith(".gz") or is_gz:
        if six.PY3:
-            return gzip.open(f, "rt")
+            return gzip.open(f, "rt", encoding="utf-8", errors="ignore")
        else:
            return gzip.open(f)
+    else:
+        if six.PY3:
+            return open(f, encoding="utf-8", errors="ignore")
        else:
            return open(f)

@@ -850,8 +853,20 @@ def locale_export():
    RuntimeError: Click will abort further execution because Python 3 was
    configured to use ASCII as encoding for the environment.
    Consult https://click.palletsprojects.com/en/7.x/python3/ for mitigation steps.
+
+    Looks up available locales on the system to find an appropriate one to pick,
+    defaulting to C.UTF-8 which is globally available on newer systems.
    """
-    return "export LC_ALL=C.UTF-8 && export LANG=C.UTF-8 && "
+    locale_to_use = "C.UTF-8"
+    try:
+        locales = subprocess.check_output(["locale", "-a"]).decode(errors="ignore").split("\n")
+    except subprocess.CalledProcessError:
+        locales = []
+    for locale in locales:
+        if locale.lower().endswith(("utf-8", "utf8")):
+            locale_to_use = locale
+            break
+    return "export LC_ALL=%s && export LANG=%s && " % (locale_to_use, locale_to_use)

 def java_freetype_fix():
    """Provide workaround for issues FreeType library symbols.

--- a/bcbio/variation/coverage.py
+++ b/bcbio/variation/coverage.py
@@ -129,7 +129,8 @@ def _subset_to_variant_regions(callable_file, variant_regions, data):
    out_file = "%s-vrsubset.bed" % utils.splitext_plus(callable_file)[0]
    if not utils.file_uptodate(out_file, callable_file):
        with file_transaction(data, out_file) as tx_out_file:
-            pybedtools.BedTool(callable_file).intersect(variant_regions).saveas(tx_out_file)
+            with utils.open_gzipsafe(callable_file) as in_handle:
+                pybedtools.BedTool(in_handle).intersect(variant_regions).saveas(tx_out_file)
    return out_file

 def _get_cache_file(data, target_name):

--- a/bcbio/variation/gatkfilter.py
+++ b/bcbio/variation/gatkfilter.py
@@ -156,7 +156,7 @@ def _get_vqsr_training(filter_type, vrn_files, gatk_type):
    params = []
    for name, train_info, fname in _get_training_data(vrn_files)[filter_type]:
        if gatk_type == "gatk4":
-            params.extend(["--resource", "%s,%s:%s" % (name, train_info, fname)])
+            params.extend(["--resource:%s,%s" % (name, train_info), fname])
            if filter_type == "INDEL":
                params.extend(["--max-gaussians", "4"])
        else: