Skip to content
Commits on Source (4)
......@@ -14,25 +14,25 @@ before_install:
# Temporal fix for networking problem: https://github.com/travis-ci/travis-ci/issues/1484
- echo "127.0.1.1 "`hostname` | sudo tee /etc/hosts
# Get and install anaconda for custom Python installation
- wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
- bash Miniconda2-latest-Linux-x86_64.sh -b -p ~/install/bcbio-vm/anaconda
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
- bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/install/bcbio-vm/anaconda
install:
# Install bcbio-nextgen and bcbio-nextgen-vm
- df -h
- export PATH=~/install/bcbio-vm/anaconda/bin:$PATH
- conda install --yes nomkl
- conda install --yes -c conda-forge -c bioconda bcbio-nextgen
- conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm
- conda install --yes -c conda-forge -c bioconda bcbio-nextgen python=3
- conda install --yes -c conda-forge -c bioconda bcbio-nextgen-vm python=3
# Clean up space with external tools we don't need for tests
- conda clean --yes --tarballs --index-cache
- conda remove --yes --force qt
- rm -rf ~/install/bcbio-vm/anaconda/pkgs/qt-*
- du -sch ~/install/bcbio-vm/anaconda/pkgs/* | sort -h
- df -h
# Update to latest bcbio-nextgen code externally and within the container
- rm -rf ~/install/bcbio-vm/anaconda/lib/python2.7/site-packages/bcbio
- rm -rf ~/install/bcbio-vm/anaconda/lib/python2.7/site-packages/bcbio_nextgen-*
# Update to latest bcbio-nextgen code externally
- rm -rf ~/install/bcbio-vm/anaconda/lib/python*/site-packages/bcbio
- rm -rf ~/install/bcbio-vm/anaconda/lib/python*/site-packages/bcbio_nextgen-*
- ~/install/bcbio-vm/anaconda/bin/python setup.py install
jobs:
......
## 1.1.5 (12 April 2019)
- Fixes for Python3 incompatibilities on distributed IPython runs.
- Numerous smaller Python3 incompatibilities with strings/unicode and types.
Thanks to the community for reporting these.
- GATK HaplotypeCaller: correctly apply skipping of marked duplicates only
for amplicon runs. Thanks to Ben Liesfeld.
- Fix format detection for bzip2 fastq inputs.
- Support latest GATK4 MuTect2 (4.1.1.0) with changes to ploidy and reference
parameters.
- Support changes to GATK4 for VQSR --resource specification in 4.1.1.0. Thanks
to Timothee Cezard.
- Support latest bedtools (2.28.0) which expects SAM heads for bgzipped BED
inputs.
## 1.1.4 (3 April 2019)
- Move to Python 3.6. A python2 environment in the install runs non python3
......
......@@ -68,7 +68,18 @@ def _pack_n_log(f):
return ipython.zip_args(fn(*args))
return wrapper
@require(sample)
def apply(object, args=None, kwargs=None):
"""Python3 apply replacement for double unpacking of inputs during apply.
Thanks to: https://github.com/stefanholek/apply
"""
if args is None:
args = ()
if kwargs is None:
kwargs = {}
return object(*args, **kwargs)
require(sample)
def prepare_sample(*args):
args = ipython.unzip_args(args)
with _setup_logging(args) as config:
......
......@@ -4,7 +4,6 @@ from __future__ import print_function
import collections
import copy
import glob
import gzip
import operator
import os
import subprocess
......@@ -53,7 +52,7 @@ def _prep_sample_and_config(ldetail_group, fastq_dir, fastq_final_dir):
return out
def _non_empty(f):
with gzip.open(f) as in_handle:
with utils.open_gzipsafe(f) as in_handle:
for line in in_handle:
return True
return False
......
......@@ -4,7 +4,6 @@ Provides high level summaries of calls in regions of interest.
"""
import csv
import collections
import gzip
import os
import decimal
import uuid
......@@ -61,7 +60,7 @@ def _civic_regions(civic_file, variant_types=None, diseases=None, drugs=None):
"""
if isinstance(diseases, six.string_types):
diseases = [diseases]
with gzip.open(civic_file) as in_handle:
with utils.open_gzipsafe(civic_file) as in_handle:
reader = csv.reader(in_handle, delimiter="\t")
for chrom, start, end, info_str in reader:
info = edn_loads(info_str)
......
......@@ -18,6 +18,7 @@ import subprocess
import sys
import glob
import six
from six.moves import urllib
import toolz as tz
import yaml
......@@ -441,7 +442,7 @@ def _is_old_database(db_dir, args):
pred_file = os.path.join(db_dir, "snpEffectPredictor.bin")
if not utils.file_exists(pred_file):
return True
with gzip.open(pred_file) as in_handle:
with utils.open_gzipsafe(pred_file, is_gz=True) as in_handle:
version_info = in_handle.readline().strip().split("\t")
program, version = version_info[:2]
if not program.lower() == "snpeff" or LooseVersion(snpeff_version) > LooseVersion(version):
......
......@@ -135,6 +135,8 @@ def _add_combine_info(output, combine_map, file_key):
data = samples[0]
data["region_bams"] = region_bams
data["region"] = regions
data = dd.set_mark_duplicates(data, data["config"]["algorithm"]["orig_markduplicates"])
del data["config"]["algorithm"]["orig_markduplicates"]
out.append([data])
return out
......@@ -155,6 +157,7 @@ def parallel_prep_region(samples, run_parallel):
extras.append([data])
else:
# Do not want to re-run duplicate marking after realignment
data["config"]["algorithm"]["orig_markduplicates"] = dd.get_mark_duplicates(data)
data = dd.set_mark_duplicates(data, False)
torun.append([data])
return extras + parallel_split_combine(torun, split_fn, run_parallel,
......
......@@ -240,7 +240,7 @@ def _fill_validation_targets(data):
sv_truth = tz.get_in(["config", "algorithm", "svvalidate"], data, {})
sv_targets = (zip(itertools.repeat("svvalidate"), sv_truth.keys()) if isinstance(sv_truth, dict)
else [["svvalidate"]])
for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"], ["variant_regions"]] + sv_targets]:
for vtarget in [list(xs) for xs in [["validate"], ["validate_regions"], ["variant_regions"]] + list(sv_targets)]:
val = tz.get_in(["config", "algorithm"] + vtarget, data)
if val and not os.path.exists(val) and not objectstore.is_remote(val):
installed_val = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "validation", val))
......@@ -663,8 +663,7 @@ def _check_quality_format(items):
"is not supported. Supported values are %s."
% (SAMPLE_FORMAT.values()))
fastq_file = next((file for file in item.get('files') or [] if
any([ext for ext in fastq_extensions if ext in file])), None)
fastq_file = next((f for f in item.get("files") or [] if f.endswith(tuple(fastq_extensions))), None)
if fastq_file and specified_format and not objectstore.is_remote(fastq_file):
fastq_format = _detect_fastq_format(fastq_file)
......
......@@ -62,7 +62,7 @@ def _slurm_info(queue):
"""Returns machine information for a slurm job scheduler.
"""
cl = "sinfo -h -p {} --format '%c %m %D'".format(queue)
num_cpus, mem, num_nodes = subprocess.check_output(shlex.split(cl)).split()
num_cpus, mem, num_nodes = subprocess.check_output(shlex.split(cl)).decode().split()
# if the queue contains multiple memory configurations, the minimum value is printed with a trailing '+'
mem = float(mem.replace('+', ''))
num_cpus = int(num_cpus.replace('+', ''))
......@@ -83,7 +83,7 @@ def _torque_info(queue):
hosts are available, it uses the first host found from pbsnodes.
"""
nodes = _torque_queue_nodes(queue)
pbs_out = subprocess.check_output(["pbsnodes"])
pbs_out = subprocess.check_output(["pbsnodes"]).decode()
info = {}
for i, line in enumerate(pbs_out.split("\n")):
if i == 0 and len(nodes) == 0:
......@@ -104,7 +104,7 @@ def _torque_queue_nodes(queue):
Parses out nodes from `acl_hosts` in qstat -Qf and extracts the
initial names of nodes used in pbsnodes.
"""
qstat_out = subprocess.check_output(["qstat", "-Qf", queue])
qstat_out = subprocess.check_output(["qstat", "-Qf", queue]).decode()
hosts = []
in_hosts = False
for line in qstat_out.split("\n"):
......@@ -128,9 +128,9 @@ def median_left(x):
def _sge_info(queue):
"""Returns machine information for an sge job scheduler.
"""
qhost_out = subprocess.check_output(["qhost", "-q", "-xml"])
qhost_out = subprocess.check_output(["qhost", "-q", "-xml"]).decode()
qstat_queue = ["-q", queue] if queue and "," not in queue else []
qstat_out = subprocess.check_output(["qstat", "-f", "-xml"] + qstat_queue)
qstat_out = subprocess.check_output(["qstat", "-f", "-xml"] + qstat_queue).decode()
slot_info = _sge_get_slots(qstat_out)
mem_info = _sge_get_mem(qhost_out, queue)
machine_keys = slot_info.keys()
......
......@@ -4,6 +4,7 @@ https://github.com/ewels/MultiQC
"""
import collections
import glob
import io
import json
import mimetypes
import os
......@@ -145,13 +146,13 @@ def _save_uploaded_data_json(samples, data_json_work, out_dir):
if not upload_path_mapping:
return data_json_work
with open(data_json_work) as f:
with io.open(data_json_work, encoding="utf-8") as f:
data = json.load(f, object_pairs_hook=OrderedDict)
upload_base = samples[0]["upload"]["dir"]
data = walk_json(data, lambda s: _work_path_to_rel_final_path(s, upload_path_mapping, upload_base))
data_json_final = os.path.join(out_dir, "multiqc_data_final.json")
with open(data_json_final, "w") as f:
with io.open(data_json_final, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
return data_json_final
......
......@@ -31,10 +31,10 @@ def _mirbase_stats(data, out_dir):
return {"base": out_file, "secondary": [out_file_novel]}
def _get_stats_from_miraligner(fn, out_file, name):
df = pd.read_csv(fn, sep="\t", dtype={"mism": "string",
"add": "string",
"t5": "string",
"t3": "string"},
df = pd.read_csv(fn, sep="\t", dtype={"mism": "str",
"add": "str",
"t5": "str",
"t3": "str"},
na_values=["."])
dfmirs = df[['mir', 'freq']].groupby(['mir']).count()
df5 = df.loc[df.t5 != "0", ['mir', 't5']].groupby(['mir']).count()
......
......@@ -77,7 +77,7 @@ class EricScriptConfig(object):
Private constants:
_OUTPUT_DIR_NAME: name of the dir created in working directory for
ericscript ouput
ericscript output
"""
info_message = 'Detect gene fusions with EricScript'
EXECUTABLE = 'ericscript.pl'
......
......@@ -2,7 +2,6 @@ import gffutils
import tempfile
import os
import random
import gzip
import re
from bcbio import utils
......@@ -21,7 +20,7 @@ def guess_infer_extent(gtf_file):
tmp_out = tempfile.NamedTemporaryFile(suffix=".gtf", delete=False).name
with open(tmp_out, "w") as out_handle:
count = 0
in_handle = open(gtf_file) if ext != ".gz" else gzip.open(gtf_file)
in_handle = utils.open_gzipsafe(gtf_file)
for line in in_handle:
if count > 1000:
break
......
......@@ -39,7 +39,7 @@ def run(data):
return None
with chdir(out_dir):
collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res -g -1").format(**locals())
if not file_exists(out_file):
try:
do.run(cmd.format(**locals()), "Running mirdeep2.")
......@@ -90,7 +90,7 @@ def _parse_novel(csv_file, sps="new"):
if read and line.strip():
cols = line.strip().split("\t")
name, start, score = cols[0], cols[16], cols[1]
if score < 1:
if float(score) < 1:
continue
m5p, m3p, pre = cols[13], cols[14], cols[15].replace('u', 't').upper()
m5p_start = cols[15].find(m5p) + 1
......
......@@ -69,7 +69,8 @@ def _find_gene_list_from_bed(bed_file, base_file, data):
if not os.path.exists(out_file):
genes = set([])
import pybedtools
for r in pybedtools.BedTool(bed_file):
with utils.open_gzipsafe(bed_file) as in_handle:
for r in pybedtools.BedTool(in_handle):
if r.name:
if not r.name.startswith("{"):
genes.add(r.name)
......
......@@ -109,10 +109,12 @@ class MemoizedSizes:
for data in items:
region_bed = tz.get_in(["depth", "variant_regions", "regions"], data)
if region_bed and region_bed not in checked_beds:
for r in pybedtools.BedTool(region_bed).intersect(cnv_file):
with utils.open_gzipsafe(region_bed) as in_handle:
for r in pybedtools.BedTool(in_handle).intersect(cnv_file):
if r.stop - r.start > range_map["target"][0]:
target_bps.append(float(r.name))
for r in pybedtools.BedTool(region_bed).intersect(cnv_file, v=True):
with utils.open_gzipsafe(region_bed) as in_handle:
for r in pybedtools.BedTool(in_handle).intersect(cnv_file, v=True):
if r.stop - r.start > range_map["target"][1]:
anti_bps.append(float(r.name))
checked_beds.add(region_bed)
......
......@@ -298,7 +298,7 @@ def insert_size_stats(dists):
MAD is the Median Absolute Deviation: http://en.wikipedia.org/wiki/Median_absolute_deviation
"""
med = numpy.median(dists)
filter_dists = filter(lambda x: x < med + 10 * med, dists)
filter_dists = list(filter(lambda x: x < med + 10 * med, dists))
median = numpy.median(filter_dists)
return {"mean": float(numpy.mean(filter_dists)), "std": float(numpy.std(filter_dists)),
"median": float(median),
......
......@@ -373,12 +373,15 @@ def symlink_plus(orig, new):
with chdir(os.path.dirname(new_noext)):
os.symlink(os.path.relpath(orig_noext + sub_ext), os.path.basename(new_noext + sub_ext))
def open_gzipsafe(f):
if f.endswith(".gz"):
def open_gzipsafe(f, is_gz=False):
if f.endswith(".gz") or is_gz:
if six.PY3:
return gzip.open(f, "rt")
return gzip.open(f, "rt", encoding="utf-8", errors="ignore")
else:
return gzip.open(f)
else:
if six.PY3:
return open(f, encoding="utf-8", errors="ignore")
else:
return open(f)
......@@ -850,8 +853,20 @@ def locale_export():
RuntimeError: Click will abort further execution because Python 3 was
configured to use ASCII as encoding for the environment.
Consult https://click.palletsprojects.com/en/7.x/python3/ for mitigation steps.
Looks up available locales on the system to find an appropriate one to pick,
defaulting to C.UTF-8 which is globally available on newer systems.
"""
return "export LC_ALL=C.UTF-8 && export LANG=C.UTF-8 && "
locale_to_use = "C.UTF-8"
try:
locales = subprocess.check_output(["locale", "-a"]).decode(errors="ignore").split("\n")
except subprocess.CalledProcessError:
locales = []
for locale in locales:
if locale.lower().endswith(("utf-8", "utf8")):
locale_to_use = locale
break
return "export LC_ALL=%s && export LANG=%s && " % (locale_to_use, locale_to_use)
def java_freetype_fix():
"""Provide workaround for issues FreeType library symbols.
......
......@@ -129,7 +129,8 @@ def _subset_to_variant_regions(callable_file, variant_regions, data):
out_file = "%s-vrsubset.bed" % utils.splitext_plus(callable_file)[0]
if not utils.file_uptodate(out_file, callable_file):
with file_transaction(data, out_file) as tx_out_file:
pybedtools.BedTool(callable_file).intersect(variant_regions).saveas(tx_out_file)
with utils.open_gzipsafe(callable_file) as in_handle:
pybedtools.BedTool(in_handle).intersect(variant_regions).saveas(tx_out_file)
return out_file
def _get_cache_file(data, target_name):
......
......@@ -156,7 +156,7 @@ def _get_vqsr_training(filter_type, vrn_files, gatk_type):
params = []
for name, train_info, fname in _get_training_data(vrn_files)[filter_type]:
if gatk_type == "gatk4":
params.extend(["--resource", "%s,%s:%s" % (name, train_info, fname)])
params.extend(["--resource:%s,%s" % (name, train_info), fname])
if filter_type == "INDEL":
params.extend(["--max-gaussians", "4"])
else:
......