Skip to content
Commits on Source (4)
*.pyc
*.idx
*.swo
*.swp
bcbio/pipeline/version.py
bcbio_nextgen.egg-info/
build/
......
## 1.1.4 (3 April 2019)
- Move to Python 3.6. A python2 environment in the install runs non python3
compatible programs. The codebase is still compatible with python 2.7 but
will only get run and tested on python 3 for future releases.
- RNA-seq: fix for race condition when creating the pizzly cache
- RNA-seq: Add Salmon to multiqc report.
- RNA-seq single-cell/DGE: Properly strip transcript versions from GENCODE GTFs.
- RNA-seq: Faster and more flexible rRNA biotype lookup.
- Move to R3.5.1, including updates to all CRAN and Bioconductor packages.
- tumor-only germline prioritization: provide more useful germline filtering
based on prioritization INFO tag (EPR) rather than filter field.
- Install: do not require fabric for tool and data installs, making full codebase
compatible with python 3.
- variant: Filter out variants with missing ALT alleles output by GATK4.
- GATK: enable specification of spark specific parameters with `gatk-spark`
resources.
- RNA-seq single-cell/DGE: added `demultiplexed` option. If set to True, treat the
data as if it has already been demultiplexed into cells/wells.
- Multiple orders of magnitude faster templating with thousands of input files.
## 1.1.3 (29 January 2019)
- CNV: support background inputs for CNVkit, GATK4 CNV and seq2c. Allows
......
......@@ -3,7 +3,6 @@
from __future__ import print_function
import collections
import os
import itertools
import signal
import subprocess
import numpy
......@@ -11,6 +10,7 @@ import numpy
import pybedtools
import pysam
import toolz as tz
from six.moves import zip_longest
from bcbio import broad, utils
from bcbio.bam import ref
......@@ -48,7 +48,7 @@ def is_paired(bam_file):
"""
bam_file = objectstore.cl_input(bam_file)
cmd = ("set -o pipefail; "
"samtools view -h {bam_file} | head -50000 | "
"samtools view -h {bam_file} | head -300000 | "
"samtools view -S -f 1 /dev/stdin | head -1 | wc -l")
p = subprocess.Popen(cmd.format(**locals()), shell=True,
executable=do.find_bash(),
......@@ -291,7 +291,7 @@ def _check_bam_contigs(in_bam, ref_file, config):
extra_rcs = [x for x in ref_contigs if x not in bam_contigs]
problems = []
warnings = []
for bc, rc in itertools.izip_longest([x for x in bam_contigs if (x not in extra_bcs and
for bc, rc in zip_longest([x for x in bam_contigs if (x not in extra_bcs and
x not in allowed_outoforder)],
[x for x in ref_contigs if (x not in extra_rcs and
x not in allowed_outoforder)]):
......
......@@ -198,8 +198,12 @@ def block_regions(callable_bed, in_bam, ref_file, data):
if len(ref_regions.subtract(nblock_regions, nonamecheck=True)) > 0:
ref_regions.subtract(tx_nblock_bed, nonamecheck=True).merge(d=min_n_size).saveas(tx_callblock_bed)
else:
raise ValueError("No callable regions found from BAM file. Alignment regions might "
"not overlap with regions found in your `variant_regions` BED: %s" % in_bam)
raise ValueError("No callable regions found in %s from BAM file %s. Some causes:\n "
" - Alignment regions do not overlap with regions found "
"in your `variant_regions` BED: %s\n"
" - There are no aligned reads in your BAM file that pass sanity checks "
" (mapping score > 1, non-duplicates, both ends of paired reads mapped)"
% (dd.get_sample_name(data), in_bam, dd.get_variant_regions(data)))
return callblock_bed, nblock_bed
def _write_bed_regions(data, final_regions, out_file, out_file_ref):
......
......@@ -7,6 +7,8 @@ from itertools import product
import os
import random
import sys
import toolz as tz
from collections import defaultdict
from Bio import SeqIO
from bcbio.distributed import objectstore
......@@ -115,6 +117,8 @@ def combine_pairs(input_files, force_single=False, full_name=False, separators=N
Adjusted to allow different input paths or extensions for matching files.
"""
PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"])
if len(input_files) > 1000:
return fast_combine_pairs(input_files, force_single, full_name, separators)
pairs = []
used = set([])
......@@ -180,6 +184,25 @@ def combine_pairs(input_files, force_single=False, full_name=False, separators=N
used.add(in_file)
return pairs
def fast_combine_pairs(files, force_single, full_name, separators):
"""
assume files that need to be paired are within 10 entries of each other, once the list is sorted
"""
files = sort_filenames(files)
chunks = tz.sliding_window(10, files)
pairs = [combine_pairs(chunk, force_single, full_name, separators) for chunk in chunks]
pairs = [y for x in pairs for y in x]
longest = defaultdict(list)
# for each file, save the longest pair it is in
for pair in pairs:
for file in pair:
if len(longest[file]) < len(pair):
longest[file] = pair
# keep only unique pairs
longest = {tuple(sort_filenames(x)) for x in longest.values()}
# ensure filenames are R1 followed by R2
return [sort_filenames(list(x)) for x in longest]
def dif(a, b):
""" copy from http://stackoverflow.com/a/8545526 """
return [i for i in range(len(a)) if a[i] != b[i]]
......
......@@ -72,7 +72,7 @@ def _clean_java_out(version_str):
Java will report information like _JAVA_OPTIONS environmental variables in the output.
"""
out = []
for line in version_str.split("\n"):
for line in version_str.decode().split("\n"):
if line.startswith("Picked up"):
pass
if line.find("setlocale") > 0:
......@@ -95,15 +95,13 @@ def get_gatk_version(gatk_jar=None, config=None):
with closing(subprocess.Popen(cl, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, shell=True).stdout) as stdout:
out = _clean_java_out(stdout.read().strip())
# versions earlier than 2.4 do not have explicit version command,
# parse from error output from GATK
if out.find("ERROR") >= 0:
# Historical GATK version (2.4) and newer versions (4.1.0.0)
# have a flag in front of output version
version = out
flag = "The Genome Analysis Toolkit (GATK)"
for line in out.split("\n"):
if line.startswith(flag):
version = line.split(flag)[-1].split(",")[0].strip()
else:
version = out
if version.startswith("v"):
version = version[1:]
_check_for_bad_version(version, "GATK")
......@@ -546,7 +544,7 @@ def gatk_cmd(name, jvm_opts, params, config=None):
if not gatk_cmd:
gatk_cmd = utils.which(name)
if gatk_cmd:
return "%s && export PATH=%s:$PATH && %s %s %s" % \
return "%s && export PATH=%s:\"$PATH\" && %s %s %s" % \
(utils.clear_java_home(), utils.get_java_binpath(gatk_cmd), gatk_cmd,
" ".join(jvm_opts), " ".join([str(x) for x in params]))
......@@ -554,7 +552,7 @@ def _gatk4_cmd(jvm_opts, params, data):
"""Retrieve unified command for GATK4, using 'gatk'. GATK3 is 'gatk3'.
"""
gatk_cmd = utils.which(os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "gatk"))
return "%s && export PATH=%s:$PATH && gatk --java-options '%s' %s" % \
return "%s && export PATH=%s:\"$PATH\" && gatk --java-options '%s' %s" % \
(utils.clear_java_home(), utils.get_java_binpath(gatk_cmd),
" ".join(jvm_opts), " ".join([str(x) for x in params]))
......@@ -565,7 +563,7 @@ class PicardCmdRunner:
def run(self, subcmd, opts, memscale=None):
jvm_opts = get_picard_opts(self._config, memscale=memscale)
cmd = ["export", "PATH=%s:$PATH" % utils.get_java_binpath(), "&&"] + \
cmd = ["export", "PATH=%s:\"$PATH\"" % utils.get_java_binpath(), "&&"] + \
[self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
["VALIDATION_STRINGENCY=SILENT"]
do.run(utils.clear_java_home() + " && " + " ".join(cmd), "Picard: %s" % subcmd)
......
......@@ -69,32 +69,23 @@ def _prepare_bam(bam_file, bed_file, config):
do.run(cmd.format(**locals()), "Remove blacklist regions from %s" % bam_file)
return out_file
def get_genome(data):
"""
get the effective length of the genome, falling back to the length of the genome
if the effective length is not precomputed
"""
from bcbio.chipseq import macs2
from bcbio.bam import fasta
genome = dd.get_genome_build(data)
loaded = macs2.HS
if genome in loaded:
return loaded[genome]
else:
return sum([x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
def _bam_coverage(name, bam_input, data):
"""Run bamCoverage from deeptools"""
cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
"--binSize 20 --effectiveGenomeSize {size} "
"--smoothLength 60 --extendReads 150 --centerReads -p {cores}")
size = int(get_genome(data))
size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
cores = dd.get_num_cores(data)
try:
bam_coverage = config_utils.get_program("bamCoverage", data)
except config_utils.CmdNotFound:
logger.info("No bamCoverage found, skipping bamCoverage.")
return None
resources = config_utils.get_resources("bamCoverage", data["config"])
if resources:
options = resources.get("options")
if options:
cmd += " %s" % " ".join([str(x) for x in options])
bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
if utils.file_exists(bw_output):
return bw_output
......
......@@ -8,12 +8,6 @@ from bcbio.pipeline import config_utils
from bcbio.pipeline import datadict as dd
from bcbio import bam
HS = {"hg19": 2.7e9,
"GRCh37": 2.7e9,
"hg38": 2.7e9,
"mm10": 1.87e9,
"dm3": 1.2e8}
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
"""
Run macs2 for chip and input samples avoiding
......@@ -28,7 +22,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
return _get_output_files(out_dir)
macs2 = config_utils.get_program("macs2", config)
options = " ".join(resources.get("macs2", {}).get("options", ""))
genome_size = HS.get(genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data)))
genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
with utils.chdir(out_dir):
......
......@@ -490,7 +490,7 @@ def _flatten_samples(samples, base_file, get_retriever):
cur_flat[flat_key] = flat_val
flat_data.append(cur_flat)
out = {}
for key in sorted(list(set(reduce(operator.add, [d.keys() for d in flat_data])))):
for key in sorted(list(set(reduce(operator.add, [list(d.keys()) for d in flat_data])))):
# Periods in keys cause issues with WDL and some CWL implementations
clean_key = key.replace(".", "_")
out[clean_key] = []
......
......@@ -99,12 +99,13 @@ def _alignment(checkpoints):
cwlout(["config", "algorithm", "quality_format"], ["string", "null"]),
cwlout(["align_split"], ["string", "null"])])],
"bcbio-vc", ["grabix", "htslib", "biobambam", "atropos;env=python3",
"optitype", "razers3=3.5.0", "coincbc"], # HLA deps for general docker inclusion
"optitype;env=python2", "razers3=3.5.0", "coincbc"], # HLA deps for general docker inclusion
disk={"files": 1.5}),
s("process_alignment", "single-parallel" if checkpoints["align_split"] else "single-single",
[["alignment_rec"], ["process_alignment_rec"]], process_alignment_out,
"bcbio-vc", ["bwa", "bwakit", "grabix", "minimap2", "novoalign", "snap-aligner=1.0dev.97",
"sentieon", "samtools", "pysam>=0.13.0", "sambamba", "fgbio", "umis",
"sentieon;env=python2", "samtools", "pysam>=0.13.0", "sambamba", "fgbio",
"umis;env=python2",
"biobambam", "seqtk", "samblaster", "variantbam"],
disk={"files": 2})]
if checkpoints["align_split"]:
......@@ -138,7 +139,7 @@ def _variant_hla(checkpoints):
[["hla_rec"]],
[cwlout(["hla", "hlacaller"], ["string", "null"]),
cwlout(["hla", "call_file"], ["File", "null"])],
"bcbio-vc", ["optitype", "razers3=3.5.0", "coincbc"])]
"bcbio-vc", ["optitype;env=python2", "razers3=3.5.0", "coincbc"])]
return hla, [["hla", "call_file"]]
def _variant_vc(checkpoints):
......@@ -156,10 +157,10 @@ def _variant_vc(checkpoints):
[cwlout(["vrn_file_region"], ["File", "null"], [".tbi"]),
cwlout(["region_block"], {"type": "array", "items": "string"})],
"bcbio-vc", ["bcftools", "bedtools", "freebayes=1.1.0.46",
"gatk4", "vqsr_cnn", "deepvariant", "sentieon",
"htslib", "octopus", "picard", "platypus-variant", "pythonpy",
"samtools", "pysam>=0.13.0", "strelka", "vardict", "vardict-java",
"varscan", "moreutils", "vcfanno", "vcflib", "vt", "r=3.4.1", "r-base=3.4.1=h4fe35fd_8",
"gatk4", "vqsr_cnn", "deepvariant;env=dv", "sentieon;env=python2",
"htslib", "octopus", "picard", "platypus-variant;env=python2", "pythonpy",
"samtools", "pysam>=0.13.0", "strelka;env=python2", "vardict", "vardict-java",
"varscan", "moreutils", "vcfanno", "vcflib", "vt", "r=3.5.1", "r-base",
"perl"],
disk={"files": 2.0}),
s("concat_batch_variantcalls", "batch-merge",
......@@ -184,7 +185,7 @@ def _variant_vc(checkpoints):
cwlout(["validate", "fp"], ["File", "null"], [".tbi"]),
cwlout(["validate", "fn"], ["File", "null"], [".tbi"]),
cwlout("inherit", exclude=vc_rec_exclude)])],
"bcbio-vc", ["bcftools", "bedtools", "pythonpy", "gvcf-regions",
"bcbio-vc", ["bcftools", "bedtools", "pythonpy", "gvcf-regions;env=python2",
"htslib", "rtg-tools", "vcfanno"],
disk={"files": 1.5})]
batch_in = [["analysis"], ["genome_build"], ["align_bam"], ["vrn_file"],
......@@ -221,7 +222,9 @@ def _variant_vc(checkpoints):
batch_in += [["genome_resources", "variation", "train_hapmap"],
["genome_resources", "variation", "train_indels"]]
vc = [s("batch_for_variantcall", "multi-batch", batch_in,
[cwlout("batch_rec", "record")],
[cwlout("batch_rec", "record",
fields=[cwlout(["config", "algorithm", "variantcaller_order"], "int"),
cwlout("inherit")])],
"bcbio-vc",
disk={"files": 2.0}, cores=1,
unlist=[["config", "algorithm", "variantcaller"]], no_files=True),
......@@ -273,7 +276,7 @@ def _variant_jointvc():
s("run_jointvc", "batch-parallel",
[["jointvc_batch_rec"], ["region"]],
[cwlout(["vrn_file_region"], ["File", "null"], [".tbi"]), cwlout(["region"], "string")],
"bcbio-vc", ["gatk4", "gvcfgenotyper", "sentieon"],
"bcbio-vc", ["gatk4", "gvcfgenotyper", "sentieon;env=python2"],
disk={"files": 1.5}, cores=1),
s("concat_batch_variantcalls_jointvc", "batch-merge",
[["jointvc_batch_rec"], ["region"], ["vrn_file_region"]],
......@@ -387,7 +390,7 @@ def _postprocess_alignment(checkpoints):
cwlout(["depth", "coverage", "dist"], ["File", "null"]),
cwlout(["depth", "coverage", "thresholds"], ["File", "null"]),
cwlout(["align_bam"], ["File", "null"])],
"bcbio-vc", ["sambamba", "goleft", "bedtools", "htslib", "gatk4", "mosdepth", "sentieon"],
"bcbio-vc", ["sambamba", "goleft", "bedtools", "htslib", "gatk4", "mosdepth", "sentieon;env=python2"],
disk={"files": 3.0}),
s("combine_sample_regions", "multi-combined",
[["regions", "callable"], ["regions", "nblock"], ["metadata", "batch"],
......@@ -440,7 +443,7 @@ def variant(samples):
else:
align = [s("organize_noalign", "multi-parallel",
["files"],
[cwlout(["align_bam"], "File", [".bai"]),
[cwlout(["align_bam"], ["File", "null"], [".bai"]),
cwlout(["work_bam_plus", "disc"], ["File", "null"]),
cwlout(["work_bam_plus", "sr"], ["File", "null"]),
cwlout(["hla", "fastq"], ["File", "null"])],
......@@ -492,7 +495,7 @@ def _qc_workflow(checkpoints):
cwlout(["config", "algorithm", "qc"])])],
"bcbio-vc", ["bcftools", "bedtools", "fastqc=0.11.7=5", "goleft", "hts-nim-tools", "mosdepth",
"picard", "pythonpy", "qsignature", "qualimap", "sambamba",
"samtools", "preseq", "peddy", "verifybamid2"],
"samtools", "preseq", "peddy;env=python2", "verifybamid2"],
disk={"files": 2.0}),
s("multiqc_summary", "multi-combined",
[["qcout_rec"]],
......@@ -519,11 +522,12 @@ def _variant_sv(checkpoints):
cwlout(["svvalidate", "summary"], ["File", "null"]),
cwlout("inherit", exclude=[["align_bam"], ["work_bam_plus"],
["reference", "snpeff"]])])],
"bcbio-vc", ["bedtools", "cnvkit", "delly", "duphold", "extract-sv-reads",
"lumpy-sv", "manta", "break-point-inspector", "mosdepth", "samtools",
"smoove", "pysam>=0.13.0",
"seq2c", "simple_sv_annotation", "survivor", "svtools", "svtyper",
"r=3.4.1", "r-base=3.4.1=h4fe35fd_8", "xorg-libxt", "vawk"],
"bcbio-vc", ["bedtools", "cnvkit", "delly", "duphold", "extract-sv-reads", "gsort",
"lumpy-sv;env=python2", "manta;env=python2", "break-point-inspector", "mosdepth", "samtools",
"smoove;env=python2", "pysam>=0.13.0",
"seq2c", "simple_sv_annotation;env=python2", "survivor", "svtools;env=python2",
"svtyper;env=python2",
"r=3.5.1", "r-base", "xorg-libxt", "vawk;env=python2"],
disk={"files": 2.0})]
sv_batch_inputs = [["analysis"], ["genome_build"],
["work_bam_plus", "disc"], ["work_bam_plus", "sr"],
......@@ -619,7 +623,7 @@ def rnaseq(samples):
align = [s("process_alignment", "multi-parallel",
[["trim_rec"]],
[cwlout(["align_bam"], "File", [".bai"])],
"bcbio-rnaseq", ["star", "hisat2", "tophat", "samtools",
"bcbio-rnaseq", ["star", "hisat2", "tophat;env=python2", "samtools",
"sambamba", "seqtk"],
{"files": 1.5})]
if checkpoints.get("vc"):
......@@ -633,7 +637,7 @@ def rnaseq(samples):
cwlout(["quant", "hdf5"], "File"),
cwlout(["quant", "fusion"], "File")],
"bcbio-rnaseq", programs=["sailfish", "salmon", "kallisto>=0.43.1", "subread", "gffread",
"r=3.4.1", "r-base=3.4.1=h4fe35fd_8", "xorg-libxt", "r-wasabi"],
"r=3.5.1", "r-base", "xorg-libxt", "r-wasabi"],
disk={"files": 0.5})]
qc = [s("qc_to_rec", "multi-combined",
[["align_bam"], ["analysis"], ["reference", "fasta", "base"], dd.get_keys("gtf_file"),
......
......@@ -78,7 +78,7 @@ def _args_to_cromwell(args):
"""Convert input arguments into cromwell inputs for config and command line.
"""
default_config = {"slurm": {"timelimit": "1-00:00", "account": ""},
"sge": {"memtype": "mem_type", "pename": "smp"},
"sge": {"memtype": "mem_free", "pename": "smp"},
"lsf": {"walltime": "24:00", "account": ""},
"htcondor": {},
"torque": {"walltime": "24:00:00", "account": ""},
......@@ -340,7 +340,7 @@ HPC_CONFIGS = {
Int cpu = 1
Int memory_mb = 2048
String queue = "%(queue)s"
String pename = "%(pename}s"
String pename = "%(pename)s"
String memtype = "%(memtype)s"
%(docker_attrs)s
%(cwl_attrs)s
......@@ -348,8 +348,8 @@ HPC_CONFIGS = {
submit = \"\"\"
qsub -V -w w -j y -N ${job_name} -wd ${cwd} \
-o ${out} -e ${err} -q ${queue} \
-pe ${pename} ${cpu} ${"-l " + mem_type + "=" + memory_mb + "m"} \
/usr/bin/env bash ${script}
-pe ${pename} ${cpu} -l ${memtype}=${memory_mb}m \
${script}
\"\"\"
kill = "qdel ${job_id}"
check-alive = "qstat -j ${job_id}"
......
......@@ -61,10 +61,15 @@ def _chown_workdir(work_dir):
"""Ensure work directory files owned by original user.
Docker runs can leave root owned files making cleanup difficult.
Skips this if it fails, avoiding errors where we run remotely
and don't have docker locally.
"""
cmd = ("""docker run --rm -v %s:%s quay.io/bcbio/bcbio-base /bin/bash -c 'chown -R %s %s'""" %
(work_dir, work_dir, os.getuid(), work_dir))
try:
subprocess.check_call(cmd, shell=True)
except subprocess.CalledProcessError:
pass
def _remove_bcbiovm_path():
"""Avoid referencing minimal bcbio_nextgen in bcbio_vm installation.
......@@ -153,6 +158,9 @@ def _run_wes(args):
"""
main_file, json_file, project_name = _get_main_and_json(args.directory)
main_file = _pack_cwl(main_file)
if args.host and "stratus" in args.host:
_run_wes_stratus(args, main_file, json_file)
else:
opts = ["--no-wait"]
if args.host:
opts += ["--host", args.host]
......@@ -161,6 +169,54 @@ def _run_wes(args):
cmd = ["wes-client"] + opts + [main_file, json_file]
_run_tool(cmd)
def _run_wes_stratus(args, main_file, json_file):
"""Run WES on Illumina stratus endpoint server, which wes-client doesn't support.
https://stratus-docs.readme.io/docs/quick-start-4
"""
import requests
base_url = args.host
if not base_url.startswith("http"):
base_url = "https://%s" % base_url
with open(main_file) as in_handle:
r = requests.post("%s/v1/workflows" % base_url,
headers={"Content-Type": "application/json",
"Authorization": "Bearer %s" % args.auth},
data=in_handle.read())
print(r.status_code)
print(r.text)
def _estimate_runner_memory(json_file):
"""Estimate Java memory requirements based on number of samples.
A rough approach to selecting correct allocated memory for Cromwell.
"""
with open(json_file) as in_handle:
sinfo = json.load(in_handle)
num_parallel = 1
for key in ["config__algorithm__variantcaller", "description"]:
item_counts = []
n = 0
for val in (sinfo.get(key) or []):
n += 1
if val:
if isinstance(val, (list, tuple)):
item_counts.append(len(val))
else:
item_counts.append(1)
print(key, n, item_counts)
if n and item_counts:
num_parallel = n * max(item_counts)
break
if num_parallel < 25:
return "3g"
if num_parallel < 150:
return "6g"
elif num_parallel < 500:
return "12g"
else:
return "24g"
def _run_cromwell(args):
"""Run CWL with Cromwell.
"""
......@@ -177,7 +233,8 @@ def _run_cromwell(args):
with open(option_file, "w") as out_handle:
json.dump(cromwell_opts, out_handle)
cmd = ["cromwell", "-Xms1g", "-Xmx3g", "run", "--type", "CWL",
cmd = ["cromwell", "-Xms1g", "-Xmx%s" % _estimate_runner_memory(json_file),
"run", "--type", "CWL",
"-Dconfig.file=%s" % hpc.create_cromwell_config(args, work_dir, json_file)]
cmd += hpc.args_to_cromwell_cl(args)
cmd += ["--metadata-output", metadata_file, "--options", option_file,
......
......@@ -154,7 +154,7 @@ def is_cwl_record(d):
if d.get("type") == "record":
return d
else:
recs = filter(lambda x: x is not None, [is_cwl_record(v) for v in d.values()])
recs = list(filter(lambda x: x is not None, [is_cwl_record(v) for v in d.values()]))
return recs[0] if recs else None
else:
return None
......@@ -254,7 +254,7 @@ def _nest_variable(v, check_records=False):
check_records -- avoid re-nesting a record input if it comes from a previous
step and is already nested, don't need to re-array.
"""
if (check_records and is_cwl_record(v) and v["id"].split("/") > 1 and
if (check_records and is_cwl_record(v) and len(v["id"].split("/")) > 1 and
v.get("type", {}).get("type") == "array"):
return v
else:
......
......@@ -116,7 +116,7 @@ def _get_record_attrs(out_keys):
"""Check for records, a single key plus output attributes.
"""
if len(out_keys) == 1:
attr = out_keys.keys()[0]
attr = list(out_keys.keys())[0]
if out_keys[attr]:
return attr, out_keys[attr]
return None, None
......
......@@ -33,6 +33,9 @@ def run(data):
out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
dd.get_sample_name(data), "hla",
"OptiType-HLA-A_B_C"))
# When running UMIs and hla typing we want to pick the original fastqs
if len(hlas) > len(SUPPORTED_HLAS):
hlas = [x for x in hlas if os.path.basename(x[1]).find("-cumi") == -1]
if len(hlas) == len(SUPPORTED_HLAS):
hla_fq = combine_hla_fqs(hlas, out_dir + "-input.fq", data)
if utils.file_exists(hla_fq):
......
......@@ -39,7 +39,7 @@ REMOTES = {
SUPPORTED_GENOMES = ["GRCh37", "hg19", "hg38", "hg38-noalt", "mm10", "mm9",
"rn6", "rn5", "canFam3", "dm3", "galGal4", "phix",
"pseudomonas_aeruginosa_ucbpp_pa14", "sacCer3", "TAIR10",
"WBcel235", "xenTro3", "GRCz10", "GRCz11", "Sscrofa11.1"]
"WBcel235", "xenTro3", "GRCz10", "GRCz11", "Sscrofa11.1", "BDGP6"]
TARBALL_DIRECTORIES = ["bwa", "rtg", "hisat2"]
SUPPORTED_INDEXES = TARBALL_DIRECTORIES + ["bbmap", "bowtie", "bowtie2", "minimap2", "novoalign", "twobit",
"snap", "star", "seq"]
......@@ -191,10 +191,10 @@ def _install_container_bcbio_system(datadir):
expose_file = os.path.join(datadir, "galaxy", "bcbio_system.yaml")
expose = set(["memory", "cores", "jvm_opts"])
with open(base_file) as in_handle:
config = yaml.load(in_handle)
config = yaml.safe_load(in_handle)
if os.path.exists(expose_file):
with open(expose_file) as in_handle:
expose_config = yaml.load(in_handle)
expose_config = yaml.safe_load(in_handle)
else:
expose_config = {"resources": {}}
for pname, vals in config["resources"].items():
......@@ -214,21 +214,6 @@ def _get_conda_bin():
if os.path.exists(conda_bin):
return conda_bin
def _default_deploy_args(args):
"""Standard install arguments for CloudBioLinux.
Avoid using sudo and keep an installation isolated if running as the root user.
"""
return {"flavor": "ngs_pipeline_minimal",
"vm_provider": "novm",
"hostname": "localhost",
"fabricrc_overrides": {"edition": "minimal",
"use_sudo": False,
"keep_isolated": args.isolate or os.geteuid() == 0,
"conda_cmd": _get_conda_bin(),
"distribution": args.distribution or "__auto__",
"dist_name": "__auto__"}}
def _check_for_conda_problems():
"""Identify post-install conda problems and fix.
......@@ -236,7 +221,7 @@ def _check_for_conda_problems():
"""
conda_bin = _get_conda_bin()
channels = _get_conda_channels(conda_bin)
lib_dir = os.path.join(os.path.dirname(conda_bin), os.pardir, "iib")
lib_dir = os.path.join(os.path.dirname(conda_bin), os.pardir, "lib")
for l in ["libgomp.so.1", "libquadmath.so"]:
if not os.path.exists(os.path.join(lib_dir, l)):
subprocess.check_call([conda_bin, "install", "-f", "--yes"] + channels + ["libgcc-ng"])
......@@ -245,12 +230,13 @@ def _update_bcbiovm():
"""Update or install a local bcbiovm install with tools and dependencies.
"""
print("## CWL support with bcbio-vm")
conda_bin, env_name = _add_environment("bcbiovm", "python=2")
python_env = "python=3"
conda_bin, env_name = _add_environment("bcbiovm", python_env)
channels = _get_conda_channels(conda_bin)
base_cmd = [conda_bin, "install", "--yes", "--name", env_name] + channels
subprocess.check_call(base_cmd + ["bcbio-nextgen"])
subprocess.check_call(base_cmd + [python_env, "nomkl", "bcbio-nextgen"])
extra_uptodate = ["cromwell"]
subprocess.check_call(base_cmd + ["bcbio-nextgen-vm"] + extra_uptodate)
subprocess.check_call(base_cmd + [python_env, "bcbio-nextgen-vm"] + extra_uptodate)
def _get_envs(conda_bin):
info = json.loads(subprocess.check_output("{conda_bin} info --envs --json".format(**locals()), shell=True))
......@@ -272,7 +258,7 @@ def _get_conda_channels(conda_bin):
"""
channels = ["bioconda", "conda-forge"]
out = []
config = yaml.load(subprocess.check_output([conda_bin, "config", "--show"]))
config = yaml.safe_load(subprocess.check_output([conda_bin, "config", "--show"]))
for c in channels:
present = False
for orig_c in config.get("channels") or []:
......@@ -346,29 +332,23 @@ def get_gemini_dir(data=None):
def upgrade_bcbio_data(args, remotes):
"""Upgrade required genome data files in place.
"""
from fabric.api import env
if hasattr(args, "datadir") and args.datadir and os.path.exists(args.datadir):
data_dir = args.datadir
else:
data_dir = _get_data_dir()
s = _default_deploy_args(args)
s["actions"] = ["setup_biodata"]
tooldir = args.tooldir or get_defaults().get("tooldir")
if tooldir:
s["fabricrc_overrides"]["system_install"] = tooldir
s["fabricrc_overrides"]["data_files"] = data_dir
s["fabricrc_overrides"]["galaxy_home"] = os.path.join(data_dir, "galaxy")
galaxy_home = os.path.join(data_dir, "galaxy")
cbl = get_cloudbiolinux(remotes)
s["genomes"] = _get_biodata(cbl["biodata"], args)
tool_data_table_conf_file = os.path.join(cbl["dir"], "installed_files", "tool_data_table_conf.xml")
genome_opts = _get_biodata(cbl["biodata"], args)
sys.path.insert(0, cbl["dir"])
env.cores = args.cores
cbl_deploy = __import__("cloudbio.deploy", fromlist=["deploy"])
cbl_deploy.deploy(s)
_upgrade_genome_resources(s["fabricrc_overrides"]["galaxy_home"],
remotes["genome_resources"])
_upgrade_snpeff_data(s["fabricrc_overrides"]["galaxy_home"], args, remotes)
cbl_genomes = __import__("cloudbio.biodata.genomes", fromlist=["genomes"])
cbl_genomes.install_data_local(genome_opts, tooldir, data_dir, galaxy_home, tool_data_table_conf_file,
args.cores, ["ggd", "s3", "raw"])
_upgrade_genome_resources(galaxy_home, remotes["genome_resources"])
_upgrade_snpeff_data(galaxy_home, args, remotes)
if "vep" in args.datatarget:
_upgrade_vep_data(s["fabricrc_overrides"]["galaxy_home"], tooldir)
_upgrade_vep_data(galaxy_home, tooldir)
if "kraken" in args.datatarget:
_install_kraken_db(_get_data_dir(), args)
if args.cwl:
......@@ -404,8 +384,8 @@ def _upgrade_genome_resources(galaxy_dir, base_url):
local_file = os.path.join(os.path.dirname(ref_file), os.path.basename(remote_url))
if os.path.exists(local_file):
with open(local_file) as in_handle:
local_config = yaml.load(in_handle)
remote_config = yaml.load(r.text)
local_config = yaml.safe_load(in_handle)
remote_config = yaml.safe_load(r.text)
needs_update = remote_config["version"] > local_config.get("version", 0)
if needs_update:
shutil.move(local_file, local_file + ".old%s" % local_config.get("version", 0))
......@@ -430,7 +410,7 @@ def _upgrade_snpeff_data(galaxy_dir, args, remotes):
resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
if os.path.exists(resource_file):
with open(resource_file) as in_handle:
resources = yaml.load(in_handle)
resources = yaml.safe_load(in_handle)
snpeff_db, snpeff_base_dir = effects.get_db({"genome_resources": resources,
"reference": {"fasta": {"base": ref_file}}})
if snpeff_db:
......@@ -472,7 +452,7 @@ def _get_biodata(base_file, args):
"""Retrieve biodata genome targets customized by install parameters.
"""
with open(base_file) as in_handle:
config = yaml.load(in_handle)
config = yaml.safe_load(in_handle)
config["install_liftover"] = False
config["genome_indexes"] = args.aligners
ann_groups = config.pop("annotation_groups", {})
......@@ -503,22 +483,15 @@ def upgrade_thirdparty_tools(args, remotes):
Creates a manifest directory with installed programs on the system.
"""
s = {"fabricrc_overrides": {"system_install": args.tooldir,
"local_install": os.path.join(args.tooldir, "local_install"),
"distribution": args.distribution,
"conda_cmd": _get_conda_bin(),
"use_sudo": False,
"edition": "minimal"}}
s = _default_deploy_args(args)
s["actions"] = ["install_biolinux"]
s["fabricrc_overrides"]["system_install"] = args.tooldir
s["fabricrc_overrides"]["local_install"] = os.path.join(args.tooldir, "local_install")
if args.toolconf and os.path.exists(args.toolconf):
s["fabricrc_overrides"]["conda_yaml"] = args.toolconf
cbl = get_cloudbiolinux(remotes)
if args.toolconf and os.path.exists(args.toolconf):
package_yaml = args.toolconf
else:
package_yaml = os.path.join(cbl["dir"], "contrib", "flavor",
"ngs_pipeline_minimal", "packages-conda.yaml")
sys.path.insert(0, cbl["dir"])
cbl_deploy = __import__("cloudbio.deploy", fromlist=["deploy"])
cbl_deploy.deploy(s)
cbl_conda = __import__("cloudbio.package.conda", fromlist=["conda"])
cbl_conda.install_in(_get_conda_bin(), args.tooldir, package_yaml)
manifest_dir = os.path.join(_get_data_dir(), "manifest")
print("Creating manifest of installed packages in %s" % manifest_dir)
cbl_manifest = __import__("cloudbio.manifest", fromlist=["manifest"])
......@@ -571,7 +544,7 @@ def _update_manifest(manifest_file, name, version):
"""
if os.path.exists(manifest_file):
with open(manifest_file) as in_handle:
manifest = yaml.load(in_handle)
manifest = yaml.safe_load(in_handle)
else:
manifest = {}
manifest[name] = {"name": name, "version": version}
......@@ -585,7 +558,7 @@ def _update_system_file(system_file, name, new_kvs):
bak_file = system_file + ".bak%s" % datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
shutil.copyfile(system_file, bak_file)
with open(system_file) as in_handle:
config = yaml.load(in_handle)
config = yaml.safe_load(in_handle)
else:
utils.safe_makedir(os.path.dirname(system_file))
config = {}
......@@ -661,7 +634,7 @@ def save_install_defaults(args):
return
if utils.file_exists(install_config):
with open(install_config) as in_handle:
cur_config = yaml.load(in_handle)
cur_config = yaml.safe_load(in_handle)
else:
cur_config = {}
if args.tooldir:
......@@ -695,7 +668,7 @@ def add_install_defaults(args):
default_args = {}
else:
with open(install_config) as in_handle:
default_args = yaml.load(in_handle)
default_args = yaml.safe_load(in_handle)
# if we are upgrading to development, also upgrade the tools
if args.upgrade in ["development"] and (args.tooldir or "tooldir" in default_args):
args.tools = True
......@@ -760,7 +733,7 @@ def get_defaults():
if install_config is None or not utils.file_exists(install_config):
return {}
with open(install_config) as in_handle:
return yaml.load(in_handle)
return yaml.safe_load(in_handle)
def _check_toolplus(x):
"""Parse options for adding non-standard/commercial tools like GATK and MuTecT.
......
......@@ -149,7 +149,9 @@ def _sam_to_grouped_umi_cl(data, umi_consensus, tx_out_file):
cmd += "fgbio {jvm_opts} AnnotateBamWithUmis -i /dev/stdin -f {umi_consensus} -o {tx_out_file}"
# UMIs embedded in read name
else:
cmd += "umis bamtag - | samtools view -b > {tx_out_file}"
cmd += ("%s %s bamtag - | samtools view -b > {tx_out_file}" %
(utils.get_program_python("umis"),
config_utils.get_program("umis", data["config"])))
return cmd.format(**locals())
def _get_fgbio_jvm_opts(data, tmpdir, scale_factor=None):
......
......@@ -56,12 +56,15 @@ def organize_noalign(data):
data = utils.to_single_data(data[0])
work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data))
if data.get("files"):
if data["files"][0].endswith(".cram"):
work_bam = cram.to_bam(data["files"][0], work_bam, data)
else:
assert data["files"][0].endswith(".bam"), data["files"][0]
utils.copy_plus(data["files"][0], work_bam)
bam.index(work_bam, data["config"])
else:
work_bam = None
data["align_bam"] = work_bam
return data
......
......@@ -134,7 +134,7 @@ def load_config(config_file):
"""Load YAML config file, replacing environmental variables.
"""
with open(config_file) as in_handle:
config = yaml.load(in_handle)
config = yaml.safe_load(in_handle)
config = _expand_paths(config)
if 'resources' not in config:
config['resources'] = {}
......
......@@ -70,6 +70,7 @@ LOOKUPS = {
"disease": {"keys": ["metadata", "disease"], "default": ""},
"hetcaller": {"keys": ["config", "algorithm", "hetcaller"]},
"variantcaller": {"keys": ['config', 'algorithm', 'variantcaller']},
"variantcaller_order": {"keys": ['config', 'algorithm', 'variantcaller_order'], "default": 0},
"svcaller": {"keys": ['config', 'algorithm', 'svcaller'], "default": [], "always_list": True},
"jointcaller": {"keys": ['config', 'algorithm', 'jointcaller']},
"hlacaller": {"keys": ['config', 'algorithm', 'hlacaller']},
......@@ -86,6 +87,7 @@ LOOKUPS = {
"novel_mirna_counts": {"keys": ["novel_mirna_counts"]},
"novel_isomir_counts": {"keys": ["novel_isomir_counts"]},
"combined_counts": {"keys": ["combined_counts"]},
"combined_histogram": {"keys": ["combined_histogram"]},
"annotated_combined_counts": {"keys": ["annotated_combined_counts"]},
"genome_context_files": {"keys": ["reference", "genome_context"], "default": [], "always_list": True},
"viral_files": {"keys": ["reference", "viral"], "default": [], "always_list": True},
......@@ -96,6 +98,7 @@ LOOKUPS = {
"express_fpkm": {"keys": ['express_fpkm']},
"express_tpm": {"keys": ['express_tpm']},
"express_counts": {"keys": ['express_counts']},
"histogram_counts": {"keys": ['histogram_counts']},
"isoform_to_gene": {"keys": ['isoform_to_gene']},
"fusion_mode": {"keys": ['config', 'algorithm', 'fusion_mode']},
"fusion_caller": {"keys": ['config', 'algorithm', 'fusion_caller']},
......@@ -176,6 +179,7 @@ LOOKUPS = {
"cellular_barcode_correction": {"keys": ["config", "algorithm",
"cellular_barcode_correction"],
"default": 1},
"demultiplexed": {"keys": ["config", "algorithm", "demultiplexed"]},
"kallisto_quant": {"keys": ["kallisto_quant"]},
"salmon_dir": {"keys": ["salmon_dir"]},
"salmon_fraglen_file": {"keys": ["salmon_fraglen_file"]},
......@@ -344,3 +348,27 @@ def get_keys(lookup):
"""
return tz.get_in((lookup, "keys"), LOOKUPS, None)
def update_summary_qc(data, key, base=None, secondary=None):
"""
updates summary_qc with a new section, keyed by key.
stick files into summary_qc if you want them propagated forward
and available for multiqc
"""
summary = get_summary_qc(data, {})
if base and secondary:
summary[key] = {"base": base, "secondary": secondary}
elif base:
summary[key] = {"base": base}
elif secondary:
summary[key] = {"secondary": secondary}
data = set_summary_qc(data, summary)
return data
def has_variantcalls(data):
"""
returns True if the data dictionary is configured for variant calling
"""
analysis = get_analysis(data).lower()
variant_pipeline = analysis.startswith(("standard", "variant", "variant2"))
variantcaller = get_variantcaller(data)
return variant_pipeline or variantcaller