Skip to content
Commits on Source (4)
## 1.1.7
## 1.1.8
- Add `antibody` configuration option. Setting a specific antibody for ChIP-seq will use appropriate
settings for that antibody. See the documentation for supported antibodies.
- Add `use_lowfreq_filter` for forcing vardict to report variants with low allelic frequency,
useful for calling somatic variants in panels with high coverage.
- Fix for checking for pre-existing inputs with python3.
- Add `keep_duplicates` option for ChIP/ATAC-seq which does not remove duplicates before peak calling.
Defaults to False.
- Add `keep_multimappers` for ChIP/ATAC-seq which does not remove multimappers before peak calling.
Defaults to False.
- Remove ethnicity as a required column in PED files.
## 1.1.7 (10 October 2019)
=======
- hot fix for dataclasses not being supported in 3.6. Use namedtuple instead.
## 1.1.6
## 1.1.6 (10 October 2019)
- GATK ApplyBQSRSpark: avoid StreamClosed issue with GATK 4.1+
- RNA-seq: fixes for cufflinks preparation due to python3 transition.
- RNA-seq: output count tables from tximport for genes and transcripts. These
are in `bcbioRNASeq/results/date/genes/counts` and
`bcbioRNASeq/results/data/transcripts/counts`.
are in `bcbioRNASeq/results/date/genes/counts` and `bcbioRNASeq/results/data/transcripts/counts`.
- qualimap (RNA-seq): disable stranded mode for qualimap, as it gives incorrect
results with the hisat2 aligner and for RNA-seq just setting it to unstranded
- Add `quantify_genome_alignments` option to use genome alignments to quantify
......
......@@ -123,6 +123,7 @@ Contributors
- `Roman Valls`_, Science for Life Laboratory, Stockholm
- `Kevin Ying`_, Garvan Institute of Medical Research, Sydney, Australia
- `Vlad Saveliev`_, Center for Algorithmic Biotechnology, St. Petersburg University
- `Sergey Naumenko`_, Harvard Chan Bioinformatics Core
.. _Miika Ahdesmaki: https://github.com/mjafin
......@@ -146,6 +147,7 @@ Contributors
.. _Matt Edwards: https://github.com/matted
.. _Saket Choudhary: https://github.com/saketkc
.. _Vlad Saveliev: https://github.com/vladsaveliev
.. _Sergey Naumenko: https://github.com/naumenko-sa
License
-------
......
artwork/logo.png

14.9 KiB | W: | H:

artwork/logo.png

17.3 KiB | W: | H:

artwork/logo.png
artwork/logo.png
artwork/logo.png
artwork/logo.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -561,3 +561,21 @@ def convert_invalid_mapq(in_bam, out_bam=None):
read.mapq = VALIDMAPQ
out_bam_fh.write(read)
return out_bam
def remove_duplicates(in_bam, data):
"""
remove duplicates from a duplicate marked BAM file
"""
base, ext = os.path.splitext(in_bam)
out_bam = base + "-noduplicates" + ext
if utils.file_exists(out_bam):
return out_bam
num_cores = dd.get_num_cores(data)
sambamba = config_utils.get_program("sambamba", data)
with file_transaction(out_bam) as tx_out_bam:
cmd = (f'{sambamba} view -h --nthreads {num_cores} -f bam -F "not duplicate" '
f'{in_bam} > {tx_out_bam}')
message = f"Removing duplicates from {in_bam}, saving as {out_bam}."
do.run(cmd, message)
index(out_bam, dd.get_config(data))
return out_bam
import os
import subprocess
import sys
import toolz as tz
from bcbio import utils
......@@ -9,11 +10,33 @@ from bcbio.ngsalign import bowtie2, bwa
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio.log import logger
from bcbio.heterogeneity.chromhacks import get_mitochondrial_chroms
def clean_chipseq_alignment(data):
# lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
work_bam = dd.get_work_bam(data)
clean_bam = remove_nonassembled_chrom(work_bam, data)
if not dd.get_keep_multimapped(data):
clean_bam = remove_multimappers(clean_bam, data)
if not dd.get_keep_duplicates(data):
clean_bam = bam.remove_duplicates(clean_bam, data)
data["work_bam"] = clean_bam
encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
if encode_bed:
data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config'])
bam.index(data["work_bam"], data['config'])
try:
data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
dd.get_work_bam(data), data)
except subprocess.CalledProcessError:
logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
f" falling back to non-normalized coverage.")
data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
dd.get_work_bam(data), data)
return [[data]]
def remove_multimappers(bam_file, data):
aligner = dd.get_aligner(data)
data["align_bam"] = dd.get_work_bam(data)
if dd.get_mark_duplicates(data):
if aligner:
if aligner == "bowtie2":
filterer = bowtie2.filter_multimappers
......@@ -22,24 +45,19 @@ def clean_chipseq_alignment(data):
else:
logger.error("ChIP-seq only supported for bowtie2 and bwa.")
sys.exit(-1)
unique_bam = filterer(dd.get_work_bam(data), data)
data["work_bam"] = unique_bam
unique_bam = filterer(bam_file, data)
else:
logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
"If BAM is not cleaned for peak calling, can result in downstream errors.")
# lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
data["config"])
encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
if encode_bed:
data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
bam.index(data["work_bam"], data['config'])
data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
return [[data]]
unique_bam = bam_file
logger.warn("When a BAM file is given as input, bcbio skips removal of "
"multimappers.")
logger.warn("ChIP/ATAC-seq usually requires duplicate marking, but it was disabled.")
return unique_bam
def _keep_assembled_chrom(bam_file, genome, config):
"""Remove contigs from the BAM file"""
fai = "%s.fai" % genome
def remove_nonassembled_chrom(bam_file, data):
"""Remove non-assembled contigs from the BAM file"""
ref_file = dd.get_ref_file(data)
config = dd.get_config(data)
fai = "%s.fai" % ref_file
chrom = []
with open(fai) as inh:
for line in inh:
......@@ -56,9 +74,8 @@ def _keep_assembled_chrom(bam_file, genome, config):
bam.index(out_file, config)
return out_file
def _prepare_bam(bam_file, bed_file, config):
"""Remove regions from bed files"""
def remove_blacklist_regions(bam_file, bed_file, config):
"""Remove blacklist regions from a BAM file"""
if not bam_file or not bed_file:
return bam_file
out_file = utils.append_stem(bam_file, '_filter')
......@@ -71,7 +88,31 @@ def _prepare_bam(bam_file, bed_file, config):
def _bam_coverage(name, bam_input, data):
"""Run bamCoverage from deeptools"""
cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
"--binSize 20 --effectiveGenomeSize {size} "
"--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
cores = dd.get_num_cores(data)
try:
bam_coverage = config_utils.get_program("bamCoverage", data)
except config_utils.CmdNotFound:
logger.info("No bamCoverage found, skipping bamCoverage.")
return None
resources = config_utils.get_resources("bamCoverage", data["config"])
if resources:
options = resources.get("options")
if options:
cmd += " %s" % " ".join([str(x) for x in options])
bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
if utils.file_exists(bw_output):
return bw_output
with file_transaction(bw_output) as out_tx:
do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
return bw_output
def _normalized_bam_coverage(name, bam_input, data):
"""Run bamCoverage from deeptools but produce normalized bigWig files"""
cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
"--binSize 20 --effectiveGenomeSize {size} "
"--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
......@@ -81,6 +122,12 @@ def _bam_coverage(name, bam_input, data):
except config_utils.CmdNotFound:
logger.info("No bamCoverage found, skipping bamCoverage.")
return None
method = dd.get_chip_method(data)
cmd += "--normalizeUsing CPM "
toignore = get_mitochondrial_chroms(data)
if toignore:
ignorenormflag = f"--ignoreForNormalization {' '.join(toignore)} "
cmd += ignorenormflag
resources = config_utils.get_resources("bamCoverage", data["config"])
if resources:
options = resources.get("options")
......@@ -92,3 +139,35 @@ def _bam_coverage(name, bam_input, data):
with file_transaction(bw_output) as out_tx:
do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
return bw_output
def _compute_deeptools_matrix(data):
pass
def extract_NF_regions(data):
"""
extract the nucleosome free regions from the work_bam. These regions will
be < 100 bases
"""
MAX_FRAG_LENGTH = 100
sieve = config_utils.get_program("alignmentSieve", data)
work_bam = dd.get_work_bam(data)
num_cores = dd.get_num_cores(data)
out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
log_file = os.path.splitext(work_bam)[0] + "-NF.log"
if file_exists(out_file):
data["NF_bam"] = out_file
return data
with file_transaction(out_file) as tx_out_file, \
file_transaction(log_file) as tx_log_file:
tx_unsorted_bam = tx_out_file + ".unsorted"
cmd = (
f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
f"--minMappingQuality 10 "
f"--filterMetrics {tx_log_file} ")
do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
tx_out_file = bam.sort(tx_unsorted_bam)
data["NF_bam"] = out_file
return data
......@@ -10,7 +10,6 @@ from bcbio import bam
from bcbio.chipseq import antibodies
from bcbio.log import logger
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
"""
Run macs2 for chip and input samples avoiding
......@@ -24,7 +23,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
_compress_bdg_files(out_dir)
return _get_output_files(out_dir)
macs2 = config_utils.get_program("macs2", config)
antibody = antibodies.ANTIBODIES.get(dd.get_chipseq_antibody(data).lower(), None)
antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
if antibody:
logger.info(f"{antibody.name} specified, using {antibody.peaktype} peak settings.")
peaksettings = select_peak_parameters(antibody)
......@@ -35,7 +34,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
with utils.chdir(out_dir):
cmd = _macs2_cmd(method)
cmd = _macs2_cmd(data)
cmd += peaksettings
try:
do.run(cmd.format(**locals()), "macs2 for %s" % name)
......@@ -67,8 +66,9 @@ def _compress_bdg_files(out_dir):
cmd = "gzip %s" % fn
do.run(cmd, "compress bdg file: %s" % fn)
def _macs2_cmd(method="chip"):
def _macs2_cmd(data):
"""Main command for macs2 tool."""
method = dd.get_chip_method(data)
if method.lower() == "chip":
cmd = ("{macs2} callpeak -t {chip_bam} -c {input_bam} {paired} "
"{genome_size} -n {name} --bdg {options} ")
......
......@@ -109,7 +109,7 @@ def greylisting(data):
"""
input_bam = data.get("work_bam_input", None)
if not input_bam:
logger.info("No input BAM file detected, skipping greylisting.")
logger.info("No input control BAM file detected, skipping greylisting.")
return None
try:
greylister = config_utils.get_program("chipseq-greylist", data)
......
......@@ -7,6 +7,8 @@ import os
from bcbio import utils
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.bam import ref
def is_autosomal(chrom):
"""Keep chromosomes that are a digit 1-22, or chr prefixed digit chr1-chr22
......@@ -51,3 +53,8 @@ def bed_to_standardonly(in_file, data, headers=None, include_sex_chroms=False, o
if checkfn(line.split()[0]) or (headers and line.startswith(headers)):
out_handle.write(line)
return out_file
def get_mitochondrial_chroms(data):
ref_file = dd.get_ref_file(data)
mito = [c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)]
return mito
......@@ -57,7 +57,7 @@ def upgrade_bcbio(args):
args = add_install_defaults(args)
if args.upgrade in ["stable", "system", "deps", "development"]:
if args.upgrade == "development":
anaconda_dir = _update_conda_devel()
anaconda_dir = _update_conda_latest()
_check_for_conda_problems()
print("Upgrading bcbio-nextgen to latest development version")
pip_bin = os.path.join(os.path.dirname(os.path.realpath(sys.executable)), "pip")
......@@ -287,6 +287,27 @@ def _update_conda_packages():
os.remove(req_file)
return os.path.dirname(os.path.dirname(conda_bin))
def _update_conda_latest():
"""Update to the latest bcbio conda package
"""
conda_bin = _get_conda_bin()
output = subprocess.run([conda_bin, "search", "-c", "bioconda", "bcbio-nextgen"], stdout=subprocess.PIPE,
stderr=subprocess.PIPE).stdout
lines = [l for l in output.decode().split("\n") if l]
latest = lines.pop()
tokens = latest.split()
conda_version = tokens[1].strip()
print(f"Detected {conda_version} as latest version of bcbio-nextgen on bioconda.")
channels = _get_conda_channels(conda_bin)
bcbio_version = version.__version__
if LooseVersion(bcbio_version) < LooseVersion(conda_version):
print(f"Installing bcbio {conda_version} from bioconda.")
subprocess.check_call([conda_bin, "install", "--quiet", "--yes"] + channels +
[f"bcbio-nextgen>={conda_version}"])
else:
print(f"bcbio version {bcbio_version} is newer than the conda version {conda_version}, skipping upgrade from conda.")
return os.path.dirname(os.path.dirname(conda_bin))
def _update_conda_devel():
"""Update to the latest development conda package.
"""
......
......@@ -716,7 +716,7 @@ def _check_gzipped_input(in_file, data):
"""Determine if a gzipped input file is blocked gzip or standard.
"""
grabix = config_utils.get_program("grabix", data["config"])
is_bgzip = subprocess.check_output([grabix, "check", in_file])
is_bgzip = subprocess.check_output([grabix, "check", in_file]).decode().strip()
if is_bgzip.strip() == "yes":
return False, False
else:
......
......@@ -88,7 +88,7 @@ def filter_multimappers(align_file, data):
bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
if utils.file_exists(out_file):
return out_file
base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" '
base_filter = '-F "[XS] == null and not unmapped {paired_filter}" '
if bam.is_paired(align_file):
paired_filter = "and paired and proper_pair"
else:
......
......@@ -277,7 +277,7 @@ def filter_multimappers(align_file, data):
bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
if utils.file_exists(out_file):
return out_file
base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " '
if bam.is_paired(align_file):
paired_filter = "and paired and proper_pair"
else:
......
......@@ -198,14 +198,19 @@ def correct_umis(data):
# Improve speeds by avoiding compression read/write bottlenecks
io_opts = "--async-io=true --compression=0"
umis_whitelist = tz.get_in(["config", "algorithm", "correct_umis"], data)
umi_method, umi_tag = _check_umi_type(input_bam)
fgbio = config_utils.get_program("fgbio", data["config"])
samtools = config_utils.get_program("samtools", data["config"])
if not utils.file_exists(output_bam):
umi_method, umi_tag = _check_umi_type(input_bam)
cmd = ("unset JAVA_HOME && "
"fgbio {jvm_opts} {io_opts} CorrectUmis "
"{fgbio} {jvm_opts} {io_opts} CorrectUmis "
"-t {umi_tag} -m 3 -d 1 -x "
"-U {umis_whitelist} "
"-i {input_bam} -o {output_bam}")
"-i {input_bam} -o /dev/stdout | {samtools} view -bh > {output_bam}")
do.run(cmd.format(**locals()), "Correcting UMIs")
bam.index(output_bam, data["config"])
return output_bam
def umi_consensus(data):
......@@ -216,6 +221,8 @@ def umi_consensus(data):
f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
avg_coverage = coverage.get_average_coverage("rawumi", dd.get_variant_regions(data), data)
fgbio = config_utils.get_program("fgbio", data["config"])
bamtofastq = config_utils.get_program("bamtofastq", data["config"])
if not utils.file_uptodate(f1_out, align_bam):
with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
......@@ -227,13 +234,13 @@ def umi_consensus(data):
tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
ref_file = dd.get_ref_file(data)
cmd = ("unset JAVA_HOME && "
"fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
"{fgbio} {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
"-i {align_bam} | "
"fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
"{fgbio} {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
"-i /dev/stdin -o /dev/stdout | "
"fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
"{fgbio} {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
"-i /dev/stdin -o /dev/stdout | "
"bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
"{bamtofastq} collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
do.run(cmd.format(**locals()), "UMI consensus fastq generation")
return f1_out, f2_out, avg_coverage
......@@ -304,7 +311,7 @@ def _check_dedup(data):
return dup_param
def dedup_bam(in_bam, data):
"""Perform non-stream based deduplication of BAM input files using biobambam.
"""Perform non-stream based duplicate marking of BAM input files using biobambam.
"""
if _check_dedup(data):
out_file = os.path.join(utils.safe_makedir(os.path.join(os.getcwd(), "align", dd.get_sample_name(data))),
......@@ -317,7 +324,7 @@ def dedup_bam(in_bam, data):
cores, mem = _get_cores_memory(data, downscale=2)
cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
"markthreads={cores} I={in_bam} O={tx_out_file}")
do.run(cmd.format(**locals()), "De-duplication with biobambam")
do.run(cmd.format(**locals()), f"Mark duplication of {in_bam} with biobambam.")
bam.index(out_file, data["config"])
return out_file
else:
......
......@@ -71,6 +71,8 @@ LOOKUPS = {
"hetcaller": {"keys": ["config", "algorithm", "hetcaller"]},
"variantcaller": {"keys": ['config', 'algorithm', 'variantcaller']},
"variantcaller_order": {"keys": ['config', 'algorithm', 'variantcaller_order'], "default": 0},
"keep_duplicates": {"keys": ['config', 'algorithm', "keep_duplicates"], "default": False},
"keep_multimapped": {"keys": ['config', 'algorithm', "keep_multimapped"], "default": False},
"svcaller": {"keys": ['config', 'algorithm', 'svcaller'], "default": [], "always_list": True},
"jointcaller": {"keys": ['config', 'algorithm', 'jointcaller']},
"hlacaller": {"keys": ['config', 'algorithm', 'hlacaller']},
......@@ -176,6 +178,7 @@ LOOKUPS = {
"salmon": {"keys": ["salmon"]},
"umi_type": {"keys": ["config", "algorithm", "umi_type"]},
"correct_umis": {"keys": ["config", "algorithm", "correct_umis"]},
"use_lowfreq_filter": {"keys": ["config", "algorithm", "use_lowfreq_filter"]},
"sample_barcodes": {"keys": ["config", "algorithm", "sample_barcodes"]},
"cellular_barcodes": {"keys": ["config", "algorithm", "cellular_barcodes"],
"default": []},
......@@ -208,8 +211,7 @@ LOOKUPS = {
"disc_bam": {"keys": ["work_bam_plus", "disc"]},
"sr_bam": {"keys": ["work_bam_plus", "sr"]},
"peddy_report": {"keys": ["peddy_report"]},
"chipseq_antibody": {"keys": ["config", "algorithm", "chipseq", "antibody"], "default": ""},
"peaktype": {"keys": ["config", "algorithm", "chipseq", "peaktype"]},
"antibody": {"keys": ["config", "algorithm", "antibody"], "default": ""},
"tools_off": {"keys": ["config", "algorithm", "tools_off"], "default": [], "always_list": True},
"tools_on": {"keys": ["config", "algorithm", "tools_on"], "default": [], "always_list": True},
"cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]},
......
......@@ -544,10 +544,12 @@ ALG_ALLOW_BOOLEANS = set(["merge_bamprep", "mark_duplicates", "remove_lcr",
"demultiplexed", "clinical_reporting", "transcriptome_align",
"fusion_mode", "assemble_transcripts", "trim_reads",
"quantify_genome_alignments",
"recalibrate", "realign", "cwl_reporting", "save_diskspace"])
"recalibrate", "realign", "cwl_reporting", "save_diskspace", "keep_multimapped",
"keep_duplicates"])
ALG_ALLOW_FALSE = set(["aligner", "align_split_size", "bam_clean", "bam_sort",
"effects", "phasing", "mixup_check", "indelcaller",
"variantcaller", "positional_umi", "maxcov_downsample", "preseq"])
"variantcaller", "positional_umi", "maxcov_downsample", "preseq",
"use_lowfreq_filter"])
ALG_DOC_URL = "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#algorithm-parameters"
......
......@@ -39,7 +39,9 @@ def run(bam_file, data, fastqc_out):
frmt = "bam" if bam_file.endswith("bam") else "fastq"
fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
fastqc_clean_name = dd.get_sample_name(data)
num_cores = data["config"]["algorithm"].get("num_cores", 1)
# FastQC scales memory with threads (250mb per thread) so we avoid
# very low memory usage
num_cores = max(data["config"]["algorithm"].get("num_cores", 1), 2)
with tx_tmpdir(data, work_dir) as tx_tmp_dir:
with utils.chdir(tx_tmp_dir):
cl = [config_utils.get_program("fastqc", data["config"]),
......
......@@ -103,7 +103,9 @@ def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
if not os.path.exists(out_dir):
tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp"))
eversion = vepv.split("_")[0]
if int(eversion) >= 97:
if int(eversion) >= 98:
vep_url_string = "indexed_vep_cache"
elif int(eversion) == 97:
vep_url_string = "vep"
else:
vep_url_string = "VEP"
......@@ -179,7 +181,7 @@ def run_vep(in_file, data):
for plugin in plugins:
plugin_args = plugin_fns[plugin](data)
config_args += plugin_args
config_args += ["--sift", "b", "--polyphen", "b"]
config_args += ["--sift", "b", "--polyphen", "b", "--humdiv"]
if hgvs_compatible:
config_args += ["--hgvsg", "--hgvs", "--shift_hgvs", "1"]
if (dd.get_effects_transcripts(data).startswith("canonical")
......
......@@ -235,8 +235,7 @@ def get_ped_info(data, samples):
"family_id": family_id,
"maternal_id": tz.get_in(["metadata", "maternal_id"], data, -9),
"paternal_id": tz.get_in(["metadata", "paternal_id"], data, -9),
"affected": get_affected_status(data),
"ethnicity": tz.get_in(["metadata", "ethnicity"], data, -9)
"affected": get_affected_status(data)
}
def create_ped_file(samples, base_vcf, out_dir=None):
......@@ -249,7 +248,8 @@ def create_ped_file(samples, base_vcf, out_dir=None):
if out_dir:
out_file = os.path.join(out_dir, os.path.basename(out_file))
sample_ped_lines = {}
header = ["#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID", "Sex", "Phenotype", "Ethnicity"]
header = ["#Family_ID", "Individual_ID", "Paternal_ID", "Maternal_ID",
"Sex", "Phenotype"]
for md_ped in list(set([x for x in [tz.get_in(["metadata", "ped"], data)
for data in samples] if x is not None])):
with open(md_ped) as in_handle:
......@@ -273,10 +273,12 @@ def create_ped_file(samples, base_vcf, out_dir=None):
if sname in sample_ped_lines:
writer.writerow(sample_ped_lines[sname])
else:
writer.writerow([ped_info["family_id"], ped_info["individual_id"],
ped_info["paternal_id"], ped_info["maternal_id"],
ped_info["gender"], ped_info["affected"],
ped_info["ethnicity"]])
writer.writerow([ped_info["family_id"],
ped_info["individual_id"],
ped_info["paternal_id"],
ped_info["maternal_id"],
ped_info["gender"],
ped_info["affected"]])
return out_file
def _find_shared_batch(samples):
......
......@@ -170,13 +170,19 @@ def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
jvm_opts = _get_jvm_opts(items[0], tx_out_file)
setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file)
lowfreq_filter = _lowfreq_linear_filter(0, False)
use_lowfreq_filter = config["algorithm"].get("use_lowfreq_filter")
if use_lowfreq_filter is False:
lowfreq_filter = " | "
else:
lowfreq_filter = " | " + _lowfreq_linear_filter(0, False) + " | "
teststrandbias = config_utils.get_program("teststrandbias.R", config)
var2vcf_valid = config_utils.get_program("var2vcf_valid.pl", config)
cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} "
"-N {sample} -b {bamfile} {opts} "
"| teststrandbias.R "
"| var2vcf_valid.pl -A -N {sample} -E {var2vcf_opts} "
"| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} "
"| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
"| {teststrandbias} "
"| {var2vcf_valid} -A -N {sample} -E {var2vcf_opts} "
"| {contig_cl} | bcftools filter -i 'QUAL >= 0' {lowfreq_filter} "
"{fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
if num_bams > 1:
temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
tmp_out = temp_file_prefix + ".temp.vcf"
......
version: 18
version: 19
aliases:
ensembl: drosophila_melanogaster_vep_97_BDGP6.22
ensembl: drosophila_melanogaster_vep_98_BDGP6.22
snpeff: BDGP6.86