Skip to content
Commits on Source (3)
## 1.1.8
## 1.1.9 (in progress)
- Fix for get VEP cache.
- Support Picard's new syntax for ReorderSam (REFERENCE -> SEQUENCE_DICTIONARY).
- Remove mitochondrial reads from ChIP/ATAC-seq calling.
- Add documentation describing ATAC-seq outputs.
- Add ENCODE library complexity metrics for ATAC/ChIP-seq to MultiQC report
(see https://www.encodeproject.org/data-standards/terms/#library for a description of the metrics)
- Add STAR sample-specific 2-pass. This helps assign a moderate number of reads per genes. Thanks
to @naumenko-sa for the intial implementation and push to get this going.
- Index transcriptomes only once for pseudo/quasi aligner tools. This fixes race conditions that
can happen.
- Add --buildversion option, for tracking which version of a gene build was used. This is used
during `bcbio_setup_genome.py`. Suggested formats are source_version, so Ensembl_94,
EnsemblMetazoa_25, FlyBase_26, etc.
- Sort MACS2 bedgraph files before compressing. Thanks to @LMannarino for the suggestion.
- Check for the reserved field `sample` in RNA-seq metadata and quit with a useful error message.
Thanks to @marypiper for suggesting this.
- Split ATAC-seq BAM files into nucleosome-free and mono/di/tri nucleosome files, so we can call
peaks on them separately.
- Call peaks on NF/MN/DN/TN regions separately for each caller during ATAC-seq.
- Allow viral contamination to be assasyed on non tumor/normal samples.
- Ensure EBV coverage is calculated when run on genomes with it included as a contig.
## 1.1.8 (28 October 2019)
- Add `antibody` configuration option. Setting a specific antibody for ChIP-seq will use appropriate
settings for that antibody. See the documentation for supported antibodies.
- Add `use_lowfreq_filter` for forcing vardict to report variants with low allelic frequency,
......
......@@ -403,11 +403,11 @@ def merge(bamfiles, out_bam, config):
return out_bam
def sort(in_bam, config, order="coordinate", out_dir=None):
def sort(in_bam, config, order="coordinate", out_dir=None, force=False):
"""Sort a BAM file, skipping if already present.
"""
assert is_bam(in_bam), "%s in not a BAM file" % in_bam
if bam_already_sorted(in_bam, config, order):
if not force and bam_already_sorted(in_bam, config, order):
return in_bam
sort_stem = _get_sort_stem(in_bam, order, out_dir)
......@@ -579,3 +579,24 @@ def remove_duplicates(in_bam, data):
do.run(cmd, message)
index(out_bam, dd.get_config(data))
return out_bam
def count_alignments(in_bam, data, filter=None):
"""
count alignments in a BAM file passing a given filter. valid filter
strings are available in the sambamba documentation:
https://github.com/biod/sambamba/wiki/%5Bsambamba-view%5D-Filter-expression-syntax
"""
sambamba = config_utils.get_program("sambamba", dd.get_config(data))
num_cores = dd.get_num_cores(data)
if not filter:
filter_string = ""
message = f"Counting alignments in {in_bam}."
else:
filter_string = "--filter {filter}"
message = f"Counting alignments in {in_bam} matching {filter}."
cmd = f"{sambamba} view -c --nthreads {num_cores} -f bam {filter_string} {in_bam}"
logger.info(message)
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return int(result.stdout.decode().strip())
......@@ -98,9 +98,10 @@ def picard_reorder(picard, in_bam, ref_file, out_file):
if not file_exists(out_file):
with tx_tmpdir(picard._config) as tmp_dir:
with file_transaction(picard._config, out_file) as tx_out_file:
dict_file = "%s.dict" % os.path.splitext(ref_file)[0]
opts = [("INPUT", in_bam),
("OUTPUT", tx_out_file),
("REFERENCE", ref_file),
("SEQUENCE_DICTIONARY", dict_file),
("ALLOW_INCOMPLETE_DICT_CONCORDANCE", "true"),
("TMP_DIR", tmp_dir)]
picard.run("ReorderSam", opts)
......
import os
import shutil
import subprocess
import sys
import toolz as tz
......@@ -10,17 +11,29 @@ from bcbio.ngsalign import bowtie2, bwa
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio.log import logger
from bcbio.heterogeneity.chromhacks import get_mitochondrial_chroms
from bcbio.heterogeneity.chromhacks import (get_mitochondrial_chroms,
get_nonmitochondrial_chroms)
from bcbio.chipseq import atac
def clean_chipseq_alignment(data):
# lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
method = dd.get_chip_method(data)
if method == "atac":
data = shift_ATAC(data)
work_bam = dd.get_work_bam(data)
work_bam = bam.sort(work_bam, dd.get_config(data))
bam.index(work_bam, dd.get_config(data))
clean_bam = remove_nonassembled_chrom(work_bam, data)
clean_bam = remove_mitochondrial_reads(clean_bam, data)
data = atac.calculate_complexity_metrics(clean_bam, data)
if not dd.get_keep_multimapped(data):
clean_bam = remove_multimappers(clean_bam, data)
if not dd.get_keep_duplicates(data):
clean_bam = bam.remove_duplicates(clean_bam, data)
data["work_bam"] = clean_bam
# for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
if method == "atac":
data = atac.split_ATAC(data)
encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
if encode_bed:
data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config'])
......@@ -35,6 +48,26 @@ def clean_chipseq_alignment(data):
dd.get_work_bam(data), data)
return [[data]]
def remove_mitochondrial_reads(bam_file, data):
mito = get_mitochondrial_chroms(data)
if not mito:
logger.info(f"Mitochondrial chromosome not identified, skipping removal of "
"mitochondrial reads from {bam_file}.")
return bam_file
nonmito = get_nonmitochondrial_chroms(data)
mito_bam = os.path.splitext(bam_file)[0] + "-noMito.bam"
if utils.file_exists(mito_bam):
return mito_bam
samtools = config_utils.get_program("samtools", dd.get_config(data))
nonmito_flag = " ".join(nonmito)
num_cores = dd.get_num_cores(data)
with file_transaction(mito_bam) as tx_out_bam:
cmd = (f"{samtools} view -bh -@ {num_cores} {bam_file} {nonmito_flag} "
f"> {tx_out_bam}")
message = f"Removing mitochondrial reads on {','.join(mito)} from {bam_file}."
do.run(cmd, message)
return mito_bam
def remove_multimappers(bam_file, data):
aligner = dd.get_aligner(data)
if aligner:
......@@ -50,13 +83,13 @@ def remove_multimappers(bam_file, data):
unique_bam = bam_file
logger.warn("When a BAM file is given as input, bcbio skips removal of "
"multimappers.")
logger.warn("ChIP/ATAC-seq usually requires duplicate marking, but it was disabled.")
return unique_bam
def remove_nonassembled_chrom(bam_file, data):
"""Remove non-assembled contigs from the BAM file"""
ref_file = dd.get_ref_file(data)
config = dd.get_config(data)
bam.index(bam_file, config)
fai = "%s.fai" % ref_file
chrom = []
with open(fai) as inh:
......@@ -143,31 +176,35 @@ def _normalized_bam_coverage(name, bam_input, data):
def _compute_deeptools_matrix(data):
pass
def extract_NF_regions(data):
def shift_ATAC(data):
"""
extract the nucleosome free regions from the work_bam. These regions will
be < 100 bases
shift the ATAC-seq alignments
"""
MAX_FRAG_LENGTH = 100
sieve = config_utils.get_program("alignmentSieve", data)
work_bam = dd.get_work_bam(data)
num_cores = dd.get_num_cores(data)
out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
log_file = os.path.splitext(work_bam)[0] + "-NF.log"
if file_exists(out_file):
data["NF_bam"] = out_file
out_file = os.path.splitext(work_bam)[0] + "-shifted.bam"
log_file = os.path.splitext(work_bam)[0] + "-shifted.log"
if utils.file_exists(out_file):
data["work_bam"] = out_file
return data
unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam"
if not utils.file_exists(out_file):
with file_transaction(out_file) as tx_out_file, \
file_transaction(log_file) as tx_log_file:
tx_unsorted_bam = tx_out_file + ".unsorted"
tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam"
cmd = (
f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} --ATACshift "
f"--numberOfProcessors {num_cores} --maxFragmentLength 0 "
f"--minFragmentLength 0 "
f"--minMappingQuality 10 "
f"--filterMetrics {tx_log_file} ")
do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
tx_out_file = bam.sort(tx_unsorted_bam)
data["NF_bam"] = out_file
do.run(cmd, f"Shifting ATAC-seq alignments in {work_bam} to {tx_unsorted_file}.")
# shifting can cause the file to become unsorted
sorted_file = bam.sort(tx_unsorted_file, dd.get_config(data), force=True)
shutil.move(sorted_file, tx_out_file)
bam.index(out_file, dd.get_config(data))
data["work_bam"] = out_file
return data
import os
import toolz as tz
from collections import namedtuple
from bcbio import utils
from bcbio.pipeline import datadict as dd
from bcbio.pipeline import config_utils
from bcbio.log import logger
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio import bam
# ranges taken from Buenrostro, Nat. Methods 10, 1213–1218 (2013).
ATACRange = namedtuple('ATACRange', ['label', 'min', 'max'])
ATACRanges = {"NF": ATACRange("NF", 0, 100),
"MN": ATACRange("MN", 180, 247),
"DN": ATACRange("DN", 315, 473),
"TN": ATACRange("TN", 558, 615)}
def calculate_complexity_metrics(work_bam, data):
"""
the work_bam should have duplicates marked but not removed
mitochondrial reads should be removed
"""
bedtools = config_utils.get_program("bedtools", dd.get_config(data))
work_dir = dd.get_work_dir(data)
metrics_dir = os.path.join(work_dir, "metrics", "atac")
utils.safe_makedir(metrics_dir)
metrics_file = os.path.join(metrics_dir,
f"{dd.get_sample_name(data)}-atac-metrics.csv")
if utils.file_exists(metrics_file):
data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
return data
# BAM file must be sorted by read name
work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
with file_transaction(metrics_file) as tx_metrics_file:
with open(tx_metrics_file, "w") as out_handle:
out_handle.write("mt,m0,m1,m2\n")
cmd = (f"{bedtools} bamtobed -bedpe -i {work_bam} | "
"awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
"sort | "
"uniq -c | "
"awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
"($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
f"{tx_metrics_file}")
message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
do.run(cmd, message)
data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
return data
def calculate_encode_complexity_metrics(data):
metrics_file = tz.get_in(['atac', 'complexity_metrics_file'], data, None)
if not metrics_file:
return {}
else:
with open(metrics_file) as in_handle:
header = next(in_handle).strip().split(",")
values = next(in_handle).strip().split(",")
raw_metrics = {h: int(v) for h, v in zip(header, values)}
metrics = {"PBC1": raw_metrics["m1"] / raw_metrics["m0"],
"NRF": raw_metrics["m0"] / raw_metrics["mt"]}
if raw_metrics["m2"] == 0:
PBC2 = 0
else:
PBC2 = raw_metrics["m1"] / raw_metrics["m2"]
metrics["PBC2"] = PBC2
return(metrics)
def split_ATAC(data, bam_file=None):
"""
splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
on the estimated insert sizes
uses the current working BAM file if no BAM file is supplied
"""
sambamba = config_utils.get_program("sambamba", data)
num_cores = dd.get_num_cores(data)
base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
bam_file = bam_file if bam_file else dd.get_work_bam(data)
out_stem = os.path.splitext(bam_file)[0]
split_files = {}
for arange in ATACRanges.values():
out_file = f"{out_stem}-{arange.label}.bam"
if not utils.file_exists(out_file):
with file_transaction(out_file) as tx_out_file:
cmd = base_cmd +\
f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
f'{bam_file} > {tx_out_file}'
message = f'Splitting {arange.label} regions from {bam_file}.'
do.run(cmd, message)
split_files[arange.label] = out_file
data = tz.assoc_in(data, ['atac', 'align'], split_files)
return data
......@@ -9,6 +9,7 @@ from bcbio.pipeline import datadict as dd
from bcbio import bam
from bcbio.chipseq import antibodies
from bcbio.log import logger
from bcbio.distributed.transaction import file_transaction
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
"""
......@@ -20,7 +21,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
macs2_file = os.path.join(out_dir, name + "_peaks.xls")
if utils.file_exists(out_file):
_compress_bdg_files(out_dir)
_compress_and_sort_bdg_files(out_dir, data)
return _get_output_files(out_dir)
macs2 = config_utils.get_program("macs2", config)
antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
......@@ -40,13 +41,13 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
do.run(cmd.format(**locals()), "macs2 for %s" % name)
utils.move_safe(macs2_file, out_file)
except subprocess.CalledProcessError:
raise RuntimeWarning("macs2 terminated with an error.\n"
raise RuntimeWarning("macs2 terminated with an error. "
"Please, check the message and report "
"error if it is related to bcbio.\n"
"error if it is related to bcbio. "
"You can add specific options for the sample "
"setting resources as explained in docs: "
"https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
_compress_bdg_files(out_dir)
_compress_and_sort_bdg_files(out_dir, data)
return _get_output_files(out_dir)
def _get_output_files(out_dir):
......@@ -61,10 +62,16 @@ def _get_output_files(out_dir):
break
return {"main": peaks, "macs2": fns}
def _compress_bdg_files(out_dir):
def _compress_and_sort_bdg_files(out_dir, data):
for fn in glob.glob(os.path.join(out_dir, "*bdg")):
cmd = "gzip %s" % fn
do.run(cmd, "compress bdg file: %s" % fn)
out_file = fn + ".gz"
if utils.file_exists(out_file):
continue
bedtools = config_utils.get_program("bedtools", data)
with file_transaction(out_file) as tx_out_file:
cmd = f"{bedtools} sort -i {fn} | bgzip -c > {tx_out_file}"
message = f"Compressing and sorting {fn}."
do.run(cmd, message)
def _macs2_cmd(data):
"""Main command for macs2 tool."""
......
......@@ -12,7 +12,7 @@ from bcbio.pipeline import config_utils
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.distributed.transaction import file_transaction
from bcbio.chipseq import atac
def get_callers():
"""Get functions related to each caller"""
......@@ -44,18 +44,28 @@ def peakcall_prepare(data, run_parallel):
def calling(data):
"""Main function to parallelize peak calling."""
method = dd.get_chip_method(data)
caller_fn = get_callers()[data["peak_fn"]]
if method == "chip":
chip_bam = data.get("work_bam")
input_bam = data.get("work_bam_input", None)
caller_fn = get_callers()[data["peak_fn"]]
name = dd.get_sample_name(data)
out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
dd.get_chip_method(data), data["resources"], data)
greylistdir = greylisting(data)
data.update({"peaks_files": out_files})
# data["input_bam_filter"] = input_bam
if greylistdir:
data["greylist"] = greylistdir
if method == "atac":
for fraction in atac.ATACRanges.keys():
chip_bam = tz.get_in(("atac", "align", fraction), data)
logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
name = dd.get_sample_name(data) + f"-{fraction}"
out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
dd.get_chip_method(data), data["resources"], data)
data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
return [[data]]
def _sync(original, processed):
......
......@@ -58,3 +58,16 @@ def get_mitochondrial_chroms(data):
ref_file = dd.get_ref_file(data)
mito = [c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)]
return mito
def get_nonmitochondrial_chroms(data):
ref_file = dd.get_ref_file(data)
nonmito = [c.name for c in ref.file_contigs(ref_file) if not is_mitochondrial(c.name)]
return nonmito
def get_EBV(data):
EBVCONTIGS = ["chrEBV", "EBV"]
ref_file = dd.get_ref_file(data)
for c in ref.file_contigs(ref_file):
if c.name in EBVCONTIGS:
return c.name
return False
......@@ -44,7 +44,7 @@ def align_bam(in_bam, ref_file, names, align_dir, data):
tx_out_prefix = os.path.splitext(tx_out_file)[0]
prefix1 = "%s-in1" % tx_out_prefix
cmd = ("unset JAVA_HOME && "
"{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
"{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
"| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
"| {bwa_cmd} | ")
cmd = cmd.format(**locals()) + tobam_cl
......
......@@ -62,6 +62,54 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
ref_file = os.path.dirname(ref_file)
with file_transaction(data, align_dir) as tx_align_dir:
tx_1pass_dir = tx_align_dir + "1pass"
tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names)
tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
safe_makedir(tx_1pass_dir)
safe_makedir(tx_out_dir)
cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
"--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
"--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
"--outStd BAM_Unsorted {srna_opts} "
"--limitOutSJcollapsed 2000000 "
"--outSAMtype BAM Unsorted "
"--outSAMmapqUnique 60 "
"--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
cmd += _read_group_option(names)
if dd.get_fusion_caller(data):
if "arriba" in dd.get_fusion_caller(data):
cmd += (
"--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions "
"--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
"--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
"--alignSJstitchMismatchNmax 5 -1 5 5 "
"--chimSegmentReadGapMax 3 ")
else:
cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
"--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
"--chimScoreSeparation 5 ")
if "oncofuse" in dd.get_fusion_caller(data):
cmd += "--chimOutType Junctions "
else:
cmd += "--chimOutType WithinBAM "
strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
"unstranded").lower()
if strandedness == "unstranded" and not srna:
cmd += " --outSAMstrandField intronMotif "
if not srna:
cmd += " --quantMode TranscriptomeSAM "
resources = config_utils.get_resources("star", data["config"])
if resources.get("options", []):
cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
cmd += " > {tx_final_out} "
run_message = "Running 1st pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
do.run(cmd.format(**locals()), run_message, None)
sjfile = get_splicejunction_file(tx_out_dir, data)
sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else ""
tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
safe_makedir(tx_align_dir)
......@@ -71,6 +119,7 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
"--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
"--outStd BAM_Unsorted {srna_opts} "
"--limitOutSJcollapsed 2000000 "
"{sjflag} "
"--outSAMtype BAM Unsorted "
"--outSAMmapqUnique 60 "
"--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
......@@ -104,7 +153,7 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
cmd += " > {tx_final_out} "
run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
run_message = "Running 2nd pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
do.run(cmd.format(**locals()), run_message, None)
data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
......
......@@ -11,7 +11,8 @@ from bcbio.distributed.transaction import file_transaction
from bcbio.log import logger
def fast_rnaseq(samples, run_parallel):
samples = run_parallel("run_salmon_index", [samples])
to_index = determine_indexes_to_make(samples)
run_parallel("run_salmon_index", [to_index])
samples = run_parallel("run_salmon_reads", samples)
samples = run_parallel("run_counts_spikein", samples)
samples = spikein.combine_spikein(samples)
......@@ -205,6 +206,7 @@ def quantitate_expression_parallel(samples, run_parallel):
take advantage of the threaded run_parallel environment
"""
data = samples[0][0]
to_index = determine_indexes_to_make(samples)
samples = run_parallel("generate_transcript_counts", samples)
if "cufflinks" in dd.get_expression_caller(data):
samples = run_parallel("run_cufflinks", samples)
......@@ -213,13 +215,14 @@ def quantitate_expression_parallel(samples, run_parallel):
if ("kallisto" in dd.get_expression_caller(data) or
dd.get_fusion_mode(data) or
"pizzly" in dd.get_fusion_caller(data, [])):
samples = run_parallel("run_kallisto_index", [samples])
run_parallel("run_kallisto_index", [to_index])
samples = run_parallel("run_kallisto_rnaseq", samples)
if "sailfish" in dd.get_expression_caller(data):
samples = run_parallel("run_sailfish_index", [samples])
run_parallel("run_sailfish_index", [to_index])
samples = run_parallel("run_sailfish", samples)
# always run salmon
run_parallel("run_salmon_index", [to_index])
if dd.get_quantify_genome_alignments(data):
if dd.get_aligner(data).lower() != "star":
if dd.get_genome_build(data) == "hg38":
......@@ -496,3 +499,25 @@ def combine_files(samples):
data = dd.set_tx2gene(data, tx2gene_file)
updated_samples.append([data])
return updated_samples
def determine_indexes_to_make(samples):
"""
returns a subset of the samples that have different indexes in them to make sure we only
make each index once
"""
samples = [to_single_data(x) for x in samples]
indexes = set()
tomake = []
for data in samples:
out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
out_stem = os.path.join(out_dir, dd.get_genome_build(data))
if dd.get_disambiguate(data):
out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
if dd.get_disambiguate(data):
out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
combined_file = out_stem + ".fa"
if combined_file not in indexes:
tomake.append(data)
indexes.add(combined_file)
return tomake
......@@ -11,6 +11,7 @@ import itertools
import operator
import os
import string
import sys
import six
import toolz as tz
......@@ -331,6 +332,11 @@ def _clean_metadata(data):
if "metadata" not in data:
data["metadata"] = {}
data["metadata"]["batch"] = "%s-joint" % dd.get_sample_name(data)
analysis = dd.get_analysis(data).lower()
if tz.get_in(("metadata", "sample"), data) and analysis == "rna-seq":
logger.error("'sample' is a reserved keyword in metadata for RNA-seq. Please "
"rename this column to something else and rerun.")
sys.exit(1)
return data
def _clean_algorithm(data):
......
......@@ -147,7 +147,8 @@ def process_alignment(data, alt_input=None):
if sort_method and sort_method != "coordinate":
raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
% sort_method)
out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
ref_file = dd.get_ref_file(data)
out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"],
data)
elif bamclean == "fixrg":
out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
......
......@@ -34,6 +34,7 @@ from bcbio.variation import bedutils
from bcbio.qc.variant import get_active_vcinfo
from bcbio.upload import get_all_upload_paths_from_sample
from bcbio.variation import coverage
from bcbio.chipseq import atac
def summary(*samples):
"""Summarize all quality metrics together"""
......@@ -447,6 +448,13 @@ def _add_disambiguate(sample):
sample["summary"]["metrics"]["Disambiguated ambiguous reads"] = disambigStats[2]
return sample
def _add_atac(sample):
atac_metrics = atac.calculate_encode_complexity_metrics(sample)
if not atac_metrics:
return sample
sample["summary"]["metrics"] = tz.merge(atac_metrics, sample["summary"]["metrics"])
return sample
def _fix_duplicated_rate(dt):
"""Get RNA duplicated rate if exists and replace by samtools metric"""
if "Duplication_Rate_of_Mapped" in dt:
......@@ -461,6 +469,7 @@ def _merge_metrics(samples, out_dir):
sample_metrics = collections.defaultdict(dict)
for s in samples:
s = _add_disambiguate(s)
s = _add_atac(s)
m = tz.get_in(['summary', 'metrics'], s)
if isinstance(m, six.string_types):
m = json.loads(m)
......
......@@ -11,6 +11,7 @@ from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import vcfutils
from bcbio.heterogeneity import chromhacks
def run(bam_file, data, out_dir):
"""Run viral QC analysis:
......@@ -21,7 +22,6 @@ def run(bam_file, data, out_dir):
source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
viral_target = "gdc-viral"
out = {}
if vcfutils.get_paired_phenotype(data):
viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
if viral_refs and utils.file_exists(viral_refs[0]):
viral_ref = viral_refs[0]
......@@ -52,6 +52,18 @@ def run(bam_file, data, out_dir):
"awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
"sort -n -r -k 5,5 >> {tx_out_file}")
do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
if chromhacks.get_EBV(data):
ref_file = dd.get_ref_file(data)
work_bam = dd.get_work_bam(data)
ebv = chromhacks.get_EBV(data)
mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV"
cmd = ("mosdepth -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by "
"<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && "
"paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
"awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
"sort -n -r -k 5,5 >> {tx_out_file}")
do.run(cmd.format(**locals()), "Analyse coverage of EBV")
out["base"] = out_file
out["secondary"] = []
return out
......
......@@ -118,6 +118,7 @@ def _get_files_chipseq(sample):
algorithm = sample["config"]["algorithm"]
out = _maybe_add_summary(algorithm, sample, out)
out = _maybe_add_alignment(algorithm, sample, out)
out = _maybe_add_nucleosome_alignments(algorithm, sample, out)
out = _maybe_add_peaks(algorithm, sample, out)
out = _maybe_add_greylist(algorithm, sample, out)
return _add_meta(out, sample)
......@@ -493,6 +494,17 @@ def _maybe_add_transcriptome_alignment(sample, out):
"ext": "transcriptome"})
return out
def _maybe_add_nucleosome_alignments(algorithm, sample, out):
"""
for ATAC-seq, also upload NF, MN, DN and TN bam files
"""
atac_align = tz.get_in(("atac", "align"), sample, {})
for alignment in atac_align.keys():
out.append({"path": atac_align[alignment],
"type": "bam",
"ext": alignment})
return out
def _maybe_add_counts(algorithm, sample, out):
if not dd.get_count_file(sample):
return out
......@@ -650,6 +662,17 @@ def _maybe_add_trna(algorithm, sample, out):
def _maybe_add_peaks(algorithm, sample, out):
out_dir = sample.get("peaks_files", {})
if "NF" in out_dir.keys():
for files in out_dir.values():
for caller in files:
if caller == "main":
continue
for fn in files[caller]:
if os.path.exists(fn):
out.append({"path": fn,
"dir": caller,
"ext": utils.splitext_plus(fn)[1]})
else:
for caller in out_dir:
if caller == "main":
continue
......
......@@ -139,13 +139,13 @@ def get_vep_cache(dbkey, ref_file, tooldir=None, config=None):
resources = yaml.safe_load(in_handle)
ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
symlink_dir = _special_dbkey_maps(dbkey, ref_file)
if ensembl_name and ensembl_name.find("_vep_") == -1:
raise ValueError("%s has ensembl an incorrect value."
"It should have _vep_ in the name."
"Remove line or fix the name to avoid error.")
if symlink_dir and ensembl_name:
species, vepv = ensembl_name.split("_vep_")
return symlink_dir, species
elif ensembl_name:
species, vepv = ensembl_name.split("_vep_")
vep_dir = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(ref_file)), "vep"))
return vep_dir, species
return None, None
def run_vep(in_file, data):
......
......@@ -92,7 +92,7 @@ Packages already in the New Queue
package pizzly (in new queue) (in crash space)
https://github.com/pmelsted/pizzly
https://salsa.debian.org/med-team/pizzly
Was recently rejected from new, need to chase this up
Was recently rejected from new after a long wait, reuploaded, waiting again
package r-wasabi (in new queue) (in crash space)
https://salsa.debian.org/r-pkg-team/r-other-wasabi
......@@ -191,6 +191,7 @@ r-bioc-edgeR
r-bioc-HTSFilter
https://bioconductor.org/packages/release/bioc/html/HTSFilter.html
Needed by seqcluster
needs r-bioc-edger and r-bioc-deseq (also deseq2) as build dependends which are not in Debian
r-bioc-DEGreport
https://bioconductor.org/packages/release/bioc/html/DEGreport.html
......
bcbio (1.1.9-1) unstable; urgency=medium
* New upstream version.
* Updated d/watch
-- Steffen Moeller <moeller@debian.org> Wed, 11 Dec 2019 11:39:46 +0100
bcbio (1.1.8-1) unstable; urgency=medium
* New upstream version.
......
......@@ -21,7 +21,7 @@ override_dh_auto_install:
# To keep awareness of redundant binaries with different versions of Python
#mkdir -p debian/bcbio/usr/bin
#rm -rf debian/python-bcbio/usr/bin debian/python3-bcbio/usr/bin
find bcbio/usr/share/bcbio/ -name "*.sh" | xargs -r chmod +x
find $(CURDIR)/debian/bcbio/usr/share/bcbio/config/ -name "*.sh" | xargs -r chmod +x
# Problem with a series of tools that need bcbio before it is installed,
# just to print --help
......