Skip to content
Commits on Source (6)
......@@ -6,10 +6,10 @@ os:
- linux
env:
# Earliest supported versions
- CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
- CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
- CONDA_PY=2.7 PANDAS=0.20.1 PYSAM=0.10.0
- CONDA_PY=3.5 PANDAS=0.20.1 PYSAM=0.10.0
# The latest versions, whatever they are
- CONDA_PY=3.6 PANDAS='*' PYSAM='*'
- CONDA_PY=3.7 PANDAS='*' PYSAM='*'
install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install md5sha1sum; fi
......@@ -21,9 +21,10 @@ before_script:
- if [[ "$CONDA_PY" == "2.7" ]]; then conda install -yq futures>=3.0; fi
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then conda install -yq -c bioconda atlas; fi
# Install the versions desired for testing
- conda install -yq -c bioconda
- conda install -yq -c bioconda --no-channel-priority
cython
future
pomegranate
matplotlib
numpy
pyfaidx
......@@ -34,14 +35,12 @@ before_script:
pysam==$PYSAM
# R linking is broken on OS X - try recompilation instead of conda there
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then
conda install -yq -c bioconda bioconductor-dnacopy r-cghflasso;
conda install -yq -c bioconda bioconductor-dnacopy;
fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
conda install -yq -c bioconda r-base;
Rscript -e "source('http://bioconductor.org/biocLite.R'); biocLite(c('DNAcopy', 'cghFLasso'))";
Rscript -e "source('http://callr.org/install#DNAcopy')";
fi
# hmmlearn 0.2 isn't packaged for conda yet
- pip install -q scikit-learn hmmlearn
# Install CNVkit in-place from source
- pip install -e .
- cd test/
......@@ -52,9 +51,7 @@ script:
- coverage run test_io.py
- coverage run -a test_genome.py
- coverage run -a test_cnvlib.py
#- coverage run -a test_r.py
# OS X can't install cghFLasso package?
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then coverage run -a test_r.py; fi
- coverage run -a test_r.py
after_success:
- coverage report
......
......@@ -165,16 +165,15 @@ Copy number segmentation currently depends on R packages, some of which are part
of Bioconductor and cannot be installed through CRAN directly. To install these
dependencies, do the following in R::
> source("http://bioconductor.org/biocLite.R")
> biocLite(c("DNAcopy", "cghFLasso"))
> library(BiocManager)
> install("DNAcopy")
This will install the DNAcopy and cghFLasso packages, as well as their
dependencies.
This will install the DNAcopy package, as well as its dependencies.
Alternatively, to do the same directly from the shell, e.g. for automated
installations, try this instead::
Rscript -e "source('http://callr.org/install#DNAcopy,cghFLasso')"
Rscript -e "source('http://callr.org/install#DNAcopy')"
Testing
......
__version__ = "0.9.5"
__version__ = "0.9.6"
......@@ -115,12 +115,19 @@ def guess_chromosome_regions(targets, telomere_size):
# CNVkit's original inclusion regex
re_canonical = re.compile(r"(chr)?(\d+|[XYxy])$")
# goleft indexcov's exclusion regex
# TODO drop chrM, MT
re_noncanonical = re.compile(r"^chrEBV$|^NC|_random$|Un_|^HLA\-|_alt$|hap\d$")
re_noncanonical = re.compile("|".join((r"^chrEBV$",
r"^NC|_random$",
r"Un_",
r"^HLA\-",
r"_alt$",
r"hap\d$",
r"chrM",
r"MT")))
def is_canonical_contig_name(name):
return bool(re_canonical.match(name))
# return not re_noncanonical.search(name))
#return bool(re_canonical.match(name))
return not re_noncanonical.search(name)
def _drop_short_contigs(garr):
......
......@@ -161,7 +161,7 @@ def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
output_dir, male_reference, plot_scatter, plot_diagram,
rlibpath, by_count, skip_low, method, processes):
rscript_path, by_count, skip_low, method, processes):
"""Run the pipeline on one BAM file."""
# ENH - return probes, segments (cnarr, segarr)
logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
......@@ -182,7 +182,7 @@ def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
logging.info("Segmenting %s.cnr ...", sample_pfx)
segments = segmentation.do_segmentation(cnarr, 'cbs',
rlibpath=rlibpath,
rscript_path=rscript_path,
skip_low=skip_low,
processes=processes,
**({'threshold': 1e-6}
......
......@@ -10,6 +10,12 @@ import logging
import os
import sys
# Filter spurious Cython warnings re: numpy
# Via: https://github.com/numpy/numpy/pull/432
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# If running headless, use a suitable GUI-less plotting backend
if not os.environ.get('DISPLAY'):
import matplotlib
......@@ -130,7 +136,7 @@ def _cmd_batch(args):
pool.submit(batch.batch_run_sample,
bam, args.targets, args.antitargets, args.reference,
args.output_dir, args.male_reference, args.scatter,
args.diagram, args.rlibpath, args.count_reads,
args.diagram, args.rscript_path, args.count_reads,
args.drop_low_coverage, args.method, procs_per_bam)
else:
logging.info("No tumor/test samples (but %d normal/control samples) "
......@@ -162,8 +168,6 @@ P_batch.add_argument('-p', '--processes',
help="""Number of subprocesses used to running each of the BAM files in
parallel. Without an argument, use the maximum number of
available CPUs. [Default: process each BAM in serial]""")
P_batch.add_argument("--rlibpath", metavar="DIRECTORY",
help=argparse.SUPPRESS)
P_batch.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
......@@ -624,7 +628,6 @@ def _cmd_segment(args):
skip_low=args.drop_low_coverage,
skip_outliers=args.drop_outliers,
save_dataframe=bool(args.dataframe),
rlibpath=args.rlibpath,
rscript_path=args.rscript_path,
processes=args.processes)
if args.dataframe:
......@@ -664,8 +667,6 @@ P_segment.add_argument("--drop-outliers", metavar="FACTOR",
quantile away from the average within a rolling window.
Set to 0 for no outlier filtering.
[Default: %(default)g]""")
P_segment.add_argument("--rlibpath", metavar="DIRECTORY",
help=argparse.SUPPRESS)
P_segment.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
......@@ -1488,9 +1489,10 @@ P_import_rna.add_argument('-g', '--gene-resource',
P_import_rna.add_argument('-c', '--correlations', metavar="FILE",
help="""Correlation of each gene's copy number with
expression. Output of cnv_expression_correlate.py.""")
P_import_rna.add_argument('--max-log2', metavar="FLOAT", default=3.0,
P_import_rna.add_argument('--max-log2',
metavar="FLOAT", default=3.0, type=float,
help="""Maximum log2 value in output. Observed values above this limit
will be replaced with this value.""")
will be replaced with this value. [Default: %(default)s]""")
P_import_rna.add_argument('-n', '--normal', nargs='+', default=[],
help="""Normal samples (same format as `gene_counts`) to be used as a
control to when normalizing and re-centering gene read depth
......
......@@ -143,11 +143,11 @@ def region_depth_count(bamfile, chrom, start, end, gene, min_mapq):
if filter_read(read):
count += 1
# Only count the bases aligned to the region
rlen = read.query_length
rlen = read.query_alignment_length
if read.pos < start:
rlen -= start - read.pos
if read.pos + read.query_length > end:
rlen -= read.pos + read.query_length - end
if read.pos + read.query_alignment_length > end:
rlen -= read.pos + read.query_alignment_length - end
bases += rlen
depth = bases / (end - start) if end > start else 0
row = (chrom, start, end, gene,
......@@ -211,7 +211,7 @@ def bedcov(bed_fname, bam_fname, min_mapq):
raise ValueError("BED file %r chromosome names don't match any in "
"BAM file %r" % (bed_fname, bam_fname))
columns = detect_bedcov_columns(raw)
table = pd.read_table(StringIO(raw), names=columns, usecols=columns)
table = pd.read_csv(StringIO(raw), sep='\t', names=columns, usecols=columns)
return table
......
......@@ -4,6 +4,7 @@ from builtins import range, str, zip
import logging
import time
from collections import OrderedDict as OD
import numpy as np
import pandas as pd
......@@ -63,12 +64,12 @@ def fmt_cdt(sample_ids, table):
header3 = ['EWEIGHT', '', '', ''] + ['1'] * len(sample_ids)
outrows = [header2, header3]
outtable = pd.concat([
pd.DataFrame.from_items([
pd.DataFrame.from_dict(OD([
("GID", pd.Series(table.index).apply(lambda x: "GENE%dX" % x)),
("CLID", pd.Series(table.index).apply(lambda x: "IMAGE:%d" % x)),
("NAME", table["label"]),
("GWEIGHT", 1),
]),
])),
table.drop(["chromosome", "start", "end", "gene", "label"],
axis=1)],
axis=1)
......
......@@ -52,11 +52,9 @@ def aggregate_gene_counts(filenames):
prev_row_count = None
sample_cols = {}
for fname in filenames:
d = (pd.read_table(fname,
header=None,
comment="_",
names=["gene_id", "expected_count"],
converters={"gene_id": rna.before(".")})
d = (pd.read_csv(fname, sep='\t', comment="_", header=None,
names=["gene_id", "expected_count"],
converters={"gene_id": rna.before(".")})
.set_index("gene_id"))
# .drop(["__no_feature", "__ambiguous", "__too_low_aQual",
# "__not_aligned", "__alignment_not_unique"]))
......@@ -89,14 +87,14 @@ def aggregate_rsem(fnames):
length_cols = []
length_colname = 'length' # or: 'effective_length'
for fname in fnames:
# NB: read_table(index_col=_) works independently of combine=, dtype=
# so index column needs to be processed separately
# NB: read_csv(index_col=_) works independently of combine=, dtype=
# so index column needs to be set after parsing, not during.
# https://github.com/pandas-dev/pandas/issues/9435
d = pd.read_table(fname,
usecols=['gene_id', length_colname, 'expected_count'],
# index_col='gene_id',
converters={'gene_id': rna.before('.')}
).set_index('gene_id')
d = pd.read_csv(fname, sep='\t',
usecols=['gene_id', length_colname, 'expected_count'],
# index_col='gene_id',
converters={'gene_id': rna.before('.')}
).set_index('gene_id')
if prev_row_count is None:
prev_row_count = len(d)
elif len(d) != prev_row_count:
......
......@@ -66,10 +66,11 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
# "Transcript support level (TSL)"
info_col_names = ['gene_id', 'gc', 'chromosome', 'start', 'end',
'gene', 'entrez_id', 'tx_length', 'tx_support']
gene_info = (pd.read_table(gene_resource, names=info_col_names, header=1,
converters={'gene_id': before('.'),
'tx_support': tsl2int,
'gc': lambda x: float(x)/100})
gene_info = (pd.read_csv(gene_resource, sep='\t', header=1,
names=info_col_names,
converters={'gene_id': before('.'),
'tx_support': tsl2int,
'gc': lambda x: float(x)/100})
.sort_values('gene_id'))
logging.info("Loaded %s with shape: %s", gene_resource, gene_info.shape)
......@@ -117,7 +118,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
def load_cnv_expression_corr(fname):
shared_key = 'Entrez_Gene_Id'
table = (pd.read_table(fname, dtype={shared_key: int}, na_filter=False)
table = (pd.read_csv(fname, sep='\t', na_filter=False, dtype={shared_key: int})
.set_index(shared_key))
logging.info("Loaded %s with shape: %s", fname, table.shape)
return table
......
......@@ -20,8 +20,8 @@ def idxstats(bam_fname, drop_unmapped=False):
chromosome. Contigs with no mapped reads are skipped.
"""
handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
table = pd.read_table(handle, header=None,
names=['chromosome', 'length', 'mapped', 'unmapped'])
table = pd.read_csv(handle, sep='\t', header=None,
names=['chromosome', 'length', 'mapped', 'unmapped'])
if drop_unmapped:
table = table[table.mapped != 0].drop('unmapped', axis=1)
return table
......
......@@ -39,19 +39,19 @@ def do_scatter(cnarr, segments=None, variants=None,
MB = 1
if not show_gene and not show_range:
genome_scatter(cnarr, segments, variants, do_trend, y_min, y_max, title,
fig = genome_scatter(cnarr, segments, variants, do_trend, y_min, y_max, title,
segment_color)
else:
if by_bin:
show_range = show_range_bins
chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
fig = chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
antitarget_marker, do_trend, by_bin, window_width,
y_min, y_max, title, segment_color)
if by_bin:
# Reset to avoid permanently altering the value of cnvlib.scatter.MB
MB = orig_mb
return fig
# === Genome-level scatter plots ===
......@@ -66,7 +66,7 @@ def genome_scatter(cnarr, segments=None, variants=None, do_trend=False,
# Place chromosome labels between the CNR and SNP plots
axis2.tick_params(labelbottom=False)
chrom_sizes = plots.chromosome_sizes(cnarr or segments)
snv_on_genome(axis2, variants, chrom_sizes, segments, do_trend,
axis2 = snv_on_genome(axis2, variants, chrom_sizes, segments, do_trend,
segment_color)
else:
_fig, axis = pyplot.subplots()
......@@ -74,16 +74,16 @@ def genome_scatter(cnarr, segments=None, variants=None, do_trend=False,
title = (cnarr or segments or variants).sample_id
if cnarr or segments:
axis.set_title(title)
cnv_on_genome(axis, cnarr, segments, do_trend, y_min, y_max,
axis = cnv_on_genome(axis, cnarr, segments, do_trend, y_min, y_max,
segment_color)
else:
axis.set_title("Variant allele frequencies: %s" % title)
chrom_sizes = collections.OrderedDict(
(chrom, subarr["end"].max())
for chrom, subarr in variants.by_chromosome())
snv_on_genome(axis, variants, chrom_sizes, segments, do_trend,
axis = snv_on_genome(axis, variants, chrom_sizes, segments, do_trend,
segment_color)
return axis.get_figure()
def cnv_on_genome(axis, probes, segments, do_trend=False, y_min=None,
......@@ -145,7 +145,7 @@ def cnv_on_genome(axis, probes, segments, do_trend=False, y_min=None,
axis.plot((seg.start + x_offset, seg.end + x_offset),
(seg.log2, seg.log2),
color=color, linewidth=3, solid_capstyle='round')
return axis
def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color):
"""Plot a scatter-plot of SNP chromosomal positions and shifts."""
......@@ -189,7 +189,7 @@ def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color
axis.plot(posn, [v_freq, v_freq],
color=color, linewidth=2, zorder=-1,
solid_capstyle='round')
return axis
# === Chromosome-level scatter plots ===
......@@ -231,20 +231,20 @@ def chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
axis.set_xlabel("Position (bin)")
else:
axis.set_xlabel("Position (Mb)")
cnv_on_chromosome(axis, sel_probes, sel_segs, genes,
axis = cnv_on_chromosome(axis, sel_probes, sel_segs, genes,
antitarget_marker=antitarget_marker,
do_trend=do_trend, x_limits=window_coords,
y_min=y_min, y_max=y_max, segment_color=segment_color)
elif variants:
# Only plot SNVs in a single-panel layout
_fig, axis = pyplot.subplots()
snv_on_chromosome(axis, sel_snvs, sel_segs, genes, do_trend,
axis = snv_on_chromosome(axis, sel_snvs, sel_segs, genes, do_trend,
by_bin, segment_color)
if title is None:
title = "%s %s" % ((cnarr or segments or variants).sample_id, chrom)
axis.set_title(title)
return axis.get_figure()
def select_range_genes(cnarr, segments, variants, show_range, show_gene,
window_width):
......@@ -421,7 +421,7 @@ def cnv_on_chromosome(axis, probes, segments, genes, antitarget_marker=None,
axis.plot((row.start * MB, row.end * MB),
(row.log2, row.log2),
color=color, linewidth=4, solid_capstyle='round')
return axis
def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,
segment_color):
......@@ -458,7 +458,7 @@ def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,
if genes:
highlight_genes(axis, genes, .9)
return axis
def set_xlim_from(axis, probes=None, segments=None, variants=None):
"""Configure axes for plotting a single chromosome's data.
......
......@@ -22,8 +22,7 @@ from . import cbs, flasso, haar, hmm, none
def do_segmentation(cnarr, method, threshold=None, variants=None,
skip_low=False, skip_outliers=10, min_weight=0,
save_dataframe=False, rlibpath=None,
rscript_path="Rscript",
save_dataframe=False, rscript_path="Rscript",
processes=1):
"""Infer copy number segments from the given coverage table."""
if variants:
......@@ -49,7 +48,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
# -> assign separate identifiers via chrom name suffix?
cna = _do_segmentation(cnarr, method, threshold, variants, skip_low,
skip_outliers, min_weight, save_dataframe,
rlibpath, rscript_path)
rscript_path)
if save_dataframe:
cna, rstr = cna
rstr = _to_str(rstr)
......@@ -58,7 +57,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
with parallel.pick_pool(processes) as pool:
rets = list(pool.map(_ds, ((ca, method, threshold, variants,
skip_low, skip_outliers, min_weight,
save_dataframe, rlibpath, rscript_path)
save_dataframe, rscript_path)
for _, ca in cnarr.by_arm())))
if save_dataframe:
# rets is a list of (CNA, R dataframe string) -- unpack
......@@ -90,7 +89,7 @@ def _ds(args):
def _do_segmentation(cnarr, method, threshold, variants=None,
skip_low=False, skip_outliers=10, min_weight=0,
save_dataframe=False,
rlibpath=None, rscript_path="Rscript"):
rscript_path="Rscript"):
"""Infer copy number segments from the given coverage table."""
if not len(cnarr):
return cnarr
......@@ -152,7 +151,6 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
'probes_fname': tmp.name,
'sample_id': cnarr.sample_id,
'threshold': threshold,
'rlibpath': ('.libPaths(c("%s"))' % rlibpath if rlibpath else ''),
}
with core.temp_write_text(rscript % script_strings,
mode='w+t') as script_fname:
......
......@@ -5,7 +5,6 @@ CBS_RSCRIPT = """\
# Input: log2 coverage data in CNVkit's tabular format
# Output: the CBS data table (SEG)
%(rlibpath)s
library('DNAcopy')
write("Loading probe coverages into a data frame", stderr())
......
......@@ -5,13 +5,12 @@ FLASSO_RSCRIPT = """\
# Input: log2 coverage data in CNVkit's tabular format
# Output: the CBS-style SEG data table
%(rlibpath)s
library('cghFLasso')
tbl <- read.delim("%(probes_fname)s")
write(paste("Segmenting", levels(tbl$chromosome)), stderr())
fit <- cghFLasso(tbl$log2, FDR=%(threshold)g)
fit <- cghFLasso(tbl$log2, FDR=%(threshold)g, chromosome=tbl$chromosome)
# Reformat the output table as SEG
outtable <- data.frame(sample="%(sample_id)s",
......
#!/usr/bin/env python
"""Segmentation by Hidden Markov Model."""
from __future__ import absolute_import, division, print_function
import collections
import logging
import numpy as np
import pandas as pd
import scipy.special
import pomegranate as pom
from ..descriptives import biweight_midvariance
from ..segfilters import squash_by_groups
def segment_hmm(cnarr, method, window=None):
def segment_hmm(cnarr, method, window=None, processes=1):
"""Segment bins by Hidden Markov Model.
Use Viterbi method to infer copy number segments from sequential data.
......@@ -31,30 +37,31 @@ def segment_hmm(cnarr, method, window=None):
# NB: Incorporate weights into smoothed log2 estimates
# (Useful kludge until weighted HMM is in place)
orig_log2 = cnarr['log2'].values.copy()
cnarr['log2'] = cnarr.smoothed(window)
cnarr['log2'] = cnarr.smoothed()#window)
logging.info("Building model from observations")
model = hmm_get_model(cnarr, method, processes)
logging.debug("Building model from observations")
model = hmm_get_model(cnarr, method)
logging.debug("Predicting states from model")
obs = as_observation_matrix(cnarr)
# A sequence of inferred discrete states. Length is the same as `freqs`,
# with one state assigned to each input datapoint.
states = model.predict(obs, lengths=chrom_arm_lengths(cnarr))
logging.info("Predicting states from model")
observations = as_observation_matrix(cnarr)
states = np.concatenate([np.array(model.predict(obs, algorithm='map'))
for obs in observations])
# TODO logging.warn if model did not converge
# print(model, end="\n\n")
# print("Model params:\nmeans =", sorted(model.means_.flat))
# print(model.monitor_, end="\n\n")
logging.info("Done, now finalizing")
logging.debug("Model states: %s", model.states)
logging.debug("Predicted states: %s", states[:100])
logging.debug(str(collections.Counter(states)))
logging.debug("Observations: %s", observations[0][:100])
logging.debug("Edges: %s", model.edges)
# Merge adjacent bins with the same state to create segments
from ..segfilters import squash_by_groups
cnarr['log2'] = orig_log2
cnarr['probes'] = 1
segarr = squash_by_groups(cnarr, pd.Series(states), by_arm=True)
return segarr
def hmm_get_model(cnarr, method):
def hmm_get_model(cnarr, method, processes):
"""
Parameters
......@@ -62,65 +69,94 @@ def hmm_get_model(cnarr, method):
cnarr : CopyNumArray
The normalized bin-level values to be segmented.
method : string
One of 'hmm', 'hmm-tumor', 'hmm-germline'
One of 'hmm', 'hmm-tumor', 'hmm-germline'.
processes : int
Number of parallel jobs to run.
Returns
-------
model :
A hmmlearn Model trained on the given dataset.
A pomegranate HiddenMarkovModel trained on the given dataset.
"""
# Delayed import so hmmlearn is an optional dependency
from hmmlearn import hmm
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', RuntimeWarning)
# warnings.simplefilter('error', RuntimeWarning)
assert method in ('hmm-tumor', 'hmm-germline', 'hmm')
if method == 'hmm-tumor':
state_means = np.column_stack([[-4, -1, 0, .585, 1]])
else:
state_means = np.column_stack([[-1, 0, .585]])
kwargs = {
'n_components': len(state_means),
'means_prior': state_means,
'means_weight': .5,
}
if method == 'hmm-germline':
# Don't alter the state means when training the model
kwargs.update({
'init_params': 'stc',
'params': 'stc',
})
model = hmm.GaussianHMM(covariance_type='diag', n_iter=100,
random_state=0xA5EED, **kwargs)
if method == 'germline':
model.means_ = state_means
# Create observation matrix
# TODO incorporate weights -- currently handled by smoothing
# TODO incorporate inter-bin distances
cnarr = cnarr.autosomes()
freqs = as_observation_matrix(cnarr)
observations = as_observation_matrix(cnarr.autosomes())
if cnarr.chromosome.nunique() == 1:
model.fit(freqs)
# Estimate standard deviation from the full distribution, robustly
stdev = biweight_midvariance(np.concatenate(observations), initial=0)
if method == 'hmm-germline':
state_names = ["loss", "neutral", "gain"]
distributions = [
pom.NormalDistribution(-1.0, stdev, frozen=True),
pom.NormalDistribution(0.0, stdev, frozen=True),
pom.NormalDistribution(0.585, stdev, frozen=True),
]
elif method == 'hmm-tumor':
state_names = ["del", "loss", "neutral", "gain", "amp"]
distributions = [
pom.NormalDistribution(-2.0, stdev, frozen=False),
pom.NormalDistribution(-0.5, stdev, frozen=False),
pom.NormalDistribution(0.0, stdev, frozen=True),
pom.NormalDistribution(0.3, stdev, frozen=False),
pom.NormalDistribution(1.0, stdev, frozen=False),
]
else:
model.fit(freqs, lengths=chrom_arm_lengths(cnarr))
state_names = ["loss", "neutral", "gain"]
distributions = [
pom.NormalDistribution(-1.0, stdev, frozen=False),
pom.NormalDistribution(0.0, stdev, frozen=False),
pom.NormalDistribution(0.585, stdev, frozen=False),
]
n_states = len(distributions)
# Starts -- prefer neutral
binom_coefs = scipy.special.binom(n_states - 1, range(n_states))
start_probabilities = binom_coefs / binom_coefs.sum()
# Ends -- equally likely
#end_probabilities = np.ones(n_states) / n_states
# Prefer to keep the current state in each transition
# All other transitions are equally likely, to start
transition_matrix = (np.identity(n_states) * 100
+ np.ones((n_states, n_states)) / n_states)
model = pom.HiddenMarkovModel.from_matrix(transition_matrix, distributions,
start_probabilities, state_names=state_names, name=method)
model.fit(sequences=observations,
weights=[len(obs) for obs in observations],
distribution_inertia = .8, # Allow updating dists, but slowly
edge_inertia=0.1,
# lr_decay=.75,
pseudocount=5,
use_pseudocount=True,
max_iterations=100000,
n_jobs=processes,
verbose=False)
return model
def as_observation_matrix(cnarr):
columns = [cnarr['log2'].values]
if 'baf' in cnarr:
# XXX untested
columns.append(cnarr['baf'].values)
return np.column_stack(columns)
def as_observation_matrix(cnarr, variants=None):
"""Extract HMM fitting values from `cnarr`.
For each chromosome arm, extract log2 ratios as a numpy array.
Future: If VCF of variants is given, or 'baf' column has already been
added to `cnarr` from the same, then the BAF values are a second row/column
in each numpy array.
def chrom_arm_lengths(cnarr):
lengths = [len(x) for _, x in cnarr.by_arm()]
assert sum(lengths) == len(cnarr)
return lengths
Returns: List of numpy.ndarray, one per chromosome arm.
"""
# TODO incorporate weights -- currently handled by smoothing
# TODO incorporate inter-bin distances
observations = [arm.log2.values
for _c, arm in cnarr.by_arm()]
return observations
# --- TODO incorporate variant BAF/zygosity/LOH/ROH ---
if variants is not None:
print("Variants!")
segarr['baf'] = variants.baf_by_ranges(segarr)
elif 'baf' in cnarr:
print("BAF!")
observations = [np.hstack([arm.log2.values, arm['baf'].values])
for _c, arm in cnarr.by_arm()]
# /---
#!/usr/bin/env python
"""Segmentation by Hidden Markov Model."""
from __future__ import absolute_import, division, print_function
import logging
import numpy as np
import pandas as pd
def segment_hmm(cnarr, method, window=None):
"""Segment bins by Hidden Markov Model.
Use Viterbi method to infer copy number segments from sequential data.
With b-allele frequencies ('baf' column in `cnarr`), jointly segment
log-ratios and b-allele frequencies across a chromosome.
Parameters
----------
cnarr : CopyNumArray
The bin-level data to segment.
method : string
One of 'hmm' (3 states, flexible means), 'hmm-tumor' (5 states, flexible
means), 'hmm-germline' (3 states, fixed means).
Results
-------
segarr : CopyNumArray
The segmented data.
"""
# NB: Incorporate weights into smoothed log2 estimates
# (Useful kludge until weighted HMM is in place)
orig_log2 = cnarr['log2'].values.copy()
cnarr['log2'] = cnarr.smoothed(window)
logging.debug("Building model from observations")
model = hmm_get_model(cnarr, method)
logging.debug("Predicting states from model")
obs = as_observation_matrix(cnarr)
# A sequence of inferred discrete states. Length is the same as `freqs`,
# with one state assigned to each input datapoint.
states = model.predict(obs, lengths=chrom_arm_lengths(cnarr))
# TODO logging.warn if model did not converge
# print(model, end="\n\n")
# print("Model params:\nmeans =", sorted(model.means_.flat))
# print(model.monitor_, end="\n\n")
# Merge adjacent bins with the same state to create segments
from ..segfilters import squash_by_groups
cnarr['log2'] = orig_log2
cnarr['probes'] = 1
segarr = squash_by_groups(cnarr, pd.Series(states), by_arm=True)
return segarr
def hmm_get_model(cnarr, method):
"""
Parameters
----------
cnarr : CopyNumArray
The normalized bin-level values to be segmented.
method : string
One of 'hmm', 'hmm-tumor', 'hmm-germline'
Returns
-------
model :
A hmmlearn Model trained on the given dataset.
"""
# Delayed import so hmmlearn is an optional dependency
from hmmlearn import hmm
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', RuntimeWarning)
# warnings.simplefilter('error', RuntimeWarning)
assert method in ('hmm-tumor', 'hmm-germline', 'hmm')
if method == 'hmm-tumor':
state_means = np.column_stack([[-4, -1, 0, .585, 1]])
else:
state_means = np.column_stack([[-1, 0, .585]])
kwargs = {
'n_components': len(state_means),
'means_prior': state_means,
'means_weight': .5,
}
if method == 'hmm-germline':
# Don't alter the state means when training the model
kwargs.update({
'init_params': 'stc',
'params': 'stc',
})
model = hmm.GaussianHMM(covariance_type='diag', n_iter=100,
random_state=0xA5EED, **kwargs)
if method == 'germline':
model.means_ = state_means
# Create observation matrix
# TODO incorporate weights -- currently handled by smoothing
# TODO incorporate inter-bin distances
cnarr = cnarr.autosomes()
freqs = as_observation_matrix(cnarr)
if cnarr.chromosome.nunique() == 1:
model.fit(freqs)
else:
model.fit(freqs, lengths=chrom_arm_lengths(cnarr))
return model
def as_observation_matrix(cnarr):
columns = [cnarr['log2'].values]
if 'baf' in cnarr:
# XXX untested
columns.append(cnarr['baf'].values)
return np.column_stack(columns)
def chrom_arm_lengths(cnarr):
lengths = [len(x) for _, x in cnarr.by_arm()]
assert sum(lengths) == len(cnarr)
return lengths
"""An array of genomic intervals, treated as variant loci."""
from __future__ import absolute_import, division, print_function
from builtins import str
import logging
import numpy as np
......
cnvkit (0.9.6-1) unstable; urgency=medium
* New upstream version
* debhelper-compat 12
* Standards-Version: 4.4.0
-- Steffen Moeller <moeller@debian.org> Wed, 28 Aug 2019 19:31:59 +0200
cnvkit (0.9.5-3) unstable; urgency=medium
* Fix buster rebuilds (Closes: #925568)
......