Skip to content
Commits on Source (7)
language: c
sudo: false
os:
- osx
- linux
env:
# Earliest supported versions
- CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
- CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
# The latest versions, whatever they are
- CONDA_PY=3.6 PANDAS='*' PYSAM='*'
install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install md5sha1sum; fi
- source devtools/travis-ci/install_miniconda.sh
before_script:
- conda create -q -y --name testenv python=$CONDA_PY
- conda create -yq --name testenv python=$CONDA_PY
- source activate testenv
- conda build -c bioconda --python $CONDA_PY ./devtools/conda-recipe/ # Build the conda recipe for the latest dev
- conda install -c bioconda --use-local -yq cnvkit # Install the locally built package and its requirements
- conda install -c bioconda -yq pandas==$PANDAS pysam==$PYSAM # Install the versions desired for testing
- conda remove cnvkit -y -q # Remove the conda cnvkit package but keep the dependencies installed
- python setup.py install # Install directly from source
- if [[ "$CONDA_PY" == "2.7" ]]; then conda install -yq futures>=3.0; fi
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then conda install -yq -c bioconda atlas; fi
# Install the versions desired for testing
- conda install -yq -c bioconda
cython
future
matplotlib
numpy
pyfaidx
pysam
reportlab
scipy
pandas==$PANDAS
pysam==$PYSAM
# R linking is broken on OS X - try recompilation instead of conda there
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then
conda install -yq -c bioconda bioconductor-dnacopy r-cghflasso;
fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
conda install -yq -c bioconda r-base;
Rscript -e "source('http://bioconductor.org/biocLite.R'); biocLite(c('DNAcopy', 'cghFLasso'))";
fi
# hmmlearn 0.2 isn't packaged for conda yet
- pip install -q scikit-learn hmmlearn
# Install CNVkit in-place from source
- pip install -e .
- cd test/
# For codecov.io
- pip install codecov
......@@ -21,18 +52,10 @@ script:
- coverage run test_io.py
- coverage run -a test_genome.py
- coverage run -a test_cnvlib.py
- coverage run -a test_r.py
#- coverage run -a test_r.py
# OS X can't install cghFLasso package?
- if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then coverage run -a test_r.py; fi
after_success:
- coverage report
- codecov
os:
# - osx
- linux
env:
- CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
- CONDA_PY=2.7 PANDAS=0.22.0 PYSAM=0.13.0
- CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
- CONDA_PY=3.5 PANDAS=0.22.0 PYSAM=0.13.0
Copyright (c) 2013-2017 Eric Talevich
Copyright (c) 2013-2018 Eric Talevich
Copyright (c) 2013-2018 University of California
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......
......@@ -78,11 +78,20 @@ packages.
This approach is preferred on Mac OS X, and is a solid choice on Linux, too.
To download and install CNVkit and its Python dependencies::
To download and install CNVkit and its Python dependencies in a clean
environment::
# Configure the sources where conda will find packages
conda config --add channels defaults
conda config --add channels conda-forge
conda config --add channels bioconda
# Install CNVkit in a new environment named "cnvkit"
conda create -n cnvkit cnvkit
# Activate the environment with CNVkit installed:
source activate cnvkit
Or, in an existing environment::
conda install cnvkit
......@@ -143,7 +152,7 @@ Then install the rest of CNVkit's dependencies::
Alternatively, you can use `Homebrew <http://brew.sh/>`_ to install an
up-to-date Python (e.g. ``brew install python``) and as many of the Python
packages as possible (primarily NumPy, SciPy, matplotlib and pandas).
packages as possible (primarily NumPy and SciPy; ideally matplotlib and pandas).
Then, proceed with pip::
pip install numpy scipy pandas matplotlib reportlab biopython pyfaidx pysam pyvcf
......
__version__ = "0.9.3"
__version__ = "0.9.5"
......@@ -9,6 +9,7 @@ import pandas as pd
from skgenome import tabio, GenomicArray as GA
from . import coverage, samutil
from .antitarget import compare_chrom_names
from .descriptives import weighted_median
......@@ -47,6 +48,14 @@ def do_autobin(bam_fname, method, targets=None, access=None,
((target depth, target avg. bin size),
(antitarget depth, antitarget avg. bin size))
"""
if method in ('amplicon', 'hybrid'):
if targets is None:
raise ValueError("Target regions are required for method %r "
"but were not provided." % method)
if not len(targets):
raise ValueError("Target regions are required for method %r "
"but were not provided." % method)
# Closes over bp_per_bin
def depth2binsize(depth, min_size, max_size):
if depth:
......@@ -95,7 +104,9 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None):
"""Hybrid capture sequencing."""
# Identify off-target regions
if access is None:
access = idxstats2ga(rc_table)
access = idxstats2ga(rc_table, bam_fname)
# Verify BAM chromosome names match those in target BED
compare_chrom_names(access, targets)
antitargets = access.subtract(targets)
# Only examine chromosomes present in all 2-3 input datasets
rc_table, targets, antitargets = shared_chroms(rc_table, targets,
......@@ -126,9 +137,10 @@ def average_depth(rc_table, read_length):
return weighted_median(mean_depths, rc_table.length)
def idxstats2ga(table):
def idxstats2ga(table, bam_fname):
return GA(table.assign(start=0, end=table.length)
.loc[:, ('chromosome', 'start', 'end')])
.loc[:, ('chromosome', 'start', 'end')],
meta_dict={'filename': bam_fname})
def sample_region_cov(bam_fname, regions, max_num=100):
......
......@@ -276,7 +276,7 @@ class CopyNumArray(GenomicArray):
self._chr_y_label, stats['chry_ratio'],
stats['chrx_male_lr'], stats['chry_male_lr'],
stats['chrx_male_lr'] * stats['chry_male_lr'],
('female', 'male')[is_xy])
'male' if is_xy else 'female')
return ~is_xy
def compare_sex_chromosomes(self, male_reference=False, skip_low=False):
......
......@@ -163,7 +163,11 @@ P_batch.add_argument('-p', '--processes',
parallel. Without an argument, use the maximum number of
available CPUs. [Default: process each BAM in serial]""")
P_batch.add_argument("--rlibpath", metavar="DIRECTORY",
help="Path to an alternative site-library to use for R packages.")
help=argparse.SUPPRESS)
P_batch.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
[Default: %(default)s]""")
# Reference-building options
P_batch_newref = P_batch.add_argument_group(
......@@ -401,20 +405,25 @@ P_autobin.add_argument('-g', '--access', metavar="FILENAME",
P_autobin.add_argument('-t', '--targets',
help="""Potentially targeted genomic regions, e.g. all possible exons
for the reference genome. Format: BED, interval list, etc.""")
P_autobin.add_argument('-b', '--bp-per-bin', type=float, default=100000.,
P_autobin.add_argument('-b', '--bp-per-bin',
type=float, default=100000.,
help="""Desired average number of sequencing read bases mapped to each
bin. [Default: %(default)s]""")
P_autobin.add_argument('--target-max-size', type=int, default=20000,
help="Maximum size of target bins.")
P_autobin.add_argument('--target-min-size', type=int, default=20,
help="Minimum size of target bins.")
P_autobin.add_argument('--antitarget-max-size', type=int, default=500000,
help="Maximum size of antitarget bins.")
P_autobin.add_argument('--antitarget-min-size', type=int, default=500,
help="Minimum size of antitarget bins.")
P_autobin.add_argument('--annotate',
P_autobin.add_argument('--target-max-size', metavar="BASES",
type=int, default=20000,
help="Maximum size of target bins. [Default: %(default)s]")
P_autobin.add_argument('--target-min-size', metavar="BASES",
type=int, default=20,
help="Minimum size of target bins. [Default: %(default)s]")
P_autobin.add_argument('--antitarget-max-size', metavar="BASES",
type=int, default=500000,
help="Maximum size of antitarget bins. [Default: %(default)s]")
P_autobin.add_argument('--antitarget-min-size', metavar="BASES",
type=int, default=500,
help="Minimum size of antitarget bins. [Default: %(default)s]")
P_autobin.add_argument('--annotate', metavar="FILENAME",
help="""Use gene models from this file to assign names to the target
regions. Format: UCSC refFlat.txt or ensFlat.txt file
(preferred), or BED, interval list, GFF, or similar.""")
......@@ -616,6 +625,7 @@ def _cmd_segment(args):
skip_outliers=args.drop_outliers,
save_dataframe=bool(args.dataframe),
rlibpath=args.rlibpath,
rscript_path=args.rscript_path,
processes=args.processes)
if args.dataframe:
segments, dframe = results
......@@ -648,14 +658,18 @@ P_segment.add_argument('-t', '--threshold', type=float,
P_segment.add_argument("--drop-low-coverage", action='store_true',
help="""Drop very-low-coverage bins before segmentation to avoid
false-positive deletions in poor-quality tumor samples.""")
P_segment.add_argument("--drop-outliers",
type=float, default=10, metavar="FACTOR",
P_segment.add_argument("--drop-outliers", metavar="FACTOR",
type=float, default=10,
help="""Drop outlier bins more than this many multiples of the 95th
quantile away from the average within a rolling window.
Set to 0 for no outlier filtering.
[Default: %(default)g]""")
P_segment.add_argument("--rlibpath", metavar="DIRECTORY",
help="Path to an alternative site-library to use for R packages.")
help=argparse.SUPPRESS)
P_segment.add_argument("--rscript-path", metavar="PATH", default="Rscript",
help="""Path to the Rscript excecutable to use for running R code.
Use this option to specify a non-default R installation.
[Default: %(default)s]""")
P_segment.add_argument('-p', '--processes',
nargs='?', type=int, const=0, default=1,
help="""Number of subprocesses to segment in parallel.
......@@ -701,7 +715,8 @@ def _cmd_call(args):
logging.info("Shifting log2 values by %f", -args.center_at)
cnarr['log2'] -= args.center_at
elif args.center:
cnarr.center_all(args.center, verbose=True)
cnarr.center_all(args.center, skip_low=args.drop_low_coverage,
verbose=True)
varr = load_het_snps(args.vcf, args.sample_id, args.normal_id,
args.min_variant_depth, args.zygosity_freq)
......@@ -747,6 +762,9 @@ P_call.add_argument("--ploidy", type=int, default=2,
help="Ploidy of the sample cells. [Default: %(default)d]")
P_call.add_argument("--purity", type=float,
help="Estimated tumor cell fraction, a.k.a. purity or cellularity.")
P_call.add_argument("--drop-low-coverage", action='store_true',
help="""Drop very-low-coverage bins before segmentation to avoid
false-positive deletions in poor-quality tumor samples.""")
P_call.add_argument('-x', '--sample-sex', '-g', '--gender', dest='sample_sex',
choices=('m', 'y', 'male', 'Male', 'f', 'x', 'female', 'Female'),
help="""Specify the sample's chromosomal sex as male or female.
......@@ -1176,8 +1194,8 @@ def do_sex(cnarrs, is_male_reference):
is_xy, stats = cna.compare_sex_chromosomes(is_male_reference)
return (cna.meta["filename"] or cna.sample_id,
"Male" if is_xy else "Female",
strsign(stats['chrx_ratio']),
strsign(stats['chry_ratio']))
strsign(stats['chrx_ratio']) if stats else "NA",
strsign(stats['chry_ratio']) if stats else "NA")
rows = (guess_and_format(cna) for cna in cnarrs)
columns = ["sample", "sex", "X_logratio", "Y_logratio"]
......@@ -1440,7 +1458,7 @@ def _cmd_import_rna(args):
"""Convert a cohort of per-gene log2 ratios to CNVkit .cnr format."""
all_data, cnrs = import_rna.do_import_rna(
args.gene_counts, args.format, args.gene_resource, args.correlations,
args.normal)
args.normal, args.do_gc, args.do_txlen, args.max_log2)
logging.info("Writing output files")
if args.output:
all_data.to_csv(args.output, sep='\t', index=True)
......@@ -1470,7 +1488,10 @@ P_import_rna.add_argument('-g', '--gene-resource',
P_import_rna.add_argument('-c', '--correlations', metavar="FILE",
help="""Correlation of each gene's copy number with
expression. Output of cnv_expression_correlate.py.""")
P_import_rna.add_argument('-n', '--normal', nargs='+',
P_import_rna.add_argument('--max-log2', metavar="FLOAT", default=3.0,
help="""Maximum log2 value in output. Observed values above this limit
will be replaced with this value.""")
P_import_rna.add_argument('-n', '--normal', nargs='+', default=[],
help="""Normal samples (same format as `gene_counts`) to be used as a
control to when normalizing and re-centering gene read depth
ratios. All filenames following this option will be used.""")
......@@ -1480,6 +1501,14 @@ P_import_rna.add_argument('-d', '--output-dir',
sample. [Default: %(default)s]""")
P_import_rna.add_argument('-o', '--output', metavar="FILE",
help="Output file name (summary table).")
P_import_rna_bias = P_import_rna.add_argument_group(
"To disable specific automatic bias corrections")
P_import_rna_bias.add_argument('--no-gc', dest='do_gc', action='store_false',
help="Skip GC correction.")
P_import_rna_bias.add_argument('--no-txlen', dest='do_txlen', action='store_false',
help="Skip transcript length correction.")
P_import_rna.set_defaults(func=_cmd_import_rna)
......@@ -1504,10 +1533,15 @@ def _cmd_export_bed(args):
# ENH: args.sample_sex as a comma-separated list
is_sample_female = verify_sample_sex(segments, args.sample_sex,
args.male_reference)
if args.sample_id:
label = args.sample_id
elif args.label_genes:
label = None
else:
label = segments.sample_id
tbl = export.export_bed(segments, args.ploidy,
args.male_reference, is_sample_female,
args.sample_id or segments.sample_id,
args.show)
label, args.show)
bed_tables.append(tbl)
table = pd.concat(bed_tables)
write_dataframe(args.output, table, header=False)
......@@ -1520,6 +1554,9 @@ P_export_bed.add_argument('segments', nargs='+',
P_export_bed.add_argument("-i", "--sample-id", metavar="LABEL",
help="""Identifier to write in the 4th column of the BED file.
[Default: use the sample ID, taken from the file name]""")
P_export_bed.add_argument('--label-genes', action='store_true',
help="""Show gene names in the 4th column of the BED file.
(This is a bad idea if >1 input files are given.)""")
P_export_bed.add_argument("--ploidy", type=int, default=2,
help="Ploidy of the sample cells. [Default: %(default)d]")
P_export_bed.add_argument('-x', '--sample-sex', '-g', '--gender',
......
......@@ -178,7 +178,7 @@ def export_bed(segments, ploidy, is_reference_male, is_sample_female,
the reference ploidy on autosomes, or half that on sex chromosomes.
"""
out = segments.data.loc[:, ["chromosome", "start", "end"]]
out["label"] = label
out["label"] = label if label else segments["gene"]
out["ncopies"] = (segments["cn"] if "cn" in segments
else call.absolute_pure(segments, ploidy, is_reference_male)
.round().astype('int'))
......
......@@ -9,14 +9,15 @@ from . import rna
def do_import_rna(gene_count_fnames, in_format, gene_resource_fname,
correlations_fname=None, normal_fnames=()):
correlations_fname=None, normal_fnames=(),
do_gc=True, do_txlen=True, max_log2=3):
"""Convert a cohort of per-gene read counts to CNVkit .cnr format.
The expected data source is TCGA gene-level expression counts for individual
samples, but other sources should be fine, too.
"""
# Deduplicate and ensure all normals are included in the analysis
gene_count_fnames = sorted(set(gene_count_fnames) + set(normal_fnames))
gene_count_fnames = sorted(set(list(gene_count_fnames) + list(normal_fnames)))
if in_format == 'rsem':
sample_counts, tx_lengths = aggregate_rsem(gene_count_fnames)
......@@ -43,7 +44,7 @@ def do_import_rna(gene_count_fnames, in_format, gene_resource_fname,
# CNVkit files have both absolute and log2-normalized read counts
cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2,
gene_info)
cnrs = (rna.correct_cnr(cnr) for cnr in cnrs)
cnrs = (rna.correct_cnr(cnr, do_gc, do_txlen, max_log2) for cnr in cnrs)
return all_data, cnrs
......
......@@ -230,11 +230,9 @@ def combine_probes(filenames, fa_fname, is_male_reference, sexes, skip_low,
logging.info("Loading target %s", fname)
cnarrx = read_cna(fname)
# Bin information should match across all files
if not (len(cnarr1) == len(cnarrx)
and (cnarr1.chromosome == cnarrx.chromosome).all()
and (cnarr1.start == cnarrx.start).all()
and (cnarr1.end == cnarrx.end).all()
and (cnarr1['gene'] == cnarrx['gene']).all()):
if not np.array_equal(
cnarr1.data.loc[:, ('chromosome', 'start', 'end', 'gene')].values,
cnarrx.data.loc[:, ('chromosome', 'start', 'end', 'gene')].values):
raise RuntimeError("%s bins do not match those in %s"
% (fname, filenames[0]))
all_depths.append(cnarrx['depth'] if 'depth' in cnarrx
......@@ -273,8 +271,14 @@ def warn_bad_bins(cnarr, max_name_width=50):
if len(fg_bad_bins) > 0:
bad_pct = (100 * len(fg_bad_bins)
/ sum(~cnarr['gene'].isin(params.ANTITARGET_ALIASES)))
logging.info("Targets: %d (%s) bins failed filters:",
len(fg_bad_bins), "%.4f" % bad_pct + '%')
logging.info("Targets: %d (%s) bins failed filters "
"(log2 < %s, log2 > %s, spread > %s)",
len(fg_bad_bins),
"%.4f" % bad_pct + '%',
params.MIN_REF_COVERAGE,
-params.MIN_REF_COVERAGE,
params.MAX_REF_SPREAD)
if len(fg_bad_bins) < 500:
gene_cols = min(max_name_width, max(map(len, fg_bad_bins['gene'])))
labels = fg_bad_bins.labels()
chrom_cols = max(labels.apply(len))
......
......@@ -4,9 +4,7 @@ Process per-gene expression levels, or the equivalent, by cohort.
"""
from __future__ import absolute_import, division, print_function
from builtins import zip
import logging
import sys
import numpy as np
import pandas as pd
......@@ -14,8 +12,7 @@ from scipy.stats import gmean
from .cnary import CopyNumArray as CNA
from .fix import center_by_window
# from .descriptives import biweight_midvariance
# from .params import NULL_LOG2_COVERAGE
NULL_LOG2_COVERAGE = -5
......@@ -37,10 +34,10 @@ def filter_probes(sample_counts):
For the purposes of copy number inference, these rows are best removed.
"""
gene_medians = sample_counts.median(axis=1)
## Make sure the gene has detectable transcript in at least half of samples
# Make sure the gene has detectable transcript in at least half of samples
is_mostly_transcribed = (gene_medians >= 1.0)
print("Dropping", (~is_mostly_transcribed).sum(), "/",
len(is_mostly_transcribed), "rarely expressed genes from input samples")
logging.info("Dropping %d / %d rarely expressed genes from input samples",
(~is_mostly_transcribed).sum(), len(is_mostly_transcribed))
return sample_counts[is_mostly_transcribed]
......@@ -74,8 +71,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
'tx_support': tsl2int,
'gc': lambda x: float(x)/100})
.sort_values('gene_id'))
print("Loaded", gene_resource, "with shape:", gene_info.shape,
file=sys.stderr)
logging.info("Loaded %s with shape: %s", gene_resource, gene_info.shape)
if corr_fname:
corr_table = load_cnv_expression_corr(corr_fname)
......@@ -85,10 +81,21 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
unique_idx = gi_corr.groupby('gene_id').apply(dedupe_ens_hugo)
gi_corr = gi_corr.loc[unique_idx]
# DBG verify that the tables were aligned correctly
# for colname in ('pearson_r', 'spearman_r', 'kendall_t'):
# print("In column", colname, "length", len(gi_corr),
# ", have", gi_corr[colname].count(), "not-NA,",
# (gi_corr[colname] > 0.1).sum(), ">0.1",
# ", max value", np.nanmax(gi_corr[colname]))
if not gene_info['entrez_id'].is_unique:
# Treat untraceable entrez_id duplicates (scarce at this point, ~15)
# as unknown correlation, i.e. default, as if not in TCGA
gi_corr.loc[tuple(locate_entrez_dupes(gi_corr)),
entrez_dupes_idx = tuple(locate_entrez_dupes(gi_corr))
logging.info("Resetting %d ambiguous genes' correlation "
"coefficients to default %f",
len(entrez_dupes_idx), default_r)
gi_corr.loc[entrez_dupes_idx,
('pearson_r', 'spearman_r', 'kendall_t')] = default_r
# Genes w/o TCGA info get default correlation 0.1 (= .5*corr.median())
......@@ -102,7 +109,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
unique_idx = gene_info.groupby('gene_id').apply(dedupe_ens_no_hugo)
gene_info = gene_info.loc[unique_idx]
print("Trimmed gene info table to shape:", gene_info.shape)
logging.info("Trimmed gene info table to shape: %s", gene_info.shape)
assert gene_info['gene_id'].is_unique
gene_info['entrez_id'] = gene_info['entrez_id'].fillna(0).astype('int')
return gene_info.set_index('gene_id')
......@@ -110,9 +117,9 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
def load_cnv_expression_corr(fname):
shared_key = 'Entrez_Gene_Id'
table = (pd.read_table(fname, dtype={shared_key: str}, na_filter=False)
table = (pd.read_table(fname, dtype={shared_key: int}, na_filter=False)
.set_index(shared_key))
print("Loaded", fname, "with shape:", table.shape, file=sys.stderr)
logging.info("Loaded %s with shape: %s", fname, table.shape)
return table
......@@ -242,8 +249,8 @@ def align_gene_info_to_samples(gene_info, sample_counts, tx_lengths,
Also calculate weights and add to gene_info as 'weight', along with
transcript lengths as 'tx_length'.
"""
print("Dimensions: gene_info=%s, sample_counts=%s"
% (gene_info.shape, sample_counts.shape))
logging.debug("Dimensions: gene_info=%s, sample_counts=%s",
gene_info.shape, sample_counts.shape)
sc, gi = sample_counts.align(gene_info, join='inner', axis=0)
gi = gi.sort_values(by=['chromosome', 'start'])
sc = sc.loc[gi.index]
......@@ -299,8 +306,6 @@ def normalize_read_depths(sample_depths, normal_ids):
Finally, convert to log2 ratios.
"""
# TODO use normal_ids as a control here
# e.g. subtract their IQR after the loop?
assert sample_depths.values.sum() > 0
sample_depths = sample_depths.fillna(0)
for _i in range(4):
......@@ -313,6 +318,31 @@ def normalize_read_depths(sample_depths, normal_ids):
# print(" round", _i, "grand mean:", sample_depths.values.mean(),
# ": sample-wise denom =", q3.mean(),
# ", gene-wise denom =", sm.mean())
if normal_ids:
# Use normal samples as a baseline for read depths
normal_ids = pd.Series(normal_ids)
if not normal_ids.isin(sample_depths.columns).all():
raise ValueError("Normal sample IDs not in samples: %s"
% normal_ids.drop(sample_depths.columns, errors='ignore'))
normal_depths = sample_depths.loc[:, normal_ids]
use_median = True # TODO - benchmark & pick one
if use_median:
# Simple approach: divide normalized (1=neutral) values by gene medians
normal_avg = normal_depths.median(axis=1)
sample_depths = sample_depths.divide(normal_avg, axis=0).clip_lower(0)
else:
# Alternate approach: divide their IQR
# At each gene, sample depths above the normal sample depth 75%ile
# are divided by the 75%ile, those below 25%ile are divided by
# 25%ile, and those in between are reset to neutral (=1.0)
n25, n75 = np.nanpercentile(normal_depths.values, [25, 75], axis=1)
below_idx = sample_depths.values < n25[:, np.newaxis]
above_idx = sample_depths.values > n75[:, np.newaxis]
factor = np.zeros_like(sample_depths.values)
factor[below_idx] = (below_idx / n25[:, np.newaxis])[below_idx]
factor[above_idx] = (above_idx / n75[:, np.newaxis])[above_idx]
sample_depths *= factor
sample_depths[~(below_idx | above_idx)] = 1.0
# Finally, convert normalized read depths to log2 scale
return safe_log2(sample_depths, NULL_LOG2_COVERAGE)
......@@ -349,7 +379,7 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info,
# Fill NA fields with the lowest finite value in the same row.
# Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth.
gene_minima = sample_data_log2.min(axis=1, skipna=True)
assert not gene_minima.hasnans
assert not gene_minima.hasnans, gene_minima.head()
for (sample_id, sample_col), (_sid_log2, sample_log2) \
in zip(sample_counts.iteritems(), sample_data_log2.iteritems()):
tx_len = cnr_info.tx_length
......@@ -362,7 +392,7 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info,
yield cnr
def correct_cnr(cnr):
def correct_cnr(cnr, do_gc, do_txlen, max_log2):
"""Apply bias corrections & smoothing.
- Biases: 'gc', 'length'
......@@ -370,9 +400,12 @@ def correct_cnr(cnr):
"""
cnr.center_all()
# Biases, similar to stock CNVkit
if 'gc' in cnr:
if any((do_gc, do_txlen)):
if do_gc and 'gc' in cnr:
cnr = center_by_window(cnr, .1, cnr['gc'])
if 'tx_length' in cnr:
if do_txlen and 'tx_length' in cnr:
cnr = center_by_window(cnr, .1, cnr['tx_length'])
cnr.center_all()
if max_log2:
cnr[cnr['log2'] > max_log2, 'log2'] = max_log2
return cnr
"""Segmentation of copy number values."""
from __future__ import absolute_import, division, print_function
from builtins import map
from builtins import map, zip
import locale
import logging
......@@ -12,6 +12,7 @@ import numpy as np
import pandas as pd
from Bio._py3k import StringIO
from skgenome import tabio
from skgenome.intersect import iter_slices
from .. import core, parallel, params, smoothing, vary
from ..cnary import CopyNumArray as CNA
......@@ -22,14 +23,15 @@ from . import cbs, flasso, haar, hmm, none
def do_segmentation(cnarr, method, threshold=None, variants=None,
skip_low=False, skip_outliers=10, min_weight=0,
save_dataframe=False, rlibpath=None,
rscript_path="Rscript",
processes=1):
"""Infer copy number segments from the given coverage table."""
if variants:
variants = variants.heterozygous()
if not threshold:
threshold = {'cbs': 0.0001,
'flasso': 0.005,
'haar': 0.001,
'flasso': 0.0001,
'haar': 0.0001,
}.get(method)
msg = "Segmenting with method " + repr(method)
if threshold is not None:
......@@ -46,7 +48,8 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
# ENH segment p/q arms separately
# -> assign separate identifiers via chrom name suffix?
cna = _do_segmentation(cnarr, method, threshold, variants, skip_low,
skip_outliers, min_weight, save_dataframe, rlibpath)
skip_outliers, min_weight, save_dataframe,
rlibpath, rscript_path)
if save_dataframe:
cna, rstr = cna
rstr = _to_str(rstr)
......@@ -55,7 +58,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
with parallel.pick_pool(processes) as pool:
rets = list(pool.map(_ds, ((ca, method, threshold, variants,
skip_low, skip_outliers, min_weight,
save_dataframe, rlibpath)
save_dataframe, rlibpath, rscript_path)
for _, ca in cnarr.by_arm())))
if save_dataframe:
# rets is a list of (CNA, R dataframe string) -- unpack
......@@ -86,7 +89,8 @@ def _ds(args):
def _do_segmentation(cnarr, method, threshold, variants=None,
skip_low=False, skip_outliers=10, min_weight=0,
save_dataframe=False, rlibpath=None):
save_dataframe=False,
rlibpath=None, rscript_path="Rscript"):
"""Infer copy number segments from the given coverage table."""
if not len(cnarr):
return cnarr
......@@ -152,7 +156,7 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
}
with core.temp_write_text(rscript % script_strings,
mode='w+t') as script_fname:
seg_out = core.call_quiet('Rscript', '--vanilla', script_fname)
seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname)
# Convert R dataframe contents (SEG) to a proper CopyNumArray
# NB: Automatically shifts 'start' back from 1- to 0-indexed
segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA)
......@@ -163,7 +167,6 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
else:
segarr['weight'] = 1.0
segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True)
segarr = repair_segments(segarr, cnarr)
else:
raise ValueError("Unknown method %r" % method)
......@@ -176,8 +179,7 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
segarr = segarr.as_dataframe(pd.concat(newsegs))
segarr['baf'] = variants.baf_by_ranges(segarr)
segarr['gene'], segarr['weight'], segarr['depth'] = \
transfer_fields(segarr, cnarr)
segarr = transfer_fields(segarr, cnarr)
if save_dataframe:
return segarr, seg_out
else:
......@@ -214,40 +216,16 @@ def transfer_fields(segments, cnarr, ignore=params.IGNORE_GENE_NAMES):
Segment gene name is the comma-separated list of bin gene names. Segment
weight is the sum of bin weights, and depth is the (weighted) mean of bin
depths.
"""
if not len(cnarr):
return [], [], []
ignore += params.ANTITARGET_ALIASES
if 'weight' not in cnarr:
cnarr['weight'] = 1
if 'depth' not in cnarr:
cnarr['depth'] = np.exp2(cnarr['log2'])
seggenes = ['-'] * len(segments)
segweights = np.zeros(len(segments))
segdepths = np.zeros(len(segments))
for i, (_seg, subprobes) in enumerate(cnarr.by_ranges(segments)):
if not len(subprobes):
continue
segweights[i] = subprobes['weight'].sum()
if subprobes['weight'].sum() > 0:
segdepths[i] = np.average(subprobes['depth'], weights=subprobes['weight'])
subgenes = [g for g in pd.unique(subprobes['gene']) if g not in ignore]
if subgenes:
seggenes[i] = ",".join(subgenes)
return seggenes, segweights, segdepths
# TODO/ENH combine with transfer_fields
# Recalculate segment means, weights, depths here instead of in R
def repair_segments(segments, orig_probes):
"""Post-process segmentation output.
Also: Post-process segmentation output.
1. Ensure every chromosome has at least one segment.
2. Ensure first and last segment ends match 1st/last bin ends
(but keep log2 as-is).
"""
def make_null_segment(chrom, orig_start, orig_end):
"""Closes over 'segments'."""
vals = {'chromosome': chrom,
'start': orig_start,
'end': orig_end,
......@@ -260,19 +238,60 @@ def repair_segments(segments, orig_probes):
row_vals = tuple(vals[c] for c in segments.data.columns)
return row_vals
# Adjust segment endpoints on each chromosome
segments = segments.copy()
extra_segments = []
for chrom, subprobes in orig_probes.by_chromosome():
chr_seg_idx = np.where(segments.chromosome == chrom)[0]
orig_start = subprobes[0, 'start']
orig_end = subprobes[len(subprobes)-1, 'end']
if len(chr_seg_idx):
segments[chr_seg_idx[0], 'start'] = orig_start
segments[chr_seg_idx[-1], 'end'] = orig_end
if not len(cnarr):
# This Should Never Happen (TM)
# raise RuntimeError("No bins for:\n" + str(segments.data))
logging.warn("No bins for:\n%s", segments.data)
return segments
# Adjust segment endpoints to cover the chromosome arm's original bins
# (Stretch first and last segment endpoints to match first/last bins)
bins_chrom = cnarr.chromosome.iat[0]
bins_start = cnarr.start.iat[0]
bins_end = cnarr.end.iat[-1]
if not len(segments):
# All bins in this chromosome arm were dropped: make a dummy segment
return make_null_segment(bins_chrom, bins_start, bins_end)
segments.start.iat[0] = bins_start
segments.end.iat[-1] = bins_end
# Aggregate segment depths, weights, gene names
# ENH refactor so that np/CNA.data access is encapsulated in skgenome
ignore += params.ANTITARGET_ALIASES
assert bins_chrom == segments.chromosome.iat[0]
cdata = cnarr.data.reset_index()
if 'depth' not in cdata.columns:
cdata['depth'] = np.exp2(cnarr['log2'].values)
bin_genes = cdata['gene'].values
bin_weights = cdata['weight'].values if 'weight' in cdata.columns else None
bin_depths = cdata['depth'].values
seg_genes = ['-'] * len(segments)
seg_weights = np.zeros(len(segments))
seg_depths = np.zeros(len(segments))
for i, bin_idx in enumerate(iter_slices(cdata, segments.data, 'outer', False)):
if bin_weights is not None:
seg_wt = bin_weights[bin_idx].sum()
if seg_wt > 0:
seg_dp = np.average(bin_depths[bin_idx],
weights=bin_weights[bin_idx])
else:
seg_dp = 0.0
else:
bin_count = len(cdata.iloc[bin_idx])
seg_wt = float(bin_count)
seg_dp = bin_depths[bin_idx].mean()
subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore]
if subgenes:
seg_gn = ",".join(subgenes)
else:
extra_segments.append(
make_null_segment(chrom, orig_start, orig_end))
if extra_segments:
segments.add(segments.as_rows(extra_segments))
seg_gn = '-'
seg_genes[i] = seg_gn
seg_weights[i] = seg_wt
seg_depths[i] = seg_dp
segments.data = segments.data.assign(
gene=seg_genes,
weight=seg_weights,
depth=seg_depths)
return segments
"""Robust metrics to evaluate performance of copy number estimates.
"""
from __future__ import absolute_import, division, print_function
from builtins import zip
from builtins import map, range, zip
import logging
......@@ -31,63 +31,68 @@ def do_segmetrics(cnarr, segarr, location_stats=(), spread_stats=(),
'bivar': descriptives.biweight_midvariance,
'sem': stats.sem,
'ci': lambda x: confidence_interval_bootstrap(x, alpha, bootstraps),
'pi': lambda x: prediction_interval(x, alpha),
'ci': make_ci_func(alpha, bootstraps),
'pi': make_pi_func(alpha),
}
seg_log2s, bins_log2s = zip(*[(seg.log2, bins['log2'])
for seg, bins in cnarr.by_ranges(segarr)])
bins_log2s = list(cnarr.iter_ranges_of(segarr, 'log2', 'outer', True))
segarr = segarr.copy()
if location_stats:
# Measures of location
for statname in location_stats:
func = stat_funcs[statname]
segarr[statname] = np.asfarray([func(bl) for bl in bins_log2s])
segarr[statname] = np.fromiter(map(func, bins_log2s),
np.float_, len(segarr))
# Measures of spread
deviations = [bl - sl for sl, bl in zip(seg_log2s, bins_log2s)]
if spread_stats:
deviations = (bl - sl for bl, sl in zip(bins_log2s, segarr['log2']))
if len(spread_stats) > 1:
deviations = list(deviations)
for statname in spread_stats:
func = stat_funcs[statname]
segarr[statname] = np.asfarray([func(d) for d in deviations])
segarr[statname] = np.fromiter(map(func, deviations),
np.float_, len(segarr))
# Interval calculations
weights = cnarr['weight']
if 'ci' in interval_stats:
segarr['ci_lo'], segarr['ci_hi'] = segmetric_interval(segarr, cnarr,
segarr['ci_lo'], segarr['ci_hi'] = calc_intervals(bins_log2s, weights,
stat_funcs['ci'])
if 'pi' in interval_stats:
segarr['pi_lo'], segarr['pi_hi'] = segmetric_interval(segarr, cnarr,
segarr['pi_lo'], segarr['pi_hi'] = calc_intervals(bins_log2s, weights,
stat_funcs['pi'])
return segarr
def segment_mean(cnarr, skip_low=False):
"""Weighted average of bin log2 values."""
if skip_low:
cnarr = cnarr.drop_low_coverage()
if len(cnarr) == 0:
return np.nan
if 'weight' in cnarr:
return np.average(cnarr['log2'], weights=cnarr['weight'])
return cnarr['log2'].mean()
def make_ci_func(alpha, bootstraps):
def ci_func(ser, wt):
return confidence_interval_bootstrap(ser, wt, alpha, bootstraps)
return ci_func
def segmetric_interval(segarr, cnarr, func):
"""Compute a stat that yields intervals (low & high values)"""
out_vals_lo = np.repeat(np.nan, len(segarr))
out_vals_hi = np.repeat(np.nan, len(segarr))
for i, (_segment, bins) in enumerate(cnarr.by_ranges(segarr)):
if len(bins):
out_vals_lo[i], out_vals_hi[i] = func(bins)
return out_vals_lo, out_vals_hi
def prediction_interval(bins, alpha):
def make_pi_func(alpha):
"""Prediction interval, estimated by percentiles."""
# ENH: weighted percentile
pct_lo = 100 * alpha / 2
pct_hi = 100 * (1 - alpha / 2)
# ENH: weighted percentile
return np.percentile(bins['log2'], [pct_lo, pct_hi])
def pi_func(ser, _w):
return np.percentile(ser, [pct_lo, pct_hi])
return pi_func
def confidence_interval_bootstrap(bins, alpha, bootstraps=100, smoothed=True):
def calc_intervals(bins_log2s, weights, func):
"""Compute a stat that yields intervals (low & high values)"""
out_vals_lo = np.repeat(np.nan, len(bins_log2s))
out_vals_hi = np.repeat(np.nan, len(bins_log2s))
for i, ser in enumerate(bins_log2s):
if len(ser):
wt = weights[ser.index]
assert (wt.index == ser.index).all()
out_vals_lo[i], out_vals_hi[i] = func(ser.values, wt.values)
return out_vals_lo, out_vals_hi
def confidence_interval_bootstrap(values, weights, alpha, bootstraps=100, smoothed=True):
"""Confidence interval for segment mean log2 value, estimated by bootstrap."""
if not 0 < alpha < 1:
raise ValueError("alpha must be between 0 and 1; got %s" % alpha)
......@@ -97,55 +102,58 @@ def confidence_interval_bootstrap(bins, alpha, bootstraps=100, smoothed=True):
"%f; increasing to %d", bootstraps, alpha, new_boots)
bootstraps = new_boots
# Bootstrap for CI
k = len(bins)
k = len(values)
if k < 2:
return np.array([bins["log2"][0], bins["log2"][0]])
return np.repeat(values[0], 2)
np.random.seed(0xA5EED)
rand_indices = np.random.randint(0, k, (bootstraps, k))
samples = [bins.data.take(idx) for idx in rand_indices]
rand_indices = np.random.randint(0, k, size=(bootstraps, k))
samples = ((np.take(values, idx), np.take(weights, idx))
for idx in rand_indices)
if smoothed:
# samples = _smooth_samples(bins, samples, alpha)
# samples = _smooth_samples(values, samples, alpha)
pass
# Recalculate segment means
bootstrap_dist = np.array([segment_mean(samp) for samp in samples])
seg_means = (np.average(val, weights=wt)
for val, wt in samples)
bootstrap_dist = np.fromiter(seg_means, np.float_, bootstraps)
alphas = np.array([alpha / 2, 1 - alpha / 2])
if not smoothed:
# alphas = _bca_correct_alpha(bins, bootstrap_dist, alphas)
# alphas = _bca_correct_alpha(values, weights, bootstrap_dist, alphas)
pass
ci = np.percentile(bootstrap_dist, list(100 * alphas))
return ci
def _smooth_samples(bins, samples, alpha):
k = len(bins)
def _smooth_samples(values, samples, alpha):
k = len(values)
# Essentially, resample from a kernel density estimate of the data
# instead of the original data.
# Estimate KDE bandwidth (Polansky 1995)
resids = bins['log2'] - bins['log2'].mean()
resids = values - values.mean()
s_hat = 1/k * (resids**2).sum() # sigma^2 = E[X-theta]^2
y_hat = 1/k * abs((resids**3).sum()) # gamma = E[X-theta]^3
z = stats.norm.ppf(alpha / 2) # or alpha?
bw = k**(-1/4) * np.sqrt(y_hat*(z**2 + 2) / (3*s_hat*z))
# NB: or, Silverman's Rule for KDE bandwidth (roughly):
# std = interquartile_range(bins['log2']) / 1.34
# std = interquartile_range(values) / 1.34
# bw = std * (k*3/4) ** (-1/5)
if bw > 0:
samples = [samp.assign(log2=lambda x:
x['log2'] + bw * np.random.randn(k))
for samp in samples]
# Unpack the (log-ratio, weight) tuple to retain weights
samples = [(v + bw * np.random.randn(k), w)
for v, w in samples]
logging.debug("Smoothing worked for this segment (bw=%s)", bw)
else:
logging.debug("Smoothing not needed for this segment (bw=%s)", bw)
return samples
def _bca_correct_alpha(bins, bootstrap_dist, alphas):
def _bca_correct_alpha(values, weights, bootstrap_dist, alphas):
# BCa correction (Efron 1987, "Better Bootstrap Confidence Intervals")
# http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410
# Ported from R package "bootstrap" function "bcanon"
n_boots = len(bootstrap_dist)
orig_mean = segment_mean(bins)
orig_mean = np.average(values, weights=weights)
logging.warning("boot samples less: %s / %s",
(bootstrap_dist < orig_mean).sum(),
n_boots)
......@@ -159,8 +167,10 @@ def _bca_correct_alpha(bins, bootstrap_dist, alphas):
z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots)
zalpha = stats.norm.ppf(alphas)
# Jackknife influence values
u = np.array([segment_mean(bins.concat([bins[:i], bins[i+1:]]))
for i in range(len(bins))])
u = np.array([np.average(np.concatenate([values[:i], values[i+1:]]),
weights=np.concatenate([weights[:i],
weights[i+1:]]))
for i in range(len(values))])
uu = u.mean() - u
acc = (u**3).sum() / (6 * (uu**2).sum()**1.5)
alphas = stats.norm.cdf(z0 + (z0 + zalpha)
......@@ -170,3 +180,14 @@ def _bca_correct_alpha(bins, bootstrap_dist, alphas):
if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1:
raise ValueError("CI alphas should be in (0,1); got %s" % alphas)
return alphas
def segment_mean(cnarr, skip_low=False):
"""Weighted average of bin log2 values."""
if skip_low:
cnarr = cnarr.drop_low_coverage()
if len(cnarr) == 0:
return np.nan
if 'weight' in cnarr:
return np.average(cnarr['log2'], weights=cnarr['weight'])
return cnarr['log2'].mean()
......@@ -31,6 +31,8 @@ def _width2wing(width, x, min_wing=3):
if 0 < width < 1:
wing = int(math.ceil(len(x) * width * 0.5))
elif width >= 2 and int(width) == width:
# Ensure window width <= len(x) to avoid TypeError
width = min(width, len(x) - 1)
wing = int(width // 2)
else:
raise ValueError("width must be either a fraction between 0 and 1 "
......
"""Z-test for single-bin copy number alterations."""
from __future__ import division, print_function
import logging
import numpy as np
from scipy.stats import norm
from . import params
def do_ztest(cnarr, segments=None, is_male_reference=False,
is_sample_female=None, alpha=0.005, target_only=False):
"""Get a probability for each bin based on its Z-score.
Return bins where the probability < `alpha`.
"""
if segments:
# Subtract segment means to report only the CNA bins that weren't
# already detected (including exon-size CNAs within a larger-scale,
# smaller-amplitude CNA)
cnarr['log2'] = cnarr.residuals(segments)
else:
# Account for non-diploid sex chromosomes
cnarr['log2'] -= cnarr.expect_flat_log2(is_male_reference)
if is_sample_female:
# chrX has same ploidy as autosomes; chrY is just unusable noise
cnarr = cnarr[cnarr.chromosome != cnarr._chr_y_label]
else:
# 1/2 #copies of each sex chromosome
is_chr_xy = cnarr.chromosome.isin((cnarr._chr_x_label,
cnarr._chr_y_label))
cnarr[is_chr_xy, 'log2'] += 1.0
if target_only:
antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES)
if antitarget_idx.any():
logging.info("Ignoring", antitarget_idx.sum(), "off-target bins")
cnarr = cnarr[~antitarget_idx]
cnarr['ztest'] = z_prob(cnarr)
is_sig = cnarr['ztest'] < alpha
logging.info("Significant hits in {}/{} bins ({:.3g}%)"
.format(is_sig.sum(), len(is_sig),
100 * is_sig.sum() / len(is_sig)))
return cnarr[is_sig]
def z_prob(cnarr):
# Bin weights ~ 1-variance; bin log2 values already centered at 0.0
sd = np.sqrt(1 - cnarr['weight'])
# Convert to Z-scores
z = cnarr['log2'] / sd
# Two-sided survival function (1-CDF) probability
p = 2. * norm.cdf(z)
# Similar to the above -- which is better?
# p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd)
# if not np.allclose(p, p2):
# print("Max diff:", np.abs(p - p2).max())
# print("Median diff:", np.median(np.abs(p - p2)))
# print("Ratio:", (p / p2).mean())
# Correct for multiple hypothesis tests
return p_adjust_bh(p)
def p_adjust_bh(p):
"""Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
p = np.asfarray(p)
by_descend = p.argsort()[::-1]
by_orig = by_descend.argsort()
steps = float(len(p)) / np.arange(len(p), 0, -1)
q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
return q[by_orig]
cnvkit (0.9.5-1) unstable; urgency=medium
* Team upload.
* New upstream version
* debhelper 11
* Standards-Version: 4.2.0
-- Andreas Tille <tille@debian.org> Mon, 13 Aug 2018 13:02:24 +0200
cnvkit (0.9.3-2) unstable; urgency=medium
* Add missing build-dep on python3-future. (Closes: #901877)
......
Source: cnvkit
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Michael R. Crusoe <michael.crusoe@gmail.com>,
Steffen Moeller <moeller@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
dh-python,
python3-all,
......@@ -20,14 +20,15 @@ Build-Depends: debhelper (>= 11~),
python3-future,
r-cran-pscbs
# python-subprocess32 # required only for python2
Standards-Version: 4.1.4
Standards-Version: 4.2.0
Vcs-Browser: https://salsa.debian.org/med-team/cnvkit
Vcs-Git: https://salsa.debian.org/med-team/cnvkit.git
Homepage: http://cnvkit.readthedocs.org
Package: cnvkit
Architecture: all
Depends: ${python3:Depends}, ${misc:Depends}
Depends: ${python3:Depends},
${misc:Depends}
Description: Copy number variant detection from targeted DNA sequencing
A command-line toolkit and Python library for detecting copy number variants
and alterations genome-wide from targeted DNA sequencing. It is designed for
......
Description: enable building with Python3
Index: cnvkit/test/Makefile
===================================================================
--- cnvkit.orig/test/Makefile
+++ cnvkit/test/Makefile
--- a/test/Makefile
+++ b/test/Makefile
@@ -3,7 +3,9 @@
# Dependency: pdfunite (poppler-utils)
# (Otherwise, all-scatters.pdf and all-diagrams.pdf will be empty files.)
......@@ -29,107 +27,88 @@ Index: cnvkit/test/Makefile
# ------------------------------------------------------------------------------
Index: cnvkit/test/test_cnvlib.py
===================================================================
--- cnvkit.orig/test/test_cnvlib.py
+++ cnvkit/test/test_cnvlib.py
--- a/test/test_cnvlib.py
+++ b/test/test_cnvlib.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Unit tests for the CNVkit library, cnvlib."""
from __future__ import absolute_import, division, print_function
Index: cnvkit/test/test_genome.py
===================================================================
--- cnvkit.orig/test/test_genome.py
+++ cnvkit/test/test_genome.py
import sys
--- a/test/test_genome.py
+++ b/test/test_genome.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Unit tests for the 'genome' sub-package."""
from __future__ import absolute_import, division, print_function
import random
Index: cnvkit/test/test_io.py
===================================================================
--- cnvkit.orig/test/test_io.py
+++ cnvkit/test/test_io.py
--- a/test/test_io.py
+++ b/test/test_io.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Unit tests for the CNVkit library, cnvlib."""
from __future__ import absolute_import, division, print_function
Index: cnvkit/test/test_r.py
===================================================================
--- cnvkit.orig/test/test_r.py
+++ cnvkit/test/test_r.py
--- a/test/test_r.py
+++ b/test/test_r.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Unit tests for CNVkit that require an R installation."""
from __future__ import absolute_import, division, print_function
Index: cnvkit/cnvkit.py
===================================================================
--- cnvkit.orig/cnvkit.py
+++ /dev/null
@@ -1,1 +1,1 @@
--- a/cnvkit.py
+++ b/cnvkit.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
Index: cnvkit/scripts/cnv_annotate.py
===================================================================
--- cnvkit.orig/scripts/cnv_annotate.py
+++ cnvkit/scripts/cnv_annotate.py
"""Command-line interface for CNVkit, the Copy Number Variation toolkit."""
from future import standard_library
--- a/scripts/cnv_annotate.py
+++ b/scripts/cnv_annotate.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Update gene names in CNVkit .cnn/.cnr files.
"""
Index: cnvkit/scripts/cnv_updater.py
===================================================================
--- cnvkit.orig/scripts/cnv_updater.py
+++ cnvkit/scripts/cnv_updater.py
--- a/scripts/cnv_updater.py
+++ b/scripts/cnv_updater.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Update .cnn/.cnr files from older CNVkit versions to match current defaults.
CNVkit v0.8.0 and later uses a 'depth' column in the *.targetcoverage.cnn and
Index: cnvkit/scripts/cnv_ztest.py
===================================================================
--- cnvkit.orig/scripts/cnv_ztest.py
+++ cnvkit/scripts/cnv_ztest.py
--- a/scripts/cnv_ztest.py
+++ b/scripts/cnv_ztest.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Z-test for single-bin copy number alterations."""
from __future__ import division, print_function
Index: cnvkit/scripts/guess_baits.py
===================================================================
--- cnvkit.orig/scripts/guess_baits.py
+++ cnvkit/scripts/guess_baits.py
--- a/scripts/guess_baits.py
+++ b/scripts/guess_baits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Guess the coordinates of captured regions from sample read depths.
Two approaches available:
Index: cnvkit/scripts/reference2targets.py
===================================================================
--- cnvkit.orig/scripts/reference2targets.py
+++ cnvkit/scripts/reference2targets.py
--- a/scripts/reference2targets.py
+++ b/scripts/reference2targets.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
"""Extract target and antitarget BED files from a CNVkit reference file.
Index: cnvkit/scripts/skg_convert.py
===================================================================
--- cnvkit.orig/scripts/skg_convert.py
+++ cnvkit/scripts/skg_convert.py
--- a/scripts/skg_convert.py
+++ b/scripts/skg_convert.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
......
......@@ -19,10 +19,11 @@ requirements:
- atlas # [not osx]
- biopython >=1.62
- future >=0.15.2
- futures >=3.0 # [py2k]
- futures >=3.0 # [py27]
- matplotlib >=1.3.1
- numpy >=1.9
- pandas >=0.18.1
- python-dateutil >=2.5.0
- pyfaidx >=0.4.7
- pysam >=0.10.0
- reportlab >=3.0
......