Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/.travis.yml
+++ b/.travis.yml
 language: c
 sudo: false

+os:
+    - osx
+    - linux
+env:
+    # Earliest supported versions
+    - CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
+    - CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
+    # The latest versions, whatever they are
+    - CONDA_PY=3.6 PANDAS='*' PYSAM='*'
+
 install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install md5sha1sum; fi
    - source devtools/travis-ci/install_miniconda.sh

 before_script:
-    - conda create -q -y --name testenv python=$CONDA_PY
+    - conda create -yq --name testenv python=$CONDA_PY
    - source activate testenv
-    - conda build -c bioconda --python $CONDA_PY ./devtools/conda-recipe/  # Build the conda recipe for the latest dev
-    - conda install -c bioconda --use-local -yq cnvkit  # Install the locally built package and its requirements
-    - conda install -c bioconda -yq pandas==$PANDAS pysam==$PYSAM  # Install the versions desired for testing
-    - conda remove cnvkit -y -q  # Remove the conda cnvkit package but keep the dependencies installed
-    - python setup.py install  # Install directly from source
+    - if [[ "$CONDA_PY" == "2.7" ]]; then conda install -yq futures>=3.0; fi
+    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then conda install -yq -c bioconda atlas; fi
+    # Install the versions desired for testing
+    - conda install -yq -c bioconda
+        cython
+        future
+        matplotlib
+        numpy
+        pyfaidx
+        pysam
+        reportlab
+        scipy
+        pandas==$PANDAS
+        pysam==$PYSAM
+    # R linking is broken on OS X - try recompilation instead of conda there
+    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then
+        conda install -yq -c bioconda bioconductor-dnacopy r-cghflasso;
+      fi
+    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+        conda install -yq -c bioconda r-base;
+        Rscript -e "source('http://bioconductor.org/biocLite.R'); biocLite(c('DNAcopy', 'cghFLasso'))";
+      fi
+    # hmmlearn 0.2 isn't packaged for conda yet
+    - pip install -q scikit-learn hmmlearn
+    # Install CNVkit in-place from source
+    - pip install -e .
    - cd test/
    # For codecov.io
    - pip install codecov
@@ -21,18 +52,10 @@ script:
    - coverage run test_io.py
    - coverage run -a test_genome.py
    - coverage run -a test_cnvlib.py
-    - coverage run -a test_r.py
+    #- coverage run -a test_r.py
+    # OS X can't install cghFLasso package?
+    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then coverage run -a test_r.py; fi

 after_success:
    - coverage report
    - codecov
-
-os:
-  # - osx
-  - linux
-
-env:
-  - CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
-  - CONDA_PY=2.7 PANDAS=0.22.0 PYSAM=0.13.0
-  - CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
-  - CONDA_PY=3.5 PANDAS=0.22.0 PYSAM=0.13.0
--- a/LICENSE
+++ b/LICENSE
-Copyright (c) 2013-2017 Eric Talevich
+Copyright (c) 2013-2018 Eric Talevich
+Copyright (c) 2013-2018 University of California

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

--- a/README.rst
+++ b/README.rst
@@ -78,11 +78,20 @@ packages.

 This approach is preferred on Mac OS X, and is a solid choice on Linux, too.

-To download and install CNVkit and its Python dependencies::
+To download and install CNVkit and its Python dependencies in a clean
+environment::

+    # Configure the sources where conda will find packages
    conda config --add channels defaults
    conda config --add channels conda-forge
    conda config --add channels bioconda
+    # Install CNVkit in a new environment named "cnvkit"
+    conda create -n cnvkit cnvkit
+    # Activate the environment with CNVkit installed:
+    source activate cnvkit
+
+Or, in an existing environment::
+
    conda install cnvkit


@@ -143,7 +152,7 @@ Then install the rest of CNVkit's dependencies::

 Alternatively, you can use `Homebrew <http://brew.sh/>`_ to install an
 up-to-date Python (e.g. ``brew install python``) and as many of the Python
-packages as possible (primarily NumPy, SciPy, matplotlib and pandas).
+packages as possible (primarily NumPy and SciPy; ideally matplotlib and pandas).
 Then, proceed with pip::

    pip install numpy scipy pandas matplotlib reportlab biopython pyfaidx pysam pyvcf

--- a/cnvlib/_version.py
+++ b/cnvlib/_version.py
-__version__ = "0.9.3"
+__version__ = "0.9.5"
--- a/cnvlib/autobin.py
+++ b/cnvlib/autobin.py
@@ -9,6 +9,7 @@ import pandas as pd
 from skgenome import tabio, GenomicArray as GA

 from . import coverage, samutil
+from .antitarget import compare_chrom_names
 from .descriptives import weighted_median


@@ -47,6 +48,14 @@ def do_autobin(bam_fname, method, targets=None, access=None,
        ((target depth, target avg. bin size),
         (antitarget depth, antitarget avg. bin size))
    """
+    if method in ('amplicon', 'hybrid'):
+        if targets is None:
+            raise ValueError("Target regions are required for method %r "
+                             "but were not provided." % method)
+        if not len(targets):
+            raise ValueError("Target regions are required for method %r "
+                             "but were not provided." % method)
+
    # Closes over bp_per_bin
    def depth2binsize(depth, min_size, max_size):
        if depth:
@@ -95,7 +104,9 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None):
    """Hybrid capture sequencing."""
    # Identify off-target regions
    if access is None:
-        access = idxstats2ga(rc_table)
+        access = idxstats2ga(rc_table, bam_fname)
+        # Verify BAM chromosome names match those in target BED
+        compare_chrom_names(access, targets)
    antitargets = access.subtract(targets)
    # Only examine chromosomes present in all 2-3 input datasets
    rc_table, targets, antitargets = shared_chroms(rc_table, targets,
@@ -126,9 +137,10 @@ def average_depth(rc_table, read_length):
    return weighted_median(mean_depths, rc_table.length)


-def idxstats2ga(table):
+def idxstats2ga(table, bam_fname):
    return GA(table.assign(start=0, end=table.length)
-              .loc[:, ('chromosome', 'start', 'end')])
+              .loc[:, ('chromosome', 'start', 'end')],
+              meta_dict={'filename': bam_fname})


 def sample_region_cov(bam_fname, regions, max_num=100):

--- a/cnvlib/cnary.py
+++ b/cnvlib/cnary.py
@@ -276,7 +276,7 @@ class CopyNumArray(GenomicArray):
                         self._chr_y_label, stats['chry_ratio'],
                         stats['chrx_male_lr'], stats['chry_male_lr'],
                         stats['chrx_male_lr'] * stats['chry_male_lr'],
-                         ('female', 'male')[is_xy])
+                         'male' if is_xy else 'female')
        return ~is_xy

    def compare_sex_chromosomes(self, male_reference=False, skip_low=False):

--- a/cnvlib/commands.py
+++ b/cnvlib/commands.py
@@ -163,7 +163,11 @@ P_batch.add_argument('-p', '--processes',
                parallel. Without an argument, use the maximum number of
                available CPUs. [Default: process each BAM in serial]""")
 P_batch.add_argument("--rlibpath", metavar="DIRECTORY",
-        help="Path to an alternative site-library to use for R packages.")
+        help=argparse.SUPPRESS)
+P_batch.add_argument("--rscript-path", metavar="PATH", default="Rscript",
+        help="""Path to the Rscript excecutable to use for running R code.
+                Use this option to specify a non-default R installation.
+                [Default: %(default)s]""")

 # Reference-building options
 P_batch_newref = P_batch.add_argument_group(
@@ -401,20 +405,25 @@ P_autobin.add_argument('-g', '--access', metavar="FILENAME",
 P_autobin.add_argument('-t', '--targets',
        help="""Potentially targeted genomic regions, e.g. all possible exons
                for the reference genome. Format: BED, interval list, etc.""")
-P_autobin.add_argument('-b', '--bp-per-bin', type=float, default=100000.,
+P_autobin.add_argument('-b', '--bp-per-bin',
+        type=float, default=100000.,
        help="""Desired average number of sequencing read bases mapped to each
                bin. [Default: %(default)s]""")

-P_autobin.add_argument('--target-max-size', type=int, default=20000,
-        help="Maximum size of target bins.")
-P_autobin.add_argument('--target-min-size', type=int, default=20,
-        help="Minimum size of target bins.")
-P_autobin.add_argument('--antitarget-max-size', type=int, default=500000,
-        help="Maximum size of antitarget bins.")
-P_autobin.add_argument('--antitarget-min-size', type=int, default=500,
-        help="Minimum size of antitarget bins.")
-
-P_autobin.add_argument('--annotate',
+P_autobin.add_argument('--target-max-size', metavar="BASES",
+        type=int, default=20000,
+        help="Maximum size of target bins. [Default: %(default)s]")
+P_autobin.add_argument('--target-min-size', metavar="BASES",
+        type=int, default=20,
+        help="Minimum size of target bins. [Default: %(default)s]")
+P_autobin.add_argument('--antitarget-max-size', metavar="BASES",
+        type=int, default=500000,
+        help="Maximum size of antitarget bins. [Default: %(default)s]")
+P_autobin.add_argument('--antitarget-min-size', metavar="BASES",
+        type=int, default=500,
+        help="Minimum size of antitarget bins. [Default: %(default)s]")
+
+P_autobin.add_argument('--annotate', metavar="FILENAME",
        help="""Use gene models from this file to assign names to the target
                regions. Format: UCSC refFlat.txt or ensFlat.txt file
                (preferred), or BED, interval list, GFF, or similar.""")
@@ -616,6 +625,7 @@ def _cmd_segment(args):
                                           skip_outliers=args.drop_outliers,
                                           save_dataframe=bool(args.dataframe),
                                           rlibpath=args.rlibpath,
+                                           rscript_path=args.rscript_path,
                                           processes=args.processes)
    if args.dataframe:
        segments, dframe = results
@@ -648,14 +658,18 @@ P_segment.add_argument('-t', '--threshold', type=float,
 P_segment.add_argument("--drop-low-coverage", action='store_true',
        help="""Drop very-low-coverage bins before segmentation to avoid
                false-positive deletions in poor-quality tumor samples.""")
-P_segment.add_argument("--drop-outliers",
-        type=float, default=10, metavar="FACTOR",
+P_segment.add_argument("--drop-outliers", metavar="FACTOR",
+        type=float, default=10,
        help="""Drop outlier bins more than this many multiples of the 95th
                quantile away from the average within a rolling window.
                Set to 0 for no outlier filtering.
                [Default: %(default)g]""")
 P_segment.add_argument("--rlibpath", metavar="DIRECTORY",
-        help="Path to an alternative site-library to use for R packages.")
+        help=argparse.SUPPRESS)
+P_segment.add_argument("--rscript-path", metavar="PATH", default="Rscript",
+        help="""Path to the Rscript excecutable to use for running R code.
+                Use this option to specify a non-default R installation.
+                [Default: %(default)s]""")
 P_segment.add_argument('-p', '--processes',
        nargs='?', type=int, const=0, default=1,
        help="""Number of subprocesses to segment in parallel.
@@ -701,7 +715,8 @@ def _cmd_call(args):
        logging.info("Shifting log2 values by %f", -args.center_at)
        cnarr['log2'] -= args.center_at
    elif args.center:
-        cnarr.center_all(args.center, verbose=True)
+        cnarr.center_all(args.center, skip_low=args.drop_low_coverage,
+                verbose=True)

    varr = load_het_snps(args.vcf, args.sample_id, args.normal_id,
                         args.min_variant_depth, args.zygosity_freq)
@@ -747,6 +762,9 @@ P_call.add_argument("--ploidy", type=int, default=2,
        help="Ploidy of the sample cells. [Default: %(default)d]")
 P_call.add_argument("--purity", type=float,
        help="Estimated tumor cell fraction, a.k.a. purity or cellularity.")
+P_call.add_argument("--drop-low-coverage", action='store_true',
+        help="""Drop very-low-coverage bins before segmentation to avoid
+                false-positive deletions in poor-quality tumor samples.""")
 P_call.add_argument('-x', '--sample-sex', '-g', '--gender', dest='sample_sex',
        choices=('m', 'y', 'male', 'Male', 'f', 'x', 'female', 'Female'),
        help="""Specify the sample's chromosomal sex as male or female.
@@ -1176,8 +1194,8 @@ def do_sex(cnarrs, is_male_reference):
        is_xy, stats = cna.compare_sex_chromosomes(is_male_reference)
        return (cna.meta["filename"] or cna.sample_id,
                "Male" if is_xy else "Female",
-                strsign(stats['chrx_ratio']),
-                strsign(stats['chry_ratio']))
+                strsign(stats['chrx_ratio']) if stats else "NA",
+                strsign(stats['chry_ratio']) if stats else "NA")

    rows = (guess_and_format(cna) for cna in cnarrs)
    columns = ["sample", "sex", "X_logratio", "Y_logratio"]
@@ -1440,7 +1458,7 @@ def _cmd_import_rna(args):
    """Convert a cohort of per-gene log2 ratios to CNVkit .cnr format."""
    all_data, cnrs = import_rna.do_import_rna(
        args.gene_counts, args.format, args.gene_resource, args.correlations,
-        args.normal)
+        args.normal, args.do_gc, args.do_txlen, args.max_log2)
    logging.info("Writing output files")
    if args.output:
        all_data.to_csv(args.output, sep='\t', index=True)
@@ -1470,7 +1488,10 @@ P_import_rna.add_argument('-g', '--gene-resource',
 P_import_rna.add_argument('-c', '--correlations', metavar="FILE",
        help="""Correlation of each gene's copy number with
                expression. Output of cnv_expression_correlate.py.""")
-P_import_rna.add_argument('-n', '--normal', nargs='+',
+P_import_rna.add_argument('--max-log2', metavar="FLOAT", default=3.0,
+        help="""Maximum log2 value in output. Observed values above this limit
+                will be replaced with this value.""")
+P_import_rna.add_argument('-n', '--normal', nargs='+', default=[],
        help="""Normal samples (same format as `gene_counts`) to be used as a
                control to when normalizing and re-centering gene read depth
                ratios. All filenames following this option will be used.""")
@@ -1480,6 +1501,14 @@ P_import_rna.add_argument('-d', '--output-dir',
                sample. [Default: %(default)s]""")
 P_import_rna.add_argument('-o', '--output', metavar="FILE",
        help="Output file name (summary table).")
+
+P_import_rna_bias = P_import_rna.add_argument_group(
+    "To disable specific automatic bias corrections")
+P_import_rna_bias.add_argument('--no-gc', dest='do_gc', action='store_false',
+        help="Skip GC correction.")
+P_import_rna_bias.add_argument('--no-txlen', dest='do_txlen', action='store_false',
+        help="Skip transcript length correction.")
+
 P_import_rna.set_defaults(func=_cmd_import_rna)


@@ -1504,10 +1533,15 @@ def _cmd_export_bed(args):
        # ENH: args.sample_sex as a comma-separated list
        is_sample_female = verify_sample_sex(segments, args.sample_sex,
                                             args.male_reference)
+        if args.sample_id:
+            label = args.sample_id
+        elif args.label_genes:
+            label = None
+        else:
+            label = segments.sample_id
        tbl = export.export_bed(segments, args.ploidy,
                                args.male_reference, is_sample_female,
-                                args.sample_id or segments.sample_id,
-                                args.show)
+                                label, args.show)
        bed_tables.append(tbl)
    table = pd.concat(bed_tables)
    write_dataframe(args.output, table, header=False)
@@ -1520,6 +1554,9 @@ P_export_bed.add_argument('segments', nargs='+',
 P_export_bed.add_argument("-i", "--sample-id", metavar="LABEL",
        help="""Identifier to write in the 4th column of the BED file.
                [Default: use the sample ID, taken from the file name]""")
+P_export_bed.add_argument('--label-genes', action='store_true',
+        help="""Show gene names in the 4th column of the BED file.
+        (This is a bad idea if >1 input files are given.)""")
 P_export_bed.add_argument("--ploidy", type=int, default=2,
        help="Ploidy of the sample cells. [Default: %(default)d]")
 P_export_bed.add_argument('-x', '--sample-sex', '-g', '--gender',

--- a/cnvlib/export.py
+++ b/cnvlib/export.py
@@ -178,7 +178,7 @@ def export_bed(segments, ploidy, is_reference_male, is_sample_female,
    the reference ploidy on autosomes, or half that on sex chromosomes.
    """
    out = segments.data.loc[:, ["chromosome", "start", "end"]]
-    out["label"] = label
+    out["label"] = label if label else segments["gene"]
    out["ncopies"] = (segments["cn"] if "cn" in segments
                      else call.absolute_pure(segments, ploidy, is_reference_male)
                           .round().astype('int'))

--- a/cnvlib/import_rna.py
+++ b/cnvlib/import_rna.py
@@ -9,14 +9,15 @@ from . import rna


 def do_import_rna(gene_count_fnames, in_format, gene_resource_fname,
-                  correlations_fname=None, normal_fnames=()):
+                  correlations_fname=None, normal_fnames=(),
+                  do_gc=True, do_txlen=True, max_log2=3):
    """Convert a cohort of per-gene read counts to CNVkit .cnr format.

    The expected data source is TCGA gene-level expression counts for individual
    samples, but other sources should be fine, too.
    """
    # Deduplicate and ensure all normals are included in the analysis
-    gene_count_fnames = sorted(set(gene_count_fnames) + set(normal_fnames))
+    gene_count_fnames = sorted(set(list(gene_count_fnames) + list(normal_fnames)))

    if in_format == 'rsem':
        sample_counts, tx_lengths = aggregate_rsem(gene_count_fnames)
@@ -43,7 +44,7 @@ def do_import_rna(gene_count_fnames, in_format, gene_resource_fname,
    # CNVkit files have both absolute and log2-normalized read counts
    cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2,
                                       gene_info)
-    cnrs = (rna.correct_cnr(cnr) for cnr in cnrs)
+    cnrs = (rna.correct_cnr(cnr, do_gc, do_txlen, max_log2) for cnr in cnrs)
    return all_data, cnrs



--- a/cnvlib/reference.py
+++ b/cnvlib/reference.py
@@ -230,11 +230,9 @@ def combine_probes(filenames, fa_fname, is_male_reference, sexes, skip_low,
        logging.info("Loading target %s", fname)
        cnarrx = read_cna(fname)
        # Bin information should match across all files
-        if not (len(cnarr1) == len(cnarrx)
-                and (cnarr1.chromosome == cnarrx.chromosome).all()
-                and (cnarr1.start == cnarrx.start).all()
-                and (cnarr1.end == cnarrx.end).all()
-                and (cnarr1['gene'] == cnarrx['gene']).all()):
+        if not np.array_equal(
+                cnarr1.data.loc[:, ('chromosome', 'start', 'end', 'gene')].values,
+                cnarrx.data.loc[:, ('chromosome', 'start', 'end', 'gene')].values):
            raise RuntimeError("%s bins do not match those in %s"
                               % (fname, filenames[0]))
        all_depths.append(cnarrx['depth'] if 'depth' in cnarrx
@@ -273,8 +271,14 @@ def warn_bad_bins(cnarr, max_name_width=50):
    if len(fg_bad_bins) > 0:
        bad_pct = (100 * len(fg_bad_bins)
                   / sum(~cnarr['gene'].isin(params.ANTITARGET_ALIASES)))
-        logging.info("Targets: %d (%s) bins failed filters:",
-                     len(fg_bad_bins), "%.4f" % bad_pct + '%')
+        logging.info("Targets: %d (%s) bins failed filters "
+                     "(log2 < %s, log2 > %s, spread > %s)",
+                     len(fg_bad_bins),
+                     "%.4f" % bad_pct + '%',
+                     params.MIN_REF_COVERAGE,
+                     -params.MIN_REF_COVERAGE,
+                     params.MAX_REF_SPREAD)
+        if len(fg_bad_bins) < 500:
            gene_cols = min(max_name_width, max(map(len, fg_bad_bins['gene'])))
            labels = fg_bad_bins.labels()
            chrom_cols = max(labels.apply(len))

--- a/cnvlib/rna.py
+++ b/cnvlib/rna.py
@@ -4,9 +4,7 @@ Process per-gene expression levels, or the equivalent, by cohort.
 """
 from __future__ import absolute_import, division, print_function
 from builtins import zip
-
 import logging
-import sys

 import numpy as np
 import pandas as pd
@@ -14,8 +12,7 @@ from scipy.stats import gmean

 from .cnary import CopyNumArray as CNA
 from .fix import center_by_window
-# from .descriptives import biweight_midvariance
-# from .params import NULL_LOG2_COVERAGE
+

 NULL_LOG2_COVERAGE = -5

@@ -37,10 +34,10 @@ def filter_probes(sample_counts):
    For the purposes of copy number inference, these rows are best removed.
    """
    gene_medians = sample_counts.median(axis=1)
-    ## Make sure the gene has detectable transcript in at least half of samples
+    # Make sure the gene has detectable transcript in at least half of samples
    is_mostly_transcribed = (gene_medians >= 1.0)
-    print("Dropping", (~is_mostly_transcribed).sum(), "/",
-          len(is_mostly_transcribed), "rarely expressed genes from input samples")
+    logging.info("Dropping %d / %d rarely expressed genes from input samples",
+                 (~is_mostly_transcribed).sum(), len(is_mostly_transcribed))
    return sample_counts[is_mostly_transcribed]


@@ -74,8 +71,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
                                           'tx_support': tsl2int,
                                           'gc': lambda x: float(x)/100})
                 .sort_values('gene_id'))
-    print("Loaded", gene_resource, "with shape:", gene_info.shape,
-          file=sys.stderr)
+    logging.info("Loaded %s with shape: %s", gene_resource, gene_info.shape)

    if corr_fname:
        corr_table = load_cnv_expression_corr(corr_fname)
@@ -85,10 +81,21 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
            unique_idx = gi_corr.groupby('gene_id').apply(dedupe_ens_hugo)
            gi_corr = gi_corr.loc[unique_idx]

+        # DBG verify that the tables were aligned correctly
+        #  for colname in ('pearson_r', 'spearman_r', 'kendall_t'):
+        #      print("In column", colname, "length", len(gi_corr),
+        #            ", have", gi_corr[colname].count(), "not-NA,",
+        #            (gi_corr[colname] > 0.1).sum(), ">0.1",
+        #            ", max value", np.nanmax(gi_corr[colname]))
+
        if not gene_info['entrez_id'].is_unique:
            # Treat untraceable entrez_id duplicates (scarce at this point, ~15)
            # as unknown correlation, i.e. default, as if not in TCGA
-            gi_corr.loc[tuple(locate_entrez_dupes(gi_corr)),
+            entrez_dupes_idx = tuple(locate_entrez_dupes(gi_corr))
+            logging.info("Resetting %d ambiguous genes' correlation "
+                         "coefficients to default %f",
+                         len(entrez_dupes_idx), default_r)
+            gi_corr.loc[entrez_dupes_idx,
                        ('pearson_r', 'spearman_r', 'kendall_t')] = default_r

        # Genes w/o TCGA info get default correlation 0.1 (= .5*corr.median())
@@ -102,7 +109,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
        unique_idx = gene_info.groupby('gene_id').apply(dedupe_ens_no_hugo)
        gene_info = gene_info.loc[unique_idx]

-    print("Trimmed gene info table to shape:", gene_info.shape)
+    logging.info("Trimmed gene info table to shape: %s", gene_info.shape)
    assert gene_info['gene_id'].is_unique
    gene_info['entrez_id'] = gene_info['entrez_id'].fillna(0).astype('int')
    return gene_info.set_index('gene_id')
@@ -110,9 +117,9 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):

 def load_cnv_expression_corr(fname):
    shared_key = 'Entrez_Gene_Id'
-    table = (pd.read_table(fname, dtype={shared_key: str}, na_filter=False)
+    table = (pd.read_table(fname, dtype={shared_key: int}, na_filter=False)
             .set_index(shared_key))
-    print("Loaded", fname, "with shape:", table.shape, file=sys.stderr)
+    logging.info("Loaded %s with shape: %s", fname, table.shape)
    return table


@@ -242,8 +249,8 @@ def align_gene_info_to_samples(gene_info, sample_counts, tx_lengths,
    Also calculate weights and add to gene_info as 'weight', along with
    transcript lengths as 'tx_length'.
    """
-    print("Dimensions: gene_info=%s, sample_counts=%s"
-          % (gene_info.shape, sample_counts.shape))
+    logging.debug("Dimensions: gene_info=%s, sample_counts=%s",
+                  gene_info.shape, sample_counts.shape)
    sc, gi = sample_counts.align(gene_info, join='inner', axis=0)
    gi = gi.sort_values(by=['chromosome', 'start'])
    sc = sc.loc[gi.index]
@@ -299,8 +306,6 @@ def normalize_read_depths(sample_depths, normal_ids):

    Finally, convert to log2 ratios.
    """
-    # TODO use normal_ids as a control here
-    #   e.g. subtract their IQR after the loop?
    assert sample_depths.values.sum() > 0
    sample_depths = sample_depths.fillna(0)
    for _i in range(4):
@@ -313,6 +318,31 @@ def normalize_read_depths(sample_depths, normal_ids):
        # print("  round", _i, "grand mean:", sample_depths.values.mean(),
        #       ": sample-wise denom =", q3.mean(),
        #       ", gene-wise denom =", sm.mean())
+    if normal_ids:
+        # Use normal samples as a baseline for read depths
+        normal_ids = pd.Series(normal_ids)
+        if not normal_ids.isin(sample_depths.columns).all():
+            raise ValueError("Normal sample IDs not in samples: %s"
+                    % normal_ids.drop(sample_depths.columns, errors='ignore'))
+        normal_depths = sample_depths.loc[:, normal_ids]
+        use_median = True  # TODO - benchmark & pick one
+        if use_median:
+            # Simple approach: divide normalized (1=neutral) values by gene medians
+            normal_avg = normal_depths.median(axis=1)
+            sample_depths = sample_depths.divide(normal_avg, axis=0).clip_lower(0)
+        else:
+            # Alternate approach: divide their IQR
+            # At each gene, sample depths above the normal sample depth 75%ile
+            # are divided by the 75%ile, those below 25%ile are divided by
+            # 25%ile, and those in between are reset to neutral (=1.0)
+            n25, n75 = np.nanpercentile(normal_depths.values, [25, 75], axis=1)
+            below_idx = sample_depths.values < n25[:, np.newaxis]
+            above_idx = sample_depths.values > n75[:, np.newaxis]
+            factor = np.zeros_like(sample_depths.values)
+            factor[below_idx] = (below_idx / n25[:, np.newaxis])[below_idx]
+            factor[above_idx] = (above_idx / n75[:, np.newaxis])[above_idx]
+            sample_depths *= factor
+            sample_depths[~(below_idx | above_idx)] = 1.0
    # Finally, convert normalized read depths to log2 scale
    return safe_log2(sample_depths, NULL_LOG2_COVERAGE)

@@ -349,7 +379,7 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info,
    # Fill NA fields with the lowest finite value in the same row.
    # Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth.
    gene_minima = sample_data_log2.min(axis=1, skipna=True)
-    assert not gene_minima.hasnans
+    assert not gene_minima.hasnans, gene_minima.head()
    for (sample_id, sample_col), (_sid_log2, sample_log2) \
            in zip(sample_counts.iteritems(), sample_data_log2.iteritems()):
        tx_len = cnr_info.tx_length
@@ -362,7 +392,7 @@ def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info,
        yield cnr


-def correct_cnr(cnr):
+def correct_cnr(cnr, do_gc, do_txlen, max_log2):
    """Apply bias corrections & smoothing.

    - Biases: 'gc', 'length'
@@ -370,9 +400,12 @@ def correct_cnr(cnr):
    """
    cnr.center_all()
    # Biases, similar to stock CNVkit
-    if 'gc' in cnr:
+    if any((do_gc, do_txlen)):
+        if do_gc and 'gc' in cnr:
            cnr = center_by_window(cnr, .1, cnr['gc'])
-    if 'tx_length' in cnr:
+        if do_txlen and 'tx_length' in cnr:
            cnr = center_by_window(cnr, .1, cnr['tx_length'])
        cnr.center_all()
+    if max_log2:
+        cnr[cnr['log2'] > max_log2, 'log2'] = max_log2
    return cnr
--- a/cnvlib/segmentation/__init__.py
+++ b/cnvlib/segmentation/__init__.py
 """Segmentation of copy number values."""
 from __future__ import absolute_import, division, print_function
-from builtins import map
+from builtins import map, zip

 import locale
 import logging
@@ -12,6 +12,7 @@ import numpy as np
 import pandas as pd
 from Bio._py3k import StringIO
 from skgenome import tabio
+from skgenome.intersect import iter_slices

 from .. import core, parallel, params, smoothing, vary
 from ..cnary import CopyNumArray as CNA
@@ -22,14 +23,15 @@ from . import cbs, flasso, haar, hmm, none
 def do_segmentation(cnarr, method, threshold=None, variants=None,
                    skip_low=False, skip_outliers=10, min_weight=0,
                    save_dataframe=False, rlibpath=None,
+                    rscript_path="Rscript",
                    processes=1):
    """Infer copy number segments from the given coverage table."""
    if variants:
        variants = variants.heterozygous()
    if not threshold:
        threshold = {'cbs': 0.0001,
-                     'flasso': 0.005,
-                     'haar': 0.001,
+                     'flasso': 0.0001,
+                     'haar': 0.0001,
                    }.get(method)
    msg = "Segmenting with method " + repr(method)
    if threshold is not None:
@@ -46,7 +48,8 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
        # ENH segment p/q arms separately
        # -> assign separate identifiers via chrom name suffix?
        cna = _do_segmentation(cnarr, method, threshold, variants, skip_low,
-                               skip_outliers, min_weight, save_dataframe, rlibpath)
+                               skip_outliers, min_weight, save_dataframe,
+                               rlibpath, rscript_path)
        if save_dataframe:
            cna, rstr = cna
            rstr = _to_str(rstr)
@@ -55,7 +58,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
        with parallel.pick_pool(processes) as pool:
            rets = list(pool.map(_ds, ((ca, method, threshold, variants,
                                        skip_low, skip_outliers, min_weight,
-                                        save_dataframe, rlibpath)
+                                        save_dataframe, rlibpath, rscript_path)
                                       for _, ca in cnarr.by_arm())))
        if save_dataframe:
            # rets is a list of (CNA, R dataframe string) -- unpack
@@ -86,7 +89,8 @@ def _ds(args):

 def _do_segmentation(cnarr, method, threshold, variants=None,
                     skip_low=False, skip_outliers=10, min_weight=0,
-                     save_dataframe=False, rlibpath=None):
+                     save_dataframe=False,
+                     rlibpath=None, rscript_path="Rscript"):
    """Infer copy number segments from the given coverage table."""
    if not len(cnarr):
        return cnarr
@@ -152,7 +156,7 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
            }
            with core.temp_write_text(rscript % script_strings,
                                      mode='w+t') as script_fname:
-                seg_out = core.call_quiet('Rscript', '--vanilla', script_fname)
+                seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname)
        # Convert R dataframe contents (SEG) to a proper CopyNumArray
        # NB: Automatically shifts 'start' back from 1- to 0-indexed
        segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA)
@@ -163,7 +167,6 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
            else:
                segarr['weight'] = 1.0
            segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True)
-        segarr = repair_segments(segarr, cnarr)

    else:
        raise ValueError("Unknown method %r" % method)
@@ -176,8 +179,7 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
        segarr = segarr.as_dataframe(pd.concat(newsegs))
        segarr['baf'] = variants.baf_by_ranges(segarr)

-    segarr['gene'], segarr['weight'], segarr['depth'] = \
-            transfer_fields(segarr, cnarr)
+    segarr = transfer_fields(segarr, cnarr)
    if save_dataframe:
        return segarr, seg_out
    else:
@@ -214,40 +216,16 @@ def transfer_fields(segments, cnarr, ignore=params.IGNORE_GENE_NAMES):
    Segment gene name is the comma-separated list of bin gene names. Segment
    weight is the sum of bin weights, and depth is the (weighted) mean of bin
    depths.
-    """
-    if not len(cnarr):
-        return [], [], []

-    ignore += params.ANTITARGET_ALIASES
-    if 'weight' not in cnarr:
-        cnarr['weight'] = 1
-    if 'depth' not in cnarr:
-        cnarr['depth'] = np.exp2(cnarr['log2'])
-    seggenes = ['-'] * len(segments)
-    segweights = np.zeros(len(segments))
-    segdepths = np.zeros(len(segments))
-    for i, (_seg, subprobes) in enumerate(cnarr.by_ranges(segments)):
-        if not len(subprobes):
-            continue
-        segweights[i] = subprobes['weight'].sum()
-        if subprobes['weight'].sum() > 0:
-            segdepths[i] = np.average(subprobes['depth'], weights=subprobes['weight'])
-        subgenes = [g for g in pd.unique(subprobes['gene']) if g not in ignore]
-        if subgenes:
-            seggenes[i] = ",".join(subgenes)
-    return seggenes, segweights, segdepths
-
-
-# TODO/ENH combine with transfer_fields
-# Recalculate segment means, weights, depths here instead of in R
-def repair_segments(segments, orig_probes):
-    """Post-process segmentation output.
+    Also: Post-process segmentation output.

    1. Ensure every chromosome has at least one segment.
    2. Ensure first and last segment ends match 1st/last bin ends
       (but keep log2 as-is).
+
    """
    def make_null_segment(chrom, orig_start, orig_end):
+        """Closes over 'segments'."""
        vals = {'chromosome': chrom,
                'start': orig_start,
                'end': orig_end,
@@ -260,19 +238,60 @@ def repair_segments(segments, orig_probes):
        row_vals = tuple(vals[c] for c in segments.data.columns)
        return row_vals

-    # Adjust segment endpoints on each chromosome
-    segments = segments.copy()
-    extra_segments = []
-    for chrom, subprobes in orig_probes.by_chromosome():
-        chr_seg_idx = np.where(segments.chromosome == chrom)[0]
-        orig_start = subprobes[0, 'start']
-        orig_end =  subprobes[len(subprobes)-1, 'end']
-        if len(chr_seg_idx):
-            segments[chr_seg_idx[0], 'start'] = orig_start
-            segments[chr_seg_idx[-1], 'end'] = orig_end
+    if not len(cnarr):
+        # This Should Never Happen (TM)
+        # raise RuntimeError("No bins for:\n" + str(segments.data))
+        logging.warn("No bins for:\n%s", segments.data)
+        return segments
+
+    # Adjust segment endpoints to cover the chromosome arm's original bins
+    # (Stretch first and last segment endpoints to match first/last bins)
+    bins_chrom = cnarr.chromosome.iat[0]
+    bins_start = cnarr.start.iat[0]
+    bins_end = cnarr.end.iat[-1]
+    if not len(segments):
+        # All bins in this chromosome arm were dropped: make a dummy segment
+        return make_null_segment(bins_chrom, bins_start, bins_end)
+    segments.start.iat[0] = bins_start
+    segments.end.iat[-1] = bins_end
+
+    # Aggregate segment depths, weights, gene names
+    # ENH refactor so that np/CNA.data access is encapsulated in skgenome
+    ignore += params.ANTITARGET_ALIASES
+    assert bins_chrom == segments.chromosome.iat[0]
+    cdata = cnarr.data.reset_index()
+    if 'depth' not in cdata.columns:
+        cdata['depth'] = np.exp2(cnarr['log2'].values)
+    bin_genes = cdata['gene'].values
+    bin_weights = cdata['weight'].values if 'weight' in cdata.columns else None
+    bin_depths = cdata['depth'].values
+    seg_genes = ['-'] * len(segments)
+    seg_weights = np.zeros(len(segments))
+    seg_depths = np.zeros(len(segments))
+
+    for i, bin_idx in enumerate(iter_slices(cdata, segments.data, 'outer', False)):
+        if bin_weights is not None:
+            seg_wt = bin_weights[bin_idx].sum()
+            if seg_wt > 0:
+                seg_dp = np.average(bin_depths[bin_idx],
+                                    weights=bin_weights[bin_idx])
+            else:
+                seg_dp = 0.0
+        else:
+            bin_count = len(cdata.iloc[bin_idx])
+            seg_wt = float(bin_count)
+            seg_dp = bin_depths[bin_idx].mean()
+        subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore]
+        if subgenes:
+            seg_gn = ",".join(subgenes)
        else:
-            extra_segments.append(
-                make_null_segment(chrom, orig_start, orig_end))
-    if extra_segments:
-        segments.add(segments.as_rows(extra_segments))
+            seg_gn = '-'
+        seg_genes[i] = seg_gn
+        seg_weights[i] = seg_wt
+        seg_depths[i] = seg_dp
+
+    segments.data = segments.data.assign(
+        gene=seg_genes,
+        weight=seg_weights,
+        depth=seg_depths)
    return segments
--- a/cnvlib/segmetrics.py
+++ b/cnvlib/segmetrics.py
 """Robust metrics to evaluate performance of copy number estimates.
 """
 from __future__ import absolute_import, division, print_function
-from builtins import zip
+from builtins import map, range, zip

 import logging

@@ -31,63 +31,68 @@ def do_segmetrics(cnarr, segarr, location_stats=(), spread_stats=(),
        'bivar': descriptives.biweight_midvariance,
        'sem': stats.sem,

-        'ci': lambda x: confidence_interval_bootstrap(x, alpha, bootstraps),
-        'pi': lambda x: prediction_interval(x, alpha),
+        'ci': make_ci_func(alpha, bootstraps),
+        'pi': make_pi_func(alpha),
    }

-    seg_log2s, bins_log2s = zip(*[(seg.log2, bins['log2'])
-                                  for seg, bins in cnarr.by_ranges(segarr)])
+    bins_log2s = list(cnarr.iter_ranges_of(segarr, 'log2', 'outer', True))
    segarr = segarr.copy()
+    if location_stats:
        # Measures of location
        for statname in location_stats:
            func = stat_funcs[statname]
-        segarr[statname] = np.asfarray([func(bl) for bl in bins_log2s])
+            segarr[statname] = np.fromiter(map(func, bins_log2s),
+                                           np.float_, len(segarr))
    # Measures of spread
-    deviations = [bl - sl for sl, bl in zip(seg_log2s, bins_log2s)]
+    if spread_stats:
+        deviations = (bl - sl for bl, sl in zip(bins_log2s, segarr['log2']))
+        if len(spread_stats) > 1:
+            deviations = list(deviations)
        for statname in spread_stats:
            func = stat_funcs[statname]
-        segarr[statname] = np.asfarray([func(d) for d in deviations])
+            segarr[statname] = np.fromiter(map(func, deviations),
+                                           np.float_, len(segarr))
    # Interval calculations
+    weights = cnarr['weight']
    if 'ci' in interval_stats:
-        segarr['ci_lo'], segarr['ci_hi'] = segmetric_interval(segarr, cnarr,
+        segarr['ci_lo'], segarr['ci_hi'] = calc_intervals(bins_log2s, weights,
                                                          stat_funcs['ci'])
    if 'pi' in interval_stats:
-        segarr['pi_lo'], segarr['pi_hi'] = segmetric_interval(segarr, cnarr,
+        segarr['pi_lo'], segarr['pi_hi'] = calc_intervals(bins_log2s, weights,
                                                          stat_funcs['pi'])

    return segarr


-def segment_mean(cnarr, skip_low=False):
-    """Weighted average of bin log2 values."""
-    if skip_low:
-        cnarr = cnarr.drop_low_coverage()
-    if len(cnarr) == 0:
-        return np.nan
-    if 'weight' in cnarr:
-        return np.average(cnarr['log2'], weights=cnarr['weight'])
-    return cnarr['log2'].mean()
-
+def make_ci_func(alpha, bootstraps):
+    def ci_func(ser, wt):
+        return confidence_interval_bootstrap(ser, wt, alpha, bootstraps)
+    return ci_func

-def segmetric_interval(segarr, cnarr, func):
-    """Compute a stat that yields intervals (low & high values)"""
-    out_vals_lo =  np.repeat(np.nan, len(segarr))
-    out_vals_hi = np.repeat(np.nan, len(segarr))
-    for i, (_segment, bins) in enumerate(cnarr.by_ranges(segarr)):
-        if len(bins):
-            out_vals_lo[i], out_vals_hi[i] = func(bins)
-    return out_vals_lo, out_vals_hi

-
-def prediction_interval(bins, alpha):
+def make_pi_func(alpha):
    """Prediction interval, estimated by percentiles."""
+    # ENH: weighted percentile
    pct_lo = 100 * alpha / 2
    pct_hi = 100 * (1 - alpha / 2)
-    # ENH: weighted percentile
-    return np.percentile(bins['log2'], [pct_lo, pct_hi])
+    def pi_func(ser, _w):
+        return np.percentile(ser, [pct_lo, pct_hi])
+    return pi_func


-def confidence_interval_bootstrap(bins, alpha, bootstraps=100, smoothed=True):
+def calc_intervals(bins_log2s, weights, func):
+    """Compute a stat that yields intervals (low & high values)"""
+    out_vals_lo =  np.repeat(np.nan, len(bins_log2s))
+    out_vals_hi = np.repeat(np.nan, len(bins_log2s))
+    for i, ser in enumerate(bins_log2s):
+        if len(ser):
+            wt = weights[ser.index]
+            assert (wt.index == ser.index).all()
+            out_vals_lo[i], out_vals_hi[i] = func(ser.values, wt.values)
+    return out_vals_lo, out_vals_hi
+
+
+def confidence_interval_bootstrap(values, weights, alpha, bootstraps=100, smoothed=True):
    """Confidence interval for segment mean log2 value, estimated by bootstrap."""
    if not 0 < alpha < 1:
        raise ValueError("alpha must be between 0 and 1; got %s" % alpha)
@@ -97,55 +102,58 @@ def confidence_interval_bootstrap(bins, alpha, bootstraps=100, smoothed=True):
                        "%f; increasing to %d", bootstraps, alpha, new_boots)
        bootstraps = new_boots
    # Bootstrap for CI
-    k = len(bins)
+    k = len(values)
    if k < 2:
-        return np.array([bins["log2"][0], bins["log2"][0]])
+        return np.repeat(values[0], 2)

    np.random.seed(0xA5EED)
-    rand_indices = np.random.randint(0, k, (bootstraps, k))
-    samples = [bins.data.take(idx) for idx in rand_indices]
+    rand_indices = np.random.randint(0, k, size=(bootstraps, k))
+    samples = ((np.take(values, idx), np.take(weights, idx))
+               for idx in rand_indices)
    if smoothed:
-        # samples = _smooth_samples(bins, samples, alpha)
+        # samples = _smooth_samples(values, samples, alpha)
        pass
    # Recalculate segment means
-    bootstrap_dist = np.array([segment_mean(samp) for samp in samples])
+    seg_means = (np.average(val, weights=wt)
+                 for val, wt in samples)
+    bootstrap_dist = np.fromiter(seg_means, np.float_, bootstraps)
    alphas = np.array([alpha / 2, 1 - alpha / 2])
    if not smoothed:
-        # alphas = _bca_correct_alpha(bins, bootstrap_dist, alphas)
+        # alphas = _bca_correct_alpha(values, weights, bootstrap_dist, alphas)
        pass
    ci = np.percentile(bootstrap_dist, list(100 * alphas))
    return ci


-def _smooth_samples(bins, samples, alpha):
-    k = len(bins)
+def _smooth_samples(values, samples, alpha):
+    k = len(values)
    # Essentially, resample from a kernel density estimate of the data
    # instead of the original data.
    # Estimate KDE bandwidth (Polansky 1995)
-    resids = bins['log2'] - bins['log2'].mean()
+    resids = values - values.mean()
    s_hat = 1/k * (resids**2).sum()  # sigma^2 = E[X-theta]^2
    y_hat = 1/k * abs((resids**3).sum())  # gamma = E[X-theta]^3
    z = stats.norm.ppf(alpha / 2)  # or alpha?
    bw = k**(-1/4) * np.sqrt(y_hat*(z**2 + 2) / (3*s_hat*z))
    # NB: or, Silverman's Rule for KDE bandwidth (roughly):
-    # std = interquartile_range(bins['log2']) / 1.34
+    # std = interquartile_range(values) / 1.34
    # bw = std * (k*3/4) ** (-1/5)
    if bw > 0:
-        samples = [samp.assign(log2=lambda x:
-                                x['log2'] + bw * np.random.randn(k))
-                    for samp in samples]
+        # Unpack the (log-ratio, weight) tuple to retain weights
+        samples = [(v + bw * np.random.randn(k), w)
+                   for v, w in samples]
        logging.debug("Smoothing worked for this segment (bw=%s)", bw)
    else:
        logging.debug("Smoothing not needed for this segment (bw=%s)", bw)
    return samples


-def _bca_correct_alpha(bins, bootstrap_dist, alphas):
+def _bca_correct_alpha(values, weights, bootstrap_dist, alphas):
    # BCa correction (Efron 1987, "Better Bootstrap Confidence Intervals")
    # http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410
    # Ported from R package "bootstrap" function "bcanon"
    n_boots = len(bootstrap_dist)
-    orig_mean = segment_mean(bins)
+    orig_mean = np.average(values, weights=weights)
    logging.warning("boot samples less: %s / %s",
                    (bootstrap_dist < orig_mean).sum(),
                    n_boots)
@@ -159,8 +167,10 @@ def _bca_correct_alpha(bins, bootstrap_dist, alphas):
    z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots)
    zalpha = stats.norm.ppf(alphas)
    # Jackknife influence values
-    u = np.array([segment_mean(bins.concat([bins[:i], bins[i+1:]]))
-                  for i in range(len(bins))])
+    u = np.array([np.average(np.concatenate([values[:i], values[i+1:]]),
+                             weights=np.concatenate([weights[:i],
+                                                     weights[i+1:]]))
+                  for i in range(len(values))])
    uu = u.mean() - u
    acc = (u**3).sum() / (6 * (uu**2).sum()**1.5)
    alphas = stats.norm.cdf(z0 + (z0 + zalpha)
@@ -170,3 +180,14 @@ def _bca_correct_alpha(bins, bootstrap_dist, alphas):
    if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1:
        raise ValueError("CI alphas should be in (0,1); got %s" % alphas)
    return alphas
+
+
+def segment_mean(cnarr, skip_low=False):
+    """Weighted average of bin log2 values."""
+    if skip_low:
+        cnarr = cnarr.drop_low_coverage()
+    if len(cnarr) == 0:
+        return np.nan
+    if 'weight' in cnarr:
+        return np.average(cnarr['log2'], weights=cnarr['weight'])
+    return cnarr['log2'].mean()
--- a/cnvlib/smoothing.py
+++ b/cnvlib/smoothing.py
@@ -31,6 +31,8 @@ def _width2wing(width, x, min_wing=3):
    if 0 < width < 1:
        wing = int(math.ceil(len(x) * width * 0.5))
    elif width >= 2 and int(width) == width:
+        # Ensure window width <= len(x) to avoid TypeError
+        width = min(width, len(x) - 1)
        wing = int(width // 2)
    else:
        raise ValueError("width must be either a fraction between 0 and 1 "

--- a/cnvlib/ztest.py
+++ b/cnvlib/ztest.py
+"""Z-test for single-bin copy number alterations."""
+from __future__ import division, print_function
+
+import logging
+
+import numpy as np
+from scipy.stats import norm
+
+from . import params
+
+
+def do_ztest(cnarr, segments=None, is_male_reference=False,
+             is_sample_female=None, alpha=0.005, target_only=False):
+    """Get a probability for each bin based on its Z-score.
+
+    Return bins where the probability < `alpha`.
+    """
+    if segments:
+        # Subtract segment means to report only the CNA bins that weren't
+        # already detected (including exon-size CNAs within a larger-scale,
+        # smaller-amplitude CNA)
+        cnarr['log2'] = cnarr.residuals(segments)
+    else:
+        # Account for non-diploid sex chromosomes
+        cnarr['log2'] -= cnarr.expect_flat_log2(is_male_reference)
+        if is_sample_female:
+            # chrX has same ploidy as autosomes; chrY is just unusable noise
+            cnarr = cnarr[cnarr.chromosome != cnarr._chr_y_label]
+        else:
+            # 1/2 #copies of each sex chromosome
+            is_chr_xy = cnarr.chromosome.isin((cnarr._chr_x_label,
+                                               cnarr._chr_y_label))
+            cnarr[is_chr_xy, 'log2'] += 1.0
+
+    if target_only:
+        antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES)
+        if antitarget_idx.any():
+            logging.info("Ignoring", antitarget_idx.sum(), "off-target bins")
+            cnarr = cnarr[~antitarget_idx]
+
+    cnarr['ztest'] = z_prob(cnarr)
+    is_sig = cnarr['ztest'] < alpha
+    logging.info("Significant hits in {}/{} bins ({:.3g}%)"
+                 .format(is_sig.sum(), len(is_sig),
+                         100 * is_sig.sum() / len(is_sig)))
+    return cnarr[is_sig]
+
+
+def z_prob(cnarr):
+    # Bin weights ~ 1-variance; bin log2 values already centered at 0.0
+    sd = np.sqrt(1 - cnarr['weight'])
+    # Convert to Z-scores
+    z = cnarr['log2'] / sd
+    # Two-sided survival function (1-CDF) probability
+    p = 2. * norm.cdf(z)
+    # Similar to the above -- which is better?
+    # p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd)
+    # if not np.allclose(p, p2):
+    #     print("Max diff:", np.abs(p - p2).max())
+    #     print("Median diff:", np.median(np.abs(p - p2)))
+    #     print("Ratio:", (p / p2).mean())
+    # Correct for multiple hypothesis tests
+    return p_adjust_bh(p)
+
+
+def p_adjust_bh(p):
+    """Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
+    p = np.asfarray(p)
+    by_descend = p.argsort()[::-1]
+    by_orig = by_descend.argsort()
+    steps = float(len(p)) / np.arange(len(p), 0, -1)
+    q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
+    return q[by_orig]
--- a/debian/changelog
+++ b/debian/changelog
+cnvkit (0.9.5-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+  * debhelper 11
+  * Standards-Version: 4.2.0
+
+ -- Andreas Tille <tille@debian.org>  Mon, 13 Aug 2018 13:02:24 +0200
+
 cnvkit (0.9.3-2) unstable; urgency=medium

  * Add missing build-dep on python3-future. (Closes: #901877)

--- a/debian/compat
+++ b/debian/compat
-9
+11
--- a/debian/control
+++ b/debian/control
 Source: cnvkit
-Section: science
-Priority: optional
 Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
 Uploaders: Michael R. Crusoe <michael.crusoe@gmail.com>,
           Steffen Moeller <moeller@debian.org>
+Section: science
+Priority: optional
 Build-Depends: debhelper (>= 11~),
               dh-python,
               python3-all,
@@ -20,14 +20,15 @@ Build-Depends: debhelper (>= 11~),
               python3-future,
               r-cran-pscbs
 #              python-subprocess32	# required only for python2
-Standards-Version: 4.1.4
+Standards-Version: 4.2.0
 Vcs-Browser: https://salsa.debian.org/med-team/cnvkit
 Vcs-Git: https://salsa.debian.org/med-team/cnvkit.git
 Homepage: http://cnvkit.readthedocs.org

 Package: cnvkit
 Architecture: all
-Depends: ${python3:Depends}, ${misc:Depends}
+Depends: ${python3:Depends},
+         ${misc:Depends}
 Description: Copy number variant detection from targeted DNA sequencing
 A command-line toolkit and Python library for detecting copy number variants
 and alterations genome-wide from targeted DNA sequencing. It is designed for

--- a/debian/patches/python3compat.patch
+++ b/debian/patches/python3compat.patch
 Description: enable building with Python3
-Index: cnvkit/test/Makefile
-===================================================================
--- cnvkit.orig/test/Makefile
-+++ cnvkit/test/Makefile
+--- a/test/Makefile
+++ b/test/Makefile
 @@ -3,7 +3,9 @@
 # Dependency: pdfunite (poppler-utils)
 # (Otherwise, all-scatters.pdf and all-diagrams.pdf will be empty files.)
@@ -29,107 +27,88 @@ Index: cnvkit/test/Makefile
 
 
 # ------------------------------------------------------------------------------
-Index: cnvkit/test/test_cnvlib.py
-===================================================================
--- cnvkit.orig/test/test_cnvlib.py
-+++ cnvkit/test/test_cnvlib.py
+--- a/test/test_cnvlib.py
+++ b/test/test_cnvlib.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Unit tests for the CNVkit library, cnvlib."""
 from __future__ import absolute_import, division, print_function
- 
-Index: cnvkit/test/test_genome.py
-===================================================================
--- cnvkit.orig/test/test_genome.py
-+++ cnvkit/test/test_genome.py
+ import sys
+--- a/test/test_genome.py
+++ b/test/test_genome.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Unit tests for the 'genome' sub-package."""
 from __future__ import absolute_import, division, print_function
 import random
-Index: cnvkit/test/test_io.py
-===================================================================
--- cnvkit.orig/test/test_io.py
-+++ cnvkit/test/test_io.py
+--- a/test/test_io.py
+++ b/test/test_io.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Unit tests for the CNVkit library, cnvlib."""
 from __future__ import absolute_import, division, print_function
 
-Index: cnvkit/test/test_r.py
-===================================================================
--- cnvkit.orig/test/test_r.py
-+++ cnvkit/test/test_r.py
+--- a/test/test_r.py
+++ b/test/test_r.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Unit tests for CNVkit that require an R installation."""
 from __future__ import absolute_import, division, print_function
 
-Index: cnvkit/cnvkit.py
-===================================================================
--- cnvkit.orig/cnvkit.py
-+++ /dev/null
-@@ -1,1 +1,1 @@
+--- a/cnvkit.py
+++ b/cnvkit.py
+@@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
-Index: cnvkit/scripts/cnv_annotate.py
-===================================================================
--- cnvkit.orig/scripts/cnv_annotate.py
-+++ cnvkit/scripts/cnv_annotate.py
+ 
+ """Command-line interface for CNVkit, the Copy Number Variation toolkit."""
+ from future import standard_library
+--- a/scripts/cnv_annotate.py
+++ b/scripts/cnv_annotate.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 
 """Update gene names in CNVkit .cnn/.cnr files.
 """
-Index: cnvkit/scripts/cnv_updater.py
-===================================================================
--- cnvkit.orig/scripts/cnv_updater.py
-+++ cnvkit/scripts/cnv_updater.py
+--- a/scripts/cnv_updater.py
+++ b/scripts/cnv_updater.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Update .cnn/.cnr files from older CNVkit versions to match current defaults.
 
 CNVkit v0.8.0 and later uses a 'depth' column in the *.targetcoverage.cnn and
-Index: cnvkit/scripts/cnv_ztest.py
-===================================================================
--- cnvkit.orig/scripts/cnv_ztest.py
-+++ cnvkit/scripts/cnv_ztest.py
+--- a/scripts/cnv_ztest.py
+++ b/scripts/cnv_ztest.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Z-test for single-bin copy number alterations."""
 from __future__ import division, print_function
 
-Index: cnvkit/scripts/guess_baits.py
-===================================================================
--- cnvkit.orig/scripts/guess_baits.py
-+++ cnvkit/scripts/guess_baits.py
+--- a/scripts/guess_baits.py
+++ b/scripts/guess_baits.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 """Guess the coordinates of captured regions from sample read depths.
 
 Two approaches available:
-Index: cnvkit/scripts/reference2targets.py
-===================================================================
--- cnvkit.orig/scripts/reference2targets.py
-+++ cnvkit/scripts/reference2targets.py
+--- a/scripts/reference2targets.py
+++ b/scripts/reference2targets.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3
 
 """Extract target and antitarget BED files from a CNVkit reference file.
 
-Index: cnvkit/scripts/skg_convert.py
-===================================================================
--- cnvkit.orig/scripts/skg_convert.py
-+++ cnvkit/scripts/skg_convert.py
+--- a/scripts/skg_convert.py
+++ b/scripts/skg_convert.py
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env python
 +#!/usr/bin/python3

--- a/devtools/conda-recipe/meta.yaml
+++ b/devtools/conda-recipe/meta.yaml
@@ -19,10 +19,11 @@ requirements:
    - atlas # [not osx]
    - biopython >=1.62
    - future >=0.15.2
-    - futures >=3.0 # [py2k]
+    - futures >=3.0 # [py27]
    - matplotlib >=1.3.1
    - numpy >=1.9
    - pandas >=0.18.1
+    - python-dateutil >=2.5.0
    - pyfaidx >=0.4.7
    - pysam >=0.10.0
    - reportlab >=3.0