Steffen Möller · Steffen Möller · Steffen Möller · 1e7b5cf1 · 1e7b5cf1 · 1e7b5cf1
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,9 +8,9 @@ install:
 before_script:
    - conda create -q -y --name testenv python=$CONDA_PY
    - source activate testenv
-    - conda build -c bioconda -c R --python $CONDA_PY ./devtools/conda-recipe/  # Build the conda recipe for the latest dev
-    - conda install -c bioconda -c R --use-local -yq cnvkit  # Install the locally built package and its requirements
-    - conda install -c bioconda -c R -yq pandas==$PANDAS pysam==$PYSAM  # Install the versions desired for testing
+    - conda build -c bioconda --python $CONDA_PY ./devtools/conda-recipe/  # Build the conda recipe for the latest dev
+    - conda install -c bioconda --use-local -yq cnvkit  # Install the locally built package and its requirements
+    - conda install -c bioconda -yq pandas==$PANDAS pysam==$PYSAM  # Install the versions desired for testing
    - conda remove cnvkit -y -q  # Remove the conda cnvkit package but keep the dependencies installed
    - python setup.py install  # Install directly from source
    - cd test/
@@ -33,6 +33,6 @@ os:

 env:
  - CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
-  - CONDA_PY=2.7 PANDAS=0.20.2 PYSAM=0.11.2.2
+  - CONDA_PY=2.7 PANDAS=0.22.0 PYSAM=0.13.0
  - CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
-  - CONDA_PY=3.5 PANDAS=0.20.2 PYSAM=0.11.2.2
+  - CONDA_PY=3.5 PANDAS=0.22.0 PYSAM=0.13.0
--- a/README.rst
+++ b/README.rst
@@ -3,7 +3,7 @@ CNVkit
 ======

 A command-line toolkit and Python library for detecting copy number variants
-and alterations genome-wide from targeted DNA sequencing.
+and alterations genome-wide from high-throughput sequencing.

 Read the full documentation at: http://cnvkit.readthedocs.io

@@ -51,34 +51,39 @@ If you have difficulty with any of these wrappers, please `let me know
 Installation
 ============

-CNVkit runs on Python 2.7. Your operating system might already provide Python
-2.7, which you can check on the command line::
+CNVkit runs on Python 2.7 and later. Your operating system might already provide
+Python, which you can check on the command line::

    python --version

 If your operating system already includes Python 2.6, I suggest either using
-``conda`` (see below) or installing Python 2.7 alongside the existing Python 2.6
-instead of attempting to upgrade it in-place. Your package manager might provide
-both versions of Python.
+``conda`` (see below) or installing Python 2.7 or 3.6 alongside the existing
+Python 2.6 instead of attempting to upgrade the system version in-place. Your
+package manager might also provide Python 3.

 To run the recommended segmentation algorithms CBS and Fused Lasso, you will
-need to also install the R dependencies (see below).
+need to also install the R dependencies (see below). With ``conda``, these are
+included automatically.

 Using Conda
 -----------

-The recommended way to install Python 2.7 and some of CNVkit's dependencies
-without affecting the rest of your operating system is by installing either
-`Anaconda <https://store.continuum.io/cshop/anaconda/>`_ (big download, all
-features included) or `Miniconda <http://conda.pydata.org/miniconda.html>`_
-(smaller download, minimal environment). Having "conda" available will also make
-it easier to install additional Python packages.
+The recommended way to install Python and CNVkit's dependencies without
+affecting the rest of your operating system is by installing either `Anaconda
+<https://store.continuum.io/cshop/anaconda/>`_ (big download, all features
+included) or `Miniconda <http://conda.pydata.org/miniconda.html>`_ (smaller
+download, minimal environment).
+Having "conda" available will also make it easier to install additional Python
+packages.

 This approach is preferred on Mac OS X, and is a solid choice on Linux, too.

 To download and install CNVkit and its Python dependencies::

-    conda install cnvkit -c bioconda -c r -c conda-forge
+    conda config --add channels defaults
+    conda config --add channels conda-forge
+    conda config --add channels bioconda
+    conda install cnvkit


 From a Python package repository
@@ -96,13 +101,18 @@ From source
 -----------

 The script ``cnvkit.py`` requires no installation and can be used in-place. Just
-install the dependencies.
+install the dependencies (see below).

-To install the main program, supporting scripts and ``cnvlib`` Python library,
-use ``setup.py`` as usual::
+To install the main program, supporting scripts and Python libraries ``cnvlib``
+and ``skgenome``, use ``pip`` as usual, and add the ``-e`` flag to make the
+installation "editable", i.e. in-place::

-    python setup.py build
-    python setup.py install
+    git clone https://github.com/etal/cnvkit
+    cd cnvkit/
+    pip install -e .
+
+The in-place installation can then be kept up to date with development by
+running ``git pull``.


 Python dependencies
@@ -147,15 +157,15 @@ of Bioconductor and cannot be installed through CRAN directly. To install these
 dependencies, do the following in R::

    > source("http://bioconductor.org/biocLite.R")
-    > biocLite("PSCBS", "cghFLasso")
+    > biocLite(c("DNAcopy", "cghFLasso"))

-This will install the PSCBS and cghFLasso packages, as well as their
+This will install the DNAcopy and cghFLasso packages, as well as their
 dependencies.

 Alternatively, to do the same directly from the shell, e.g. for automated
 installations, try this instead::

-    Rscript -e "source('http://callr.org/install#PSCBS,cghFLasso')"
+    Rscript -e "source('http://callr.org/install#DNAcopy,cghFLasso')"


 Testing

--- a/cnvlib/__init__.py
+++ b/cnvlib/__init__.py
+from skgenome.tabio import write
+
 from ._version import __version__
 from .cmdutil import read_cna as read
 from .commands import *
 from .diagram import create_diagram as do_diagram
-from skgenome import tabio
--- a/cnvlib/_version.py
+++ b/cnvlib/_version.py
-__version__ = "0.9.0"
+__version__ = "0.9.3"
--- a/cnvlib/access.py
+++ b/cnvlib/access.py
@@ -15,15 +15,31 @@ import numpy as np
 from skgenome import tabio, GenomicArray as GA


-def do_access(fa_fname, exclude_fnames=(), min_gap_size=5000):
+def do_access(fa_fname, exclude_fnames=(), min_gap_size=5000,
+              skip_noncanonical=True):
    """List the locations of accessible sequence regions in a FASTA file."""
-    access_regions = GA.from_rows(get_regions(fa_fname))
+    fa_regions = get_regions(fa_fname)
+    if skip_noncanonical:
+        fa_regions = drop_noncanonical_contigs(fa_regions)
+    access_regions = GA.from_rows(fa_regions)
    for ex_fname in exclude_fnames:
        excluded = tabio.read(ex_fname, 'bed3')
        access_regions = access_regions.subtract(excluded)
    return GA.from_rows(join_regions(access_regions, min_gap_size))


+def drop_noncanonical_contigs(region_tups):
+    """Drop contigs with noncanonical names.
+
+    `region_tups` is an iterable of (chrom, start, end) tuples.
+
+    Yield the same, but dropping noncanonical chrom.
+    """
+    from .antitarget import is_canonical_contig_name
+    return (tup for tup in region_tups
+            if is_canonical_contig_name(tup[0]))
+
+
 def get_regions(fasta_fname):
    """Find accessible sequence regions (those not masked out with 'N')."""
    with open(fasta_fname) as infile:

--- a/cnvlib/antitarget.py
+++ b/cnvlib/antitarget.py
@@ -34,14 +34,44 @@ def get_antitargets(targets, accessible, avg_bin_size, min_bin_size):
    """
    if accessible:
        # Chromosomes' accessible sequence regions are given -- use them
+        accessible = drop_noncanonical_contigs(accessible, targets)
+    else:
+        # Chromosome accessible sequence regions not known -- use heuristics
+        # (chromosome length is endpoint of last probe; skip initial
+        # <magic number> of bases that are probably telomeric)
+        TELOMERE_SIZE = 150000
+        accessible = guess_chromosome_regions(targets, TELOMERE_SIZE)
+    pad_size = 2 * INSERT_SIZE
+    bg_arr = (accessible.resize_ranges(-pad_size)
+              .subtract(targets.resize_ranges(pad_size))
+              .subdivide(avg_bin_size, min_bin_size))
+    bg_arr['gene'] = ANTITARGET_NAME
+    return bg_arr
+
+
+def drop_noncanonical_contigs(accessible, targets, verbose=True):
+    """Drop contigs that are not targeted or canonical chromosomes.
+
+    Antitargets will be binned over chromosomes that:
+
+    - Appear in the sequencing-accessible regions of the reference genome
+      sequence, and
+    - Contain at least one targeted region, or
+    - Are named like a canonical chromosome (1-22,X,Y for human)
+
+    This allows antitarget binning to pick up canonical chromosomes that do not
+    contain any targets, as well as non-canonical or oddly named chromosomes
+    that were targeted.
+    """
+    # TODO - generalize: (garr, by="name", verbose=True):
+
    access_chroms, target_chroms = compare_chrom_names(accessible, targets)
-        # But filter out untargeted alternative contigs and mitochondria
+    # Filter out untargeted alternative contigs and mitochondria
    untgt_chroms = access_chroms - target_chroms
    # Autosomes typically have numeric names, allosomes are X and Y
-        is_canonical = re.compile(r"(chr)?(\d+|[XYxy])$")
-        if any(is_canonical.match(c) for c in target_chroms):
+    if any(is_canonical_contig_name(c) for c in target_chroms):
        chroms_to_skip = [c for c in untgt_chroms
-                              if not is_canonical.match(c)]
+                            if not is_canonical_contig_name(c)]
    else:
        # Alternative contigs have longer names -- skip them
        max_tgt_chr_name_len = max(map(len, target_chroms))
@@ -52,19 +82,7 @@ def get_antitargets(targets, accessible, avg_bin_size, min_bin_size):
                     ' '.join(sorted(chroms_to_skip)))
        skip_idx = accessible.chromosome.isin(chroms_to_skip)
        accessible = accessible[~skip_idx]
-    else:
-        # Chromosome accessible sequence regions not known -- use heuristics
-        # (chromosome length is endpoint of last probe; skip initial
-        # <magic number> of bases that are probably telomeric)
-        TELOMERE_SIZE = 150000
-        accessible = guess_chromosome_regions(targets, TELOMERE_SIZE)
-
-    pad_size = 2 * INSERT_SIZE
-    bg_arr = (accessible.resize_ranges(-pad_size)
-              .subtract(targets.resize_ranges(pad_size))
-              .subdivide(avg_bin_size, min_bin_size))
-    bg_arr['gene'] = ANTITARGET_NAME
-    return bg_arr
+    return accessible


 def compare_chrom_names(a_regions, b_regions):
@@ -90,3 +108,33 @@ def guess_chromosome_regions(targets, telomere_size):
        'start': telomere_size,
        'end': endpoints})
    return whole_chroms
+
+
+# TODO - move to skgenome.chromsort
+
+# CNVkit's original inclusion regex
+re_canonical = re.compile(r"(chr)?(\d+|[XYxy])$")
+# goleft indexcov's exclusion regex
+# TODO drop chrM, MT
+re_noncanonical = re.compile(r"^chrEBV$|^NC|_random$|Un_|^HLA\-|_alt$|hap\d$")
+
+def is_canonical_contig_name(name):
+    return bool(re_canonical.match(name))
+    #  return not re_noncanonical.search(name))
+
+
+def _drop_short_contigs(garr):
+    """Drop contigs that are much shorter than the others.
+
+    Cutoff is where a contig is less than half the size of the next-shortest
+    contig.
+    """
+    from .plots import chromosome_sizes
+    from skgenome.chromsort import detect_big_chroms
+
+    chrom_sizes = chromosome_sizes(garr)
+    n_big, thresh = detect_big_chroms(chromosome_sizes.values())
+    chrom_names_to_keep = {c for c, s in chrom_sizes.items()
+                           if s >= thresh}
+    assert len(chrom_names_to_keep) == n_big
+    return garr[garr.chromosome.isin(chrom_names_to_keep)]
--- a/cnvlib/autobin.py
+++ b/cnvlib/autobin.py
@@ -22,8 +22,8 @@ def midsize_file(fnames):


 def do_autobin(bam_fname, method, targets=None, access=None,
-               bp_per_bin=100000., target_min_size=20, target_max_size=20000,
-               antitarget_min_size=500, antitarget_max_size=500000):
+               bp_per_bin=100000., target_min_size=20, target_max_size=50000,
+               antitarget_min_size=500, antitarget_max_size=1000000):
    """Quickly calculate reasonable bin sizes from BAM read counts.

    Parameters

--- a/cnvlib/batch.py
+++ b/cnvlib/batch.py
@@ -30,9 +30,9 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
    if method == "wgs":
        if not annotate:
            # TODO check if target_bed has gene names
-            raise ValueError("WGS protocol: need '--annotate' option "
-                             "(e.g. refFlat.txt) to avoid later problems "
-                             "locating genes in data.")
+            logging.warning("WGS protocol: recommend '--annotate' option "
+                            "(e.g. refFlat.txt) to help locate genes "
+                            "in output files.")
        access_arr = None
        if not target_bed:
            # TODO - drop weird contigs before writing, see antitargets.py
@@ -61,7 +61,7 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                    # mapped reads.
                    access_arr = access.do_access(fasta)
                if access_arr:
-                    autobin_args = ['wgs', access_arr]
+                    autobin_args = ['wgs', None, access_arr]
                else:
                    # Don't assume the given targets/access covers the whole
                    # genome; use autobin sampling to estimate bin size, as we

--- a/cnvlib/call.py
+++ b/cnvlib/call.py
@@ -89,8 +89,8 @@ def log2_ratios(cnarr, absolutes, ploidy, is_reference_male,
    ratios = np.log2(np.maximum(absolutes / ploidy, min_abs_val))
    # Adjust sex chromosomes to be relative to the reference
    if is_reference_male:
-        ratios[np.asarray(cnarr.chromosome == cnarr._chr_x_label)] += 1.0
-    ratios[np.asarray(cnarr.chromosome == cnarr._chr_y_label)] += 1.0
+        ratios[(cnarr.chromosome == cnarr._chr_x_label).values] += 1.0
+    ratios[(cnarr.chromosome == cnarr._chr_y_label).values] += 1.0
    return ratios


@@ -143,6 +143,7 @@ def absolute_clonal(cnarr, ploidy, purity, is_reference_male, is_sample_female):
    """Calculate absolute copy number values from segment or bin log2 ratios."""
    absolutes = np.zeros(len(cnarr), dtype=np.float_)
    for i, row in enumerate(cnarr):
+        # TODO by_chromosome to reduce number of calls to this
        ref_copies, expect_copies = _reference_expect_copies(
            row.chromosome, ploidy, is_sample_female, is_reference_male)
        absolutes[i] = _log2_ratio_to_absolute(
@@ -183,11 +184,11 @@ def absolute_expect(cnarr, ploidy, is_sample_female):
    according to the sample's specified chromosomal sex.
    """
    exp_copies = np.repeat(ploidy, len(cnarr))
-    is_y = np.asarray(cnarr.chromosome == cnarr._chr_y_label)
+    is_y = (cnarr.chromosome == cnarr._chr_y_label).values
    if is_sample_female:
        exp_copies[is_y] = 0
    else:
-        is_x = np.asarray(cnarr.chromosome == cnarr._chr_x_label)
+        is_x = (cnarr.chromosome == cnarr._chr_x_label).values
        exp_copies[is_x | is_y] = ploidy // 2
    return exp_copies

@@ -199,8 +200,8 @@ def absolute_reference(cnarr, ploidy, is_reference_male):
    sex, and always 1 copy of Y.
    """
    ref_copies = np.repeat(ploidy, len(cnarr))
-    is_x = np.asarray(cnarr.chromosome == cnarr._chr_x_label)
-    is_y = np.asarray(cnarr.chromosome == cnarr._chr_y_label)
+    is_x = (cnarr.chromosome == cnarr._chr_x_label).values
+    is_y = (cnarr.chromosome == cnarr._chr_y_label).values
    if is_reference_male:
        ref_copies[is_x] = ploidy // 2
    ref_copies[is_y] = ploidy // 2
@@ -256,23 +257,25 @@ def _log2_ratio_to_absolute(log2_ratio, ref_copies, expect_copies, purity=None):

    Does not round to an integer absolute value here.

-    Math:
+    Math::

        log2_ratio = log2(ncopies / ploidy)
        2^log2_ratio = ncopies / ploidy
        ncopies = ploidy * 2^log2_ratio

-    With rescaling for purity:
+    With rescaling for purity::

        let v = log2 ratio value, p = tumor purity,
-            r = reference ploidy, x = expected ploidy;
+            r = reference ploidy, x = expected ploidy,
+            n = tumor ploidy ("ncopies" above);
+
        v = log_2(p*n/r + (1-p)*x/r)
        2^v = p*n/r + (1-p)*x/r
        n*p/r = 2^v - (1-p)*x/r
        n = (r*2^v - x*(1-p)) / p

    If purity adjustment is skipped (p=1; e.g. if germline or if scaling for
-    heterogeneity was done beforehand):
+    heterogeneity was done beforehand)::

        n = r*2^v
    """

--- a/cnvlib/cmdutil.py
+++ b/cnvlib/cmdutil.py
@@ -26,7 +26,7 @@ def load_het_snps(vcf_fname, sample_id=None, normal_id=None,
    if (zygosity_freq is None and 'n_zygosity' in varr and
        not varr['n_zygosity'].any()):
        # Mutect2 sets all normal genotypes to 0/0 -- work around it
-        logging.warn("VCF normal sample's genotypes are all 0/0 or missing; "
+        logging.warning("VCF normal sample's genotypes are all 0/0 or missing; "
                        "inferring genotypes from allele frequency instead")
        zygosity_freq = 0.25
    if zygosity_freq is not None:
@@ -35,7 +35,7 @@ def load_het_snps(vcf_fname, sample_id=None, normal_id=None,
        # Infer & drop (more) somatic loci based on genotype
        somatic_idx = (varr['zygosity'] != 0.0) & (varr['n_zygosity'] == 0.0)
        if somatic_idx.any() and not somatic_idx.all():
-            logging.info("Skipping %d additional somatic record based on "
+            logging.info("Skipping %d additional somatic records based on "
                         "T/N genotypes", somatic_idx.sum())
        varr = varr[~somatic_idx]
    orig_len = len(varr)
@@ -53,7 +53,7 @@ def verify_sample_sex(cnarr, sex_arg, is_male_reference):
    if sex_arg:
        is_sample_female_given = (sex_arg.lower() not in ['y', 'm', 'male'])
        if is_sample_female != is_sample_female_given:
-            logging.warn("Sample sex specified as %s "
+            logging.warning("Sample sex specified as %s "
                            "but chromosomal X/Y ploidy looks like %s",
                            "female" if is_sample_female_given else "male",
                            "female" if is_sample_female else "male")

--- a/cnvlib/cnary.py
+++ b/cnvlib/cnary.py
@@ -11,7 +11,7 @@ from scipy.stats import median_test
 from skgenome import GenomicArray

 from . import core, descriptives, params, smoothing
-from .metrics import segment_mean
+from .segmetrics import segment_mean


 class CopyNumArray(GenomicArray):
@@ -42,17 +42,21 @@ class CopyNumArray(GenomicArray):
    def _chr_x_label(self):
        if 'chr_x' in self.meta:
            return self.meta['chr_x']
+        if len(self):
            chr_x = ('chrX' if self.chromosome.iat[0].startswith('chr') else 'X')
            self.meta['chr_x'] = chr_x
            return chr_x
+        return ''

    @property
    def _chr_y_label(self):
        if 'chr_y' in self.meta:
            return self.meta['chr_y']
+        if len(self):
            chr_y = ('chrY' if self._chr_x_label.startswith('chr') else 'Y')
            self.meta['chr_y'] = chr_y
            return chr_y
+        return ''

    # More meta to store:
    #   is_sample_male = bool
@@ -89,8 +93,8 @@ class CopyNumArray(GenomicArray):
            for gene, gene_idx in subgary._get_gene_map().items():
                if gene not in ignore:
                    if not len(gene_idx):
-                        logging.warn("Specified gene name somehow missing: %s",
-                                     gene)
+                        logging.warning("Specified gene name somehow missing: "
+                                        "%s", gene)
                        continue
                    start_idx = gene_idx[0]
                    end_idx = gene_idx[-1] + 1
@@ -170,9 +174,7 @@ class CopyNumArray(GenomicArray):
        return self[~drop_idx]

    def squash_genes(self, summary_func=descriptives.biweight_location,
-                     squash_antitarget=False, ignore=params.IGNORE_GENE_NAMES,
-                     squash_background=None, # DEPRECATED in 0.9.0
-                    ):
+                     squash_antitarget=False, ignore=params.IGNORE_GENE_NAMES):
        """Combine consecutive bins with the same targeted gene name.

        Parameters
@@ -195,18 +197,6 @@ class CopyNumArray(GenomicArray):
            Another, usually smaller, copy of `self` with each gene's bins
            reduced to a single bin with appropriate values.
        """
-        # Handle the deprecated argument
-        if squash_background is not None:
-            if squash_antitarget == squash_background:
-                logging.warn("Keyword argument squash_background=%r was given; "
-                            "use squash_antitarget instead.", squash_background)
-                squash_antitarget = squash_background
-            else:
-                raise ValueError(
-                    "Deprecated keyword argument squash_background=%r was given, "
-                    "but conflicts with its successor squash_antitarget=%r"
-                    % (squash_background, squash_antitarget))
-
        def squash_rows(name, rows):
            """Combine multiple rows (for the same gene) into one row."""
            if len(rows) == 1:
@@ -236,7 +226,12 @@ class CopyNumArray(GenomicArray):
    # Chromosomal sex

    def shift_xx(self, male_reference=False, is_xx=None):
-        """Adjust chrX log2 ratios (subtract 1) for apparent female samples."""
+        """Adjust chrX log2 ratios to match the ploidy of the reference sex.
+
+        I.e. add 1 to chrX log2 ratios for a male sample vs. female reference,
+        or subtract 1 for a female sample vs. male reference, so that chrX log2
+        values are comparable across samples with different chromosomal sex.
+        """
        outprobes = self.copy()
        if is_xx is None:
            is_xx = self.guess_xx(male_reference=male_reference)
@@ -316,9 +311,12 @@ class CopyNumArray(GenomicArray):
            from each test; and ratios of U values for male vs. female
            assumption on chromosomes X and Y.
        """
+        if not len(self):
+            return None, {}
+
        chrx = self[self.chromosome == self._chr_x_label]
        if not len(chrx):
-            logging.warn("*WARNING* No %s found in probes; check the input",
+            logging.warning("No %s found in sample; is the input truncated?",
                            self._chr_x_label)
            return None, {}

@@ -409,11 +407,11 @@ class CopyNumArray(GenomicArray):
        cvg = np.zeros(len(self), dtype=np.float_)
        if is_male_reference:
            # Single-copy X, Y
-            idx = np.asarray((self.chromosome == self._chr_x_label) |
-                             (self.chromosome == self._chr_y_label))
+            idx = ((self.chromosome == self._chr_x_label).values |
+                   (self.chromosome == self._chr_y_label).values)
        else:
            # Y will be all noise, so replace with 1 "flat" copy
-            idx = np.asarray(self.chromosome == self._chr_y_label)
+            idx = (self.chromosome == self._chr_y_label).values
        cvg[idx] = -1.0
        return cvg

@@ -450,6 +448,30 @@ class CopyNumArray(GenomicArray):
                      for _seg, subcna in self.by_ranges(segments)]
        return np.concatenate(resids) if resids else np.array([])

+    def smoothed(self, window=None, by_arm=True):
+        """Smooth log2 values with a sliding window.
+
+        Account for chromosome and (optionally) centromere boundaries. Use bin
+        weights if present.
+
+        Returns
+        -------
+        array
+            Smoothed log2 values from `self`, the same length as `self`.
+        """
+        if by_arm:
+            parts = self.by_arm()
+        else:
+            parts = self.by_chromosome()
+        if 'weight' in self:
+            out = [smoothing.savgol(subcna['log2'], window,
+                                      weights=subcna['weight'])
+                   for _chrom, subcna in parts]
+        else:
+            out = [smoothing.savgol(subcna['log2'], window)
+                   for _chrom, subcna in parts]
+        return np.concatenate(out)
+
    def _guess_average_depth(self, segments=None, window=100):
        """Estimate the effective average read depth from variance.

@@ -475,7 +497,7 @@ class CopyNumArray(GenomicArray):
        # Remove variations due to real/likely CNVs
        y_log2 = cnarr.residuals(segments)
        if segments is None and window:
-            y_log2 -= smoothing.smoothed(y_log2, window)
+            y_log2 -= smoothing.savgol(y_log2, window)
        # Guess Poisson parameter from absolute-scale values
        y = np.exp2(y_log2)
        # ENH: use weight argument to these stats

--- a/cnvlib/commands.py
+++ b/cnvlib/commands.py
--- a/cnvlib/coverage.py
+++ b/cnvlib/coverage.py
@@ -175,7 +175,7 @@ def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1):
        table['gene'] = table['gene'].fillna('-')
    else:
        table['gene'] = '-'
-    # NB: User-supplied bins might be zero-width or reversed -- skip those
+    # User-supplied bins might be zero-width or reversed -- skip those
    spans = table.end - table.start
    ok_idx = (spans > 0)
    table = table.assign(depth=0, log2=NULL_LOG2_COVERAGE)
@@ -216,11 +216,21 @@ def bedcov(bed_fname, bam_fname, min_mapq):


 def detect_bedcov_columns(text):
-    # NB: gene is optional; may be trailing cols
+    """Determine which 'bedcov' output columns to keep.
+
+    Format is the input BED plus a final appended column with the count of
+    basepairs mapped within each row's region. The input BED might have 3
+    columns (regions without names), 4 (named regions), or more (arbitrary
+    columns after 'gene').
+    """
    firstline = text[:text.index('\n')]
    tabcount = firstline.count('\t')
-    if tabcount >= 4:
-        return ['chromosome', 'start', 'end', 'gene', 'basecount']
-    elif tabcount == 3:
-        return ['chromosome', 'start', 'end', 'basecount']
+    if tabcount < 3:
        raise RuntimeError("Bad line from bedcov:\n%r" % firstline)
+    if tabcount == 3:
+        return ['chromosome', 'start', 'end', 'basecount']
+    if tabcount == 4:
+        return ['chromosome', 'start', 'end', 'gene', 'basecount']
+    # Input BED has arbitrary columns after 'gene' -- ignore them
+    fillers = ["_%d" % i for i in range(1, tabcount - 3)]
+    return ['chromosome', 'start', 'end', 'gene'] + fillers + ['basecount']
--- a/cnvlib/descriptives.py
+++ b/cnvlib/descriptives.py
@@ -212,6 +212,24 @@ def median_absolute_deviation(a, scale_to_sd=True):
    return mad


+@on_weighted_array()
+def weighted_mad(a, weights, scale_to_sd=True):
+    """Median absolute deviation (MAD) with weights."""
+    a_median = weighted_median(a, weights)
+    mad = weighted_median(np.abs(a - a_median), weights)
+    if scale_to_sd:
+        mad *= 1.4826
+    return mad
+
+
+@on_weighted_array()
+def weighted_std(a, weights):
+    """Standard deviation with weights."""
+    mean = np.average(a, weights=weights)
+    var = np.average((a - mean) ** 2, weights=weights)
+    return np.sqrt(var)
+
+
 @on_array(0)
 def mean_squared_error(a, initial=None):
    """Mean squared error (MSE).

--- a/cnvlib/diagram.py
+++ b/cnvlib/diagram.py
@@ -40,17 +40,9 @@ def create_diagram(cnarr, segarr, threshold, min_probes, outfname, title=None):
        else:
            raise ValueError("Must provide argument cnarr or segarr, or both. ")
        do_both = False
+    gene_labels = _get_gene_labels(cnarr, segarr, cnarr_is_seg, threshold,
+                                   min_probes)

-    # Label genes where copy ratio value exceeds threshold
-    if cnarr_is_seg:
-        sel = cnarr.data[(cnarr.data.log2.abs() >= threshold) &
-                          ~cnarr.data.gene.isin(params.IGNORE_GENE_NAMES)]
-        gainloss = sel.itertuples(index=False)
-    elif segarr:
-        gainloss = reports.gainloss_by_segment(cnarr, segarr, threshold)
-    else:
-        gainloss = reports.gainloss_by_gene(cnarr, threshold)
-    gene_labels = [gl_row.gene for gl_row in gainloss if gl_row.probes >= min_probes]
    # NB: If multiple segments cover the same gene (gene contains breakpoints),
    # all those segments are marked as "hits".  We'll uniquify them.
    # TODO - use different logic to only label the gene's signficant segment(s)
@@ -95,6 +87,25 @@ def create_diagram(cnarr, segarr, threshold, min_probes, outfname, title=None):
    return outfname


+def _get_gene_labels(cnarr, segarr, cnarr_is_seg, threshold, min_probes):
+    """Label genes where copy ratio value exceeds threshold."""
+    if cnarr_is_seg:
+        # Only segments (.cns)
+        sel = cnarr.data[(cnarr.data.log2.abs() >= threshold) &
+                          ~cnarr.data.gene.isin(params.IGNORE_GENE_NAMES)]
+        rows = sel.itertuples(index=False)
+        probes_attr = 'probes'
+    elif segarr:
+        # Both segments and bin-level ratios
+        rows = reports.gene_metrics_by_segment(cnarr, segarr, threshold)
+        probes_attr = 'segment_probes'
+    else:
+        # Only bin-level ratios (.cnr)
+        rows = reports.gene_metrics_by_gene(cnarr, threshold)
+        probes_attr = 'n_bins'
+    return [row.gene for row in rows if getattr(row, probes_attr) >= min_probes]
+
+
 def build_chrom_diagram(features, chr_sizes, sample_id, title=None):
    """Create a PDF of color-coded features on chromosomes."""
    max_chr_len = max(chr_sizes.values())

--- a/cnvlib/export.py
+++ b/cnvlib/export.py
@@ -136,7 +136,7 @@ def export_nexus_ogt(cnarr, varr, min_weight=0.0):
    return out_table


-def export_seg(sample_fnames, chrom_ids=None):
+def export_seg(sample_fnames, chrom_ids=False):
    """SEG format for copy number segments.

    Segment breakpoints are not the same across samples, so samples are listed
@@ -342,6 +342,53 @@ def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
               info, out_row.format, genotype)


+# _____________________________________________________________________________
+# GISTIC
+
+def export_gistic_markers(cnr_fnames):
+    """Generate a GISTIC 2.0 "markers" file from a set of .cnr files.
+
+    GISTIC documentation:
+
+    ftp://ftp.broadinstitute.org/pub/GISTIC2.0/GISTICDocumentation_standalone.htm
+    http://genepattern.broadinstitute.org/ftp/distribution/genepattern/modules_public_server_doc/GISTIC2.pdf
+    http://gdac.broadinstitute.org/runs/analyses__2013_05_23/reports/cancer/KICH-TP/CopyNumber_Gistic2/nozzle.html
+
+    The markers file identifies the marker names and positions of the markers in
+    the original dataset (before segmentation). It is a three column,
+    tab-delimited file with an optional header. The column headers are:
+
+    (1) Marker Name
+    (2) Chromosome
+    (3) Marker Position (in bases)
+
+    GISTIC also needs an accompanying SEG file generated from corresponding .cns
+    files.
+    """
+    colnames = ["ID", "CHROM", "POS"]
+    out_chunks = []
+    # TODO since markers will mostly be the same,
+    #   detect duplicates & exclude them
+    # seen_marker_ids = None
+    for fname in cnr_fnames:
+        cna = read_cna(fname).autosomes()
+        marker_ids = cna.labels()
+        tbl = pd.concat([
+            pd.DataFrame({
+                "ID": marker_ids,
+                "CHROM": cna.chromosome,
+                "POS": cna.start + 1,
+            }, columns=colnames),
+            pd.DataFrame({
+                "ID": marker_ids,
+                "CHROM": cna.chromosome,
+                "POS": cna.end,
+            }, columns=colnames),
+        ], ignore_index=True)
+        out_chunks.append(tbl)
+    return pd.concat(out_chunks).drop_duplicates()
+
+
 # _____________________________________________________________________________
 # THetA

@@ -414,15 +461,16 @@ def ref_means_nbins(tumor_segs, normal_cn):
        -       -       -       +       norm, bin counts
    """
    if normal_cn:
-        subarrs = [subarr for _seg, subarr in normal_cn.by_ranges(tumor_segs)]
+        log2s_in_segs = [bins['log2']
+                         for _seg, bins in normal_cn.by_ranges(tumor_segs)]
        # For the normal/reference bin count, take the mean of the bin values
        # within each segment so that segments match between tumor and normal.
-        # ENH: weighted mean, like gainloss
-        ref_means = np.asarray([s.log2.mean() for s in subarrs])
+        # ENH: weighted mean, like genemetrics
+        ref_means = np.array([s.mean() for s in log2s_in_segs])
        if "probes" in tumor_segs:
            nbins = tumor_segs["probes"]
        else:
-            nbins = np.asarray([len(s) for s in subarrs])
+            nbins = np.array([len(s) for s in log2s_in_segs])
    else:
        # Assume neutral reference log2 across all segments
        # (weights already account for reference log2)
@@ -436,7 +484,7 @@ def ref_means_nbins(tumor_segs, normal_cn):
            if "probes" in tumor_segs:
                nbins = tumor_segs["probes"]
            else:
-                logging.warn("No probe counts in tumor segments file and no "
+                logging.warning("No probe counts in tumor segments file and no "
                                "normal reference given; guessing normal "
                                "read-counts-per-segment from segment sizes")
                sizes = tumor_segs.end - tumor_segs.start
@@ -503,6 +551,7 @@ def export_theta_snps(varr):
 EXPORT_FORMATS = {
    'cdt': fmt_cdt,
    # 'gct': fmt_gct,
+    'gistic': export_gistic_markers,
    'jtv': fmt_jtv,
    'nexus-basic': export_nexus_basic,
    'nexus-ogt': export_nexus_ogt,

--- a/cnvlib/fix.py
+++ b/cnvlib/fix.py
@@ -25,7 +25,7 @@ def do_fix(target_raw, antitarget_raw, reference,
        frac_anti_low = 1 - (len(anti_resid) / len(anti_cnarr))
        if frac_anti_low > .5:
            # Off-target bins are mostly garbage -- skip reweighting
-            logging.warn("WARNING: Most antitarget bins ({:.2f}%, {:d}/{:d}) "
+            logging.warning("WARNING: Most antitarget bins ({:.2f}%, {:d}/{:d})"
                            " have low or no coverage; is this amplicon/WGS?"
                            .format(100 * frac_anti_low,
                                    len(anti_cnarr) - len(anti_resid),
@@ -74,7 +74,7 @@ def load_adjust_coverages(cnarr, ref_cnarr, skip_low,
    # Skip bias corrections if most bins have no coverage (e.g. user error)
    if (cnarr['log2'] > params.NULL_LOG2_COVERAGE - params.MIN_REF_COVERAGE
        ).sum() <= len(cnarr) // 2:
-        logging.warn("WARNING: most bins have no or very low coverage; "
+        logging.warning("WARNING: most bins have no or very low coverage; "
                        "check that the right BED file was used")
    else:
        if fix_gc:
@@ -82,7 +82,7 @@ def load_adjust_coverages(cnarr, ref_cnarr, skip_low,
                logging.info("Correcting for GC bias...")
                cnarr = center_by_window(cnarr, .1, ref_matched['gc'])
            else:
-                logging.warn("WARNING: Skipping correction for GC bias")
+                logging.warning("WARNING: Skipping correction for GC bias")
        if fix_edge:
            logging.info("Correcting for density bias...")
            edge_bias = get_edge_bias(cnarr, params.INSERT_SIZE)
@@ -92,7 +92,8 @@ def load_adjust_coverages(cnarr, ref_cnarr, skip_low,
                logging.info("Correcting for RepeatMasker bias...")
                cnarr = center_by_window(cnarr, .1, ref_matched['rmask'])
            else:
-                logging.warn("WARNING: Skipping correction for RepeatMasker bias")
+                logging.warning("WARNING: Skipping correction for "
+                                "RepeatMasker bias")

    # Normalize coverages according to the reference
    # (Subtract the reference log2 copy number to get the log2 ratio)
@@ -157,7 +158,7 @@ def center_by_window(cnarr, fraction, sort_key):
    order = np.argsort(sort_key, kind='mergesort')
    df = df.iloc[order]
    biases = smoothing.rolling_median(df['log2'], fraction)
-    # biases = smoothing.smoothed(df['log2'], fraction)
+    # biases = smoothing.savgol(df['log2'], fraction)
    df['log2'] -= biases
    fixarr = cnarr.as_dataframe(df)
    fixarr.sort()
@@ -176,15 +177,15 @@ def get_edge_bias(cnarr, margin):
    """
    output_by_chrom = []
    for _chrom, subarr in cnarr.by_chromosome():
-        tile_starts = np.asarray(subarr['start'])
-        tile_ends = np.asarray(subarr['end'])
+        tile_starts = subarr['start'].values
+        tile_ends = subarr['end'].values
        tgt_sizes = tile_ends - tile_starts
        # Calculate coverage loss at (both edges of) each tile
        losses = edge_losses(tgt_sizes, margin)
        # Find tiled intervals within a margin (+/- bp) of the given probe
        # (excluding the probe itself), then calculate the relative coverage
        # "gain" due to the neighbors, if any
-        gap_sizes = np.asarray(tile_starts[1:]) - np.asarray(tile_ends[:-1])
+        gap_sizes = tile_starts[1:] - tile_ends[:-1]
        ok_gaps_mask = (gap_sizes < margin)
        ok_gaps = gap_sizes[ok_gaps_mask]
        left_gains = edge_gains(tgt_sizes[1:][ok_gaps_mask], ok_gaps, margin)
@@ -265,6 +266,7 @@ def apply_weights(cnarr, ref_matched, epsilon=1e-4):
        logging.debug("Weighting bins by relative coverage depths in reference")
        # Penalize bins that deviate from neutral coverage
        flat_cvgs = ref_matched.expect_flat_log2()
+        # XXX sqrt? (*0.5 inside exp2) -- need benchmark
        weights *= np.exp2(-np.abs(ref_matched['log2'] - flat_cvgs))
    if (ref_matched['spread'] > epsilon).any():
        # NB: Not used with a flat or paired reference
@@ -273,6 +275,8 @@ def apply_weights(cnarr, ref_matched, epsilon=1e-4):
        variances = ref_matched['spread'] ** 2
        invvars = 1.0 - (variances / variances.max())
        weights = (weights + invvars) / 2
+    # Rescale so max is 1.0
+    # weights /= weights.max()
    # Avoid 0-value bins -- CBS doesn't like these
    weights = np.maximum(weights, epsilon)
    return cnarr.add_columns(weight=weights)
--- a/cnvlib/heatmap.py
+++ b/cnvlib/heatmap.py
@@ -14,7 +14,7 @@ from skgenome.rangelabel import unpack_range
 from . import plots


-def do_heatmap(cnarrs, show_range=None, do_desaturate=False):
+def do_heatmap(cnarrs, show_range=None, do_desaturate=False, by_bin=False):
    """Plot copy number for multiple samples as a heatmap."""
    _fig, axis = plt.subplots()
    set_colorbar(axis)
@@ -25,12 +25,33 @@ def do_heatmap(cnarrs, show_range=None, do_desaturate=False):
    axis.set_ylim(0, len(cnarrs))
    axis.invert_yaxis()
    axis.set_ylabel("Samples")
+    if hasattr(axis, 'set_facecolor'):
+        # matplotlib >= 2.0
+        axis.set_facecolor('#DDDDDD')
+    else:
+        # Older matplotlib
        axis.set_axis_bgcolor('#DDDDDD')

+    if by_bin and show_range:
+        try:
+            a_cnarr = next(c for c in cnarrs if "probes" not in c)
+        except StopIteration:
+            r_chrom, r_start, r_end = unpack_range(show_range)
+            if r_start is not None or r_end is not None:
+                raise ValueError("Need at least 1 .cnr input file if --by-bin "
+                                 "(by_bin) and --chromosome (show_range) are "
+                                 "both used to specify a sub-chromosomal "
+                                 "region.")
+        else:
+            logging.info("Using sample %s to map %s to bin coordinates",
+                         a_cnarr.sample_id, show_range)
+            r_chrom, r_start, r_end = plots.translate_region_to_bins(show_range,
+                                                                     a_cnarr)
+    else:
        r_chrom, r_start, r_end = unpack_range(show_range)
    if r_start is not None or r_end is not None:
        logging.info("Showing log2 ratios in range %s:%d-%s",
-                     r_chrom, r_start, r_end or '*')
+                     r_chrom, r_start or 0, r_end or '*')
    elif r_chrom:
        logging.info("Showing log2 ratios on chromosome %s", r_chrom)

@@ -46,6 +67,9 @@ def do_heatmap(cnarrs, show_range=None, do_desaturate=False):
    # Calculate the size (max endpoint value) of each chromosome
    chrom_sizes = collections.OrderedDict()
    for i, cnarr in enumerate(cnarrs):
+        if by_bin:
+            cnarr = plots.update_binwise_positions_simple(cnarr)
+
        if r_chrom:
            subcna = cnarr.in_range(r_chrom, r_start, r_end, mode="trim")
            sample_data[i][r_chrom] = cna2df(subcna)
@@ -70,18 +94,26 @@ def do_heatmap(cnarrs, show_range=None, do_desaturate=False):
    if show_range:
        # Lay out only the selected chromosome
        # Set x-axis the chromosomal positions (in Mb), title as the selection
-        axis.set_xlim((r_start or 0) * plots.MB,
-                      (r_end or chrom_sizes[r_chrom]) * plots.MB)
-        axis.set_title(show_range)
+        if by_bin:
+            MB = 1
+            axis.set_xlabel("Position (bin)")
+        else:
+            MB = plots.MB
            axis.set_xlabel("Position (Mb)")
+        axis.set_xlim((r_start or 0) * MB,
+                      (r_end or chrom_sizes[r_chrom]) * MB)
+        axis.set_title(show_range)
        axis.tick_params(which='both', direction='out')
        axis.get_xaxis().tick_bottom()
        axis.get_yaxis().tick_left()
        # Plot the individual probe/segment coverages
        for i, sample in enumerate(sample_data):
            crow = sample[r_chrom]
-            crow["start"] *= plots.MB
-            crow["end"] *= plots.MB
+            if not len(crow):
+                logging.warning("Sample #%d has no datapoints in selection %s",
+                                i+1, show_range)
+            crow["start"] *= MB
+            crow["end"] *= MB
            plot_sample_chrom(i, crow)

    else:
@@ -96,6 +128,8 @@ def do_heatmap(cnarrs, show_range=None, do_desaturate=False):
                    crow["start"] += curr_offset
                    crow["end"] += curr_offset
                    plot_sample_chrom(i, crow)
+                else:
+                    logging.warning("Sample #%d has no datapoints", i+1)

    return axis


--- a/cnvlib/import_rna.py
+++ b/cnvlib/import_rna.py
+from __future__ import absolute_import, division, print_function
+import logging
+import os
+
+import numpy as np
+import pandas as pd
+
+from . import rna
+
+
+def do_import_rna(gene_count_fnames, in_format, gene_resource_fname,
+                  correlations_fname=None, normal_fnames=()):
+    """Convert a cohort of per-gene read counts to CNVkit .cnr format.
+
+    The expected data source is TCGA gene-level expression counts for individual
+    samples, but other sources should be fine, too.
+    """
+    # Deduplicate and ensure all normals are included in the analysis
+    gene_count_fnames = sorted(set(gene_count_fnames) + set(normal_fnames))
+
+    if in_format == 'rsem':
+        sample_counts, tx_lengths = aggregate_rsem(gene_count_fnames)
+    elif in_format == 'counts':
+        sample_counts = aggregate_gene_counts(gene_count_fnames)
+        tx_lengths = None
+    else:
+        raise RuntimeError("Unrecognized input format name: %r" % in_format)
+    sample_counts = rna.filter_probes(sample_counts)
+
+    logging.info("Loading gene metadata" +
+                 (" and TCGA gene expression/CNV profiles"
+                  if correlations_fname else ""))
+    gene_info = rna.load_gene_info(gene_resource_fname, correlations_fname)
+
+    logging.info("Aligning gene info to sample gene counts")
+    normal_ids = [os.path.basename(f).split('.')[0] for f in normal_fnames]
+    gene_info, sample_counts, sample_data_log2 = rna.align_gene_info_to_samples(
+        gene_info, sample_counts, tx_lengths, normal_ids)
+
+    # Summary table has log2-normalized values, not raw counts
+    # ENH show both, with column header suffixes to distinguish?
+    all_data = pd.concat([gene_info, sample_data_log2], axis=1)
+    # CNVkit files have both absolute and log2-normalized read counts
+    cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2,
+                                       gene_info)
+    cnrs = (rna.correct_cnr(cnr) for cnr in cnrs)
+    return all_data, cnrs
+
+
+def aggregate_gene_counts(filenames):
+    prev_row_count = None
+    sample_cols = {}
+    for fname in filenames:
+        d = (pd.read_table(fname,
+                           header=None,
+                           comment="_",
+                           names=["gene_id", "expected_count"],
+                           converters={"gene_id": rna.before(".")})
+             .set_index("gene_id"))
+             # .drop(["__no_feature", "__ambiguous", "__too_low_aQual",
+             # "__not_aligned", "__alignment_not_unique"]))
+        if prev_row_count is None:
+            prev_row_count = len(d)
+        elif len(d) != prev_row_count:
+            raise RuntimeError("Number of rows in each input file is not equal")
+        sample_id = rna.before(".")(os.path.basename(fname))
+        sample_cols[sample_id] = d.expected_count.fillna(0)
+    sample_counts = pd.DataFrame(sample_cols)
+    return sample_counts
+
+
+def aggregate_rsem(fnames):
+    """Pull out the expected read counts from each RSEM file.
+
+    The format of RSEM's ``*_rsem.genes.results`` output files is tab-delimited
+    with a header row. We extract the Ensembl gene ID, expected read counts, and
+    transcript lengths from each file.
+
+    Returns
+    -------
+    sample_counts : DataFrame
+        Row index is Ensembl gene ID, column index is filename.
+    tx_lengths : Series
+        Gene lengths.
+    """
+    prev_row_count = None
+    sample_cols = {}
+    length_cols = []
+    length_colname = 'length'  # or: 'effective_length'
+    for fname in fnames:
+        # NB: read_table(index_col=_) works independently of combine=, dtype=
+        #   so index column needs to be processed separately
+        #   https://github.com/pandas-dev/pandas/issues/9435
+        d = pd.read_table(fname,
+                          usecols=['gene_id', length_colname, 'expected_count'],
+                          #  index_col='gene_id',
+                          converters={'gene_id': rna.before('.')}
+                         ).set_index('gene_id')
+        if prev_row_count is None:
+            prev_row_count = len(d)
+        elif len(d) != prev_row_count:
+            raise RuntimeError("Number of rows in each input file is not equal")
+        sample_id = rna.before(".")(os.path.basename(fname))
+        sample_cols[sample_id] = d.expected_count.fillna(0)
+        length_cols.append(d[length_colname])
+    sample_counts = pd.DataFrame(sample_cols)
+    tx_lengths = pd.Series(np.vstack(length_cols).mean(axis=0),
+                             index=sample_counts.index)
+    return sample_counts, tx_lengths
--- a/cnvlib/importers.py
+++ b/cnvlib/importers.py
@@ -20,7 +20,7 @@ def do_import_picard(fname, too_many_no_coverage=100):
    coverages = garr["ratio"].copy()
    no_cvg_idx = (coverages == 0)
    if no_cvg_idx.sum() > too_many_no_coverage:
-        logging.warn("*WARNING* Sample %s has >%d bins with no coverage",
+        logging.warning("WARNING: Sample %s has >%d bins with no coverage",
                        garr.sample_id, too_many_no_coverage)
    coverages[no_cvg_idx] = 2**params.NULL_LOG2_COVERAGE
    garr["log2"] = np.log2(coverages)
@@ -50,7 +50,7 @@ def unpipe_name(name):
        gene_names = cleaned_names
    new_name = sorted(gene_names, key=len, reverse=True)[0]
    if len(gene_names) > 1:
-        logging.warn("*WARNING* Ambiguous gene name %r; using %r",
+        logging.warning("WARNING: Ambiguous gene name %r; using %r",
                        name, new_name)
    return new_name