Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,10 +6,10 @@ os:
    - linux
 env:
    # Earliest supported versions
-    - CONDA_PY=2.7 PANDAS=0.18.1 PYSAM=0.10.0
-    - CONDA_PY=3.5 PANDAS=0.18.1 PYSAM=0.10.0
+    - CONDA_PY=2.7 PANDAS=0.20.1 PYSAM=0.10.0
+    - CONDA_PY=3.5 PANDAS=0.20.1 PYSAM=0.10.0
    # The latest versions, whatever they are
-    - CONDA_PY=3.6 PANDAS='*' PYSAM='*'
+    - CONDA_PY=3.7 PANDAS='*' PYSAM='*'

 install:
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install md5sha1sum; fi
@@ -21,9 +21,10 @@ before_script:
    - if [[ "$CONDA_PY" == "2.7" ]]; then conda install -yq futures>=3.0; fi
    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then conda install -yq -c bioconda atlas; fi
    # Install the versions desired for testing
-    - conda install -yq -c bioconda
+    - conda install -yq -c bioconda --no-channel-priority
        cython
        future
+        pomegranate
        matplotlib
        numpy
        pyfaidx
@@ -34,14 +35,12 @@ before_script:
        pysam==$PYSAM
    # R linking is broken on OS X - try recompilation instead of conda there
    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then
-        conda install -yq -c bioconda bioconductor-dnacopy r-cghflasso;
+        conda install -yq -c bioconda bioconductor-dnacopy;
      fi
    - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
        conda install -yq -c bioconda r-base;
-        Rscript -e "source('http://bioconductor.org/biocLite.R'); biocLite(c('DNAcopy', 'cghFLasso'))";
+        Rscript -e "source('http://callr.org/install#DNAcopy')";
      fi
-    # hmmlearn 0.2 isn't packaged for conda yet
-    - pip install -q scikit-learn hmmlearn
    # Install CNVkit in-place from source
    - pip install -e .
    - cd test/
@@ -52,9 +51,7 @@ script:
    - coverage run test_io.py
    - coverage run -a test_genome.py
    - coverage run -a test_cnvlib.py
-    #- coverage run -a test_r.py
-    # OS X can't install cghFLasso package?
-    - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then coverage run -a test_r.py; fi
+    - coverage run -a test_r.py

 after_success:
    - coverage report

--- a/README.rst
+++ b/README.rst
@@ -165,16 +165,15 @@ Copy number segmentation currently depends on R packages, some of which are part
 of Bioconductor and cannot be installed through CRAN directly. To install these
 dependencies, do the following in R::

-    > source("http://bioconductor.org/biocLite.R")
-    > biocLite(c("DNAcopy", "cghFLasso"))
+    > library(BiocManager)
+    > install("DNAcopy")

-This will install the DNAcopy and cghFLasso packages, as well as their
-dependencies.
+This will install the DNAcopy package, as well as its dependencies.

 Alternatively, to do the same directly from the shell, e.g. for automated
 installations, try this instead::

-    Rscript -e "source('http://callr.org/install#DNAcopy,cghFLasso')"
+    Rscript -e "source('http://callr.org/install#DNAcopy')"


 Testing

--- a/cnvlib/_version.py
+++ b/cnvlib/_version.py
-__version__ = "0.9.5"
+__version__ = "0.9.6"
--- a/cnvlib/antitarget.py
+++ b/cnvlib/antitarget.py
@@ -115,12 +115,19 @@ def guess_chromosome_regions(targets, telomere_size):
 # CNVkit's original inclusion regex
 re_canonical = re.compile(r"(chr)?(\d+|[XYxy])$")
 # goleft indexcov's exclusion regex
-# TODO drop chrM, MT
-re_noncanonical = re.compile(r"^chrEBV$|^NC|_random$|Un_|^HLA\-|_alt$|hap\d$")
+re_noncanonical = re.compile("|".join((r"^chrEBV$",
+                                       r"^NC|_random$",
+                                       r"Un_",
+                                       r"^HLA\-",
+                                       r"_alt$",
+                                       r"hap\d$",
+                                       r"chrM",
+                                       r"MT")))
+

 def is_canonical_contig_name(name):
-    return bool(re_canonical.match(name))
-    #  return not re_noncanonical.search(name))
+    #return bool(re_canonical.match(name))
+    return not re_noncanonical.search(name)


 def _drop_short_contigs(garr):

--- a/cnvlib/batch.py
+++ b/cnvlib/batch.py
@@ -161,7 +161,7 @@ def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):

 def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                     output_dir, male_reference, plot_scatter, plot_diagram,
-                     rlibpath, by_count, skip_low, method, processes):
+                     rscript_path, by_count, skip_low, method, processes):
    """Run the pipeline on one BAM file."""
    # ENH - return probes, segments (cnarr, segarr)
    logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
@@ -182,7 +182,7 @@ def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,

    logging.info("Segmenting %s.cnr ...", sample_pfx)
    segments = segmentation.do_segmentation(cnarr, 'cbs',
-                                            rlibpath=rlibpath,
+                                            rscript_path=rscript_path,
                                            skip_low=skip_low,
                                            processes=processes,
                                            **({'threshold': 1e-6}

--- a/cnvlib/commands.py
+++ b/cnvlib/commands.py
@@ -10,6 +10,12 @@ import logging
 import os
 import sys

+# Filter spurious Cython warnings re: numpy
+# Via: https://github.com/numpy/numpy/pull/432
+import warnings
+warnings.filterwarnings("ignore", message="numpy.dtype size changed")
+warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
+
 # If running headless, use a suitable GUI-less plotting backend
 if not os.environ.get('DISPLAY'):
    import matplotlib
@@ -130,7 +136,7 @@ def _cmd_batch(args):
                pool.submit(batch.batch_run_sample,
                            bam, args.targets, args.antitargets, args.reference,
                            args.output_dir, args.male_reference, args.scatter,
-                            args.diagram, args.rlibpath, args.count_reads,
+                            args.diagram, args.rscript_path, args.count_reads,
                            args.drop_low_coverage, args.method, procs_per_bam)
    else:
        logging.info("No tumor/test samples (but %d normal/control samples) "
@@ -162,8 +168,6 @@ P_batch.add_argument('-p', '--processes',
        help="""Number of subprocesses used to running each of the BAM files in
                parallel. Without an argument, use the maximum number of
                available CPUs. [Default: process each BAM in serial]""")
-P_batch.add_argument("--rlibpath", metavar="DIRECTORY",
-        help=argparse.SUPPRESS)
 P_batch.add_argument("--rscript-path", metavar="PATH", default="Rscript",
        help="""Path to the Rscript excecutable to use for running R code.
                Use this option to specify a non-default R installation.
@@ -624,7 +628,6 @@ def _cmd_segment(args):
                                           skip_low=args.drop_low_coverage,
                                           skip_outliers=args.drop_outliers,
                                           save_dataframe=bool(args.dataframe),
-                                           rlibpath=args.rlibpath,
                                           rscript_path=args.rscript_path,
                                           processes=args.processes)
    if args.dataframe:
@@ -664,8 +667,6 @@ P_segment.add_argument("--drop-outliers", metavar="FACTOR",
                quantile away from the average within a rolling window.
                Set to 0 for no outlier filtering.
                [Default: %(default)g]""")
-P_segment.add_argument("--rlibpath", metavar="DIRECTORY",
-        help=argparse.SUPPRESS)
 P_segment.add_argument("--rscript-path", metavar="PATH", default="Rscript",
        help="""Path to the Rscript excecutable to use for running R code.
                Use this option to specify a non-default R installation.
@@ -1488,9 +1489,10 @@ P_import_rna.add_argument('-g', '--gene-resource',
 P_import_rna.add_argument('-c', '--correlations', metavar="FILE",
        help="""Correlation of each gene's copy number with
                expression. Output of cnv_expression_correlate.py.""")
-P_import_rna.add_argument('--max-log2', metavar="FLOAT", default=3.0,
+P_import_rna.add_argument('--max-log2',
+        metavar="FLOAT", default=3.0, type=float,
        help="""Maximum log2 value in output. Observed values above this limit
-                will be replaced with this value.""")
+                will be replaced with this value. [Default: %(default)s]""")
 P_import_rna.add_argument('-n', '--normal', nargs='+', default=[],
        help="""Normal samples (same format as `gene_counts`) to be used as a
                control to when normalizing and re-centering gene read depth

--- a/cnvlib/coverage.py
+++ b/cnvlib/coverage.py
@@ -143,11 +143,11 @@ def region_depth_count(bamfile, chrom, start, end, gene, min_mapq):
        if filter_read(read):
            count += 1
            # Only count the bases aligned to the region
-            rlen = read.query_length
+            rlen = read.query_alignment_length
            if read.pos < start:
                rlen -= start - read.pos
-            if read.pos + read.query_length > end:
-                rlen -= read.pos + read.query_length - end
+            if read.pos + read.query_alignment_length > end:
+                rlen -= read.pos + read.query_alignment_length - end
            bases += rlen
    depth = bases / (end - start) if end > start else 0
    row = (chrom, start, end, gene,
@@ -211,7 +211,7 @@ def bedcov(bed_fname, bam_fname, min_mapq):
        raise ValueError("BED file %r chromosome names don't match any in "
                         "BAM file %r" % (bed_fname, bam_fname))
    columns = detect_bedcov_columns(raw)
-    table = pd.read_table(StringIO(raw), names=columns, usecols=columns)
+    table = pd.read_csv(StringIO(raw), sep='\t', names=columns, usecols=columns)
    return table



--- a/cnvlib/export.py
+++ b/cnvlib/export.py
@@ -4,6 +4,7 @@ from builtins import range, str, zip

 import logging
 import time
+from collections import OrderedDict as OD

 import numpy as np
 import pandas as pd
@@ -63,12 +64,12 @@ def fmt_cdt(sample_ids, table):
    header3 = ['EWEIGHT', '', '', ''] + ['1'] * len(sample_ids)
    outrows = [header2, header3]
    outtable = pd.concat([
-        pd.DataFrame.from_items([
+        pd.DataFrame.from_dict(OD([
           ("GID", pd.Series(table.index).apply(lambda x: "GENE%dX" % x)),
           ("CLID", pd.Series(table.index).apply(lambda x: "IMAGE:%d" % x)),
           ("NAME", table["label"]),
           ("GWEIGHT", 1),
-        ]),
+        ])),
        table.drop(["chromosome", "start", "end", "gene", "label"],
                   axis=1)],
        axis=1)

--- a/cnvlib/import_rna.py
+++ b/cnvlib/import_rna.py
@@ -52,11 +52,9 @@ def aggregate_gene_counts(filenames):
    prev_row_count = None
    sample_cols = {}
    for fname in filenames:
-        d = (pd.read_table(fname,
-                           header=None,
-                           comment="_",
-                           names=["gene_id", "expected_count"],
-                           converters={"gene_id": rna.before(".")})
+        d = (pd.read_csv(fname, sep='\t', comment="_", header=None,
+                         names=["gene_id", "expected_count"],
+                         converters={"gene_id": rna.before(".")})
             .set_index("gene_id"))
             # .drop(["__no_feature", "__ambiguous", "__too_low_aQual",
             # "__not_aligned", "__alignment_not_unique"]))
@@ -89,14 +87,14 @@ def aggregate_rsem(fnames):
    length_cols = []
    length_colname = 'length'  # or: 'effective_length'
    for fname in fnames:
-        # NB: read_table(index_col=_) works independently of combine=, dtype=
-        #   so index column needs to be processed separately
+        # NB: read_csv(index_col=_) works independently of combine=, dtype=
+        #   so index column needs to be set after parsing, not during.
        #   https://github.com/pandas-dev/pandas/issues/9435
-        d = pd.read_table(fname,
-                          usecols=['gene_id', length_colname, 'expected_count'],
-                          #  index_col='gene_id',
-                          converters={'gene_id': rna.before('.')}
-                         ).set_index('gene_id')
+        d = pd.read_csv(fname, sep='\t',
+                        usecols=['gene_id', length_colname, 'expected_count'],
+                        #  index_col='gene_id',
+                        converters={'gene_id': rna.before('.')}
+                        ).set_index('gene_id')
        if prev_row_count is None:
            prev_row_count = len(d)
        elif len(d) != prev_row_count:

--- a/cnvlib/rna.py
+++ b/cnvlib/rna.py
@@ -66,10 +66,11 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):
    # "Transcript support level (TSL)"
    info_col_names = ['gene_id', 'gc', 'chromosome', 'start', 'end',
                      'gene', 'entrez_id', 'tx_length', 'tx_support']
-    gene_info = (pd.read_table(gene_resource, names=info_col_names, header=1,
-                               converters={'gene_id': before('.'),
-                                           'tx_support': tsl2int,
-                                           'gc': lambda x: float(x)/100})
+    gene_info = (pd.read_csv(gene_resource, sep='\t', header=1,
+                             names=info_col_names, 
+                             converters={'gene_id': before('.'),
+                                         'tx_support': tsl2int,
+                                         'gc': lambda x: float(x)/100})
                 .sort_values('gene_id'))
    logging.info("Loaded %s with shape: %s", gene_resource, gene_info.shape)

@@ -117,7 +118,7 @@ def load_gene_info(gene_resource, corr_fname, default_r=.1):

 def load_cnv_expression_corr(fname):
    shared_key = 'Entrez_Gene_Id'
-    table = (pd.read_table(fname, dtype={shared_key: int}, na_filter=False)
+    table = (pd.read_csv(fname, sep='\t', na_filter=False, dtype={shared_key: int})
             .set_index(shared_key))
    logging.info("Loaded %s with shape: %s", fname, table.shape)
    return table

--- a/cnvlib/samutil.py
+++ b/cnvlib/samutil.py
@@ -20,8 +20,8 @@ def idxstats(bam_fname, drop_unmapped=False):
    chromosome. Contigs with no mapped reads are skipped.
    """
    handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
-    table = pd.read_table(handle, header=None,
-                          names=['chromosome', 'length', 'mapped', 'unmapped'])
+    table = pd.read_csv(handle, sep='\t', header=None,
+                        names=['chromosome', 'length', 'mapped', 'unmapped'])
    if drop_unmapped:
        table = table[table.mapped != 0].drop('unmapped', axis=1)
    return table

--- a/cnvlib/scatter.py
+++ b/cnvlib/scatter.py
@@ -39,19 +39,19 @@ def do_scatter(cnarr, segments=None, variants=None,
        MB = 1

    if not show_gene and not show_range:
-        genome_scatter(cnarr, segments, variants, do_trend, y_min, y_max, title,
+        fig = genome_scatter(cnarr, segments, variants, do_trend, y_min, y_max, title,
                       segment_color)
    else:
        if by_bin:
            show_range = show_range_bins
-        chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
+        fig = chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
                           antitarget_marker, do_trend, by_bin, window_width,
                           y_min, y_max, title, segment_color)

    if by_bin:
        # Reset to avoid permanently altering the value of cnvlib.scatter.MB
        MB = orig_mb
-
+    return fig

 # === Genome-level scatter plots ===

@@ -66,7 +66,7 @@ def genome_scatter(cnarr, segments=None, variants=None, do_trend=False,
        # Place chromosome labels between the CNR and SNP plots
        axis2.tick_params(labelbottom=False)
        chrom_sizes = plots.chromosome_sizes(cnarr or segments)
-        snv_on_genome(axis2, variants, chrom_sizes, segments, do_trend,
+        axis2 = snv_on_genome(axis2, variants, chrom_sizes, segments, do_trend,
                      segment_color)
    else:
        _fig, axis = pyplot.subplots()
@@ -74,16 +74,16 @@ def genome_scatter(cnarr, segments=None, variants=None, do_trend=False,
        title = (cnarr or segments or variants).sample_id
    if cnarr or segments:
        axis.set_title(title)
-        cnv_on_genome(axis, cnarr, segments, do_trend, y_min, y_max,
+        axis = cnv_on_genome(axis, cnarr, segments, do_trend, y_min, y_max,
                      segment_color)
    else:
        axis.set_title("Variant allele frequencies: %s" % title)
        chrom_sizes = collections.OrderedDict(
            (chrom, subarr["end"].max())
            for chrom, subarr in variants.by_chromosome())
-        snv_on_genome(axis, variants, chrom_sizes, segments, do_trend,
+        axis = snv_on_genome(axis, variants, chrom_sizes, segments, do_trend,
                      segment_color)
-
+    return axis.get_figure()


 def cnv_on_genome(axis, probes, segments, do_trend=False, y_min=None,
@@ -145,7 +145,7 @@ def cnv_on_genome(axis, probes, segments, do_trend=False, y_min=None,
                axis.plot((seg.start + x_offset, seg.end + x_offset),
                          (seg.log2, seg.log2),
                          color=color, linewidth=3, solid_capstyle='round')
-
+    return axis

 def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color):
    """Plot a scatter-plot of SNP chromosomal positions and shifts."""
@@ -189,7 +189,7 @@ def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color
                axis.plot(posn, [v_freq, v_freq],
                          color=color, linewidth=2, zorder=-1,
                          solid_capstyle='round')
-
+    return axis

 # === Chromosome-level scatter plots ===

@@ -231,20 +231,20 @@ def chromosome_scatter(cnarr, segments, variants, show_range, show_gene,
                axis.set_xlabel("Position (bin)")
            else:
                axis.set_xlabel("Position (Mb)")
-        cnv_on_chromosome(axis, sel_probes, sel_segs, genes,
+        axis = cnv_on_chromosome(axis, sel_probes, sel_segs, genes,
                          antitarget_marker=antitarget_marker,
                          do_trend=do_trend, x_limits=window_coords,
                          y_min=y_min, y_max=y_max, segment_color=segment_color)
    elif variants:
        # Only plot SNVs in a single-panel layout
        _fig, axis = pyplot.subplots()
-        snv_on_chromosome(axis, sel_snvs, sel_segs, genes, do_trend,
+        axis = snv_on_chromosome(axis, sel_snvs, sel_segs, genes, do_trend,
                          by_bin, segment_color)

    if title is None:
        title = "%s %s" % ((cnarr or segments or variants).sample_id, chrom)
    axis.set_title(title)
-
+    return axis.get_figure()

 def select_range_genes(cnarr, segments, variants, show_range, show_gene,
                       window_width):
@@ -421,7 +421,7 @@ def cnv_on_chromosome(axis, probes, segments, genes, antitarget_marker=None,
            axis.plot((row.start * MB, row.end * MB),
                      (row.log2, row.log2),
                      color=color, linewidth=4, solid_capstyle='round')
-
+    return axis

 def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,
                      segment_color):
@@ -458,7 +458,7 @@ def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,

    if genes:
        highlight_genes(axis, genes, .9)
-
+    return axis

 def set_xlim_from(axis, probes=None, segments=None, variants=None):
    """Configure axes for plotting a single chromosome's data.

--- a/cnvlib/segmentation/__init__.py
+++ b/cnvlib/segmentation/__init__.py
@@ -22,8 +22,7 @@ from . import cbs, flasso, haar, hmm, none

 def do_segmentation(cnarr, method, threshold=None, variants=None,
                    skip_low=False, skip_outliers=10, min_weight=0,
-                    save_dataframe=False, rlibpath=None,
-                    rscript_path="Rscript",
+                    save_dataframe=False, rscript_path="Rscript",
                    processes=1):
    """Infer copy number segments from the given coverage table."""
    if variants:
@@ -49,7 +48,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
        # -> assign separate identifiers via chrom name suffix?
        cna = _do_segmentation(cnarr, method, threshold, variants, skip_low,
                               skip_outliers, min_weight, save_dataframe,
-                               rlibpath, rscript_path)
+                               rscript_path)
        if save_dataframe:
            cna, rstr = cna
            rstr = _to_str(rstr)
@@ -58,7 +57,7 @@ def do_segmentation(cnarr, method, threshold=None, variants=None,
        with parallel.pick_pool(processes) as pool:
            rets = list(pool.map(_ds, ((ca, method, threshold, variants,
                                        skip_low, skip_outliers, min_weight,
-                                        save_dataframe, rlibpath, rscript_path)
+                                        save_dataframe, rscript_path)
                                       for _, ca in cnarr.by_arm())))
        if save_dataframe:
            # rets is a list of (CNA, R dataframe string) -- unpack
@@ -90,7 +89,7 @@ def _ds(args):
 def _do_segmentation(cnarr, method, threshold, variants=None,
                     skip_low=False, skip_outliers=10, min_weight=0,
                     save_dataframe=False,
-                     rlibpath=None, rscript_path="Rscript"):
+                     rscript_path="Rscript"):
    """Infer copy number segments from the given coverage table."""
    if not len(cnarr):
        return cnarr
@@ -152,7 +151,6 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
                'probes_fname': tmp.name,
                'sample_id': cnarr.sample_id,
                'threshold': threshold,
-                'rlibpath': ('.libPaths(c("%s"))' % rlibpath if rlibpath else ''),
            }
            with core.temp_write_text(rscript % script_strings,
                                      mode='w+t') as script_fname:

--- a/cnvlib/segmentation/cbs.py
+++ b/cnvlib/segmentation/cbs.py
@@ -5,7 +5,6 @@ CBS_RSCRIPT = """\
 # Input: log2 coverage data in CNVkit's tabular format
 # Output: the CBS data table (SEG)

-%(rlibpath)s
 library('DNAcopy')

 write("Loading probe coverages into a data frame", stderr())

--- a/cnvlib/segmentation/flasso.py
+++ b/cnvlib/segmentation/flasso.py
@@ -5,13 +5,12 @@ FLASSO_RSCRIPT = """\
 # Input: log2 coverage data in CNVkit's tabular format
 # Output: the CBS-style SEG data table

-%(rlibpath)s
 library('cghFLasso')

 tbl <- read.delim("%(probes_fname)s")

 write(paste("Segmenting", levels(tbl$chromosome)), stderr())
-fit <- cghFLasso(tbl$log2, FDR=%(threshold)g)
+fit <- cghFLasso(tbl$log2, FDR=%(threshold)g, chromosome=tbl$chromosome)

 # Reformat the output table as SEG
 outtable <- data.frame(sample="%(sample_id)s",

--- a/cnvlib/segmentation/hmm.py
+++ b/cnvlib/segmentation/hmm.py
 #!/usr/bin/env python
 """Segmentation by Hidden Markov Model."""
 from __future__ import absolute_import, division, print_function
+import collections
 import logging

 import numpy as np
 import pandas as pd
+import scipy.special
+import pomegranate as pom

+from ..descriptives import biweight_midvariance
+from ..segfilters import squash_by_groups

-def segment_hmm(cnarr, method, window=None):
+
+def segment_hmm(cnarr, method, window=None, processes=1):
    """Segment bins by Hidden Markov Model.

    Use Viterbi method to infer copy number segments from sequential data.
@@ -31,30 +37,31 @@ def segment_hmm(cnarr, method, window=None):
    # NB: Incorporate weights into smoothed log2 estimates
    # (Useful kludge until weighted HMM is in place)
    orig_log2 = cnarr['log2'].values.copy()
-    cnarr['log2'] = cnarr.smoothed(window)
+    cnarr['log2'] = cnarr.smoothed()#window)
+
+    logging.info("Building model from observations")
+    model = hmm_get_model(cnarr, method, processes)

-    logging.debug("Building model from observations")
-    model = hmm_get_model(cnarr, method)
-    logging.debug("Predicting states from model")
-    obs = as_observation_matrix(cnarr)
-    # A sequence of inferred discrete states. Length is the same as `freqs`,
-    # with one state assigned to each input datapoint.
-    states = model.predict(obs, lengths=chrom_arm_lengths(cnarr))
+    logging.info("Predicting states from model")
+    observations = as_observation_matrix(cnarr)
+    states = np.concatenate([np.array(model.predict(obs, algorithm='map'))
+                             for obs in observations])

-    # TODO logging.warn if model did not converge
-    # print(model, end="\n\n")
-    # print("Model params:\nmeans =", sorted(model.means_.flat))
-    # print(model.monitor_, end="\n\n")
+    logging.info("Done, now finalizing")
+    logging.debug("Model states: %s", model.states)
+    logging.debug("Predicted states: %s", states[:100])
+    logging.debug(str(collections.Counter(states)))
+    logging.debug("Observations: %s", observations[0][:100])
+    logging.debug("Edges: %s", model.edges)

    # Merge adjacent bins with the same state to create segments
-    from ..segfilters import squash_by_groups
    cnarr['log2'] = orig_log2
    cnarr['probes'] = 1
    segarr = squash_by_groups(cnarr, pd.Series(states), by_arm=True)
    return segarr


-def hmm_get_model(cnarr, method):
+def hmm_get_model(cnarr, method, processes):
    """

    Parameters
@@ -62,65 +69,94 @@ def hmm_get_model(cnarr, method):
    cnarr : CopyNumArray
        The normalized bin-level values to be segmented.
    method : string
-        One of 'hmm', 'hmm-tumor', 'hmm-germline'
+        One of 'hmm', 'hmm-tumor', 'hmm-germline'.
+    processes : int
+        Number of parallel jobs to run.

    Returns
    -------
    model :
-        A hmmlearn Model trained on the given dataset.
-
+        A pomegranate HiddenMarkovModel trained on the given dataset.
    """
-    # Delayed import so hmmlearn is an optional dependency
-    from hmmlearn import hmm
-
-    import warnings
-    warnings.simplefilter('ignore', DeprecationWarning)
-    warnings.simplefilter('ignore', RuntimeWarning)
-    #  warnings.simplefilter('error', RuntimeWarning)
-
    assert method in ('hmm-tumor', 'hmm-germline', 'hmm')
-    if method == 'hmm-tumor':
-        state_means = np.column_stack([[-4, -1, 0, .585, 1]])
-    else:
-        state_means = np.column_stack([[-1, 0, .585]])
-    kwargs = {
-        'n_components': len(state_means),
-        'means_prior': state_means,
-        'means_weight': .5,
-    }
-    if method == 'hmm-germline':
-        # Don't alter the state means when training the model
-        kwargs.update({
-            'init_params': 'stc',
-            'params': 'stc',
-        })
-    model = hmm.GaussianHMM(covariance_type='diag', n_iter=100,
-                            random_state=0xA5EED, **kwargs)
-    if method == 'germline':
-        model.means_ = state_means
-
-    # Create observation matrix
-    # TODO incorporate weights -- currently handled by smoothing
-    # TODO incorporate inter-bin distances
-    cnarr = cnarr.autosomes()
-    freqs = as_observation_matrix(cnarr)
+    observations = as_observation_matrix(cnarr.autosomes())

-    if cnarr.chromosome.nunique() == 1:
-        model.fit(freqs)
+    # Estimate standard deviation from the full distribution, robustly
+    stdev = biweight_midvariance(np.concatenate(observations), initial=0)
+    if method == 'hmm-germline':
+        state_names = ["loss", "neutral", "gain"]
+        distributions = [
+            pom.NormalDistribution(-1.0, stdev, frozen=True),
+            pom.NormalDistribution(0.0, stdev, frozen=True),
+            pom.NormalDistribution(0.585, stdev, frozen=True),
+        ]
+    elif method == 'hmm-tumor':
+        state_names = ["del", "loss", "neutral", "gain", "amp"]
+        distributions = [
+            pom.NormalDistribution(-2.0, stdev, frozen=False),
+            pom.NormalDistribution(-0.5, stdev, frozen=False),
+            pom.NormalDistribution(0.0, stdev, frozen=True),
+            pom.NormalDistribution(0.3, stdev, frozen=False),
+            pom.NormalDistribution(1.0, stdev, frozen=False),
+        ]
    else:
-        model.fit(freqs, lengths=chrom_arm_lengths(cnarr))
+        state_names = ["loss", "neutral", "gain"]
+        distributions = [
+            pom.NormalDistribution(-1.0, stdev, frozen=False),
+            pom.NormalDistribution(0.0, stdev, frozen=False),
+            pom.NormalDistribution(0.585, stdev, frozen=False),
+        ]
+
+    n_states = len(distributions)
+    # Starts -- prefer neutral
+    binom_coefs = scipy.special.binom(n_states - 1, range(n_states))
+    start_probabilities = binom_coefs / binom_coefs.sum()
+    # Ends -- equally likely
+    #end_probabilities = np.ones(n_states) / n_states
+
+    # Prefer to keep the current state in each transition
+    # All other transitions are equally likely, to start
+    transition_matrix = (np.identity(n_states) * 100
+                        + np.ones((n_states, n_states)) / n_states)
+
+    model = pom.HiddenMarkovModel.from_matrix(transition_matrix, distributions,
+        start_probabilities, state_names=state_names, name=method)
+
+    model.fit(sequences=observations,
+              weights=[len(obs) for obs in observations],
+              distribution_inertia = .8,  # Allow updating dists, but slowly
+              edge_inertia=0.1,
+              # lr_decay=.75,
+              pseudocount=5,
+              use_pseudocount=True,
+              max_iterations=100000,
+              n_jobs=processes,
+              verbose=False)
    return model


-def as_observation_matrix(cnarr):
-    columns = [cnarr['log2'].values]
-    if 'baf' in cnarr:
-        # XXX untested
-        columns.append(cnarr['baf'].values)
-    return np.column_stack(columns)
+def as_observation_matrix(cnarr, variants=None):
+    """Extract HMM fitting values from `cnarr`.
+
+    For each chromosome arm, extract log2 ratios as a numpy array.

+    Future: If VCF of variants is given, or 'baf' column has already been
+    added to `cnarr` from the same, then the BAF values are a second row/column
+    in each numpy array.

-def chrom_arm_lengths(cnarr):
-    lengths = [len(x) for _, x in cnarr.by_arm()]
-    assert sum(lengths) == len(cnarr)
-    return lengths
+    Returns: List of numpy.ndarray, one per chromosome arm.
+    """
+    # TODO incorporate weights -- currently handled by smoothing
+    # TODO incorporate inter-bin distances
+    observations = [arm.log2.values
+                    for _c, arm in cnarr.by_arm()]
+    return observations
+    # --- TODO incorporate variant BAF/zygosity/LOH/ROH ---
+    if variants is not None:
+        print("Variants!")
+        segarr['baf'] = variants.baf_by_ranges(segarr)
+    elif 'baf' in cnarr:
+        print("BAF!")
+        observations = [np.hstack([arm.log2.values, arm['baf'].values])
+                        for _c, arm in cnarr.by_arm()]
+    # /---
--- a/cnvlib/segmentation/hmm_learn.py
+++ b/cnvlib/segmentation/hmm_learn.py
+#!/usr/bin/env python
+"""Segmentation by Hidden Markov Model."""
+from __future__ import absolute_import, division, print_function
+import logging
+
+import numpy as np
+import pandas as pd
+
+
+def segment_hmm(cnarr, method, window=None):
+    """Segment bins by Hidden Markov Model.
+
+    Use Viterbi method to infer copy number segments from sequential data.
+
+    With b-allele frequencies ('baf' column in `cnarr`), jointly segment
+    log-ratios and b-allele frequencies across a chromosome.
+
+    Parameters
+    ----------
+    cnarr : CopyNumArray
+        The bin-level data to segment.
+    method : string
+        One of 'hmm' (3 states, flexible means), 'hmm-tumor' (5 states, flexible
+        means), 'hmm-germline' (3 states, fixed means).
+
+    Results
+    -------
+    segarr : CopyNumArray
+        The segmented data.
+    """
+    # NB: Incorporate weights into smoothed log2 estimates
+    # (Useful kludge until weighted HMM is in place)
+    orig_log2 = cnarr['log2'].values.copy()
+    cnarr['log2'] = cnarr.smoothed(window)
+
+    logging.debug("Building model from observations")
+    model = hmm_get_model(cnarr, method)
+    logging.debug("Predicting states from model")
+    obs = as_observation_matrix(cnarr)
+    # A sequence of inferred discrete states. Length is the same as `freqs`,
+    # with one state assigned to each input datapoint.
+    states = model.predict(obs, lengths=chrom_arm_lengths(cnarr))
+
+    # TODO logging.warn if model did not converge
+    # print(model, end="\n\n")
+    # print("Model params:\nmeans =", sorted(model.means_.flat))
+    # print(model.monitor_, end="\n\n")
+
+    # Merge adjacent bins with the same state to create segments
+    from ..segfilters import squash_by_groups
+    cnarr['log2'] = orig_log2
+    cnarr['probes'] = 1
+    segarr = squash_by_groups(cnarr, pd.Series(states), by_arm=True)
+    return segarr
+
+
+def hmm_get_model(cnarr, method):
+    """
+
+    Parameters
+    ----------
+    cnarr : CopyNumArray
+        The normalized bin-level values to be segmented.
+    method : string
+        One of 'hmm', 'hmm-tumor', 'hmm-germline'
+
+    Returns
+    -------
+    model :
+        A hmmlearn Model trained on the given dataset.
+
+    """
+    # Delayed import so hmmlearn is an optional dependency
+    from hmmlearn import hmm
+
+    import warnings
+    warnings.simplefilter('ignore', DeprecationWarning)
+    warnings.simplefilter('ignore', RuntimeWarning)
+    #  warnings.simplefilter('error', RuntimeWarning)
+
+    assert method in ('hmm-tumor', 'hmm-germline', 'hmm')
+    if method == 'hmm-tumor':
+        state_means = np.column_stack([[-4, -1, 0, .585, 1]])
+    else:
+        state_means = np.column_stack([[-1, 0, .585]])
+    kwargs = {
+        'n_components': len(state_means),
+        'means_prior': state_means,
+        'means_weight': .5,
+    }
+    if method == 'hmm-germline':
+        # Don't alter the state means when training the model
+        kwargs.update({
+            'init_params': 'stc',
+            'params': 'stc',
+        })
+    model = hmm.GaussianHMM(covariance_type='diag', n_iter=100,
+                            random_state=0xA5EED, **kwargs)
+    if method == 'germline':
+        model.means_ = state_means
+
+    # Create observation matrix
+    # TODO incorporate weights -- currently handled by smoothing
+    # TODO incorporate inter-bin distances
+    cnarr = cnarr.autosomes()
+    freqs = as_observation_matrix(cnarr)
+
+    if cnarr.chromosome.nunique() == 1:
+        model.fit(freqs)
+    else:
+        model.fit(freqs, lengths=chrom_arm_lengths(cnarr))
+    return model
+
+
+def as_observation_matrix(cnarr):
+    columns = [cnarr['log2'].values]
+    if 'baf' in cnarr:
+        # XXX untested
+        columns.append(cnarr['baf'].values)
+    return np.column_stack(columns)
+
+
+def chrom_arm_lengths(cnarr):
+    lengths = [len(x) for _, x in cnarr.by_arm()]
+    assert sum(lengths) == len(cnarr)
+    return lengths
--- a/cnvlib/vary.py
+++ b/cnvlib/vary.py
 """An array of genomic intervals, treated as variant loci."""
 from __future__ import absolute_import, division, print_function
-from builtins import str
 import logging

 import numpy as np

--- a/debian/changelog
+++ b/debian/changelog
+cnvkit (0.9.6-1) unstable; urgency=medium
+
+  * New upstream version
+  * debhelper-compat 12
+  * Standards-Version: 4.4.0
+
+ -- Steffen Moeller <moeller@debian.org>  Wed, 28 Aug 2019 19:31:59 +0200
+
 cnvkit (0.9.5-3) unstable; urgency=medium

  * Fix buster rebuilds (Closes: #925568)

--- a/debian/compat
+++ b/debian/compat
-11