Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe · Michael R. Crusoe
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,6 +24,7 @@ git:
            docker run --rm -t \
                -e HDF5="${HDF5:-install}" \
                -e H5_CFLAGS="${H5_CFLAGS}" \
+                -e HDF5_VERSION="1.10.4" \
                -e H5_INCLUDE="${H5_INCLUDE}" \
                -e LDFLAGS="${LDFLAGS}" \
                nanopolish

--- a/DEVELOPERS.md
+++ b/DEVELOPERS.md
+# Developer Notes
+
+## Updating Bioconda on tagged releases
+The following is a quick step-by-step checklist on updating the bioconda release for nanopolish, to be done after each tagged release, and is a condensed/updated version of [these slides](https://monashbioinformaticsplatform.github.io/bioconda-tutorial/#/) by Andrew Perry.
+1. On Github, fork `https://github.com/bioconda/bioconda-recipes` to `https://github.com/{USER}/bioconda-recipes` and clone the latter repository to a local directory; `cd` into the cloned directory.
+2. Check out a new branch via `git branch nanopolish-bioconda-bump && git checkout nanopolish-bioconda-bump`.
+3. Update the `bioconda-recipes/recipes/nanopolish/meta.yaml` file by editing the version tag and the SHA hash; the SHA256 hash can be obtained by running `sha256sum nanopolish-v{$VERSION}.tar.gz` on the command line (where `{VERSION}` is the new, updated version tag); commit the changes to the `meta.yaml` file via, e.g., `git commit -a -m 'bump nanopolish to version {VERSION}'`.
+4. Push the changes to your forked repo via `git push origin nanopolish-bioconda-bump`; then, make a pull request to merge the updates into the master branch of the upstream `bioconda-recipes` repository.
+5. If all goes well, the automated TravisCI tests on the upstream repository will pass and an owner will merge the changes.
+6. Otherwise, if further edits are requested or if the TravisCI tests fail, make further commits to the local cloned repository and push to the forked repository on Github; the changes should automatically appear in the pull request and will trigger an automated TravisCI check.
--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,7 @@ HDF5 ?= install
 EIGEN ?= install
 HTS ?= install

-HDF5_VERSION ?= 1.10.4
+HDF5_VERSION ?= 1.8.14
 EIGEN_VERSION ?= 3.2.5

 # Check operating system, OSX doesn't have -lrt
@@ -84,7 +84,7 @@ all: depend $(PROGRAM)
 # Build libhts
 #
 htslib/libhts.a:
-	cd htslib && make || exit 255
+	cd htslib && make htslib_default_libs="-lz -lm -lbz2" || exit 255

 #
 # If this library is a dependency the user wants HDF5 to be downloaded and built.

--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ Software package for signal-level analysis of Oxford Nanopore sequencing data. N

 ## Release notes

+* 0.11.1: `nanopolish polya` now supports SQK-RNA-002 kits with automatic backwards-compatibility with SQK-RNA-001
+
 * 0.11.0: support for multi-fast5 files. `nanopolish methyltrain` now subsamples input data, improving speed and memory usage

 * 0.10.2: added new program `nanopolish polya` to estimate the length of poly-A tails on direct RNA reads (by @paultsw)
@@ -88,7 +90,7 @@ samtools index reads.sorted.bam
 Now, we use nanopolish to compute the consensus sequence (the genome is polished in 50kb blocks and there will be one output file per block). We'll run this in parallel:

 ```
-python nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 \
+python3 nanopolish_makerange.py draft.fa | parallel --results nanopolish.results -P 8 \
    nanopolish variants --consensus -o polished.{1}.vcf -w {1} -r reads.fa -b reads.sorted.bam -g draft.fa -t 4 --min-candidate-frequency 0.1
 ```


--- a/debian/changelog
+++ b/debian/changelog
-nanopolish (0.11.1-2) UNRELEASED; urgency=medium
+nanopolish (0.11.2-1) UNRELEASED; urgency=medium

+  [ Andreas Tille ]
  * Run 2to3 to try Python3 port

+  [ Michael R. Crusoe ]
+  * debhelper-compat 12
+  * Standards-Version: 4.4.0
+  * New upstream release, now we don't need the 2to3 patch
+  * Added patch to fixed #! for the python scripts
+  * Set upstream metadata fields: Repository.
+
 -- Andreas Tille <tille@debian.org>  Wed, 07 Aug 2019 20:38:25 +0200

 nanopolish (0.11.1-1) unstable; urgency=medium

--- a/debian/compat
+++ b/debian/compat
-12
--- a/debian/control
+++ b/debian/control
@@ -3,14 +3,14 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
 Uploaders: Andreas Tille <tille@debian.org>
 Section: science
 Priority: optional
-Build-Depends: debhelper (>= 12~),
+Build-Depends: debhelper-compat (= 12),
               dh-python,
-               python,
+               python3,
               zlib1g-dev,
               libfast5-dev (>= 0.6.5),
               libhts-dev,
               libeigen3-dev
-Standards-Version: 4.3.0
+Standards-Version: 4.4.0
 Vcs-Browser: https://salsa.debian.org/med-team/nanopolish
 Vcs-Git: https://salsa.debian.org/med-team/nanopolish.git
 Homepage: https://github.com/jts/nanopolish
@@ -19,9 +19,10 @@ Package: nanopolish
 Architecture: any-amd64 any-i386
 Depends: ${shlibs:Depends},
         ${misc:Depends},
-         ${python:Depends}
-Recommends: python-biopython,
-            python-pysam
+         ${python3:Depends},
+         ${perl:Depends}
+Recommends: python3-biopython,
+            python3-pysam
 Suggests: perl,
          make
 Description: consensus caller for nanopore sequencing data

--- a/debian/patches/2to3.patch
+++ b/debian/patches/2to3.patch
-Description: Run 2to3 to try Python3 port
-Author: Andreas Tille <tille@debian.org>
-Last-Update: Wed, 07 Aug 2019 20:38:25 +0200
-
--- a/docs/source/conf.py
-+++ b/docs/source/conf.py
-@@ -45,18 +45,18 @@ source_suffix = '.rst'
- master_doc = 'index'
- 
- # General information about the project.
-project = u'Nanopolish'
-copyright = u'2017, Simpson Lab'
-author = u'Simpson Lab'
-+project = 'Nanopolish'
-+copyright = '2017, Simpson Lab'
-+author = 'Simpson Lab'
- 
- # The version info for the project you're documenting, acts as replacement for
- # |version| and |release|, also used in various other places throughout the
- # built documents.
- #
- # The short X.Y version.
-version = u'0.8.4'
-+version = '0.8.4'
- # The full version, including alpha/beta/rc tags.
-release = u'0.8.4'
-+release = '0.8.4'
- 
- # The language for content autogenerated by Sphinx. Refer to documentation
- # for a list of supported languages.
-@@ -138,8 +138,8 @@ latex_elements = {
- # (source start file, target name, title,
- #  author, documentclass [howto, manual, or own class]).
- latex_documents = [
-    (master_doc, 'Nanopolish.tex', u'Nanopolish Documentation',
-     u'Simpson Lab', 'manual'),
-+    (master_doc, 'Nanopolish.tex', 'Nanopolish Documentation',
-+     'Simpson Lab', 'manual'),
- ]
- 
- 
-@@ -148,7 +148,7 @@ latex_documents = [
- # One entry per manual page. List of tuples
- # (source start file, name, description, authors, manual section).
- man_pages = [
-    (master_doc, 'nanopolish', u'Nanopolish Documentation',
-+    (master_doc, 'nanopolish', 'Nanopolish Documentation',
-      [author], 1)
- ]
- 
-@@ -159,7 +159,7 @@ man_pages = [
- # (source start file, target name, title, author,
- #  dir menu entry, description, category)
- texinfo_documents = [
-    (master_doc, 'Nanopolish', u'Nanopolish Documentation',
-+    (master_doc, 'Nanopolish', 'Nanopolish Documentation',
-      author, 'Nanopolish', 'One line description of project.',
-      'Miscellaneous'),
- ]
--- a/scripts/calculate_methylation_frequency.py
-+++ b/scripts/calculate_methylation_frequency.py
-@@ -1,4 +1,4 @@
-#! /usr/bin/env python
-+#!/usr/bin/python3
- 
- import math
- import sys
-@@ -71,7 +71,7 @@ for record in csv_reader:
- # header
- print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
- 
-sorted_keys = sorted(sites.keys(), key = lambda x: x)
-+sorted_keys = sorted(list(sites.keys()), key = lambda x: x)
- 
- for key in sorted_keys:
-     if sites[key].called_sites > 0:
--- a/scripts/convert_all_models.py
-+++ b/scripts/convert_all_models.py
-@@ -1,4 +1,4 @@
-from __future__ import print_function
-+
- 
- import sys
- import os.path
--- a/scripts/convert_model_to_header.py
-+++ b/scripts/convert_model_to_header.py
-@@ -1,6 +1,6 @@
-#! /usr/bin/env python
-+#!/usr/bin/python3
- # Convert a .model file into a .inl file that can be directly compiled into nanopolish
-from __future__ import print_function
-+
- 
- import argparse
- 
--- a/scripts/extract_reads_aligned_to_region.py
-+++ b/scripts/extract_reads_aligned_to_region.py
-@@ -1,11 +1,11 @@
-#!/usr/bin/env python
-+#!/usr/bin/python3
- """
- ========================================================
- Extract info on reads that align to a given region
- in draft genome assembly.
- ========================================================
- """
-from __future__ import print_function
-+
- 
- try:
- 	from Bio.SeqIO.FastaIO import SimpleFastaParser
-@@ -186,7 +186,7 @@ def main():
- 	bad_read_id = ""
- 	bad_f5_path = ""
- 	num_bad_cases = 0
-	for r in region_fast5_files.keys():
-+	for r in list(region_fast5_files.keys()):
- 		read_id = r
- 		f5 = region_fast5_files[r]
- 
--- a/scripts/nanopolish_makerange.py
-+++ b/scripts/nanopolish_makerange.py
-@@ -1,5 +1,5 @@
-#! /usr/bin/env python
-from __future__ import print_function
-+#!/usr/bin/python3
-+
- 
- import sys
- import argparse
--- a/scripts/nanopolish_merge.py
-+++ b/scripts/nanopolish_merge.py
-@@ -1,4 +1,4 @@
-from __future__ import print_function
-+
- 
- import sys
- import glob
--- a/scripts/reestimate_polya_emissions.py
-+++ b/scripts/reestimate_polya_emissions.py
-@@ -117,7 +117,7 @@ def make_segmentation_dict(segmentations
-         headers = ['tag', 'read_id', 'pos', 'L_start', 'A_start', 'P_start', 'P_end', 'rate', 'plen', 'alen']
-         rdr = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE, fieldnames=headers)
-         for row in rdr:
-            if row['read_id'] not in segments.keys():
-+            if row['read_id'] not in list(segments.keys()):
-                 segments[row['read_id']] = { 'L_start': int(float(row['L_start'])),
-                                              'A_start': int(float(row['A_start'])),
-                                              'P_start': int(float(row['P_start'])),
-@@ -138,7 +138,7 @@ def region_search(read_id, sample_ix, se
-     """
-     # find read ID in segmentations:
-     read_key = None
-    for long_read_id in segmentations.keys():
-+    for long_read_id in list(segmentations.keys()):
-         if long_read_id[0:len(read_id)] == read_id:
-             read_key = long_read_id
- 
--- a/scripts/dropmodel.py
-+++ b/scripts/dropmodel.py
-@@ -1,4 +1,4 @@
-#! /usr/bin/env python
-+#!/usr/bin/python3
- # Generate a new model with a lower k-mer size than the input model
- 
- import sys
--- a/scripts/import_ont_model.py
-+++ b/scripts/import_ont_model.py
-@@ -1,4 +1,4 @@
-#! /usr/bin/python
-+#!/usr/bin/python3
- 
- # This script takes a .model file provided by ONT and adds metadata that allows it
- # to be compiled into nanopolish
--- a/debian/patches/add_interp
+++ b/debian/patches/add_interp
+From: Michael R. Crusoe <michael.crusoe@gmail.com>
+Subject: Add #! /usr/bin/env python3 to the scripts
+--- nanopolish.orig/scripts/convert_all_models.py
+++ nanopolish/scripts/convert_all_models.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ 
+ 
+ import sys
+--- nanopolish.orig/scripts/import_ont_model.py
+++ nanopolish/scripts/import_ont_model.py
+@@ -1,4 +1,4 @@
+-#! /usr/bin/python
+#! /usr/bin/env python3
+ 
+ # This script takes a .model file provided by ONT and adds metadata that allows it
+ # to be compiled into nanopolish
+--- nanopolish.orig/scripts/nanopolish_merge.py
+++ nanopolish/scripts/nanopolish_merge.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ 
+ 
+ import sys
+--- nanopolish.orig/scripts/reestimate_polya_emissions.py
+++ nanopolish/scripts/reestimate_polya_emissions.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ """
+ reestimate_polya_emissions.py: given two `polya-samples` TSV files based on different
+ underlying kmer models (with the newer TSV giving failing poly(A) segmentations),
+--- nanopolish.orig/scripts/polya_training/dump_signal.py
+++ nanopolish/scripts/polya_training/dump_signal.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ """
+ dump_signal.py: take a **verbose** polya-call file and dump an HDF5 file with the signal data.
+ 
+--- nanopolish.orig/scripts/polya_training/hmmplot.py
+++ nanopolish/scripts/polya_training/hmmplot.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ """
+ Plot a random segmentation from a dataset.
+ 
+--- nanopolish.orig/scripts/polya_training/retrain_emission.py
+++ nanopolish/scripts/polya_training/retrain_emission.py
+@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
+ """
+ retrain_emission.py: take an HDF5 file and segmentations, and output parameters of a mixture model.
+ """
--- a/debian/patches/series
+++ b/debian/patches/series
 add-shebang-to-script.patch
 reproducible.patch
-2to3.patch
+add_interp
--- a/debian/rules
+++ b/debian/rules
@@ -20,7 +20,7 @@ export EIGEN=external
 export HTS=external

 %:
-	dh $@ --with python2
+	dh $@ --with python3

 override_dh_auto_clean:
 	sed -i~ 's/^.depend: .*/.depend:/' Makefile

--- a/debian/upstream/metadata
+++ b/debian/upstream/metadata
@@ -7,3 +7,4 @@ Registry:
  Entry: SCR_016157
 - Name: conda:bioconda
  Entry: nanopolish
+Repository: https://github.com/jts/nanopolish.git
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -45,18 +45,18 @@ source_suffix = '.rst'
 master_doc = 'index'

 # General information about the project.
-project = u'Nanopolish'
-copyright = u'2017, Simpson Lab'
-author = u'Simpson Lab'
+project = 'Nanopolish'
+copyright = '2017, Simpson Lab'
+author = 'Simpson Lab'

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = u'0.8.4'
+version = '0.8.4'
 # The full version, including alpha/beta/rc tags.
-release = u'0.8.4'
+release = '0.8.4'

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -138,8 +138,8 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'Nanopolish.tex', u'Nanopolish Documentation',
-     u'Simpson Lab', 'manual'),
+    (master_doc, 'Nanopolish.tex', 'Nanopolish Documentation',
+     'Simpson Lab', 'manual'),
 ]


@@ -148,7 +148,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'nanopolish', u'Nanopolish Documentation',
+    (master_doc, 'nanopolish', 'Nanopolish Documentation',
     [author], 1)
 ]

@@ -159,7 +159,7 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'Nanopolish', u'Nanopolish Documentation',
+    (master_doc, 'Nanopolish', 'Nanopolish Documentation',
     author, 'Nanopolish', 'One line description of project.',
     'Miscellaneous'),
 ]

--- a/docs/source/debug.rst
+++ b/docs/source/debug.rst
@@ -91,7 +91,7 @@ Usage example
 """""""""""""""""""""""
 ::

-    python extract_reads_aligned_to_region.py \
+    python3 extract_reads_aligned_to_region.py \
        --reads reads.fasta \
        --genome ecoli.contigs.fasta \
        --bam reads.sorted.bam \

--- a/docs/source/manual.rst
+++ b/docs/source/manual.rst
@@ -94,8 +94,8 @@ Input

 Output
 """"""""
-    * gzipped FASTA format of basecalled reads
-    * index files (fai, gzi, readdb)
+    * gzipped FASTA file of basecalled reads (.index)
+    * index files (.fai, .gzi, .readdb)

 Readdb file format
 """"""""""""""""""""

--- a/docs/source/quickstart_call_methylation.rst
+++ b/docs/source/quickstart_call_methylation.rst
@@ -84,7 +84,7 @@ The output file contains a lot of information including the position of the CG d

 A positive value in the ``log_lik_ratio`` column indicates support for methylation. We have provided a helper script that can be used to calculate how often each reference position was methylated: ::

-	scripts/calculate_methylation_frequency.py -i methylation_calls.tsv > methylation_frequency.tsv
+	scripts/calculate_methylation_frequency.py methylation_calls.tsv > methylation_frequency.tsv

 The output is another tab-separated file, this time summarized by genomic position: ::

@@ -96,7 +96,7 @@ The output is another tab-separated file, this time summarized by genomic positi

 In the example data set we have also included bisulfite data from ENCODE for the same region of chromosome 20. We can use the included ``compare_methylation.py`` helper script to do a quick comparison between the nanopolish methylation output and bisulfite: ::

-    python compare_methylation.py bisulfite.ENCFF835NTC.example.tsv methylation_frequency.tsv > bisulfite_vs_nanopolish.tsv
+    python3 compare_methylation.py bisulfite.ENCFF835NTC.example.tsv methylation_frequency.tsv > bisulfite_vs_nanopolish.tsv

 We can use R to visualize the results - we observe good correlation between the nanopolish methylation calls and bisulfite: ::


--- a/docs/source/quickstart_polya.rst
+++ b/docs/source/quickstart_polya.rst
@@ -21,7 +21,7 @@ Download raw fast5 data and basecall
 Let's start by downloading a dataset of fast5 files from the European Nucleotide Archive. We'll download a tarball of fast5 files containing reads that are known to have polyadenylated tail lengths of 30nt. ::

    mkdir data && mkdir data/fastqs
-    wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA158/ERA1580896/oxfordnanopore_native/30xpolyA.tar.gz -O 30xpolyA.tar.gz && mv 30xpolyA.tar.gz data/
+    wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR276/ERR2764784/30xpolyA.tar.gz -O 30xpolyA.tar.gz && mv 30xpolyA.tar.gz data/
    tar -xzf data/30xpolyA.tar.gz -C data/
    read_fast5_basecaller.py --worker_threads=8 -f FLO-MIN107 -k SQK-RNA001 -q 0 -s data/fastqs -i data/30xpolyA/fast5/pass
    cat data/fastqs/workspace/pass/*.fastq > data/30xpolyA.fastq

--- a/scripts/calculate_methylation_frequency.py
+++ b/scripts/calculate_methylation_frequency.py
-#! /usr/bin/env python
+#! /usr/bin/env python3

-import math
 import sys
 import csv
 import argparse
-from collections import namedtuple
+import gzip

 class SiteStats:
    def __init__(self, g_size, g_seq):
@@ -25,19 +24,18 @@ def update_call_stats(key, num_called_cpg_sites, is_methylated, sequence):

 parser = argparse.ArgumentParser( description='Calculate methylation frequency at genomic CpG sites')
 parser.add_argument('-c', '--call-threshold', type=float, required=False, default=2.5)
-parser.add_argument('-i', '--input', type=str, required=False)
 parser.add_argument('-s', '--split-groups', action='store_true')
-args = parser.parse_args()
+args, input_files = parser.parse_known_args()
 assert(args.call_threshold is not None)

 sites = dict()
-
-if args.input:
-    in_fh = open(args.input)
+# iterate over input files and collect per-site stats
+for f in input_files:
+    if f[-3:] == ".gz":
+        in_fh = gzip.open(f, 'rt')
    else:
-    in_fh = sys.stdin
+        in_fh = open(f)
    csv_reader = csv.DictReader(in_fh, delimiter='\t')
-
    for record in csv_reader:

        num_sites = int(record['num_motifs'])
@@ -71,7 +69,7 @@ for record in csv_reader:
 # header
 print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))

-sorted_keys = sorted(sites.keys(), key = lambda x: x)
+sorted_keys = sorted(list(sites.keys()), key = lambda x: x)

 for key in sorted_keys:
    if sites[key].called_sites > 0:

--- a/scripts/convert_all_models.py
+++ b/scripts/convert_all_models.py
-from __future__ import print_function
+

 import sys
 import os.path
@@ -13,7 +13,7 @@ for model_file in sys.stdin:
    outfile = "builtin_models/%s.inl" % basename
    function_name = "initialize_%s_builtin" % (basename)
    
-    ret = os.system("python scripts/convert_model_to_header.py -i %s -f %s > src/%s" % (model_file, function_name, outfile))
+    ret = os.system("python3 scripts/convert_model_to_header.py -i %s -f %s > src/%s" % (model_file, function_name, outfile))
    if ret != 0:
        sys.stderr.write("Error processing %s\n" % model_file)
        sys.exit(1)

--- a/scripts/convert_model_to_header.py
+++ b/scripts/convert_model_to_header.py
-#! /usr/bin/env python
+#! /usr/bin/env python3
 # Convert a .model file into a .inl file that can be directly compiled into nanopolish
-from __future__ import print_function
+

 import argparse