Steffen Möller · Steffen Möller · Steffen Möller · Steffen Möller · 14cc1848 · 14cc1848
--- a/.travis.yml
+++ b/.travis.yml
 # travis.yml for github.com/jts/nanopolish

-dist: trusty
-services: docker
-sudo: false
 language: generic
-cache: apt
-git:
-    depth: 1
-
-.nanopolish.ci.matrix-definitions:
-    - &cron-only
-      if: type = cron OR env(CI_CRON) = true
-    - &arch
-      before_install:
-          - |
-            sed -i "/^FROM / s/arm64/${ARCH}/" Dockerfile-arm
-          - |
-            docker run --rm --privileged \
-                multiarch/qemu-user-static:register --reset && \
-                docker build --rm -t nanopolish -f Dockerfile-arm .
-      script:
-          - |
-            docker run --rm -t \
-                -e HDF5="${HDF5:-install}" \
-                -e H5_CFLAGS="${H5_CFLAGS}" \
-                -e HDF5_VERSION="1.10.4" \
-                -e H5_INCLUDE="${H5_INCLUDE}" \
-                -e LDFLAGS="${LDFLAGS}" \
-                nanopolish

 matrix:
  include:
-        # Set env for both nanoplish and the dependency hdf5.
-        - env:
-              - CC=gcc-4.8
-              - CXX=g++-4.8
-              - AR=gcc-ar-4.8
-              - NM=gcc-nm-4.8
-              - RANLIB=gcc-ranlib-4.8
-        - env:
-              - CC=gcc-8
-              - CXX=g++-8
-              - AR=gcc-ar-8
-              - NM=gcc-nm-8
-              - RANLIB=gcc-ranlib-8
-        # aarch64 - ARM 64-bit
-        - name: aarch64
-          sudo: required
+    - os: linux
+      arch: amd64
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-4.9
+      env: "CC=gcc-4.9 CXX=g++-4.9 AR=gcc-ar-4.9 NM=gcc-nm-4.9 RANLIB=gcc-ranlib-4.9"
+    - os: linux
+      arch: amd64
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test
+          packages:
+            - g++-8
+      env: "CC=gcc-8 CXX=g++-8 AR=gcc-ar-8 NM=gcc-nm-8 RANLIB=gcc-ranlib-8"
+    - os: linux
+      arch: arm64
+      addons:
+        apt:
+          packages:
+            - libhdf5-serial-dev
      env:
-              - ARCH=arm64
-          <<: *arch
-          <<: *cron-only
-        - name: aarch64-system-hdf5
-          sudo: required
-          env:
-              - ARCH=arm64
        - HDF5="system"
        - H5_INCLUDE="-I/usr/include/hdf5/serial"
        - LDFLAGS="-L/usr/lib/aarch64-linux-gnu/hdf5/serial"
-          <<: *arch
-        # armv7l - ARM 32-bit
-        - name: armv7l
-          sudo: required
-          env:
-              - ARCH=armhf
-          <<: *arch
-          <<: *cron-only
-        - name: armv7l-system-hdf5
-          sudo: required
-          env:
-              - ARCH=armhf
-              - HDF5="system"
-              - H5_INCLUDE="-I/usr/include/hdf5/serial"
-              - LDFLAGS="-L/usr/lib/arm-linux-gnueabihf/hdf5/serial"
-          <<: *arch
-    allow_failures:
-        # The jobs installing hdf5 from source in docker finishes with error
-        # because of the job exceeded the maximum time limit (50 minutes).
-        - name: aarch64
-        - name: armv7l
-
-# Install and export newer gcc
-before_install:
-    - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-    - sudo apt-get update -qq
-    - |
-      if [[ "${CC}" =~ ^gcc- ]]; then
-          echo "Installing ${CC}."
-          sudo apt-get install -qq "${CC}"
-      fi
-    - |
-      if [[ "${CXX}" =~ ^g\+\+- ]]; then
-          echo "Installing ${CXX}."
-          sudo apt-get install -qq "${CXX}"
-      fi

 before_script:
    # Suppress all compiler warnings for hdf5 Makefile

--- a/debian/changelog
+++ b/debian/changelog
+nanopolish (0.11.3-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream version
+
+ -- Steffen Moeller <moeller@debian.org>  Sat, 21 Dec 2019 19:10:21 +0100
+
 nanopolish (0.11.2-1) unstable; urgency=medium

  [ Michael R. Crusoe ]

--- a/debian/patches/add_interp
+++ b/debian/patches/add_interp
-From: Michael R. Crusoe <michael.crusoe@gmail.com>
-Subject: Add #! /usr/bin/env python3 to the scripts
--- nanopolish.orig/scripts/convert_all_models.py
-+++ nanopolish/scripts/convert_all_models.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- 
- 
- import sys
--- nanopolish.orig/scripts/import_ont_model.py
-+++ nanopolish/scripts/import_ont_model.py
-@@ -1,4 +1,4 @@
-#! /usr/bin/python
-+#! /usr/bin/env python3
- 
- # This script takes a .model file provided by ONT and adds metadata that allows it
- # to be compiled into nanopolish
--- nanopolish.orig/scripts/nanopolish_merge.py
-+++ nanopolish/scripts/nanopolish_merge.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- 
- 
- import sys
--- nanopolish.orig/scripts/reestimate_polya_emissions.py
-+++ nanopolish/scripts/reestimate_polya_emissions.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- """
- reestimate_polya_emissions.py: given two `polya-samples` TSV files based on different
- underlying kmer models (with the newer TSV giving failing poly(A) segmentations),
--- nanopolish.orig/scripts/polya_training/dump_signal.py
-+++ nanopolish/scripts/polya_training/dump_signal.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- """
- dump_signal.py: take a **verbose** polya-call file and dump an HDF5 file with the signal data.
- 
--- nanopolish.orig/scripts/polya_training/hmmplot.py
-+++ nanopolish/scripts/polya_training/hmmplot.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- """
- Plot a random segmentation from a dataset.
- 
--- nanopolish.orig/scripts/polya_training/retrain_emission.py
-+++ nanopolish/scripts/polya_training/retrain_emission.py
-@@ -1,3 +1,4 @@
-+#! /usr/bin/env python3
- """
- retrain_emission.py: take an HDF5 file and segmentations, and output parameters of a mixture model.
- """
--- a/debian/patches/series
+++ b/debian/patches/series
 add-shebang-to-script.patch
 reproducible.patch
-add_interp
 fix_remaining_python_interpreters.patch
--- a/docs/source/_static/nanopolish-workflow.png
+++ b/docs/source/_static/nanopolish-workflow.png
--- a/docs/source/quickstart_call_methylation.rst
+++ b/docs/source/quickstart_call_methylation.rst
@@ -27,7 +27,7 @@ In this tutorial we will use a subset of the `NA12878 WGS Consortium data <https
 **Details**:

 * Sample :	Human cell line (NA12878)
-* Basecaller : Albacore v2.0.2
+* Basecaller : Guppy v2.3.5
 * Region: chr20:5,000,000-10,000,000

 In the extracted example data you should find the following files:
@@ -38,35 +38,35 @@ In the extracted example data you should find the following files:

 The reads were basecalled using this albacore command: ::

-    read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i fast5_files -s basecalled/ -o fastq
+    guppy_basecaller -c dna_r9.4.1_450bps_flipflop.cfg -i fast5_files/ -s basecalled/

 After the basecaller finished, we merged all of the fastq files together into a single file: ::

-    cat basecalled/workspace/pass/*.fastq > albacore_output.fastq
+    cat basecalled/*.fastq > output.fastq

 Data preprocessing
 ------------------------------------

 nanopolish needs access to the signal-level data measured by the nanopore sequencer. To begin, we need to create an index file that links read ids with their signal-level data in the FAST5 files: ::

-    nanopolish index -d fast5_files/ albacore_output.fastq
+    nanopolish index -d fast5_files/ output.fastq

-We get the following files: ``albacore_output.fastq.index``, ``albacore_output.fastq.index.fai``, ``albacore_output.fastq.index.gzi``, and ``albacore_output.fastq.index.readdb``.
+We get the following files: ``output.fastq.index``, ``output.fastq.index.fai``, ``output.fastq.fastq.index.gzi``, and ``output.fastq.index.readdb``.

 Aligning reads to the reference genome
 --------------------------------------

 Next, we need to align the basecalled reads to the reference genome. We use minimap2 as it is fast enough to map reads to the human genome. In this example we'll pipe the output directly into ``samtools sort`` to get a sorted bam file: ::

-    minimap2 -a -x map-ont reference.fasta albacore_output.fastq | samtools sort -T tmp -o albacore_output.sorted.bam
-    samtools index albacore_output.sorted.bam
+    minimap2 -a -x map-ont reference.fasta output.fastq | samtools sort -T tmp -o output.sorted.bam
+    samtools index output.sorted.bam

 Calling methylation
 -------------------

-Now we're ready to use nanopolish to detect methylated bases (in this case 5-methylcytosine in a CpG context). The command is fairly straightforward - we have to tell it what reads to use (``albacore_output.fastq``), where the alignments are (``albacore_output.sorted.bam``), the reference genome (``reference.fasta``) and what region of the genome we're interested in (``chr20:5,000,000-10,000,000``)::
+Now we're ready to use nanopolish to detect methylated bases (in this case 5-methylcytosine in a CpG context). The command is fairly straightforward - we have to tell it what reads to use (``output.fastq``), where the alignments are (``output.sorted.bam``), the reference genome (``reference.fasta``) and what region of the genome we're interested in (``chr20:5,000,000-10,000,000``)::
 	
-    nanopolish call-methylation -t 8 -r albacore_output.fastq -b albacore_output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > methylation_calls.tsv
+    nanopolish call-methylation -t 8 -r output.fastq -b output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > methylation_calls.tsv

 The output file contains a lot of information including the position of the CG dinucleotide on the reference genome, the ID of the read that was used to make the call, and the log-likelihood ratio calculated by our model: ::


--- a/docs/source/quickstart_polya.rst
+++ b/docs/source/quickstart_polya.rst
@@ -23,8 +23,8 @@ Let's start by downloading a dataset of fast5 files from the European Nucleotide
    mkdir data && mkdir data/fastqs
    wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR276/ERR2764784/30xpolyA.tar.gz -O 30xpolyA.tar.gz && mv 30xpolyA.tar.gz data/
    tar -xzf data/30xpolyA.tar.gz -C data/
-    read_fast5_basecaller.py --worker_threads=8 -f FLO-MIN107 -k SQK-RNA001 -q 0 -s data/fastqs -i data/30xpolyA/fast5/pass
-    cat data/fastqs/workspace/pass/*.fastq > data/30xpolyA.fastq
+    guppy_basecaller -c rna_r9.4.1_70bps_hac.cfg -i data/30xpolyA/fast5/pass -s data/fastqs
+    cat data/fastqs/*.fastq > data/30xpolyA.fastq

 In the above, change the value of the `-f` and `-k` arguments based on your flow-cell and sequencing kit, as the basecaller's accuracy is highly dependent upon these settings.

@@ -57,7 +57,7 @@ Align with minimap2 and format the BAM file
 Now we'll align our basecalled mRNA sequences in the fastq file with our reference. First download a reference fasta: ::

    wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fas
-    wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fai
+    wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fas.fai
    mv enolase_reference.* data/

 Note that our directory structure should now look like this: ::

--- a/scripts/convert_all_models.py
+++ b/scripts/convert_all_models.py
+#! /usr/bin/env python3


 import sys

--- a/scripts/import_ont_model.py
+++ b/scripts/import_ont_model.py
-#! /usr/bin/python
+#! /usr/bin/env python3

 # This script takes a .model file provided by ONT and adds metadata that allows it
 # to be compiled into nanopolish

--- a/scripts/nanopolish_merge.py
+++ b/scripts/nanopolish_merge.py
+#! /usr/bin/env python3


 import sys

--- a/scripts/polya_training/dump_signal.py
+++ b/scripts/polya_training/dump_signal.py
+#! /usr/bin/env python3
 """
 dump_signal.py: take a **verbose** polya-call file and dump an HDF5 file with the signal data.


--- a/scripts/polya_training/hmmplot.py
+++ b/scripts/polya_training/hmmplot.py
+#! /usr/bin/env python3
 """
 Plot a random segmentation from a dataset.


--- a/scripts/polya_training/retrain_emission.py
+++ b/scripts/polya_training/retrain_emission.py
+#! /usr/bin/env python3
 """
 retrain_emission.py: take an HDF5 file and segmentations, and output parameters of a mixture model.
 """

--- a/scripts/reestimate_polya_emissions.py
+++ b/scripts/reestimate_polya_emissions.py
+#! /usr/bin/env python3
 """
 reestimate_polya_emissions.py: given two `polya-samples` TSV files based on different
 underlying kmer models (with the newer TSV giving failing poly(A) segmentations),

--- a/src/common/nanopolish_common.h
+++ b/src/common/nanopolish_common.h
@@ -18,7 +18,7 @@
 #include "logsum.h"

 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.11.2"
+#define PACKAGE_VERSION "0.11.3"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"

 //

--- a/src/nanopolish_index.cpp
+++ b/src/nanopolish_index.cpp
@@ -122,7 +122,7 @@ void index_path(ReadDB& read_db, const std::string& path, const std::multimap<st
                // recurse
                index_path(read_db, full_fn, fast5_to_read_name_map);
            } else if (is_fast5) {
-                if(!fast5_to_read_name_map.empty()) {
+                if(in_map) {
                    index_file_from_map(read_db, full_fn, fast5_to_read_name_map);
                } else {
                    index_file_from_fast5(read_db, full_fn);
@@ -176,7 +176,6 @@ void parse_sequencing_summary(const std::string& filename, std::multimap<std::st
    std::vector<std::string> fields = split(header, '\t');

    const std::string READ_NAME_STR = "read_id";
-    const std::string FILENAME_STR = "filename";
    size_t filename_idx = -1;
    size_t read_name_idx = -1;

@@ -185,13 +184,14 @@ void parse_sequencing_summary(const std::string& filename, std::multimap<std::st
            read_name_idx = i;
        }

-        if(fields[i] == FILENAME_STR) {
+        // 19/11/05: support live basecalling summary files
+        if(fields[i] == "filename" || fields[i] == "filename_fast5") {
            filename_idx = i;
        }
    }

    if(filename_idx == -1) {
-        exit_bad_header(FILENAME_STR, filename);
+        exit_bad_header("fast5 filename", filename);
    }

    if(read_name_idx == -1) {
@@ -274,6 +274,32 @@ void parse_index_options(int argc, char** argv)
    opt::reads_file = argv[optind++];
 }

+void clean_fast5_map(std::multimap<std::string, std::string>& mmap)
+{
+    std::map<std::string, int> fast5_read_count;
+    for(const auto& iter : mmap) {
+        fast5_read_count[iter.first]++;
+    }
+
+    int EXPECTED_ENTRIES_PER_FAST5 = 4000;
+    std::vector<std::string> invalid_entries;
+    for(const auto& iter : fast5_read_count) {
+        if(iter.second != EXPECTED_ENTRIES_PER_FAST5) {
+            //fprintf(stderr, "warning: %s has %d entries in the summary and will be indexed the slow way\n", iter.first.c_str(), iter.second);
+            invalid_entries.push_back(iter.first);
+        }
+    }
+
+    if(invalid_entries.size() > 0) {
+        fprintf(stderr, "warning: detected invalid summary file entries for %zu of %zu fast5 files\n", invalid_entries.size(), fast5_read_count.size());
+        fprintf(stderr, "These files will be indexed without using the summary file, which is slow.\n");
+    }
+
+    for(const auto& fast5_name : invalid_entries) {
+        mmap.erase(fast5_name);
+    }
+}
+
 int index_main(int argc, char** argv)
 {
    parse_index_options(argc, argv);
@@ -288,6 +314,13 @@ int index_main(int argc, char** argv)
        parse_sequencing_summary(ss_filename, fast5_to_read_name_map);
    }

+    // Detect non-unique fast5 file names in the summary file
+    // This occurs when a file with the same name (abc_0.fast5) appears in both fast5_pass
+    // and fast5_fail. This will be fixed by ONT but in the meantime we check for
+    // fast5 files that have a non-standard number of reads (>4000) and remove them
+    // from the map. These fast5s will be indexed the slow way.
+    clean_fast5_map(fast5_to_read_name_map);
+
    // import read names, and possibly fast5 paths, from the fasta/fastq file
    ReadDB read_db;
    read_db.build(opt::reads_file);

--- a/src/nanopolish_phase_reads.cpp
+++ b/src/nanopolish_phase_reads.cpp
@@ -59,7 +59,9 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"

 static const char *PHASE_READS_USAGE_MESSAGE =
 "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa variants.vcf\n"
-"Train a duration model\n"
+"Output a BAM file where each record shows the combination of alleles from variants.vcf that each read supports.\n"
+"variants.vcf can be any VCF file but only SNPs will be phased and variants that have a homozygous reference genotype (0/0)\n"
+"will be skipped.\n"
 "\n"
 "  -v, --verbose                        display verbose output\n"
 "      --version                        display version\n"

--- a/src/nanopolish_vcf2fasta.cpp
+++ b/src/nanopolish_vcf2fasta.cpp
@@ -237,7 +237,7 @@ int vcf2fasta_main(int argc, char** argv)
        // make a vector holding either a literal character or an index to the variant that needs to be applied
        std::vector<uint32_t> consensus_record(length);
        for(size_t i = 0; i < length; ++i) {
-            consensus_record[i] = seq[i];
+            consensus_record[i] = toupper(seq[i]);
        }

        size_t num_skipped = 0;