Skip to content
Commits on Source (4)
# travis.yml for github.com/jts/nanopolish
dist: trusty
services: docker
sudo: false
language: generic
cache: apt
git:
depth: 1
.nanopolish.ci.matrix-definitions:
- &cron-only
if: type = cron OR env(CI_CRON) = true
- &arch
before_install:
- |
sed -i "/^FROM / s/arm64/${ARCH}/" Dockerfile-arm
- |
docker run --rm --privileged \
multiarch/qemu-user-static:register --reset && \
docker build --rm -t nanopolish -f Dockerfile-arm .
script:
- |
docker run --rm -t \
-e HDF5="${HDF5:-install}" \
-e H5_CFLAGS="${H5_CFLAGS}" \
-e HDF5_VERSION="1.10.4" \
-e H5_INCLUDE="${H5_INCLUDE}" \
-e LDFLAGS="${LDFLAGS}" \
nanopolish
matrix:
include:
# Set env for both nanoplish and the dependency hdf5.
- env:
- CC=gcc-4.8
- CXX=g++-4.8
- AR=gcc-ar-4.8
- NM=gcc-nm-4.8
- RANLIB=gcc-ranlib-4.8
- env:
- CC=gcc-8
- CXX=g++-8
- AR=gcc-ar-8
- NM=gcc-nm-8
- RANLIB=gcc-ranlib-8
# aarch64 - ARM 64-bit
- name: aarch64
sudo: required
- os: linux
arch: amd64
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
env: "CC=gcc-4.9 CXX=g++-4.9 AR=gcc-ar-4.9 NM=gcc-nm-4.9 RANLIB=gcc-ranlib-4.9"
- os: linux
arch: amd64
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-8
env: "CC=gcc-8 CXX=g++-8 AR=gcc-ar-8 NM=gcc-nm-8 RANLIB=gcc-ranlib-8"
- os: linux
arch: arm64
addons:
apt:
packages:
- libhdf5-serial-dev
env:
- ARCH=arm64
<<: *arch
<<: *cron-only
- name: aarch64-system-hdf5
sudo: required
env:
- ARCH=arm64
- HDF5="system"
- H5_INCLUDE="-I/usr/include/hdf5/serial"
- LDFLAGS="-L/usr/lib/aarch64-linux-gnu/hdf5/serial"
<<: *arch
# armv7l - ARM 32-bit
- name: armv7l
sudo: required
env:
- ARCH=armhf
<<: *arch
<<: *cron-only
- name: armv7l-system-hdf5
sudo: required
env:
- ARCH=armhf
- HDF5="system"
- H5_INCLUDE="-I/usr/include/hdf5/serial"
- LDFLAGS="-L/usr/lib/arm-linux-gnueabihf/hdf5/serial"
<<: *arch
allow_failures:
# The jobs installing hdf5 from source in docker finishes with error
# because of the job exceeded the maximum time limit (50 minutes).
- name: aarch64
- name: armv7l
# Install and export newer gcc
before_install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get update -qq
- |
if [[ "${CC}" =~ ^gcc- ]]; then
echo "Installing ${CC}."
sudo apt-get install -qq "${CC}"
fi
- |
if [[ "${CXX}" =~ ^g\+\+- ]]; then
echo "Installing ${CXX}."
sudo apt-get install -qq "${CXX}"
fi
before_script:
# Suppress all compiler warnings for hdf5 Makefile
......
nanopolish (0.11.3-1) unstable; urgency=medium
* Team upload.
* New upstream version
-- Steffen Moeller <moeller@debian.org> Sat, 21 Dec 2019 19:10:21 +0100
nanopolish (0.11.2-1) unstable; urgency=medium
[ Michael R. Crusoe ]
......
From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Add #! /usr/bin/env python3 to the scripts
--- nanopolish.orig/scripts/convert_all_models.py
+++ nanopolish/scripts/convert_all_models.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
import sys
--- nanopolish.orig/scripts/import_ont_model.py
+++ nanopolish/scripts/import_ont_model.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python
+#! /usr/bin/env python3
# This script takes a .model file provided by ONT and adds metadata that allows it
# to be compiled into nanopolish
--- nanopolish.orig/scripts/nanopolish_merge.py
+++ nanopolish/scripts/nanopolish_merge.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
import sys
--- nanopolish.orig/scripts/reestimate_polya_emissions.py
+++ nanopolish/scripts/reestimate_polya_emissions.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
"""
reestimate_polya_emissions.py: given two `polya-samples` TSV files based on different
underlying kmer models (with the newer TSV giving failing poly(A) segmentations),
--- nanopolish.orig/scripts/polya_training/dump_signal.py
+++ nanopolish/scripts/polya_training/dump_signal.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
"""
dump_signal.py: take a **verbose** polya-call file and dump an HDF5 file with the signal data.
--- nanopolish.orig/scripts/polya_training/hmmplot.py
+++ nanopolish/scripts/polya_training/hmmplot.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
"""
Plot a random segmentation from a dataset.
--- nanopolish.orig/scripts/polya_training/retrain_emission.py
+++ nanopolish/scripts/polya_training/retrain_emission.py
@@ -1,3 +1,4 @@
+#! /usr/bin/env python3
"""
retrain_emission.py: take an HDF5 file and segmentations, and output parameters of a mixture model.
"""
add-shebang-to-script.patch
reproducible.patch
add_interp
fix_remaining_python_interpreters.patch
......@@ -27,7 +27,7 @@ In this tutorial we will use a subset of the `NA12878 WGS Consortium data <https
**Details**:
* Sample : Human cell line (NA12878)
* Basecaller : Albacore v2.0.2
* Basecaller : Guppy v2.3.5
* Region: chr20:5,000,000-10,000,000
In the extracted example data you should find the following files:
......@@ -38,35 +38,35 @@ In the extracted example data you should find the following files:
The reads were basecalled using this albacore command: ::
read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i fast5_files -s basecalled/ -o fastq
guppy_basecaller -c dna_r9.4.1_450bps_flipflop.cfg -i fast5_files/ -s basecalled/
After the basecaller finished, we merged all of the fastq files together into a single file: ::
cat basecalled/workspace/pass/*.fastq > albacore_output.fastq
cat basecalled/*.fastq > output.fastq
Data preprocessing
------------------------------------
nanopolish needs access to the signal-level data measured by the nanopore sequencer. To begin, we need to create an index file that links read ids with their signal-level data in the FAST5 files: ::
nanopolish index -d fast5_files/ albacore_output.fastq
nanopolish index -d fast5_files/ output.fastq
We get the following files: ``albacore_output.fastq.index``, ``albacore_output.fastq.index.fai``, ``albacore_output.fastq.index.gzi``, and ``albacore_output.fastq.index.readdb``.
We get the following files: ``output.fastq.index``, ``output.fastq.index.fai``, ``output.fastq.fastq.index.gzi``, and ``output.fastq.index.readdb``.
Aligning reads to the reference genome
--------------------------------------
Next, we need to align the basecalled reads to the reference genome. We use minimap2 as it is fast enough to map reads to the human genome. In this example we'll pipe the output directly into ``samtools sort`` to get a sorted bam file: ::
minimap2 -a -x map-ont reference.fasta albacore_output.fastq | samtools sort -T tmp -o albacore_output.sorted.bam
samtools index albacore_output.sorted.bam
minimap2 -a -x map-ont reference.fasta output.fastq | samtools sort -T tmp -o output.sorted.bam
samtools index output.sorted.bam
Calling methylation
-------------------
Now we're ready to use nanopolish to detect methylated bases (in this case 5-methylcytosine in a CpG context). The command is fairly straightforward - we have to tell it what reads to use (``albacore_output.fastq``), where the alignments are (``albacore_output.sorted.bam``), the reference genome (``reference.fasta``) and what region of the genome we're interested in (``chr20:5,000,000-10,000,000``)::
Now we're ready to use nanopolish to detect methylated bases (in this case 5-methylcytosine in a CpG context). The command is fairly straightforward - we have to tell it what reads to use (``output.fastq``), where the alignments are (``output.sorted.bam``), the reference genome (``reference.fasta``) and what region of the genome we're interested in (``chr20:5,000,000-10,000,000``)::
nanopolish call-methylation -t 8 -r albacore_output.fastq -b albacore_output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > methylation_calls.tsv
nanopolish call-methylation -t 8 -r output.fastq -b output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > methylation_calls.tsv
The output file contains a lot of information including the position of the CG dinucleotide on the reference genome, the ID of the read that was used to make the call, and the log-likelihood ratio calculated by our model: ::
......
......@@ -23,8 +23,8 @@ Let's start by downloading a dataset of fast5 files from the European Nucleotide
mkdir data && mkdir data/fastqs
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR276/ERR2764784/30xpolyA.tar.gz -O 30xpolyA.tar.gz && mv 30xpolyA.tar.gz data/
tar -xzf data/30xpolyA.tar.gz -C data/
read_fast5_basecaller.py --worker_threads=8 -f FLO-MIN107 -k SQK-RNA001 -q 0 -s data/fastqs -i data/30xpolyA/fast5/pass
cat data/fastqs/workspace/pass/*.fastq > data/30xpolyA.fastq
guppy_basecaller -c rna_r9.4.1_70bps_hac.cfg -i data/30xpolyA/fast5/pass -s data/fastqs
cat data/fastqs/*.fastq > data/30xpolyA.fastq
In the above, change the value of the `-f` and `-k` arguments based on your flow-cell and sequencing kit, as the basecaller's accuracy is highly dependent upon these settings.
......@@ -57,7 +57,7 @@ Align with minimap2 and format the BAM file
Now we'll align our basecalled mRNA sequences in the fastq file with our reference. First download a reference fasta: ::
wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fas
wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fai
wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fas.fai
mv enolase_reference.* data/
Note that our directory structure should now look like this: ::
......
#! /usr/bin/env python3
import sys
......
#! /usr/bin/python
#! /usr/bin/env python3
# This script takes a .model file provided by ONT and adds metadata that allows it
# to be compiled into nanopolish
......
#! /usr/bin/env python3
import sys
......
#! /usr/bin/env python3
"""
dump_signal.py: take a **verbose** polya-call file and dump an HDF5 file with the signal data.
......
#! /usr/bin/env python3
"""
Plot a random segmentation from a dataset.
......
#! /usr/bin/env python3
"""
retrain_emission.py: take an HDF5 file and segmentations, and output parameters of a mixture model.
"""
......
#! /usr/bin/env python3
"""
reestimate_polya_emissions.py: given two `polya-samples` TSV files based on different
underlying kmer models (with the newer TSV giving failing poly(A) segmentations),
......
......@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
#define PACKAGE_VERSION "0.11.2"
#define PACKAGE_VERSION "0.11.3"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
......
......@@ -122,7 +122,7 @@ void index_path(ReadDB& read_db, const std::string& path, const std::multimap<st
// recurse
index_path(read_db, full_fn, fast5_to_read_name_map);
} else if (is_fast5) {
if(!fast5_to_read_name_map.empty()) {
if(in_map) {
index_file_from_map(read_db, full_fn, fast5_to_read_name_map);
} else {
index_file_from_fast5(read_db, full_fn);
......@@ -176,7 +176,6 @@ void parse_sequencing_summary(const std::string& filename, std::multimap<std::st
std::vector<std::string> fields = split(header, '\t');
const std::string READ_NAME_STR = "read_id";
const std::string FILENAME_STR = "filename";
size_t filename_idx = -1;
size_t read_name_idx = -1;
......@@ -185,13 +184,14 @@ void parse_sequencing_summary(const std::string& filename, std::multimap<std::st
read_name_idx = i;
}
if(fields[i] == FILENAME_STR) {
// 19/11/05: support live basecalling summary files
if(fields[i] == "filename" || fields[i] == "filename_fast5") {
filename_idx = i;
}
}
if(filename_idx == -1) {
exit_bad_header(FILENAME_STR, filename);
exit_bad_header("fast5 filename", filename);
}
if(read_name_idx == -1) {
......@@ -274,6 +274,32 @@ void parse_index_options(int argc, char** argv)
opt::reads_file = argv[optind++];
}
void clean_fast5_map(std::multimap<std::string, std::string>& mmap)
{
std::map<std::string, int> fast5_read_count;
for(const auto& iter : mmap) {
fast5_read_count[iter.first]++;
}
int EXPECTED_ENTRIES_PER_FAST5 = 4000;
std::vector<std::string> invalid_entries;
for(const auto& iter : fast5_read_count) {
if(iter.second != EXPECTED_ENTRIES_PER_FAST5) {
//fprintf(stderr, "warning: %s has %d entries in the summary and will be indexed the slow way\n", iter.first.c_str(), iter.second);
invalid_entries.push_back(iter.first);
}
}
if(invalid_entries.size() > 0) {
fprintf(stderr, "warning: detected invalid summary file entries for %zu of %zu fast5 files\n", invalid_entries.size(), fast5_read_count.size());
fprintf(stderr, "These files will be indexed without using the summary file, which is slow.\n");
}
for(const auto& fast5_name : invalid_entries) {
mmap.erase(fast5_name);
}
}
int index_main(int argc, char** argv)
{
parse_index_options(argc, argv);
......@@ -288,6 +314,13 @@ int index_main(int argc, char** argv)
parse_sequencing_summary(ss_filename, fast5_to_read_name_map);
}
// Detect non-unique fast5 file names in the summary file
// This occurs when a file with the same name (abc_0.fast5) appears in both fast5_pass
// and fast5_fail. This will be fixed by ONT but in the meantime we check for
// fast5 files that have a non-standard number of reads (>4000) and remove them
// from the map. These fast5s will be indexed the slow way.
clean_fast5_map(fast5_to_read_name_map);
// import read names, and possibly fast5 paths, from the fasta/fastq file
ReadDB read_db;
read_db.build(opt::reads_file);
......
......@@ -59,7 +59,9 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"
static const char *PHASE_READS_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa variants.vcf\n"
"Train a duration model\n"
"Output a BAM file where each record shows the combination of alleles from variants.vcf that each read supports.\n"
"variants.vcf can be any VCF file but only SNPs will be phased and variants that have a homozygous reference genotype (0/0)\n"
"will be skipped.\n"
"\n"
" -v, --verbose display verbose output\n"
" --version display version\n"
......
......@@ -237,7 +237,7 @@ int vcf2fasta_main(int argc, char** argv)
// make a vector holding either a literal character or an index to the variant that needs to be applied
std::vector<uint32_t> consensus_record(length);
for(size_t i = 0; i < length; ++i) {
consensus_record[i] = seq[i];
consensus_record[i] = toupper(seq[i]);
}
size_t num_skipped = 0;
......