Skip to content
Commits on Source (10)
# travis.yml for github.com/jts/nanopolish
language: cpp
dist: trusty
sudo: false
language: generic
cache: apt
git:
depth: 1
matrix:
include:
# Set env for both nanoplish and the dependency hdf5.
- env:
- CC=gcc-4.8
- CXX=g++-4.8
- AR=gcc-ar-4.8
- NM=gcc-nm-4.8
- RANLIB=gcc-ranlib-4.8
- env:
- CC=gcc-8
- CXX=g++-8
- AR=gcc-ar-8
- NM=gcc-nm-8
- RANLIB=gcc-ranlib-8
# Install and export newer gcc
before_install:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get update -qq
- sudo apt-get install -qq g++-4.8
- |
if [[ "${CC}" =~ ^gcc- ]]; then
echo "Installing ${CC}."
sudo apt-get install -qq "${CC}"
fi
- |
if [[ "${CXX}" =~ ^g\+\+- ]]; then
echo "Installing ${CXX}."
sudo apt-get install -qq "${CXX}"
fi
script: make CXX=g++-4.8 nanopolish && make CXX=g++-4.8 test
script:
# Suppress all compiler warnings for hdf5 Makefile
# to display the log without downloading the raw log on Travis log page.
# Travis finishs with error when exceeding the limit of 4 MB of log length.
- export H5_CFLAGS="-w"
- make && make test
......@@ -10,8 +10,8 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
#Basic flags every build needs
LIBS = -lz
CXXFLAGS ?= -g -O3
CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
CFLAGS ?= -O3 -std=c99
CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -D_FILE_OFFSET_BITS=64 #D_FILE_OFFSET_BITS=64 makes nanopolish work in 32 bit systems
CFLAGS ?= -O3 -std=c99 -fsigned-char -D_FILE_OFFSET_BITS=64
CXX ?= g++
CC ?= gcc
......@@ -20,6 +20,10 @@ HDF5?=install
EIGEN ?= install
HTS ?= install
HDF5_VERSION ?= 1.8.14
HDF5_CONFIG_ARGS ?= --enable-threadsafe
EIGEN_VERSION ?= 3.2.5
# Check operating system, OSX doesn't have -lrt
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
......@@ -73,7 +77,8 @@ CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(NP_INCLUDE) $(EIGEN_
PROGRAM = nanopolish
TEST_PROGRAM = nanopolish_test
all: $(PROGRAM) $(TEST_PROGRAM)
.PHONY: all
all: depend $(PROGRAM)
#
# Build libhts
......@@ -85,16 +90,23 @@ htslib/libhts.a:
# If this library is a dependency the user wants HDF5 to be downloaded and built.
#
lib/libhdf5.a:
if [ ! -e hdf5-1.8.14.tar.gz ]; then wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.8/hdf5-1.8.14/src/hdf5-1.8.14.tar.gz; fi
tar -xzf hdf5-1.8.14.tar.gz || exit 255
cd hdf5-1.8.14 && ./configure --enable-threadsafe --prefix=`pwd`/.. && make && make install
if [ ! -e hdf5-$(HDF5_VERSION).tar.gz ]; then \
version_major_minior=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minior}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
fi
tar -xzf hdf5-$(HDF5_VERSION).tar.gz || exit 255
cd hdf5-$(HDF5_VERSION) && \
./configure --enable-threadsafe --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
make && make install
# Download and install eigen if not already downloaded
eigen/INSTALL:
if [ ! -e 3.2.5.tar.bz2 ]; then wget http://bitbucket.org/eigen/eigen/get/3.2.5.tar.bz2; fi
tar -xjvf 3.2.5.tar.bz2 || exit 255
mv eigen-eigen-bdd17ee3b1b3 eigen || exit 255
if [ ! -e $(EIGEN_VERSION).tar.bz2 ]; then \
wget http://bitbucket.org/eigen/eigen/get/$(EIGEN_VERSION).tar.bz2; \
fi
tar -xjf $(EIGEN_VERSION).tar.bz2 || exit 255
mv eigen-eigen-* eigen || exit 255
#
# Source files
......@@ -110,15 +122,13 @@ CPP_OBJ=$(CPP_SRC:.cpp=.o)
C_OBJ = $(C_SRC:.c=.o)
# Generate dependencies
PHONY=depend
.PHONY: depend
depend: .depend
.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(EIGEN_CHECK)
rm -f ./.depend
$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
include .depend
# Compile objects
.cpp.o:
$(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $<
......@@ -134,8 +144,11 @@ $(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIG
$(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
.PHONY: test
test: $(TEST_PROGRAM)
./$(TEST_PROGRAM)
.PHONY: clean
clean:
rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) src/main/nanopolish.o src/test/nanopolish_test.o
rm -f $(PROGRAM) $(TEST_PROGRAM) $(CPP_OBJ) $(C_OBJ) \
src/main/nanopolish.o src/test/nanopolish_test.o
......@@ -6,6 +6,8 @@ Software package for signal-level analysis of Oxford Nanopore sequencing data. N
## Release notes
* 0.11.0: support for multi-fast5 files. `nanopolish methyltrain` now subsamples input data, improving speed and memory usage
* 0.10.2: added new program `nanopolish polya` to estimate the length of poly-A tails on direct RNA reads (by @paultsw)
* 0.10.1: `nanopolish variants --consensus` now only outputs a VCF file instead of a fasta sequence. The VCF file describes the changes that need to be made to turn the draft sequence into the polished assembly. A new program, `nanopolish vcf2fasta`, is provided to generate the polished genome (this replaces `nanopolish_merge.py`, see usage instructions below). This change is to avoid issues when merging segments that end on repeat boundaries (reported by Michael Wykes and Chris Wright).
......
nanopolish (0.10.2-2) UNRELEASED; urgency=medium
nanopolish (0.11.0-1) unstable; urgency=medium
* Team upload.
* Recommends: python-pysam, python-biopython
* debhelper 12
* Standards-Version: 4.3.0
* Fix permissions
* Fix path for Python interpreter
* Better care for Python scripts
-- Andreas Tille <tille@debian.org> Tue, 30 Oct 2018 10:36:30 +0100
-- Andreas Tille <tille@debian.org> Thu, 24 Jan 2019 09:11:19 +0100
nanopolish (0.10.2-1) unstable; urgency=medium
......
......@@ -3,12 +3,14 @@ Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.
Uploaders: Afif Elghraoui <afif@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
Build-Depends: debhelper (>= 12~),
dh-python,
python,
zlib1g-dev,
libfast5-dev (>= 0.6.5),
libhts-dev,
libeigen3-dev
Standards-Version: 4.2.1
Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/nanopolish
Vcs-Git: https://salsa.debian.org/med-team/nanopolish.git
Homepage: https://github.com/jts/nanopolish
......@@ -17,6 +19,7 @@ Package: nanopolish
Architecture: any-amd64 any-i386
Depends: ${shlibs:Depends},
${misc:Depends},
${python:Depends}
Recommends: python-biopython,
python-pysam
Suggests: perl,
......
From 303971c245d507988132ad8404f840461ff4d87d Mon Sep 17 00:00:00 2001
From: Jared Simpson <jared.simpson@gmail.com>
Date: Sat, 17 Feb 2018 21:48:45 -0500
Subject: [PATCH] check for existence of group before opening it (#335)
---
src/common/nanopolish_fast5_io.cpp | 7 +++++++
1 file changed, 7 insertions(+)
--- a/src/common/nanopolish_fast5_io.cpp
+++ b/src/common/nanopolish_fast5_io.cpp
@@ -215,6 +215,13 @@ std::string fast5_get_fixed_string_attri
return "";
}
+ // according to http://hdf-forum.184993.n3.nabble.com/check-if-dataset-exists-td194725.html
+ // we should use H5Lexists to check for the existence of a group/dataset using an arbitrary path
+ ret = H5Lexists(hdf5_file, group_name.c_str(), H5P_DEFAULT);
+ if(ret <= 0) {
+ return "";
+ }
+
// Open the group /Raw/Reads/Read_nnn
group = H5Gopen(hdf5_file, group_name.c_str(), H5P_DEFAULT);
if(group < 0) {
......@@ -2,9 +2,9 @@ Description: make build reproducible
This patch enforces stable input file ordering w.r.t. source file gathering
via Make's 'wildcard' directive.
Author: Sascha Steinbiss <satta@debian.org>
--- nanopolish.orig/Makefile
+++ nanopolish/Makefile
@@ -101,8 +101,8 @@
--- a/Makefile
+++ b/Makefile
@@ -113,8 +113,8 @@ eigen/INSTALL:
#
# Find the source files by searching subdirectories
......
add-shebang-to-script.patch
reproducible.patch
hdf5-groupcheck.patch
......@@ -20,7 +20,7 @@ export EIGEN=external
export HTS=external
%:
dh $@
dh $@ --with python2
override_dh_auto_clean:
sed -i~ 's/^.depend: .*/.depend:/' Makefile
......@@ -34,3 +34,10 @@ override_dh_install:
for pl in `find debian -name "*.pl"` ; do \
sed -i '1s?^#!/usr/bin/env.*perl?#!/usr/bin/perl?' $${pl} ; \
done
for pl in `grep -Rl '#![[:space:]]*/usr/bin/env *python' debian/*/usr/*` ; do \
sed -i '1s?^#!.*python?#!/usr/bin/python?' $${pl} ; \
done
override_dh_fixperms:
dh_fixperms
find debian -name *.fast5 -exec chmod -x \{\} \;
.. _quickstart_polya:
Quickstart - how to estimate poly(A) tail lengths from nanopore native RNA reads
=================================================================================
Owing to homopolymer effects and the proximity to the sequencing adapters, the polyadenylated tails of reads obtained from nanopore native RNA sequencing are improperly basecalled, making their lengths difficult to quantify. We developed the `polya` subprogram to use an alternative statistical model to estimate these tail lengths.
In this quickstart tutorial, we'll show you how to estimate polyadenylated tail lengths step-by-step, starting from nothing but raw fast5 files. We'll basecall the fast5 files with Oxford Nanopore Technologies' *albacore* basecaller, before aligning the resulting reads with *minimap2*, indexing the files with *nanopolish index*, and finally segmenting the reads and calling the tail lengths with *nanopolish polya*.
We'll be following the steps taken in our `benchmarking analysis <https://github.com/paultsw/polya_analysis>`_ workflow that accompanies `our publication <https://www.biorxiv.org/content/early/2018/11/09/459529>`_. In each step below, we'll generate files in the working directory instead of making subdirectories, except in the case of large datasets.
Software requirements
---------------------
* `nanopolish <https://github.com/jts/nanopolish>`_ `>= v10.2`
* `minimap2 <https://github.com/lh3/minimap2>`_ `>= v2.12`
* `samtools <http://www.htslib.org/>`_ `>= v1.9`
* `albacore >= v2.3.3` (from Oxford Nanopore's private customer portal)
Download raw fast5 data and basecall
------------------------------------
Let's start by downloading a dataset of fast5 files from the European Nucleotide Archive. We'll download a tarball of fast5 files containing reads that are known to have polyadenylated tail lengths of 30nt. ::
mkdir data && mkdir data/fastqs
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA158/ERA1580896/oxfordnanopore_native/30xpolyA.tar.gz -O 30xpolyA.tar.gz && mv 30xpolyA.tar.gz data/
tar -xzf data/30xpolyA.tar.gz -C data/
read_fast5_basecaller.py --worker_threads=8 -f FLO-MIN107 -k SQK-RNA001 -q 0 -s data/fastqs -i data/30xpolyA/fast5/pass
cat data/fastqs/workspace/pass/*.fastq > data/30xpolyA.fastq
In the above, change the value of the `-f` and `-k` arguments based on your flow-cell and sequencing kit, as the basecaller's accuracy is highly dependent upon these settings.
Our directory structure should now look something like this: ::
(current directory)
|- data/
|-- 30xpolyA.fastq
|-- 30xpolyA.tar.gz
|-- 30xpolyA/
|--- fast5/
|---- pass/
|----- raw_read1.fast5
|----- (... more raw fast5's here ...)
|-- fastqs/
|--- workspace/
|---- pass/
Index with nanopolish index
---------------------------
Let's construct an index for our reads with nanopolish's `index` subprogram. This constructs a fast lookup table that tells our program where to find the raw fast5 file for each
basecalled read. ::
nanopolish index --directory=data/30xpolyA/fast5/pass --sequencing-summary=data/fastqs/sequencing_summary.txt data/30xpolyA.fastq
This should generate a collection of files in the same directory that contains the `30xpolyA.fastq`.
Align with minimap2 and format the BAM file
-------------------------------------------
Now we'll align our basecalled mRNA sequences in the fastq file with our reference. First download a reference fasta: ::
wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fas
wget https://raw.githubusercontent.com/paultsw/polya_analysis/master/data/references/enolase_reference.fai
mv enolase_reference.* data/
Note that our directory structure should now look like this: ::
(current directory)
|- data/
|-- enolase_reference.fas
|-- enolase_reference.fai
|-- (... same structure as above ...)
Let's run `minimap2` to align our basecalled reads to this reference and generate a collection of SAM/BAM files with `samtools`: ::
minimap2 -a -x map-ont data/enolase_reference.fas data/30xpolyA.fastq | samtools view -b - -o data/30xpolyA.bam
cd data && samtools sort -T tmp -o 30xpolyA.sorted.bam 30xpolyA.bam && samtools index 30xpolyA.sorted.bam && cd ..
Note that we used `-x map-ont`, which is typically for unspliced reads (e.g. genomic DNA) coming from nanopore sequencers. In typical native mRNA sequencing, you should use `-x splice`,
which is a splice-aware setting (and uses a different gap cost in the alignment). We're using `map-ont` due to the fact that our reads come from a control dataset with no splicing.
There should be three more files in the `data` directory now: `30xpolyA.bam`, `30xpolyA.sorted.bam`, and `30xpolyA.sorted.bam.bai`.
Segmentation and tail length estimation with nanopolish polya
-------------------------------------------------------------
Finally, we can run the polyadenylation estimator: ::
nanopolish polya --threads=8 --reads=data/30xpolyA.fastq --bam=data/30xpolyA.sorted.bam --genome=data/enolase_reference.fas > polya_results.tsv
Set the `--threads` flag to the number of parallel threads you want to use. Generally speaking, a larger number of threads tends to lower the compute time, but there are diminishing
returns to a higher value and performance can actually decrease if your CPU is incapable of supporting your desired number of parallel threads. The best number of threads to use is
highly dependent upon your hardware.
Interpreting the output TSV file
--------------------------------
We'll end this quickstart with a look at the output of the polya program. Let's look at the top five lines of the `polya_results.tsv` file we've just generated: ::
head -20 polya_results.tsv | column -t
readname contig position leader_start adapter_start polya_start transcript_start read_rate polya_length qc_tag
d6f42b79-90c6-4edd-8c8f-8a7ce0ac6ecb YHR174W 0 54.0 3446.0 7216.0 8211.0 130.96 38.22 PASS
453f3f3e-d22f-4d9c-81a6-8576e23390ed YHR174W 0 228.0 5542.0 10298.0 11046.0 130.96 27.48 PASS
e02d9858-0c04-4d86-8dba-18a47d9ac005 YHR174W 0 221.0 1812.0 7715.0 8775.0 97.16 29.16 PASS
b588dee2-2c5b-410c-91e1-fe8140f4f837 YHR174W 0 22.0 8338.0 13432.0 14294.0 130.96 32.43 PASS
af9dfee2-1711-4083-b109-487b99895e0a YHR174W 0 889.0 3679.0 6140.0 7168.0 130.96 39.65 PASS
93f98a86-3b18-48cf-8c4d-15cf277911e2 YHR174W 0 144.0 1464.0 5615.0 6515.0 120.48 30.96 SUFFCLIP
af9dfee2-1711-4083-b109-487b99895e0a YHR174W 0 889.0 3679.0 6140.0 7168.0 130.96 39.65 SUFFCLIP
93f98a86-3b18-48cf-8c4d-15cf277911e2 YHR174W 0 144.0 1464.0 5615.0 6515.0 120.48 30.96 PASS
ca8d4059-9d82-45ee-aa07-4b8b351618b3 YHR174W 0 1.0 2157.0 4255.0 5862.0 111.56 54.48 PASS
3493c123-78d4-4f7c-add0-cbb249aef00a YHR174W 0 78.0 1938.0 4829.0 5491.0 136.91 25.05 PASS
f5ff1802-3fdd-479a-8888-c72de01bbaea YHR174W 0 150.0 3476.0 7233.0 7932.0 130.96 25.35 PASS
bb929728-2ed8-42b0-a5a5-eea4bfd62673 YHR174W 0 91.0 1061.0 6241.0 6910.0 111.56 19.74 PASS
17cf3fef-1acb-4045-8252-e9c00fedfb7c YHR174W 0 447.0 6004.0 20058.0 20964.0 100.40 25.17 ADAPTER
e3e38de6-8a99-4029-a067-261f470517ca YHR174W 0 41.0 1588.0 4057.0 5303.0 130.96 49.13 PASS
66f55b56-c22e-4e6d-999e-50687bed6fb7 YHR174W 0 191.0 3160.0 9218.0 10030.0 125.50 28.79 PASS
56c116d7-9286-4b57-8329-e74928b11b38 YHR174W 0 13.0 1516.0 5845.0 6773.0 130.96 35.30 PASS
5ca1392c-c48f-4135-85d3-271bd4ee7a13 YHR174W 0 1.0 1976.0 4854.0 5947.0 136.91 44.64 PASS
66b5a0ef-b8e6-475e-bf20-04b96154a67f YHR174W 0 98.0 3847.0 7066.0 7925.0 120.48 29.32 PASS
34bf2187-5816-4744-8e6a-3250b5247e02 YHR174W 0 1.0 2897.0 6885.0 7547.0 125.50 22.54 PASS
Each row corresponds to the output for a given read. The columns have the following interpretation:
* `readname` refers to the unique ID associated to this particular read. This string is also used to look up the corresponding fast5 file, e.g. by looking
through the `readdb` file generated by `nanopolish index`.
* `contig` refers to the reference sequence that this read aligns to, and is taken from the BAM file.
* `position` is the 5' starting position of the alignment to the reference sequence, and also comes from the BAM file.
* `leader_start`, `adapter_start`, `polya_start`, and `transcript_start` are the indices of the signal segmentation generated by the underlying model within
`nanopolish`. Briefly, there are four biologically-meaningful regions of the raw sequence of electrical current readings within each fast5 file; these four
entries denote the starting index of each of these consecutive regions. The indices start from 0 and are oriented in the 3'-to-5' direction (due to the
inherent orientation of the native RNA nanopore sequencing protocol). A full exposition of this segmentation algorithm is available in the
`supplementary note<https://www.biorxiv.org/content/biorxiv/suppl/2018/11/09/459529.DC1/459529-2.pdf>`_ to our associated publication.
* `read_rate` is the estimated translocation rate (in units of nucleotides/second) of the read through the pore. The translocation rate is non-uniform during
the sequencing process of even a single molecule, so this is ultimately a summary statistic of the dynamic, time-varying rate.
* `polya_length` is the estimated polyadenylated tail length, in number of nucleotides. That this value is a float rather than an integer reflects the fact
that our estimated tail length is the output of an estimator based on the translocation rate.
* `qc_tag` is an additional flag used to indicate the validity of the estimate. Generally speaking, you should only use rows of the output file with this value
set to `PASS`; all other rows with (e.g.) the `qc_tag` set to `SUFFCLIP`, `ADAPTER`, etc. display signs of irregularity that indicate that we believe the
estimate to be unreliable. You can easily filter away all rows with the tag set to anything other than `PASS` using a `grep`: ::
grep 'PASS' polya_results.tsv > polya_results.pass_only.tsv
This diff is collapsed.
#! /usr/bin/env python
from __future__ import print_function
import sys
......
......@@ -243,7 +243,11 @@ void emit_tsv_header(FILE* fp)
void emit_sam_header(samFile* fp, const bam_hdr_t* hdr)
{
sam_hdr_write(fp, hdr);
int ret = sam_hdr_write(fp, hdr);
if(ret < 0) {
fprintf(stderr, "error writing sam header\n");
exit(EXIT_FAILURE);
}
}
std::string cigar_ops_to_string(const std::vector<uint32_t>& ops)
......@@ -382,7 +386,12 @@ void emit_event_alignment_sam(htsFile* fp,
int stride = alignments.front().event_idx < alignments.back().event_idx ? 1 : -1;
bam_aux_append(event_record, "ES", 'i', 4, reinterpret_cast<uint8_t*>(&stride));
sam_write1(fp, base_hdr, event_record);
int ret = sam_write1(fp, base_hdr, event_record);
if(ret < 0) {
fprintf(stderr, "error writing sam record\n");
exit(EXIT_FAILURE);
}
bam_destroy1(event_record); // automatically frees malloc'd segment
}
......
This diff is collapsed.
......@@ -94,6 +94,39 @@ const char* MethylCpGAlphabet::_recognition_sites[] = { "CG" };
const char* MethylCpGAlphabet::_recognition_sites_methylated[] = { "MG" };
const char* MethylCpGAlphabet::_recognition_sites_methylated_complement[] = { "GM" };
//
// methyl-cytosine in GC context
//
const uint8_t MethylGpCAlphabet::_rank[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,2,0,0,0,0,0,3,0,0,
0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
const char* MethylGpCAlphabet::_name = "gpc";
const char* MethylGpCAlphabet::_base = "ACGMT";
const char* MethylGpCAlphabet::_complement = "TGCGA";
const uint32_t MethylGpCAlphabet::_size = 5;
const uint32_t MethylGpCAlphabet::_num_recognition_sites = 1;
const uint32_t MethylGpCAlphabet::_recognition_length = 2;
const char* MethylGpCAlphabet::_recognition_sites[] = { "GC" };
const char* MethylGpCAlphabet::_recognition_sites_methylated[] = { "GM" };
const char* MethylGpCAlphabet::_recognition_sites_methylated_complement[] = { "MG" };
//
// Dam methylation: methyl-adenine in GATC context
//
......@@ -163,6 +196,7 @@ const char* MethylDcmAlphabet::_recognition_sites_methylated_complement[] = { "G
// Global objects
DNAAlphabet gDNAAlphabet;
MethylCpGAlphabet gMCpGAlphabet;
MethylGpCAlphabet gMethylGpCAlphabet;
MethylDamAlphabet gMethylDamAlphabet;
MethylDcmAlphabet gMethylDcmAlphabet;
UtoTRNAAlphabet gUtoTRNAAlphabet;
......@@ -171,6 +205,7 @@ std::vector<const Alphabet*> get_alphabet_list()
{
std::vector<const Alphabet*> list = { &gDNAAlphabet,
&gMCpGAlphabet,
&gMethylGpCAlphabet,
&gMethylDamAlphabet,
&gMethylDcmAlphabet,
&gUtoTRNAAlphabet };
......
......@@ -355,6 +355,26 @@ struct MethylCpGAlphabet : public Alphabet
}
};
//
// methyl-cytosine in GC context
//
struct MethylGpCAlphabet : public Alphabet
{
// member variables, expanded by macrocs
BASIC_MEMBER_BOILERPLATE
METHYLATION_MEMBER_BOILERPLATE
// member functions
BASIC_ACCESSOR_BOILERPLATE
METHYLATION_ACCESSOR_BOILERPLATE
// does this alphabet contain all of the nucleotides in bases?
virtual inline bool contains_all(const char *bases) const
{
return strspn(bases, _base) == strlen(bases);
}
};
//
// Dam methylation: methyl-adenine in GATC context
//
......@@ -398,6 +418,7 @@ struct MethylDcmAlphabet : public Alphabet
// Global alphabet objects that can be re-used
extern DNAAlphabet gDNAAlphabet;
extern MethylCpGAlphabet gMCpGAlphabet;
extern MethylGpCAlphabet gMethylGpCAlphabet;
extern MethylDamAlphabet gMethylDamAlphabet;
extern MethylDcmAlphabet gMethylDcmAlphabet;
extern UtoTRNAAlphabet gUtoTRNAAlphabet;
......
......@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
#define PACKAGE_VERSION "0.10.2"
#define PACKAGE_VERSION "0.11.0"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
......
......@@ -8,65 +8,91 @@
//
#include <string.h>
#include <math.h>
#include <assert.h>
#include "nanopolish_fast5_io.h"
//#define DEBUG_FAST5_IO 1
#define RAW_ROOT "/Raw/Reads/"
#define LEGACY_FAST5_RAW_ROOT "/Raw/Reads/"
int verbose = 0;
//
hid_t fast5_open(const std::string& filename)
fast5_file fast5_open(const std::string& filename)
{
hid_t hdf5file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
return hdf5file;
fast5_file fh;
fh.hdf5_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
// Check for attribute that indicates whether it is single or multi-fast5
// see: https://community.nanoporetech.com/posts/multi-fast5-format
const std::string indicator_p1 = "/UniqueGlobalKey/";
const std::string indicator_p2 = indicator_p1 + "tracking_id/";
bool has_indicator = H5Lexists(fh.hdf5_file, indicator_p1.c_str(), H5P_DEFAULT) && H5Lexists(fh.hdf5_file, indicator_p2.c_str(), H5P_DEFAULT);
fh.is_multi_fast5 = !has_indicator;
return fh;
}
//
void fast5_close(hid_t hdf5_file)
bool fast5_is_open(fast5_file& fh)
{
H5Fclose(hdf5_file);
return fh.hdf5_file >= 0;
}
//
std::string fast5_get_raw_read_group(hid_t hdf5_file)
void fast5_close(fast5_file& fh)
{
std::string read_name = fast5_get_raw_read_name(hdf5_file);
if(read_name != "") {
return std::string(RAW_ROOT) + read_name;
} else {
return "";
}
H5Fclose(fh.hdf5_file);
}
//
std::string fast5_get_raw_read_name(hid_t hdf5_file)
std::vector<std::string> fast5_get_multi_read_groups(fast5_file& fh)
{
// This code is From scrappie's fast5_interface
std::vector<std::string> out;
size_t buffer_size = 0;
char* buffer = NULL;
// get the number of groups in the root group
H5G_info_t group_info;
int ret = H5Gget_info_by_name(fh.hdf5_file, "/", &group_info, H5P_DEFAULT);
if(ret < 0) {
fprintf(stderr, "error getting group info\n");
exit(EXIT_FAILURE);
}
// retrieve the size of the read name
ssize_t size =
H5Lget_name_by_idx(hdf5_file, RAW_ROOT, H5_INDEX_NAME, H5_ITER_INC, 0, NULL,
0, H5P_DEFAULT);
for(size_t group_idx = 0; group_idx < group_info.nlinks; ++group_idx) {
// retrieve the size of this group name
ssize_t size = H5Lget_name_by_idx(fh.hdf5_file, "/", H5_INDEX_NAME, H5_ITER_INC, group_idx, NULL, 0, H5P_DEFAULT);
if(size < 0) {
return "";
fprintf(stderr, "error getting group name size\n");
exit(EXIT_FAILURE);
}
size += 1; // for null terminator
// copy the read name out of the fast5
char* name = (char*)calloc(1 + size, sizeof(char));
H5Lget_name_by_idx(hdf5_file, RAW_ROOT, H5_INDEX_NAME, H5_ITER_INC, 0, name,
1 + size, H5P_DEFAULT);
if(size > buffer_size) {
buffer = (char*)realloc(buffer, size);
buffer_size = size;
}
// cleanup
std::string out(name);
free(name);
// copy the group name
H5Lget_name_by_idx(fh.hdf5_file, "/", H5_INDEX_NAME, H5_ITER_INC, group_idx, buffer, buffer_size, H5P_DEFAULT);
buffer[size] = '\0';
out.push_back(buffer);
}
free(buffer);
buffer = NULL;
buffer_size = 0;
return out;
}
//
std::string fast5_get_read_id(hid_t hdf5_file)
std::string fast5_get_read_id_single_fast5(fast5_file& fh)
{
// this function is not supported for multi-fast5 files
assert(!fh.is_multi_fast5);
int ret;
hid_t read_name_attribute, raw_group, attribute_type;
size_t storage_size = 0;
......@@ -75,16 +101,16 @@ std::string fast5_get_read_id(hid_t hdf5_file)
std::string out = "";
// Get the path to the raw read group
std::string raw_read_group = fast5_get_raw_read_group(hdf5_file);
std::string raw_read_group = fast5_get_raw_read_group(fh, "");
if(raw_read_group == "") {
return out;
}
return fast5_get_fixed_string_attribute(hdf5_file, raw_read_group, "read_id");
return fast5_get_string_attribute(fh, raw_read_group, "read_id");
}
//
raw_table fast5_get_raw_samples(hid_t hdf5_file, fast5_raw_scaling scaling)
raw_table fast5_get_raw_samples(fast5_file& fh, const std::string& read_id, fast5_raw_scaling scaling)
{
float* rawptr = NULL;
hid_t space;
......@@ -94,12 +120,12 @@ raw_table fast5_get_raw_samples(hid_t hdf5_file, fast5_raw_scaling scaling)
raw_table rawtbl = { 0, 0, 0, NULL };
// mostly from scrappie
std::string raw_read_group = fast5_get_raw_read_group(hdf5_file);
std::string raw_read_group = fast5_get_raw_read_group(fh, read_id);
// Create data set name
std::string signal_path = raw_read_group + "/Signal";
hid_t dset = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT);
hid_t dset = H5Dopen(fh.hdf5_file, signal_path.c_str(), H5P_DEFAULT);
if (dset < 0) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "Failed to open dataset '%s' to read raw signal from.\n", signal_path.c_str());
......@@ -140,12 +166,27 @@ raw_table fast5_get_raw_samples(hid_t hdf5_file, fast5_raw_scaling scaling)
return rawtbl;
}
std::string fast5_get_context_tags_group(fast5_file& fh, const std::string& read_id)
{
std::string group = fh.is_multi_fast5 ? "/read_" + read_id + "/context_tags"
: "/UniqueGlobalKey/context_tags";
return group;
}
//
std::string fast5_get_experiment_type(hid_t hdf5_file)
std::string fast5_get_sequencing_kit(fast5_file& fh, const std::string& read_id)
{
std::string group = fast5_get_context_tags_group(fh, read_id);
return fast5_get_string_attribute(fh, group.c_str(), "sequencing_kit");
}
std::string fast5_get_experiment_type(fast5_file& fh, const std::string& read_id)
{
return fast5_get_fixed_string_attribute(hdf5_file, "/UniqueGlobalKey/context_tags", "experiment_type");
std::string group = fast5_get_context_tags_group(fh, read_id);
return fast5_get_string_attribute(fh, group.c_str(), "experiment_type");
}
// from scrappie
float fast5_read_float_attribute(hid_t group, const char *attribute) {
float val = NAN;
......@@ -171,16 +212,18 @@ float fast5_read_float_attribute(hid_t group, const char *attribute) {
}
//
fast5_raw_scaling fast5_get_channel_params(hid_t hdf5_file)
fast5_raw_scaling fast5_get_channel_params(fast5_file& fh, const std::string& read_id)
{
// from scrappie
fast5_raw_scaling scaling = { NAN, NAN, NAN, NAN };
const char *scaling_path = "/UniqueGlobalKey/channel_id";
hid_t scaling_group = H5Gopen(hdf5_file, scaling_path, H5P_DEFAULT);
std::string scaling_path = fh.is_multi_fast5 ? "/read_" + read_id + "/channel_id"
: "/UniqueGlobalKey/channel_id";
hid_t scaling_group = H5Gopen(fh.hdf5_file, scaling_path.c_str(), H5P_DEFAULT);
if (scaling_group < 0) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "Failed to group %s.", scaling_path);
fprintf(stderr, "Failed to open group %s\n", scaling_path.c_str());
#endif
return scaling;
}
......@@ -200,23 +243,61 @@ fast5_raw_scaling fast5_get_channel_params(hid_t hdf5_file)
//
//
std::string fast5_get_fixed_string_attribute(hid_t hdf5_file, const std::string& group_name, const std::string& attribute_name)
std::string fast5_get_raw_root(fast5_file& fh, const std::string& read_id)
{
size_t storage_size;
char* buffer;
hid_t group, attribute, attribute_type;
int ret;
return fh.is_multi_fast5 ? "/read_" + read_id + "/Raw" : "/Raw/Reads";
}
//
std::string fast5_get_raw_read_group(fast5_file& fh, const std::string& read_id)
{
if(fh.is_multi_fast5) {
return "/read_" + read_id + "/Raw";
} else {
std::string internal_read_name = fast5_get_raw_read_internal_name(fh);
return internal_read_name != "" ? std::string(LEGACY_FAST5_RAW_ROOT) + "/" + internal_read_name : "";
}
}
//
std::string fast5_get_raw_read_internal_name(fast5_file& fh)
{
// This code is From scrappie's fast5_interface
// retrieve the size of the read name
ssize_t size =
H5Lget_name_by_idx(fh.hdf5_file, LEGACY_FAST5_RAW_ROOT, H5_INDEX_NAME, H5_ITER_INC, 0, NULL, 0, H5P_DEFAULT);
if (size < 0) {
return "";
}
// copy the read name out of the fast5
char* name = (char*)calloc(1 + size, sizeof(char));
H5Lget_name_by_idx(fh.hdf5_file, LEGACY_FAST5_RAW_ROOT, H5_INDEX_NAME, H5_ITER_INC, 0, name, 1 + size, H5P_DEFAULT);
// cleanup
std::string out(name);
free(name);
return out;
}
//
std::string fast5_get_string_attribute(fast5_file& fh, const std::string& group_name, const std::string& attribute_name)
{
hid_t group, attribute, attribute_type, native_type;
std::string out;
// according to http://hdf-forum.184993.n3.nabble.com/check-if-dataset-exists-td194725.html
// we should use H5Lexists to check for the existence of a group/dataset using an arbitrary path
ret = H5Lexists(hdf5_file, group_name.c_str(), H5P_DEFAULT);
// HDF5 1.8 returns 0 on the root group, so we explicitly check for it
int ret = group_name == "/" ? 1 : H5Lexists(fh.hdf5_file, group_name.c_str(), H5P_DEFAULT);
if(ret <= 0) {
return "";
}
// Open the group /Raw/Reads/Read_nnn
group = H5Gopen(hdf5_file, group_name.c_str(), H5P_DEFAULT);
// Open the group containing the attribute we want
group = H5Gopen(fh.hdf5_file, group_name.c_str(), H5P_DEFAULT);
if(group < 0) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "could not open group %s\n", group_name.c_str());
......@@ -241,13 +322,45 @@ std::string fast5_get_fixed_string_attribute(hid_t hdf5_file, const std::string&
// Get data type and check it is a fixed-length string
attribute_type = H5Aget_type(attribute);
if(H5Tis_variable_str(attribute_type)) {
if(attribute_type < 0) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "variable length string detected -- ignoring attribute\n");
fprintf(stderr, "failed to get attribute type %s\n", attribute_name.c_str());
#endif
goto close_type;
}
if(H5Tget_class(attribute_type) != H5T_STRING) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "attribute %s is not a string\n", attribute_name.c_str());
#endif
goto close_type;
}
native_type = H5Tget_native_type(attribute_type, H5T_DIR_ASCEND);
if(native_type < 0) {
#ifdef DEBUG_FAST5_IO
fprintf(stderr, "failed to get native type for %s\n", attribute_name.c_str());
#endif
goto close_native_type;
}
if(H5Tis_variable_str(attribute_type) > 0) {
// variable length string
char* buffer;
ret = H5Aread(attribute, native_type, &buffer);
if(ret < 0) {
fprintf(stderr, "error reading attribute %s\n", attribute_name.c_str());
exit(EXIT_FAILURE);
}
out = buffer;
free(buffer);
buffer = NULL;
} else {
// fixed length string
size_t storage_size;
char* buffer;
// Get the storage size and allocate
storage_size = H5Aget_storage_size(attribute);
buffer = (char*)calloc(storage_size + 1, sizeof(char));
......@@ -260,6 +373,10 @@ std::string fast5_get_fixed_string_attribute(hid_t hdf5_file, const std::string&
// clean up
free(buffer);
}
close_native_type:
H5Tclose(native_type);
close_type:
H5Tclose(attribute_type);
close_attr:
......@@ -269,4 +386,3 @@ close_group:
return out;
}
......@@ -21,45 +21,75 @@ extern "C" {
// From scrappie
typedef struct {
// Information for scaling raw data from ADC values to pA
// Parameters for scaling raw data from ADC values to pA
float digitisation;
float offset;
float range;
float sample_rate;
} fast5_raw_scaling;
//
struct fast5_file
{
hid_t hdf5_file;
bool is_multi_fast5;
};
//
// External API
//
// open the file and return the hdf ID
hid_t fast5_open(const std::string& filename);
//
// Basic I/O
//
// open the file and return handle
fast5_file fast5_open(const std::string& filename);
// check if file is open
bool fast5_is_open(fast5_file& fh);
// close the file
void fast5_close(hid_t hdf5_file);
void fast5_close(fast5_file& fh);
// get the raw samples from this file
raw_table fast5_get_raw_samples(hid_t hdf5_file, fast5_raw_scaling scaling);
//
// Functions to get the names of the read(s) contained in the file
//
// get the name of the raw read in the file (eg Read_1234)
std::string fast5_get_raw_read_name(hid_t hdf5_file);
// Get the identifier of a read from the hdf5 file (eg 00041f-....)
std::string fast5_get_read_id_single_fast5(fast5_file& fh);
// Get a vector of read groups for a multi-fast5 file (eg [read_00041f-..., read_1243fe-....])
std::vector<std::string> fast5_get_multi_read_groups(fast5_file& fh);
// get the name of the raw read group (eg /Raw/Read/Read_1234)
std::string fast5_get_raw_read_group(hid_t hdf5_file);
//
// Functions to get the samples or metadata
//
// Get the identifier of a read from the hdf5 file
std::string fast5_get_read_id(hid_t hdf5_file);
// get the raw samples from this file
raw_table fast5_get_raw_samples(fast5_file& fh, const std::string& read_id, fast5_raw_scaling scaling);
// Get the sequencing kit
std::string fast5_get_sequencing_kit(fast5_file& fh, const std::string& read_id);
// Get the experiment type attribute
std::string fast5_get_experiment_type(hid_t hdf5_file);
std::string fast5_get_experiment_type(fast5_file& fh, const std::string& read_id);
// Get sample rate, and ADC-to-pA scalings
fast5_raw_scaling fast5_get_channel_params(hid_t hdf5_file);
fast5_raw_scaling fast5_get_channel_params(fast5_file& fh, const std::string& read_id);
//
// Internal utility functions
//
std::string fast5_get_fixed_string_attribute(hid_t hdf5_file, const std::string& group_name, const std::string& attribute_name);
// get the name of the raw read in the file (eg Read_1234)
std::string fast5_get_raw_read_internal_name(fast5_file& fh);
// get the name of the raw read group (eg /Raw/Read/Read_1234 or /read_00041f-.../Raw/)
std::string fast5_get_raw_read_group(fast5_file& fh, const std::string& read_id);
//
std::string fast5_get_string_attribute(fast5_file& fh, const std::string& group_name, const std::string& attribute_name);
#endif