Skip to content
Commits on Source (7)
......@@ -17,7 +17,7 @@ include/
lib/
share/
hdf5-1.8.14.tar.gz
hdf5-1.*.tar.gz
3.2.5.tar.bz2
eigen/
local*
# travis.yml for github.com/jts/nanopolish
dist: trusty
services: docker
sudo: false
language: generic
cache: apt
git:
depth: 1
.nanopolish.ci.matrix-definitions:
- &cron-only
if: type = cron OR env(CI_CRON) = true
- &arch
before_install:
- |
sed -i "/^FROM / s/arm64/${ARCH}/" Dockerfile-arm
- |
docker run --rm --privileged \
multiarch/qemu-user-static:register --reset && \
docker build --rm -t nanopolish -f Dockerfile-arm .
script:
- |
docker run --rm -t \
-e HDF5="${HDF5:-install}" \
-e H5_CFLAGS="${H5_CFLAGS}" \
-e H5_INCLUDE="${H5_INCLUDE}" \
-e LDFLAGS="${LDFLAGS}" \
nanopolish
matrix:
include:
# Set env for both nanoplish and the dependency hdf5.
......@@ -22,6 +43,41 @@ matrix:
- AR=gcc-ar-8
- NM=gcc-nm-8
- RANLIB=gcc-ranlib-8
# aarch64 - ARM 64-bit
- name: aarch64
sudo: required
env:
- ARCH=arm64
<<: *arch
<<: *cron-only
- name: aarch64-system-hdf5
sudo: required
env:
- ARCH=arm64
- HDF5="system"
- H5_INCLUDE="-I/usr/include/hdf5/serial"
- LDFLAGS="-L/usr/lib/aarch64-linux-gnu/hdf5/serial"
<<: *arch
# armv7l - ARM 32-bit
- name: armv7l
sudo: required
env:
- ARCH=armhf
<<: *arch
<<: *cron-only
- name: armv7l-system-hdf5
sudo: required
env:
- ARCH=armhf
- HDF5="system"
- H5_INCLUDE="-I/usr/include/hdf5/serial"
- LDFLAGS="-L/usr/lib/arm-linux-gnueabihf/hdf5/serial"
<<: *arch
allow_failures:
# The jobs installing hdf5 from source in docker finishes with error
# because of the job exceeded the maximum time limit (50 minutes).
- name: aarch64
- name: armv7l
# Install and export newer gcc
before_install:
......@@ -38,9 +94,11 @@ before_install:
sudo apt-get install -qq "${CXX}"
fi
script:
before_script:
# Suppress all compiler warnings for hdf5 Makefile
# to display the log without downloading the raw log on Travis log page.
# Travis finishs with error when exceeding the limit of 4 MB of log length.
- export H5_CFLAGS="-w"
script:
- make && make test
FROM multiarch/ubuntu-debootstrap:arm64-bionic
RUN uname -a
RUN apt-get update -qq && \
apt-get install -yq --no-install-suggests --no-install-recommends \
bzip2 \
ca-certificates \
gcc \
g++ \
make \
software-properties-common
RUN add-apt-repository -y universe && \
apt-get update -qq && \
apt-get install -yq libhdf5-dev
RUN find /usr/include -name "hdf5.h" || true
RUN find /usr/lib -name "libhdf5.a" || true
WORKDIR /nanopolish
COPY . .
CMD exec bash -c "make && make test"
......@@ -12,6 +12,7 @@ LIBS = -lz
CXXFLAGS ?= -g -O3
CXXFLAGS += -std=c++11 -fopenmp -fsigned-char -D_FILE_OFFSET_BITS=64 #D_FILE_OFFSET_BITS=64 makes nanopolish work in 32 bit systems
CFLAGS ?= -O3 -std=c99 -fsigned-char -D_FILE_OFFSET_BITS=64
LDFLAGS ?=
CXX ?= g++
CC ?= gcc
......@@ -20,8 +21,7 @@ HDF5 ?= install
EIGEN ?= install
HTS ?= install
HDF5_VERSION ?= 1.8.14
HDF5_CONFIG_ARGS ?= --enable-threadsafe
HDF5_VERSION ?= 1.10.4
EIGEN_VERSION ?= 3.2.5
# Check operating system, OSX doesn't have -lrt
......@@ -38,7 +38,7 @@ ifeq ($(HDF5), install)
else
# Use system-wide hdf5
H5_LIB =
H5_INCLUDE =
H5_INCLUDE ?=
LIBS += -lhdf5
endif
......@@ -91,13 +91,13 @@ htslib/libhts.a:
#
lib/libhdf5.a:
if [ ! -e hdf5-$(HDF5_VERSION).tar.gz ]; then \
version_major_minior=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minior}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
version_major_minor=`echo "$(HDF5_VERSION)" | sed -E 's/\.[0-9]+$$//'`; \
wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-$${version_major_minor}/hdf5-$(HDF5_VERSION)/src/hdf5-$(HDF5_VERSION).tar.gz; \
fi
tar -xzf hdf5-$(HDF5_VERSION).tar.gz || exit 255
cd hdf5-$(HDF5_VERSION) && \
./configure --enable-threadsafe --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
./configure --enable-threadsafe --disable-hl --libdir=`pwd`/../lib --includedir=`pwd`/../include --prefix=`pwd`/.. && \
make && make install
# Download and install eigen if not already downloaded
......
nanopolish (0.11.1-1) unstable; urgency=medium
* Afif Elghraoui removed himself from Uploaders
* Add myself to Uploaders
* Make nanopolish_makerange accessible in /usr/bin
* Asked upstrem for Python3 port
https://github.com/jts/nanopolish/issues/626
* New upstream version
-- Andreas Tille <tille@debian.org> Wed, 10 Jul 2019 13:12:23 +0200
nanopolish (0.11.0-2) unstable; urgency=medium
* Team upload.
......
Source: nanopolish
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 12~),
......
nanopolish usr/bin
nanopolish_test usr/lib/nanopolish
scripts/* usr/lib/nanopolish
debian/scripts/* usr/bin
#!/bin/sh
/usr/bin/python /usr/lib/nanopolish/nanopolish_makerange.py $@
......@@ -15,6 +15,7 @@ nanopolish
quickstart_consensus
quickstart_eventalign
quickstart_call_methylation
quickstart_polya
debug
manual
......
......@@ -11,6 +11,7 @@ Modules available: ::
nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly
nanopolish eventalign: align signal-level events to k-mers of a reference genome
nanopolish phase-reads: Phase reads using heterozygous SNVs with respect to a reference genome
nanopolish polya: Estimate polyadenylated tail lengths on native RNA reads
|
extract
......@@ -486,3 +487,63 @@ Usage example
- print out a progress message
polya
--------------------
Overview
"""""""""""""""""""""""
Estimate the number of nucleotides in the poly(A) tails of native RNA reads.
Input
"""""""""""""""""""""""
* basecalled reads
* alignment information
* reference transcripts
Usage example
"""""""""""""""""""""""
::
nanopolish polya [OPTIONS] --reads=reads.fa --bam=alignments.bam --genome=ref.fa
.. list-table::
:widths: 20 10 20 50
:header-rows: 1
* - Argument name(s)
- Required
- Default value
- Description
* - ``-w, --window=STR``
- N
- NA
- Compute only for reads aligning to window of reference STR (format : ctg:start_id-end_id)
* - ``-r, --reads=FILE``
- Y
- NA
- the FAST(A/Q) file of native RNA reads
* - ``-b, --bam=FILE``
- Y
- NA
- the BAM file of alignments between reads and the reference
* - ``-g, --genome=FILE``
- Y
- NA
- the reference transcripts
* - ``-t, --threads=NUM``
- N
- 1
- use NUM threads
* - ``-v, -vv``
- N
- NA
- `-v` returns raw sample log-likelihoods, while `-vv` returns event durations
......@@ -6,17 +6,9 @@ import csv
import argparse
from collections import namedtuple
def make_key(c, s, e):
return c + ":" + str(s) + ":" + str(e)
def split_key(k):
f = k.split(":")
return (f[0], int(f[1]), int(f[2]))
class SiteStats:
def __init__(self, g_size, g_seq):
self.num_reads = 0
self.posterior_methylated = 0
self.called_sites = 0
self.called_sites_methylated = 0
self.group_size = g_size
......@@ -60,7 +52,7 @@ for record in csv_reader:
# if this is a multi-cpg group and split_groups is set, break up these sites
if args.split_groups and num_sites > 1:
c = record['chromosome']
c = str(record['chromosome'])
s = int(record['start'])
e = int(record['end'])
......@@ -69,20 +61,20 @@ for record in csv_reader:
cg_pos = sequence.find("CG")
first_cg_pos = cg_pos
while cg_pos != -1:
key = make_key(c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
key = (c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
update_call_stats(key, 1, is_methylated, "split-group")
cg_pos = sequence.find("CG", cg_pos + 1)
else:
key = make_key(record['chromosome'], record['start'], record['end'])
key = (str(record['chromosome']), int(record['start']), int(record['end']))
update_call_stats(key, num_sites, is_methylated, sequence)
# header
print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
sorted_keys = sorted(sites.keys(), key = lambda x: split_key(x))
sorted_keys = sorted(sites.keys(), key = lambda x: x)
for key in sorted_keys:
if sites[key].called_sites > 0:
(c, s, e) = key.split(":")
(c, s, e) = key
f = float(sites[key].called_sites_methylated) / sites[key].called_sites
print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence))
......@@ -18,7 +18,7 @@
#include "logsum.h"
#define PACKAGE_NAME "nanopolish"
#define PACKAGE_VERSION "0.11.0"
#define PACKAGE_VERSION "0.11.1"
#define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
//
......
......@@ -56,7 +56,7 @@ namespace opt
static std::ostream* os_p;
//
void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
{
PROFILE_FUNC("index_file_from_map")
......@@ -64,20 +64,16 @@ void index_file_from_map(ReadDB& read_db, const std::string& fn, const std::map<
size_t last_dir_pos = fn.find_last_of("/");
std::string fast5_basename = last_dir_pos == std::string::npos ? fn : fn.substr(last_dir_pos + 1);
const auto& iter = fast5_to_read_name_map.find(fast5_basename);
if(iter != fast5_to_read_name_map.end()) {
auto range = fast5_to_read_name_map.equal_range(fast5_basename);
for(auto iter = range.first; iter != range.second; ++iter) {
if(read_db.has_read(iter->second)) {
read_db.add_signal_path(iter->second.c_str(), fn);
}
} else {
if(opt::verbose > 0) {
fprintf(stderr, "Could not find read %s in sequencing summary file\n", fn.c_str());
}
}
}
//
void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::map<std::string, std::string>& fast5_to_read_name_map)
void index_file_from_fast5(ReadDB& read_db, const std::string& fn)
{
PROFILE_FUNC("index_file_from_fast5")
......@@ -106,7 +102,7 @@ void index_file_from_fast5(ReadDB& read_db, const std::string& fn, const std::ma
}
//
void index_path(ReadDB& read_db, const std::string& path, const std::map<std::string, std::string>& fast5_to_read_name_map)
void index_path(ReadDB& read_db, const std::string& path, const std::multimap<std::string, std::string>& fast5_to_read_name_map)
{
fprintf(stderr, "[readdb] indexing %s\n", path.c_str());
if (is_directory(path)) {
......@@ -124,7 +120,7 @@ void index_path(ReadDB& read_db, const std::string& path, const std::map<std::st
if(!fast5_to_read_name_map.empty()) {
index_file_from_map(read_db, full_fn, fast5_to_read_name_map);
} else {
index_file_from_fast5(read_db, full_fn, fast5_to_read_name_map);
index_file_from_fast5(read_db, full_fn);
}
}
}
......@@ -160,7 +156,7 @@ void exit_bad_header(const std::string& str, const std::string& filename)
}
//
void parse_sequencing_summary(const std::string& filename, std::map<std::string, std::string>& out_map)
void parse_sequencing_summary(const std::string& filename, std::multimap<std::string, std::string>& out_map)
{
// open
std::ifstream in_file(filename.c_str());
......@@ -203,7 +199,7 @@ void parse_sequencing_summary(const std::string& filename, std::map<std::string,
fields = split(line, '\t');
std::string fast5_filename = fields[filename_idx];
std::string read_name = fields[read_name_idx];
out_map[fast5_filename] = read_name;
out_map.insert(std::make_pair(fast5_filename, read_name));
}
}
......@@ -279,7 +275,7 @@ int index_main(int argc, char** argv)
// Read a map from fast5 files to read name from the sequencing summary files (if any)
process_summary_fofn();
std::map<std::string, std::string> fast5_to_read_name_map;
std::multimap<std::string, std::string> fast5_to_read_name_map;
for(const auto& ss_filename : opt::sequencing_summary_files) {
if(opt::verbose > 2) {
fprintf(stderr, "summary: %s\n", ss_filename.c_str());
......
......@@ -220,15 +220,21 @@ private:
// T -> T (100%)
{0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 1.00f}
};
// 50/50 chance of starting on L or S
float start_probs[HMM_NUM_STATES] = { 0.50f, 0.50f, 0.00f, 0.00f, 0.00f, 0.00f };
// All state sequences must start on S:
float start_probs[HMM_NUM_STATES] = { 1.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f };
// ----- emission parameters:
// emission parameters, from empirical MLE on manually-flagged reads:
// START, LEADER, and POLYA have Gaussian emissions;
// START has a mixture of Gaussian and Uniform emissions;
// LEADER and POLYA have Gaussian emissions;
// ADAPTER, TRANSCRIPT have Gaussian mixture emissions;
// CLIFF has Uniform emissions.
GaussianParameters s_emission = {70.2737f, 3.7743f};
float s_begin = 40.0f;
float s_end = 250.0f;
float s_prob = 0.00476f; // == {1. / (250.0f - 40.0f)}
float s_norm_coeff = 0.50f;
float s_unif_coeff = 0.50f;
GaussianParameters l_emission = {110.973f, 5.237f};
GaussianParameters a0_emission = {79.347f, 8.3702f};
GaussianParameters a1_emission = {63.3126f, 2.7464f};
......@@ -265,7 +271,8 @@ private:
float log_probs;
if (state == HMM_START) {
// START state:
log_probs = log_normal_pdf(xx, this->s_emission);
float norm_term = s_norm_coeff * normal_pdf(xx, this->s_emission);
log_probs = std::log(norm_term + s_unif_coeff * s_prob);
}
if (state == HMM_LEADER) {
// LEADER state:
......@@ -680,14 +687,13 @@ std::string pre_segmentation_qc(uint32_t suffix_clip, uint32_t prefix_clip, doub
// These tests indicate that something went wrong in the segmentation algorithm.
std::string post_segmentation_qc(const Segmentation& region_indices, const SquiggleRead& sr)
{
// fetch sizes of LEADER, ADAPTER, POLYA regions:
double num_leader_samples = region_indices.leader;
// fetch sizes of ADAPTER and POLYA regions:
double num_adapter_samples = (region_indices.adapter+1) - region_indices.leader;
double num_polya_samples = region_indices.polya - (region_indices.adapter+1);
// check for NOREGION:
std::string qc_tag;
if (num_leader_samples < 200.0 || num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
if (num_adapter_samples < 200.0 || num_polya_samples < 200.0) {
qc_tag = "NOREGION";
} else {
qc_tag = "PASS";
......@@ -733,6 +739,11 @@ void estimate_polya_for_single_read(const ReadDB& read_db,
int region_start,
int region_end)
{
//----- check if primary or secondary alignment by inspecting FLAG; skip read if secondary:
if (record->core.flag & BAM_FSECONDARY) {
return;
}
//----- load a squiggle read:
std::string read_name = bam_get_qname(record);
std::string ref_name(hdr->target_name[record->core.tid]);
......