Commit 354bf44a authored by Andreas Tille's avatar Andreas Tille

New upstream version 0.0+git20170813.e8a621e

parent f3a99ea5
### DeepNano: alternative basecaller for MinION reads
## DeepNano version for R9 and R9.4 chemistry can be found in R9 directory of this repository.
DeepNano is alternative basecaller for Oxford Nanopore MinION reads
based on deep recurrent neural networks.
......@@ -50,6 +52,8 @@ For SQK-MAP-005 chemistry use:
`OMP_NUM_THREADS=1 python basecall.py --template_net nets_data/map5temp.npz --complement_net nets_data/map5comp.npz --big_net nets_data/map5-2d.npz <list of fast5 files>`
For R9 and R9.4 check r9 directory.
Advanced arguments:
=================
......
......@@ -185,7 +185,8 @@ def basecall(read_file_name, fo):
e += [1] + list(data["comp_events2"][comp_ind])
events_2d.append(e)
events_2d = np.array(events_2d, dtype=np.float32)
predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
if len(events_2d) >= 5:
predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
parser = argparse.ArgumentParser()
parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
......
### DeepNano: alternative basecaller for MinION reads - R9(.4) version
Requirements
================
We use Python 2.7.
Here are versions of Python packages, that we used:
- Cython==0.23.4
- numpy==1.10.2
- h5py==2.5.0
- python-dateutil==2.5.0
Basic usage:
================
`python basecall.py --chemistry <r9/r9.4> <list of fast5 files>`
Advanced arguments:
=================
- `-h` - prints help message
- `--output FILENAME` - output filename
- `--directory DIRECTORY` Directory where read files are stored
- `--cut-data` - sometimes read detection can produce huge regions with no bases and basecaller gets
confused. This option is workaround around that issue.
Basecaller will perform event extraction, if events are not present.
from rnnf import Rnn
from qrnnf import Rnn as Rnnq
import h5py
import argparse
import os
import datetime
import numpy as np
from extract_events import extract_events
def scale94(X):
m25 = np.percentile(X[:,0], 25)
m75 = np.percentile(X[:,0], 75)
s50 = np.median(X[:,2])
me25 = -0.3
me75= 0.3
se50 = 0.6103758
ret = np.array(X)
scale = (me75 - me25) / (m75 - m25)
m25 *= scale
shift = me25 - m25
ret[:,0] = X[:,0] * scale + shift
ret[:,1] = ret[:,0]**2
sscale = se50 / s50
ret[:,2] = X[:,2] * sscale
return ret
def scale(X):
m25 = np.percentile(X[:,0], 25)
m75 = np.percentile(X[:,0], 75)
s50 = np.median(X[:,2])
me25 = 0.07499809
me75 = 0.26622871
se50 = 0.6103758
ret = np.array(X)
scale = (me75 - me25) / (m75 - m25)
m25 *= scale
shift = me25 - m25
ret[:,0] = X[:,0] * scale + shift
ret[:,1] = ret[:,0]**2
sscale = se50 / s50
ret[:,2] = X[:,2] * sscale
return ret
def get_events(h5):
if not args.event_detect:
try:
e = h5["Analyses/Basecall_RNN_1D_000/BaseCalled_template/Events"]
return e
except:
pass
try:
e = h5["Analyses/Basecall_1D_000/BaseCalled_template/Events"]
return e
except:
pass
return extract_events(h5, args.chemistry)
def basecall(filename, output_file):
try:
h5 = h5py.File(filename, "r")
events = get_events(h5)
if events is None:
print "No events in file %s" % filename
h5.close()
return 0
if len(events) < 300:
print "Read %s too short, not basecalling" % filename
h5.close()
return 0
events = events[50:-50][:args.max_events]
mean = events["mean"]
std = events["stdv"]
length = events["length"]
scale_f = scale if args.chemistry == 'r9' else scale94
X = np.array(np.vstack([mean, mean*mean, std, length]).T, dtype=np.float32)
if len(X) < 2500 or args.cut_data == False:
X = scale_f(X)
o1, o2 = ntwk.predict(X)
else:
preds1 = []
preds2 = []
for i in range(0, len(X), 2000):
o1, o2 = ntwk.predict(scale_f(X[i:i+2500]))
if i > 0:
o1 = o1[250:]
o2 = o2[250:]
if i + 2500 < len(X):
o1 = o1[:-250]
o2 = o2[:-250]
preds1.append(o1)
preds2.append(o2)
o1 = np.vstack(preds1)
o2 = np.vstack(preds2)
o1m = (np.argmax(o1, 1))
o2m = (np.argmax(o2, 1))
om = np.vstack((o1m,o2m)).reshape((-1,),order='F')
output = "".join(map(lambda x: alph[x], om)).replace("N", "")
print >>output_file, ">%s_template_deepnano" % filename
print >>output_file, output
output_file.flush()
h5.close()
return len(events)
except Exception as e:
print "Read %s failed with %s" % (filename, e)
return 0
alph = "ACGTN"
parser = argparse.ArgumentParser()
parser.add_argument('--chemistry', choices=['r9', 'r9.4'], default='r9.4')
parser.add_argument('--output', type=str, default="output.fasta")
parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
parser.add_argument('--watch', type=str, default='', help='Watched directory')
parser.add_argument('reads', type=str, nargs='*')
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--no-debug', dest='debug', action='store_false')
parser.add_argument('--event-detect', dest='event_detect', action='store_true')
parser.add_argument('--max-events', type=int, default=50000, help='Max amount of events to basecall')
parser.add_argument('--cut-data', dest='cut_data', action='store_true', help='Cut data into smaller chunks and basecall them separatelly. Helps in case of bad preprocessing.')
parser.set_defaults(debug=False)
parser.set_defaults(event_detect=False)
parser.set_defaults(cut_data=False)
args = parser.parse_args()
assert len(args.reads) != 0 or len(args.directory) != 0 or len(args.watch) != 0, "Nothing to basecall"
ntwks = {"r9": os.path.join("networks", "r9.pkl"), "r9.4": os.path.join("networks", "r94.pkl")}
ntwk = Rnn() if args.chemistry == 'r9' else Rnnq()
ntwk.load(ntwks[args.chemistry])
if len(args.reads) or len(args.directory) != 0:
fo = open(args.output, "w")
files = args.reads
if len(args.directory):
files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]
total_events = 0
start_time = datetime.datetime.now()
for i, read in enumerate(files):
current_events = basecall(read, fo)
if args.debug:
total_events += current_events
time_diff = (datetime.datetime.now() - start_time).total_seconds() + 0.000001
print "Basecalled %d events in %f (%f ev/s)" % (total_events, time_diff, total_events / time_diff)
fo.close()
if len(args.watch) != 0:
try:
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
except:
print "Please install watchdog to watch directories"
sys.exit()
class Fast5Handler(PatternMatchingEventHandler):
"""Class for handling creation fo fast5-files"""
patterns = ["*.fast5"]
def on_created(self, event):
print "Calling", event
file_name = str(os.path.basename(event.src_path))
fasta_file_name = os.path.splitext(event.src_path)[0] + '.fasta'
with open(fasta_file_name, "w") as fo:
basecall(event.src_path, fo)
print('Watch dir: ' + args.watch)
observer = Observer()
print('Starting Observerer')
# start watching directory for fast5-files
observer.start()
observer.schedule(Fast5Handler(), path=args.watch)
try:
while True:
time.sleep(1)
# quit script using ctrl+c
except KeyboardInterrupt:
observer.stop()
observer.join()
import numpy as np
import sys
import datetime
defs = {
'r9.4': {
'ed_params': {
'window_lengths':[3, 6], 'thresholds':[1.4, 1.1],
'peak_height':0.2
}
},
'r9': {
'ed_params': {
'window_lengths':[5, 10], 'thresholds':[2.0, 1.1],
'peak_height':1.2
}
}
}
def get_raw(h5):
rk = h5["Raw/Reads"].keys()[0]
raw = h5["Raw/Reads"][rk]["Signal"]
meta = h5["UniqueGlobalKey/channel_id"].attrs
offset = meta["offset"]
raw_unit = meta['range'] / meta['digitisation']
raw = (raw + offset) * raw_unit
sl = meta["sampling_rate"]
return raw, sl
def find_stall(events, threshold):
count_above = 0
start_ev_ind = 0
for ev_ind, event in enumerate(events[:100]):
if event['mean'] <= threshold:
count_above = 0
else:
count_above += 1
if count_above == 2:
start_ev_ind = ev_ind - 1
break
new_start = 0
count = 0
for idx in range(start_ev_ind, len(events)):
if events['mean'][idx] > threshold:
count = 0
else:
count += 1
if count == 3:
new_start = idx - 2
break
return new_start
def get_tstat(s, s2, wl):
eta = 1e-100
windows1 = np.concatenate([[s[wl-1]], s[wl:] - s[:-wl]])
windows2 = np.concatenate([[s2[wl-1]], s2[wl:] - s2[:-wl]])
means = windows1 / wl
variances = windows2 / wl - means * means
variances = np.maximum(variances, eta)
delta = means[wl:] - means[:-wl]
deltav = (variances[wl:] + variances[:-wl]) / wl
return np.concatenate([np.zeros(wl), np.abs(delta / np.sqrt(deltav)), np.zeros(wl-1)])
def extract_events(h5, chem):
print "ed"
raw, sl = get_raw(h5)
events = event_detect(raw, sl, **defs[chem]["ed_params"])
med, mad = med_mad(events['mean'][-100:])
max_thresh = med + 1.48 * 2 + mad
first_event = find_stall(events, max_thresh)
return events[first_event:]
def med_mad(data):
dmed = np.median(data)
dmad = np.median(abs(data - dmed))
return dmed, dmad
def compute_prefix_sums(data):
data_sq = data * data
return np.cumsum(data), np.cumsum(data_sq)
def peak_detect(short_data, long_data, short_window, long_window, short_threshold, long_threshold,
peak_height):
long_mask = 0
NO_PEAK_POS = -1
NO_PEAK_VAL = 1e100
peaks = []
short_peak_pos = NO_PEAK_POS
short_peak_val = NO_PEAK_VAL
short_found_peak = False
long_peak_pos = NO_PEAK_POS
long_peak_val = NO_PEAK_VAL
long_found_peak = False
for i in range(len(short_data)):
val = short_data[i]
if short_peak_pos == NO_PEAK_POS:
if val < short_peak_val:
short_peak_val = val
elif val - short_peak_val > peak_height:
short_peak_val = val
short_peak_pos = i
else:
if val > short_peak_val:
short_peak_pos = i
short_peak_val = val
if short_peak_val > short_threshold:
long_mask = short_peak_pos + short_window
long_peak_pos = NO_PEAK_POS
long_peak_val = NO_PEAK_VAL
long_found_peak = False
if short_peak_val - val > peak_height and short_peak_val > short_threshold:
short_found_peak = True
if short_found_peak and (i - short_peak_pos) > short_window / 2:
peaks.append(short_peak_pos)
short_peak_pos = NO_PEAK_POS
short_peak_val = val
short_found_peak = False
if i <= long_mask:
continue
val = long_data[i]
if long_peak_pos == NO_PEAK_POS:
if val < long_peak_val:
long_peak_val = val
elif val - long_peak_val > peak_height:
long_peak_val = val
long_peak_pos = i
else:
if val > long_peak_val:
long_peak_pos = i
long_peak_val = val
if long_peak_val - val > peak_height and long_peak_val > long_threshold:
long_found_peak = True
if long_found_peak and (i - long_peak_pos) > long_window / 2:
peaks.append(long_peak_pos)
long_peak_pos = NO_PEAK_POS
long_peak_val = val
long_found_peak = False
return peaks
def generate_events(ss1, ss2, peaks, sample_rate):
peaks.append(len(ss1))
events = np.empty(len(peaks), dtype=[('start', float), ('length', float),
('mean', float), ('stdv', float)])
s = 0
s1 = 0
s2 = 0
for i, e in enumerate(peaks):
events[i]["start"] = s
l = e - s
events[i]["length"] = l
m = (ss1[e-1] - s1) / l
events[i]["mean"] = m
v = max(0.0, (ss2[e-1] - s2) / l - m*m)
events[i]["stdv"] = np.sqrt(v)
s = e
s1 = ss1[e-1]
s2 = ss2[e-1]
events["start"] /= sample_rate
events["length"] /= sample_rate
return events
def event_detect(raw_data, sample_rate, window_lengths=[16, 40], thresholds=[8.0, 4.0], peak_height = 1.0):
"""Basic, standard even detection using two t-tests
:param raw_data: ADC values
:param sample_rate: Sampling rate of data in Hz
:param window_lengths: Length 2 list of window lengths across
raw data from which `t_stats` are derived
:param thresholds: Length 2 list of thresholds on t-statistics
:peak_height: Absolute height a peak in signal must rise below
previous and following minima to be considered relevant
"""
sums, sumsqs = compute_prefix_sums(raw_data)
tstats = []
for i, w_len in enumerate(window_lengths):
tstat = get_tstat(sums, sumsqs, w_len)
tstats.append(tstat)
peaks = peak_detect(tstats[0], tstats[1], window_lengths[0], window_lengths[1], thresholds[0],
thresholds[1], peak_height)
events = generate_events(sums, sumsqs, peaks, sample_rate)
return events
This diff is collapsed.
This diff is collapsed.
import numpy as np
import pickle
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class OutLayer:
def __init__(self):
self.n_params = 2
self.params = [None, None]
def calc(self, input):
otmp = np.dot(input, self.params[0]) + self.params[1]
e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)
class SimpleLayer:
def __init__(self):
self.n_params = 5
self.params = [None for i in range(5)]
def conv(self, input, w, b, width=7):
output = np.zeros((input.shape[0], w.shape[0]))
mid = width/2
for i in range(width):
wx = w[:,:,i,0].T
oo = np.dot(input, wx)
if i < mid:
output[:-(mid-i)] += oo[mid-i:]
elif i == mid:
output += oo
else:
output[i-mid:] += oo[:-(i-mid)]
output += b
return output
def calc(self, input):
state = self.params[4]
c1 = np.tanh(self.conv(input, self.params[0], self.params[1]))
f1 = sigmoid(self.conv(input, self.params[2], self.params[3]))
output = np.zeros((len(input), self.params[4].shape[0]), dtype=np.float32)
for i in range(len(input)):
state = f1[i] * state + (1 - f1[i]) * c1[i]
output[i] = state
return np.array(output)
class BiSimpleLayer:
def __init__(self):
self.fwd = SimpleLayer()
self.bwd = SimpleLayer()
def calc(self, input):
return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
axis=1)
class Rnn:
def __init__(self):
pass
def tester(self, input):
input = input[0]
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
l4 = self.layer4.calc(l3)
return [self.output1.calc(l4)], [self.output2.calc(l4)]
def predict(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
l4 = self.layer4.calc(l3)
return self.output1.calc(l4), self.output2.calc(l4)
def debug(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return l1, l2, l3
def load(self, fn):
with open(fn, "rb") as f:
self.layer1 = BiSimpleLayer()
for i in range(5):
self.layer1.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer1.bwd.params[i] = pickle.load(f)
self.layer2 = BiSimpleLayer()
for i in range(5):
self.layer2.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer2.bwd.params[i] = pickle.load(f)
self.layer3 = BiSimpleLayer()
for i in range(5):
self.layer3.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer3.bwd.params[i] = pickle.load(f)
self.layer4 = BiSimpleLayer()
for i in range(5):
self.layer4.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer4.bwd.params[i] = pickle.load(f)
self.output1 = OutLayer()
self.output2 = OutLayer()
for i in range(2):
self.output1.params[i] = pickle.load(f)
for i in range(2):
self.output2.params[i] = pickle.load(f)
import numpy as np
import pickle
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class OutLayer:
def __init__(self):
self.n_params = 2
self.params = [None, None]
def calc(self, input):
otmp = np.dot(input, self.params[0]) + self.params[1]
e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)
class SimpleLayer:
def __init__(self):
self.n_params = 10
self.params = [None for i in range(10)]
def calc(self, input):
state = self.params[9]
# output = []
output = np.zeros((len(input), self.params[2].shape[0]), dtype=np.float32)
for i in range(len(input)):
update_gate = sigmoid(np.dot(state, self.params[6]) +
np.dot(input[i], self.params[4]) +
self.params[8])
reset_gate = sigmoid(np.dot(state, self.params[5]) +
np.dot(input[i], self.params[3]) +
self.params[7])
new_val = np.tanh(np.dot(input[i], self.params[0]) +
reset_gate * np.dot(state, self.params[1]) +
self.params[2])
state = update_gate * state + (1 - update_gate) * new_val
output[i] = state
return np.array(output)
class BiSimpleLayer:
def __init__(self):
self.fwd = SimpleLayer()
self.bwd = SimpleLayer()
def calc(self, input):
return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
axis=1)
class Rnn:
def __init__(self):
pass
def predict(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return self.output1.calc(l3), self.output2.calc(l3)
def debug(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return l1, l2, l3
def load(self, fn):
with open(fn, "rb") as f:
self.layer1 = BiSimpleLayer()
for i in range(10):
self.layer1.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer1.bwd.params[i] = pickle.load(f)
self.layer2 = BiSimpleLayer()
for i in range(10):
self.layer2.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer2.bwd.params[i] = pickle.load(f)
self.layer3 = BiSimpleLayer()
for i in range(10):
self.layer3.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer3.bwd.params[i] = pickle.load(f)
self.output1 = OutLayer()
self.output2 = OutLayer()
for i in range(2):
self.output1.params[i] = pickle.load(f)
for i in range(2):
self.output2.params[i] = pickle.load(f)
You need to have theano and sklearn installed for training.
Input files should have following format:
<reference sequence>
<mean of first event> <std of first event> <lenght of first event>
<mean of second event> <std of second event> <lenght of second event>
...
See examples.
Get alignment of events:
python realign.py ../networks/r94.pkl <list of files>
For each file with <name> it creates a file <name>r, so for file 1.txt it will create 1.txtr.
For training:
python train.py <optionally name of starting network like ../networks/r94.pkl> <list of files>
Remember to put files ending with r into training.
After each epoch the training will output:
<number of epoch> <current training log likelihood> <accuracy of first softmax> <accuracy of second
softmax> <approximate overall sequence identity> <time>
<distribution of outputs from first softmax>
<distribution of outputs from second softmax>
<confusion matrix for first softmax>
<confusion matrix for second softmax>
In folder with timestamp of training start there will be file names latest.pkl and each 20
epoch will be saved current snapshot of the network.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
import theano as th
import theano.tensor as T
from theano.tensor.nnet import sigmoid
from theano.tensor.nnet import conv2d
import numpy as np
from theano_toolkit import updates
import pickle
hidden_size = 128
def orthonormal_wts(n, m):
nm = max(n, m)
svd_u = np.linalg.svd(np.random.randn(nm, nm))[0]
return svd_u.astype(th.config.floatX)[:n, :m]
def init_wts(*argv):