...
 
Commits (8)
### DeepNano: alternative basecaller for MinION reads
## DeepNano version for R9 and R9.4 chemistry can be found in R9 directory of this repository.
DeepNano is alternative basecaller for Oxford Nanopore MinION reads
based on deep recurrent neural networks.
......@@ -50,6 +52,8 @@ For SQK-MAP-005 chemistry use:
`OMP_NUM_THREADS=1 python basecall.py --template_net nets_data/map5temp.npz --complement_net nets_data/map5comp.npz --big_net nets_data/map5-2d.npz <list of fast5 files>`
For R9 and R9.4 check r9 directory.
Advanced arguments:
=================
......
......@@ -185,7 +185,8 @@ def basecall(read_file_name, fo):
e += [1] + list(data["comp_events2"][comp_ind])
events_2d.append(e)
events_2d = np.array(events_2d, dtype=np.float32)
predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
if len(events_2d) >= 5:
predict_and_write(events_2d, big_net, fo, "%s_2d_rnn" % basename)
parser = argparse.ArgumentParser()
parser.add_argument('--template_net', type=str, default="nets_data/map6temp.npz")
......
deepnano (0.0+20160706-2) UNRELEASED; urgency=medium
deepnano (0.0+git20170813.e8a621e-1) unstable; urgency=medium
[ Steffen Moeller ]
* Added debian/upstream/metadata, paper and registries referenced
-- Steffen Moeller <moeller@debian.org> Sat, 18 Nov 2017 23:01:06 +0100
[ Andreas Tille ]
* New upstream commit
* d/watch: Parse upstream Git
* Standards-Version: 4.1.4
* Point Vcs-fields to Salsa
* debhelper 11
* Build-Depends: python-theano to enable migration to testing
-- Andreas Tille <tille@debian.org> Fri, 27 Apr 2018 09:27:06 +0200
deepnano (0.0+20160706-1) unstable; urgency=medium
......
......@@ -4,13 +4,14 @@ Uploaders: Çağrı ULAŞ <cagriulas@gmail.com>,
Andreas Tille <tille@debian.org>
Section: science
Priority: optional
Build-Depends: debhelper (>= 9),
Build-Depends: debhelper (>= 11~),
python-all,
dh-python,
python-markdown
Standards-Version: 3.9.8
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/deepnano.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/deepnano.git
python-markdown,
python-theano
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/deepnano
Vcs-Git: https://salsa.debian.org/med-team/deepnano.git
Homepage: https://bitbucket.org/vboza/deepnano
Package: deepnano
......@@ -33,7 +34,7 @@ Description: alternative basecaller for MinION reads of genomic sequences
Package: deepnano-data
Architecture: all
Depends: ${misc:Depends},
Depends: ${misc:Depends}
Suggests: deepnano
Description: alternative basecaller for MinION reads of genomic sequences (data)
DeepNano is alternative basecaller for Oxford Nanopore MinION reads
......
version=4
opts="mode=git,pretty=0.0+git%cd.%h" \
https://bitbucket.org/vboza/deepnano.git HEAD
# version=3
# upstream has not proper versioning
### DeepNano: alternative basecaller for MinION reads - R9(.4) version
Requirements
================
We use Python 2.7.
Here are versions of Python packages, that we used:
- Cython==0.23.4
- numpy==1.10.2
- h5py==2.5.0
- python-dateutil==2.5.0
Basic usage:
================
`python basecall.py --chemistry <r9/r9.4> <list of fast5 files>`
Advanced arguments:
=================
- `-h` - prints help message
- `--output FILENAME` - output filename
- `--directory DIRECTORY` Directory where read files are stored
- `--cut-data` - sometimes read detection can produce huge regions with no bases and basecaller gets
confused. This option is workaround around that issue.
Basecaller will perform event extraction, if events are not present.
from rnnf import Rnn
from qrnnf import Rnn as Rnnq
import h5py
import argparse
import os
import datetime
import numpy as np
from extract_events import extract_events
def scale94(X):
m25 = np.percentile(X[:,0], 25)
m75 = np.percentile(X[:,0], 75)
s50 = np.median(X[:,2])
me25 = -0.3
me75= 0.3
se50 = 0.6103758
ret = np.array(X)
scale = (me75 - me25) / (m75 - m25)
m25 *= scale
shift = me25 - m25
ret[:,0] = X[:,0] * scale + shift
ret[:,1] = ret[:,0]**2
sscale = se50 / s50
ret[:,2] = X[:,2] * sscale
return ret
def scale(X):
m25 = np.percentile(X[:,0], 25)
m75 = np.percentile(X[:,0], 75)
s50 = np.median(X[:,2])
me25 = 0.07499809
me75 = 0.26622871
se50 = 0.6103758
ret = np.array(X)
scale = (me75 - me25) / (m75 - m25)
m25 *= scale
shift = me25 - m25
ret[:,0] = X[:,0] * scale + shift
ret[:,1] = ret[:,0]**2
sscale = se50 / s50
ret[:,2] = X[:,2] * sscale
return ret
def get_events(h5):
if not args.event_detect:
try:
e = h5["Analyses/Basecall_RNN_1D_000/BaseCalled_template/Events"]
return e
except:
pass
try:
e = h5["Analyses/Basecall_1D_000/BaseCalled_template/Events"]
return e
except:
pass
return extract_events(h5, args.chemistry)
def basecall(filename, output_file):
try:
h5 = h5py.File(filename, "r")
events = get_events(h5)
if events is None:
print "No events in file %s" % filename
h5.close()
return 0
if len(events) < 300:
print "Read %s too short, not basecalling" % filename
h5.close()
return 0
events = events[50:-50][:args.max_events]
mean = events["mean"]
std = events["stdv"]
length = events["length"]
scale_f = scale if args.chemistry == 'r9' else scale94
X = np.array(np.vstack([mean, mean*mean, std, length]).T, dtype=np.float32)
if len(X) < 2500 or args.cut_data == False:
X = scale_f(X)
o1, o2 = ntwk.predict(X)
else:
preds1 = []
preds2 = []
for i in range(0, len(X), 2000):
o1, o2 = ntwk.predict(scale_f(X[i:i+2500]))
if i > 0:
o1 = o1[250:]
o2 = o2[250:]
if i + 2500 < len(X):
o1 = o1[:-250]
o2 = o2[:-250]
preds1.append(o1)
preds2.append(o2)
o1 = np.vstack(preds1)
o2 = np.vstack(preds2)
o1m = (np.argmax(o1, 1))
o2m = (np.argmax(o2, 1))
om = np.vstack((o1m,o2m)).reshape((-1,),order='F')
output = "".join(map(lambda x: alph[x], om)).replace("N", "")
print >>output_file, ">%s_template_deepnano" % filename
print >>output_file, output
output_file.flush()
h5.close()
return len(events)
except Exception as e:
print "Read %s failed with %s" % (filename, e)
return 0
alph = "ACGTN"
parser = argparse.ArgumentParser()
parser.add_argument('--chemistry', choices=['r9', 'r9.4'], default='r9.4')
parser.add_argument('--output', type=str, default="output.fasta")
parser.add_argument('--directory', type=str, default='', help="Directory where read files are stored")
parser.add_argument('--watch', type=str, default='', help='Watched directory')
parser.add_argument('reads', type=str, nargs='*')
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--no-debug', dest='debug', action='store_false')
parser.add_argument('--event-detect', dest='event_detect', action='store_true')
parser.add_argument('--max-events', type=int, default=50000, help='Max amount of events to basecall')
parser.add_argument('--cut-data', dest='cut_data', action='store_true', help='Cut data into smaller chunks and basecall them separatelly. Helps in case of bad preprocessing.')
parser.set_defaults(debug=False)
parser.set_defaults(event_detect=False)
parser.set_defaults(cut_data=False)
args = parser.parse_args()
assert len(args.reads) != 0 or len(args.directory) != 0 or len(args.watch) != 0, "Nothing to basecall"
ntwks = {"r9": os.path.join("networks", "r9.pkl"), "r9.4": os.path.join("networks", "r94.pkl")}
ntwk = Rnn() if args.chemistry == 'r9' else Rnnq()
ntwk.load(ntwks[args.chemistry])
if len(args.reads) or len(args.directory) != 0:
fo = open(args.output, "w")
files = args.reads
if len(args.directory):
files += [os.path.join(args.directory, x) for x in os.listdir(args.directory)]
total_events = 0
start_time = datetime.datetime.now()
for i, read in enumerate(files):
current_events = basecall(read, fo)
if args.debug:
total_events += current_events
time_diff = (datetime.datetime.now() - start_time).total_seconds() + 0.000001
print "Basecalled %d events in %f (%f ev/s)" % (total_events, time_diff, total_events / time_diff)
fo.close()
if len(args.watch) != 0:
try:
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
except:
print "Please install watchdog to watch directories"
sys.exit()
class Fast5Handler(PatternMatchingEventHandler):
"""Class for handling creation fo fast5-files"""
patterns = ["*.fast5"]
def on_created(self, event):
print "Calling", event
file_name = str(os.path.basename(event.src_path))
fasta_file_name = os.path.splitext(event.src_path)[0] + '.fasta'
with open(fasta_file_name, "w") as fo:
basecall(event.src_path, fo)
print('Watch dir: ' + args.watch)
observer = Observer()
print('Starting Observerer')
# start watching directory for fast5-files
observer.start()
observer.schedule(Fast5Handler(), path=args.watch)
try:
while True:
time.sleep(1)
# quit script using ctrl+c
except KeyboardInterrupt:
observer.stop()
observer.join()
import numpy as np
import sys
import datetime
defs = {
'r9.4': {
'ed_params': {
'window_lengths':[3, 6], 'thresholds':[1.4, 1.1],
'peak_height':0.2
}
},
'r9': {
'ed_params': {
'window_lengths':[5, 10], 'thresholds':[2.0, 1.1],
'peak_height':1.2
}
}
}
def get_raw(h5):
rk = h5["Raw/Reads"].keys()[0]
raw = h5["Raw/Reads"][rk]["Signal"]
meta = h5["UniqueGlobalKey/channel_id"].attrs
offset = meta["offset"]
raw_unit = meta['range'] / meta['digitisation']
raw = (raw + offset) * raw_unit
sl = meta["sampling_rate"]
return raw, sl
def find_stall(events, threshold):
count_above = 0
start_ev_ind = 0
for ev_ind, event in enumerate(events[:100]):
if event['mean'] <= threshold:
count_above = 0
else:
count_above += 1
if count_above == 2:
start_ev_ind = ev_ind - 1
break
new_start = 0
count = 0
for idx in range(start_ev_ind, len(events)):
if events['mean'][idx] > threshold:
count = 0
else:
count += 1
if count == 3:
new_start = idx - 2
break
return new_start
def get_tstat(s, s2, wl):
eta = 1e-100
windows1 = np.concatenate([[s[wl-1]], s[wl:] - s[:-wl]])
windows2 = np.concatenate([[s2[wl-1]], s2[wl:] - s2[:-wl]])
means = windows1 / wl
variances = windows2 / wl - means * means
variances = np.maximum(variances, eta)
delta = means[wl:] - means[:-wl]
deltav = (variances[wl:] + variances[:-wl]) / wl
return np.concatenate([np.zeros(wl), np.abs(delta / np.sqrt(deltav)), np.zeros(wl-1)])
def extract_events(h5, chem):
print "ed"
raw, sl = get_raw(h5)
events = event_detect(raw, sl, **defs[chem]["ed_params"])
med, mad = med_mad(events['mean'][-100:])
max_thresh = med + 1.48 * 2 + mad
first_event = find_stall(events, max_thresh)
return events[first_event:]
def med_mad(data):
dmed = np.median(data)
dmad = np.median(abs(data - dmed))
return dmed, dmad
def compute_prefix_sums(data):
data_sq = data * data
return np.cumsum(data), np.cumsum(data_sq)
def peak_detect(short_data, long_data, short_window, long_window, short_threshold, long_threshold,
peak_height):
long_mask = 0
NO_PEAK_POS = -1
NO_PEAK_VAL = 1e100
peaks = []
short_peak_pos = NO_PEAK_POS
short_peak_val = NO_PEAK_VAL
short_found_peak = False
long_peak_pos = NO_PEAK_POS
long_peak_val = NO_PEAK_VAL
long_found_peak = False
for i in range(len(short_data)):
val = short_data[i]
if short_peak_pos == NO_PEAK_POS:
if val < short_peak_val:
short_peak_val = val
elif val - short_peak_val > peak_height:
short_peak_val = val
short_peak_pos = i
else:
if val > short_peak_val:
short_peak_pos = i
short_peak_val = val
if short_peak_val > short_threshold:
long_mask = short_peak_pos + short_window
long_peak_pos = NO_PEAK_POS
long_peak_val = NO_PEAK_VAL
long_found_peak = False
if short_peak_val - val > peak_height and short_peak_val > short_threshold:
short_found_peak = True
if short_found_peak and (i - short_peak_pos) > short_window / 2:
peaks.append(short_peak_pos)
short_peak_pos = NO_PEAK_POS
short_peak_val = val
short_found_peak = False
if i <= long_mask:
continue
val = long_data[i]
if long_peak_pos == NO_PEAK_POS:
if val < long_peak_val:
long_peak_val = val
elif val - long_peak_val > peak_height:
long_peak_val = val
long_peak_pos = i
else:
if val > long_peak_val:
long_peak_pos = i
long_peak_val = val
if long_peak_val - val > peak_height and long_peak_val > long_threshold:
long_found_peak = True
if long_found_peak and (i - long_peak_pos) > long_window / 2:
peaks.append(long_peak_pos)
long_peak_pos = NO_PEAK_POS
long_peak_val = val
long_found_peak = False
return peaks
def generate_events(ss1, ss2, peaks, sample_rate):
peaks.append(len(ss1))
events = np.empty(len(peaks), dtype=[('start', float), ('length', float),
('mean', float), ('stdv', float)])
s = 0
s1 = 0
s2 = 0
for i, e in enumerate(peaks):
events[i]["start"] = s
l = e - s
events[i]["length"] = l
m = (ss1[e-1] - s1) / l
events[i]["mean"] = m
v = max(0.0, (ss2[e-1] - s2) / l - m*m)
events[i]["stdv"] = np.sqrt(v)
s = e
s1 = ss1[e-1]
s2 = ss2[e-1]
events["start"] /= sample_rate
events["length"] /= sample_rate
return events
def event_detect(raw_data, sample_rate, window_lengths=[16, 40], thresholds=[8.0, 4.0], peak_height = 1.0):
"""Basic, standard even detection using two t-tests
:param raw_data: ADC values
:param sample_rate: Sampling rate of data in Hz
:param window_lengths: Length 2 list of window lengths across
raw data from which `t_stats` are derived
:param thresholds: Length 2 list of thresholds on t-statistics
:peak_height: Absolute height a peak in signal must rise below
previous and following minima to be considered relevant
"""
sums, sumsqs = compute_prefix_sums(raw_data)
tstats = []
for i, w_len in enumerate(window_lengths):
tstat = get_tstat(sums, sumsqs, w_len)
tstats.append(tstat)
peaks = peak_detect(tstats[0], tstats[1], window_lengths[0], window_lengths[1], thresholds[0],
thresholds[1], peak_height)
events = generate_events(sums, sumsqs, peaks, sample_rate)
return events
This diff is collapsed.
This diff is collapsed.
import numpy as np
import pickle
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class OutLayer:
def __init__(self):
self.n_params = 2
self.params = [None, None]
def calc(self, input):
otmp = np.dot(input, self.params[0]) + self.params[1]
e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)
class SimpleLayer:
def __init__(self):
self.n_params = 5
self.params = [None for i in range(5)]
def conv(self, input, w, b, width=7):
output = np.zeros((input.shape[0], w.shape[0]))
mid = width/2
for i in range(width):
wx = w[:,:,i,0].T
oo = np.dot(input, wx)
if i < mid:
output[:-(mid-i)] += oo[mid-i:]
elif i == mid:
output += oo
else:
output[i-mid:] += oo[:-(i-mid)]
output += b
return output
def calc(self, input):
state = self.params[4]
c1 = np.tanh(self.conv(input, self.params[0], self.params[1]))
f1 = sigmoid(self.conv(input, self.params[2], self.params[3]))
output = np.zeros((len(input), self.params[4].shape[0]), dtype=np.float32)
for i in range(len(input)):
state = f1[i] * state + (1 - f1[i]) * c1[i]
output[i] = state
return np.array(output)
class BiSimpleLayer:
def __init__(self):
self.fwd = SimpleLayer()
self.bwd = SimpleLayer()
def calc(self, input):
return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
axis=1)
class Rnn:
def __init__(self):
pass
def tester(self, input):
input = input[0]
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
l4 = self.layer4.calc(l3)
return [self.output1.calc(l4)], [self.output2.calc(l4)]
def predict(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
l4 = self.layer4.calc(l3)
return self.output1.calc(l4), self.output2.calc(l4)
def debug(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return l1, l2, l3
def load(self, fn):
with open(fn, "rb") as f:
self.layer1 = BiSimpleLayer()
for i in range(5):
self.layer1.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer1.bwd.params[i] = pickle.load(f)
self.layer2 = BiSimpleLayer()
for i in range(5):
self.layer2.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer2.bwd.params[i] = pickle.load(f)
self.layer3 = BiSimpleLayer()
for i in range(5):
self.layer3.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer3.bwd.params[i] = pickle.load(f)
self.layer4 = BiSimpleLayer()
for i in range(5):
self.layer4.fwd.params[i] = pickle.load(f)
for i in range(5):
self.layer4.bwd.params[i] = pickle.load(f)
self.output1 = OutLayer()
self.output2 = OutLayer()
for i in range(2):
self.output1.params[i] = pickle.load(f)
for i in range(2):
self.output2.params[i] = pickle.load(f)
import numpy as np
import pickle
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class OutLayer:
def __init__(self):
self.n_params = 2
self.params = [None, None]
def calc(self, input):
otmp = np.dot(input, self.params[0]) + self.params[1]
e_x = np.exp(otmp - otmp.max(axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)
class SimpleLayer:
def __init__(self):
self.n_params = 10
self.params = [None for i in range(10)]
def calc(self, input):
state = self.params[9]
# output = []
output = np.zeros((len(input), self.params[2].shape[0]), dtype=np.float32)
for i in range(len(input)):
update_gate = sigmoid(np.dot(state, self.params[6]) +
np.dot(input[i], self.params[4]) +
self.params[8])
reset_gate = sigmoid(np.dot(state, self.params[5]) +
np.dot(input[i], self.params[3]) +
self.params[7])
new_val = np.tanh(np.dot(input[i], self.params[0]) +
reset_gate * np.dot(state, self.params[1]) +
self.params[2])
state = update_gate * state + (1 - update_gate) * new_val
output[i] = state
return np.array(output)
class BiSimpleLayer:
def __init__(self):
self.fwd = SimpleLayer()
self.bwd = SimpleLayer()
def calc(self, input):
return np.concatenate([self.fwd.calc(input), self.bwd.calc(input[::-1])[::-1]],
axis=1)
class Rnn:
def __init__(self):
pass
def predict(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return self.output1.calc(l3), self.output2.calc(l3)
def debug(self, input):
l1 = self.layer1.calc(input)
l2 = self.layer2.calc(l1)
l3 = self.layer3.calc(l2)
return l1, l2, l3
def load(self, fn):
with open(fn, "rb") as f:
self.layer1 = BiSimpleLayer()
for i in range(10):
self.layer1.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer1.bwd.params[i] = pickle.load(f)
self.layer2 = BiSimpleLayer()
for i in range(10):
self.layer2.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer2.bwd.params[i] = pickle.load(f)
self.layer3 = BiSimpleLayer()
for i in range(10):
self.layer3.fwd.params[i] = pickle.load(f)
for i in range(10):
self.layer3.bwd.params[i] = pickle.load(f)
self.output1 = OutLayer()
self.output2 = OutLayer()
for i in range(2):
self.output1.params[i] = pickle.load(f)
for i in range(2):
self.output2.params[i] = pickle.load(f)
You need to have theano and sklearn installed for training.
Input files should have following format:
<reference sequence>
<mean of first event> <std of first event> <lenght of first event>
<mean of second event> <std of second event> <lenght of second event>
...
See examples.
Get alignment of events:
python realign.py ../networks/r94.pkl <list of files>
For each file with <name> it creates a file <name>r, so for file 1.txt it will create 1.txtr.
For training:
python train.py <optionally name of starting network like ../networks/r94.pkl> <list of files>
Remember to put files ending with r into training.
After each epoch the training will output:
<number of epoch> <current training log likelihood> <accuracy of first softmax> <accuracy of second
softmax> <approximate overall sequence identity> <time>
<distribution of outputs from first softmax>
<distribution of outputs from second softmax>
<confusion matrix for first softmax>
<confusion matrix for second softmax>
In folder with timestamp of training start there will be file names latest.pkl and each 20
epoch will be saved current snapshot of the network.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
import theano as th
import theano.tensor as T
from theano.tensor.nnet import sigmoid
from theano.tensor.nnet import conv2d
import numpy as np
from theano_toolkit import updates
import pickle
hidden_size = 128
def orthonormal_wts(n, m):
nm = max(n, m)
svd_u = np.linalg.svd(np.random.randn(nm, nm))[0]
return svd_u.astype(th.config.floatX)[:n, :m]
def init_wts(*argv):
return 0.1 * (np.random.rand(*argv) - 0.5)
def share(array, dtype=th.config.floatX, name=None):
return th.shared(value=np.asarray(array, dtype=dtype), name=name)
class ConvLayer:
def __init__(self, input, nin, nunits, width=7):
id = str(np.random.randint(0, 10000000))
wc = share(init_wts(nunits, nin, width, 1), name="wc"+id)
bc = share(np.zeros(nunits), name="bc"+id)
input = input.dimshuffle(1, 0, 2)
inpr = input.reshape((input.shape[0], input.shape[1], input.shape[2], 1)).dimshuffle(0, 2, 1, 3)
c1 = conv2d(input=inpr, filters=wc, border_mode='half') + bc.reshape((1, bc.shape[0], 1, 1))
self.output = T.tanh(c1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
self.params = [wc, bc]
class BatchLayer:
def __init__(self, input, nin, nunits, width=7):
id = str(np.random.randint(0, 10000000))
wc = share(init_wts(nunits, nin, width, 1), name="wc"+id)
bc = share(np.zeros(nunits), name="bc"+id)
wf = share(init_wts(nunits, nin, width, 1), name="wf"+id)
bf = share(np.zeros(nunits), name="bf"+id)
h0 = share(np.zeros(nunits), name="h0"+id)
input = input.dimshuffle(1, 0, 2)
inpr = input.reshape((input.shape[0], input.shape[1], input.shape[2], 1)).dimshuffle(0, 2, 1, 3)
c1 = conv2d(input=inpr, filters=wc, border_mode='half') + bc.reshape((1, bc.shape[0], 1, 1))
c1 = T.tanh(c1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
f1 = conv2d(input=inpr, filters=wf, border_mode='half') + bf.reshape((1, bf.shape[0], 1, 1))
f1 = T.nnet.sigmoid(f1.flatten(3).dimshuffle(0, 2, 1).dimshuffle(1, 0, 2))
self.dbg = f1
def step(in_t, in_g, out_tm1):
# return in_t
return in_g * out_tm1 + (1 - in_g) * in_t
self.output, _ = th.scan(
step, sequences=[c1, f1],
outputs_info=[T.alloc(h0, input.shape[0], nunits)])
self.params = [wc, bc, wf, bf, h0]
class BiBatchLayer():
def __init__(self, input, nin, nunits):
fwd = BatchLayer(input, nin, nunits)
bwd = BatchLayer(input[::-1], nin, nunits)
self.params = fwd.params + bwd.params
self.output = T.concatenate([fwd.output, bwd.output[::-1]], axis=2)
self.dbg = fwd.dbg
class OutBatchLayer:
def __init__(self, input, in_size, n_classes):
id = str(np.random.randint(0, 10000000))
w = share(init_wts(in_size, n_classes), name=id+"w")
b = share(np.zeros(n_classes), name=id+"b")
otmp = T.dot(input, w) + b
e_x = T.exp(otmp - otmp.max(axis=2, keepdims=True))
o2 = e_x / e_x.sum(axis=2, keepdims=True)
eps = 0.00000001
self.output = T.clip(o2, eps, 1-eps)
self.params = [w, b]
class BatchNet:
def __init__(self, in_size, n_classes, only_test=False, with_jacob=False, use_first=True,
use_second=True):
np.random.seed(49)
self.input = T.ftensor3()
self.targets = T.imatrix()
self.targets2 = T.imatrix()
self.layer = BiBatchLayer(self.input.dimshuffle(1, 0, 2), in_size, hidden_size)
self.layer2 = BiBatchLayer(self.layer.output, 2*hidden_size, hidden_size)
# self.layer2 = ConvLayer(self.layer.output, 2*hidden_size, 2*hidden_size)
self.layer3 = BiBatchLayer(self.layer2.output, 2*hidden_size, hidden_size)
self.layer4 = BiBatchLayer(self.layer3.output, 2*hidden_size, hidden_size)
# self.layer4 = ConvLayer(self.layer3.output, 2*hidden_size, 2*hidden_size)
self.out_layer = OutBatchLayer(self.layer4.output, 2*hidden_size, n_classes)
self.output = self.out_layer.output.dimshuffle(1, 0, 2)
self.output_flat = T.reshape(self.output, (self.output.shape[0] * self.output.shape[1], -1))
self.targets_flat = self.targets.flatten(ndim=1)
self.cost = 0
if use_first:
self.cost = -T.mean(T.log(self.output_flat)[T.arange(self.targets_flat.shape[0]), self.targets_flat])
self.out_layer2 = OutBatchLayer(self.layer4.output, 2*hidden_size, n_classes)
self.output2 = self.out_layer2.output.dimshuffle(1, 0, 2)
self.output2_flat = T.reshape(self.output2, (self.output2.shape[0] * self.output2.shape[1], -1))
self.targets2_flat = self.targets2.flatten(ndim=1)
if use_second:
self.cost += -T.mean(T.log(self.output2_flat)[T.arange(self.targets2_flat.shape[0]), self.targets2_flat])
# self.coster = th.function(inputs=[self.input, self.targets, self.targets2], outputs=[self.cost])
#self.predict = th.function(inputs=[self.input], outputs=[self.output])
self.tester = th.function(inputs=[self.input], outputs=[self.output, self.output2])
self.debug = th.function(inputs=[self.input], outputs=[self.layer.output, self.layer2.output,
self.layer3.output])
self.layers = [self.layer, self.layer2, self.layer3, self.layer4, self.out_layer, self.out_layer2]
self.params = [p for l in self.layers for p in l.params]
self.grad_params = [p for l in self.layers[:-2] for p in l.params]
if use_first:
self.grad_params += self.out_layer.params
if use_second:
self.grad_params += self.out_layer2.params
# self.top_prob_sum = T.mean(T.log(T.max(self.output[0], axis=1))) +\
# T.mean(T.log(T.max(self.output2[0], axis=1)))
# self.grad_in = th.function(inputs=[self.input], outputs=T.grad(self.top_prob_sum, self.input))
if not only_test:
self.lr = T.fscalar()
inputs = [self.input]
if use_first:
inputs.append(self.targets)
if use_second:
inputs.append(self.targets2)
# inputs.append(self.lr)
self.trainer = th.function(
inputs=inputs,
outputs=[self.cost, self.output, self.output2],
updates=updates.adam(self.grad_params, (T.grad(self.cost, self.grad_params)),
learning_rate=0.0001, epsilon=1e-6))
def save(self, fn):
with open(fn, "wb") as f:
for p in self.params:
pickle.dump(p.get_value(), f)
def load(self, fn):
with open(fn, "rb") as f:
for p in self.params:
p.set_value(pickle.load(f))
#include <cstdio>
#include <vector>
#include <cmath>
#include <map>
#include <string>
#include <algorithm>
using namespace std;
class string2 {
public:
string2() {}
string2(const char*x) {
x_[0] = 0;
x_[1] = 0;
x_[2] = 0;
for (int i = 0; i < 2 && x[i]; i++) {
x_[i] = x[i];
}
}
string2 operator=(const char*x) {
x_[0] = 0;
x_[1] = 0;
x_[2] = 0;
for (int i = 0; i < 2 && x[i]; i++) {
x_[i] = x[i];
}
return *this;
}
string2 operator=(const string2 x) {
x_[0]= x.x_[0];
x_[1]= x.x_[1];
x_[2]= x.x_[2];
return *this;
}
string2 operator=(const string& x) {
x_[0] = 0;
x_[1] = 0;
x_[2] = 0;
for (int i = 0; i < 2 && i < x.size(); i++) {
x_[i] = x[i];
}
return *this;
}
char& operator[](int i) {
return x_[i];
}
char x_[3];
};
int main() {
char ss[100000];
scanf("%s\n", ss);
string ref(ss);
int mapping[256];
mapping['A'] = 0;
mapping['C'] = 1;
mapping['G'] = 2;
mapping['T'] = 3;
int range = 50;
vector<vector<double>> probs;
while (true) {
double a, c, g, t, n;
if (scanf("%lf %lf %lf %lf %lf", &a, &c, &g, &t, &n)<5) {
break;
}
probs.push_back(vector<double>({log(a), log(c), log(g), log(t), log(n)}));
}
int limit = min(100, (int)ref.size()/2);
while (range < 30000) {
vector<vector<double>> poses(probs.size()+1);
vector<vector<string2>> prevs(probs.size()+1);
for (int i = 0; i < poses.size(); i+=2) {
poses[i] = vector<double>(ref.size()+1, -1e30);
prevs[i] = vector<string2>(ref.size()+1, "");
}
poses[0][0] = 0;
for (int i = 0; i < poses[0].size() && i < limit; i++) {
poses[0][i] = 0;
}
int last_bp = 50;
for (int i = 2; i <= probs.size(); i+=2) {
for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
// NN
if (i > 2) {
double np = poses[i-2][j] + probs[i-2][4] + probs[i-1][4];
if (np > poses[i][j]) {
poses[i][j] = np;
prevs[i][j] = "NN";
}
}
// NX
double np = poses[i-2][j-1] + probs[i-2][4] + probs[i-1][mapping[ref[j-1]]];
if (np > poses[i][j]) {
poses[i][j] = np;
prevs[i][j] = string("N") + ref[j-1];
}
//XX
if (j > 1) {
double np = poses[i-2][j-2] + probs[i-2][mapping[ref[j-2]]] + probs[i-1][mapping[ref[j-1]]];
if (np > poses[i][j]) {
poses[i][j] = np;
prevs[i][j] = ref.substr(j-2, 2);
}
}
}
int cur_bp = max(1, last_bp - range);
for (int j = max(1, last_bp - range); j <= ref.size() && j <= last_bp + range; j++) {
if (poses[i][j] > poses[i][cur_bp]) {
cur_bp = j;
}
}
last_bp = cur_bp;
}
// fprintf(stderr, "back size %d last_bp %d\n", poses.back().size(), last_bp);
int best_pos = poses.back().size()-40;
for (int i = poses.back().size()-limit; i < poses.back().size(); i++) {
if (poses.back()[i] > poses.back()[best_pos]) {
best_pos = i;
}
}
/*int total = 0;
int rel = 0;
int better = 0;
for (int i = 0; i < poses.size(); i += 2) {
for (int j = 0; j < poses[i].size(); j++) {
total += 1;
if (poses[i][j] > -1e29) {
rel += 1;
}
if (poses[i][j] > poses.back()[best_pos]) {
better += 1;
}
}
}
fprintf(stderr, "total %d rel %d better %d\n", total, rel, better);*/
if (poses.back()[best_pos] / probs.size() * 2 > -10) {
fprintf(stderr, "best pos %d %lf %d\n", best_pos, poses.back()[best_pos] / probs.size() * 2, range);
int ipos = poses.size()-1;
int jpos = best_pos;
vector<string2> out;
while (ipos > 0) {
auto back = prevs[ipos][jpos];
out.push_back(back);
if (back[0] == 'N' && back[1] == 'N') {
ipos -= 2;
}
else if (back[0] == 'N' && back[1] != 'N') {
ipos -= 2;
jpos -= 1;
}
else if (back[0] != 'N' && back[1] != 'N') {
ipos -= 2;
jpos -= 2;
}
}
reverse(out.begin(), out.end());
for (auto &o: out) {
printf("%s\n", o.x_);
}
fprintf(stderr, "start pos %d\n", jpos);
if (best_pos == poses.back().size() - limit || jpos == limit - 1) {
return 1;
}
return 47;
}
range *= 2;
}
return 1;
}
from qrnn import BatchNet
import pickle
import sys
import numpy as np
import datetime
from collections import defaultdict
import os
from sklearn.metrics import confusion_matrix
import theano as th
from multiprocessing import Pool
import subprocess
def scale(X):
m25 = np.percentile(X[:,0], 25)
m75 = np.percentile(X[:,0], 75)
s50 = np.median(X[:,2])
me25 = -0.3
me75= 0.3
se50 = 0.6103758
ret = np.array(X)
scale = (me75 - me25) / (m75 - m25)
m25 *= scale
shift = me25 - m25
ret[:,0] = X[:,0] * scale + shift
ret[:,1] = ret[:,0]**2
sscale = se50 / s50
ret[:,2] = X[:,2] * sscale
return ret
def print_stats(o):
stats = defaultdict(int)
for x in o:
stats[x] += 1
print (stats)
def flatten2(x):
return x.reshape((x.shape[0] * x.shape[1], -1))
def realign(s):
ps = s
o1, o2 = ntwk.tester([scale(data_x[ps])])
o1m = (np.argmax(o1[0], 1))
o2m = (np.argmax(o2[0], 1))
alph = "ACGTN"
# fo = open(base_dir+"output%s.fasta" % ps, "w")
# print (">xx", file=fo)
# for a, b in zip(o1m, o2m):
# if a < 4:
# fo.write(alph[a])
# if b < 4:
# fo.write(alph[b])
# fo.close()
f = open(base_dir+"tmpb-%s.in" % s, "w")
print >>f, refs[ps]
for a, b in zip(o1[0], o2[0]):
print >>f, " ".join(map(str, a))
print >>f, " ".join(map(str, b))
f.close()
print "s", s
ret = subprocess.call("ulimit -v 32000000; ./realign <%stmpb-%s.in >%stmpb-%s.out" % (base_dir, s, base_dir, s), shell=True)
if ret != 47:
print "fail", s
return
f = open(base_dir+"tmpb-%s.out" % s)
for i, l in enumerate(f):
data_y[ps][i] = mapping[l[0]]
data_y2[ps][i] = mapping[l[1]]
fo = open(names[s] + "r", "w")
print >>fo, refs[s]
for x, y, y2 in zip(data_x[s], data_y[s], data_y2[s]):
print >>fo, " ".join(map(str, x)), "%c%c" % (alph[y], alph[y2])
fo.close()
if __name__ == '__main__':
chars = "ACGT"
mapping = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
n_dims = 4
n_classes = 5
data_x = []
data_y = []
data_y2 = []
refs = []
names = []
for fn in sys.argv[2:]:
f = open(fn)
ref = f.readline()
if len(ref) > 30000:
print "out", len(ref)
continue
refs.append(ref.strip())
names.append(fn)
X = []
Y = []
Y2 = []
print "\rfn", fn,
sys.stdout.flush()
for l in f:
its = l.strip().split()
mean = float(its[0])
std = float(its[1])
length = float(its[2])
X.append([mean, mean*mean, std, length])
Y.append(4)
Y2.append(4)
data_x.append(np.array(X, dtype=th.config.floatX))
data_y.append(np.array(Y, dtype=np.int32))
data_y2.append(np.array(Y2, dtype=np.int32))
print ("done", sum(len(x) for x in refs))
sys.stdout.flush()
ntwk = BatchNet(n_dims, n_classes, only_test=True)
ntwk.load(sys.argv[1])
base_dir = str(datetime.datetime.now())
base_dir = base_dir.replace(' ', '_')
os.mkdir(base_dir)
base_dir += "/"
p = Pool(8)
p.map(realign, range(len(data_x)))
This diff is collapsed.
Helper methods for Theano
=========================
Some helper methods for working with Theano for neural networks.
## `hinton.py`
![Hinton Diagrams](https://blog.wtf.sg/wp-content/uploads/2014/05/Screenshot-from-2014-05-04-013804.png)
Quick visualisation method of numpy matrices in the terminal. See the [blog post](https://blog.wtf.sg/2014/05/04/terminal-hinton-diagrams/).
## `utils.py`
Miscellaneous helper functions for initialising weight matrices, vector softmax, etc.
## `parameters.py`
Syntax sugar when declaring parameters.
```python
import theano_toolkit.parameters as Parameters
P = Parameters()
P.W = np.zeros(10,10)
P.b = np.zeros(10)
# build model here.
params = P.values()
gradients = T.grad(cost,wrt=params)
```
#### Experimental
More syntax sugar for initialising parameters.
```python
P = Parameters()
with P:
W = np.zeros(10,10)
b = np.zeros(10)
# use W and b to define model instead of P.W and P.b
params = P.values()
:
```
# coding=utf-8
from __future__ import print_function
import numpy as np
chars = [" ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"]
class BarHack(str):
def __str__(self):
return self.internal
def __len__(self):
return 1
def plot(arr, max_val=None):
if max_val is None:
max_arr = arr
max_val = max(abs(np.max(max_arr)), abs(np.min(max_arr)))
opts = np.get_printoptions()
np.set_printoptions(edgeitems=500)
print(np.array2string(arr,
formatter={
'float_kind': lambda x: visual(x, max_val),
'int_kind': lambda x: visual(x, max_val)},
max_line_width=5000
))
np.set_printoptions(**opts)
def visual(val, max_val):
if abs(val) == max_val:
step = len(chars) - 1
else:
step = int(abs(float(val) / max_val) * len(chars))
colourstart = ""
colourend = ""
if val < 0:
colourstart, colourend = '\033[90m', '\033[0m'
return colourstart + chars[step] + colourend
import theano.tensor as T
def log_sum_exp(x, axis=None, keepdims=False):
k = T.max(x, axis=axis, keepdims=True)
sum_x_ = T.log(T.sum(T.exp(x - k), axis=axis, keepdims=keepdims))
return sum_x_ + k.reshape(sum_x_.shape)
def log_softmax(x):
return x - log_sum_exp(x, axis=-1, keepdims=True)
def log_sigmoid(x):
return -T.nnet.softplus(-x)
def softmax(x):
e_x = T.exp(x - T.max(x, axis=-1, keepdims=True))
out = e_x / T.sum(e_x, axis=-1, keepdims=True)
return out
def log_add(x, y):
k = T.maximum(x, y)
return T.log(T.exp(x - k) + T.exp(y - k)) + k
def binary_crossentropy(sigmoid_x, y):
if sigmoid_x.owner.op == T.nnet.sigmoid:
def softplus(x):
return T.switch(x > 20, x, T.nnet.softplus(x))
x = sigmoid_x.owner.inputs[0]
return - (y * -softplus(-x) + (1 - y) * -softplus(x))
else:
return T.nnet.binary_crossentropy(sigmoid_x, y)
import theano
import numpy as np
import cPickle as pickle
from functools import reduce
import inspect
import sys
class Parameters():
def __init__(self):
self.__dict__['params'] = {}
def __setattr__(self, name, array):
params = self.__dict__['params']
if name not in params:
params[name] = theano.shared(
value=np.asarray(
array,
dtype=theano.config.floatX
),
borrow=True,
name=name
)
else:
print >> sys.stderr, "%s already assigned" % name
params[name].set_value(np.asarray(
array,
dtype=theano.config.floatX
))
def __setitem__(self, name, array):
self.__setattr__(name, array)
def __getitem__(self, name):
return self.__getattr__(name)
def __getattr__(self, name):
params = self.__dict__['params']
return params[name]
def remove(self, name):
del self.__dict__['params'][name]
def values(self):
params = self.__dict__['params']
return params.values()
def save(self, filename):
params = self.__dict__['params']
with open(filename, 'wb') as f:
pickle.dump({p.name: p.get_value() for p in params.values()}, f, 2)
def load(self, filename):
params = self.__dict__['params']
loaded = pickle.load(open(filename, 'rb'))
for k in params:
if k in loaded:
params[k].set_value(loaded[k])
else:
print >> sys.stderr, "%s does not exist." % k
def __enter__(self):
_, _, _, env_locals = inspect.getargvalues(
inspect.currentframe().f_back)
self.__dict__['_env_locals'] = env_locals.keys()
def __exit__(self, type, value, traceback):
_, _, _, env_locals = inspect.getargvalues(
inspect.currentframe().f_back)
prev_env_locals = self.__dict__['_env_locals']
del self.__dict__['_env_locals']
for k in env_locals.keys():
if k not in prev_env_locals:
self.__setattr__(k, env_locals[k])
env_locals[k] = self.__getattr__(k)
return True
def parameter_count(self):
import operator
params = self.__dict__['params']