Skip to content
Commits on Source (6)
python-treetime (0.5.3-1) unstable; urgency=medium
* New upstream version
* debhelper 12
* Standards-Version: 4.3.0
-- Andreas Tille <tille@debian.org> Fri, 25 Jan 2019 18:21:37 +0100
python-treetime (0.5.2-2) unstable; urgency=medium
* Re-upload due to accidental removal of the package (see #916906)
......
......@@ -4,7 +4,7 @@ Uploaders: Andreas Tille <tille@debian.org>
Section: science
Testsuite: autopkgtest-pkg-python
Priority: optional
Build-Depends: debhelper (>= 11~),
Build-Depends: debhelper (>= 12~),
dh-python,
python3-all,
python3-biopython,
......@@ -12,7 +12,7 @@ Build-Depends: debhelper (>= 11~),
python3-pandas,
python3-scipy,
python3-setuptools
Standards-Version: 4.2.1
Standards-Version: 4.3.0
Vcs-Browser: https://salsa.debian.org/med-team/python-treetime
Vcs-Git: https://salsa.debian.org/med-team/python-treetime.git
Homepage: https://github.com/neherlab/treetime
......
......@@ -6,4 +6,4 @@ from .treetime import ttconf as treetime_conf
from .gtr import GTR
from .merger_models import Coalescent
from .treeregression import TreeRegression
version="0.5.2"
version="0.5.3"
......@@ -118,7 +118,7 @@ class BranchLenInterpolator (Distribution):
@gamma.setter
def gamma(self, value):
self._gamma = value
self._gamma = max(ttconf.TINY_NUMBER, value)
@property
def merger_cost(self):
......
......@@ -720,12 +720,13 @@ class ClockTree(TreeAnc):
return ttconf.ERROR
rate_std = np.sqrt(self.clock_model['cov'][0,0])
current_rate = self.clock_model['slope']
current_rate = np.abs(self.clock_model['slope'])
upper_rate = self.clock_model['slope'] + rate_std
lower_rate = max(0.1*current_rate, self.clock_model['slope'] - rate_std)
for n in self.tree.find_clades():
if n.up:
n.branch_length_interpolator.gamma*=upper_rate/current_rate
n._orig_gamma = n.branch_length_interpolator.gamma
n.branch_length_interpolator.gamma = n._orig_gamma*upper_rate/current_rate
self.logger("###ClockTree.calc_rate_susceptibility: run with upper bound of rate estimate", 1)
self.make_time_tree(**params)
......@@ -733,7 +734,7 @@ class ClockTree(TreeAnc):
for n in self.tree.find_clades():
n.numdate_rate_variation = [(upper_rate, n.numdate)]
if n.up:
n.branch_length_interpolator.gamma*=lower_rate/upper_rate
n.branch_length_interpolator.gamma = n._orig_gamma*lower_rate/current_rate
self.logger("###ClockTree.calc_rate_susceptibility: run with lower bound of rate estimate", 1)
self.make_time_tree(**params)
......@@ -741,7 +742,7 @@ class ClockTree(TreeAnc):
for n in self.tree.find_clades():
n.numdate_rate_variation.append((lower_rate, n.numdate))
if n.up:
n.branch_length_interpolator.gamma*=current_rate/lower_rate
n.branch_length_interpolator.gamma = n._orig_gamma
self.logger("###ClockTree.calc_rate_susceptibility: run with central rate estimate", 1)
self.make_time_tree(**params)
......
......@@ -27,7 +27,7 @@ MIN_INTEGRATION_PEAK = 0.001
# clocktree parameters
BRANCH_LEN_PENALTY = 0
MAX_BRANCH_LENGTH = 1.5 # only relevant for time trees - upper boundary of interpolator objects
MAX_BRANCH_LENGTH = 4.0 # only relevant for branch length optimization and time trees - upper boundary of interpolator objects
NINTEGRAL = 300
REL_TOL_PRUNE = 0.01
REL_TOL_REFINE = 0.05
......
......@@ -17,7 +17,7 @@ class Distribution(object):
"""
@staticmethod
def calc_fwhm(distribution, is_log=True):
def calc_fwhm(distribution, is_neg_log=True):
"""
Assess the width of the probability distribution. This returns
full-width-half-max
......@@ -25,29 +25,34 @@ class Distribution(object):
if isinstance(distribution, interp1d):
if is_log:
if is_neg_log:
ymin = distribution.y.min()
prob = np.exp(-(distribution.y-ymin))
log_prob = distribution.y-ymin
else:
prob = distribution.y
log_prob = -np.log(distribution.y)
log_prob -= log_prob.min()
xvals = distribution.x
elif isinstance(distribution, Distribution):
# Distribution always stores log-prob
# Distribution always stores neg log-prob with the peak value subtracted
xvals = distribution._func.x
prob = distribution.prob_relative(xvals)
log_prob = distribution._func.y
else:
raise TypeError("Error in computing the FWHM for the distribution. "
" The input should be either Distribution or interpolation object");
x_idxs = binary_dilation(prob >= 0.4*(prob.max() - prob.min())+prob.min(), iterations=1)
xs = xvals[x_idxs]
if xs.shape[0] < 2:
L = xvals.shape[0]
# 0.69... is log(2), there is always one value for which this is true since
# the minimum is subtracted
tmp = np.where(log_prob < 0.693147)[0]
x_l, x_u = tmp[0], tmp[-1]
if L < 2:
print ("Not enough points to compute FWHM: returning zero")
return min(TINY_NUMBER, distribution.xmax - distribution.xmin)
else:
return xs.max() - xs.min()
# need to guard against out-of-bounds errors
return max(TINY_NUMBER, xvals[min(x_u+1,L-1)] - xvals[max(0,x_l-1)])
@classmethod
......@@ -60,10 +65,12 @@ class Distribution(object):
distribution.weight = weight
return distribution
@classmethod
def shifted_x(cls, dist, delta_x):
return Distribution(dist.x+delta_x, dist.y, kind=dist.kind)
@staticmethod
def multiply(dists):
'''
......@@ -102,12 +109,13 @@ class Distribution(object):
res = Distribution.delta_function(x_vals[0])
else:
res = Distribution(x_vals[ind], y_vals[ind], is_log=True,
min_width=min_width, kind='linear')
min_width=min_width, kind='linear', assume_sorted=True)
return res
def __init__(self, x, y, is_log=True, min_width = MIN_INTEGRATION_PEAK, kind='linear'):
def __init__(self, x, y, is_log=True, min_width = MIN_INTEGRATION_PEAK,
kind='linear', assume_sorted=False):
"""
Create Distribution instance
......@@ -117,8 +125,11 @@ class Distribution(object):
if isinstance(x, Iterable) and isinstance (y, Iterable):
self._delta = False # NOTE in classmethod this value is set explicitly to True.
xvals, yvals = np.array(sorted(zip(x,y))).T
# first, prepare x, y values
if assume_sorted:
xvals, yvals = x,y
else:
xvals, yvals = np.array(sorted(zip(x,y))).T
if not is_log:
yvals = -np.log(yvals)
# just for safety
......@@ -135,7 +146,8 @@ class Distribution(object):
yvals -= self._peak_val
self._ymax = yvals.max()
# store the interpolation object
self._func= interp1d(xvals, yvals, kind=kind, fill_value=BIG_NUMBER, bounds_error=False)
self._func= interp1d(xvals, yvals, kind=kind, fill_value=BIG_NUMBER,
bounds_error=False, assume_sorted=True)
self._fwhm = Distribution.calc_fwhm(self)
elif np.isscalar(x):
......
......@@ -54,6 +54,7 @@ class GTR(object):
else:
self.profile_map = prof_map
if logger is None:
def logger_default(*args,**kwargs):
"""standard logging function if none provided"""
......@@ -62,30 +63,17 @@ class GTR(object):
self.logger = logger_default
else:
self.logger = logger
n_states = len(self.alphabet)
self.logger("GTR: with alphabet: "+str(self.alphabet),1)
# determine if a character exists that corresponds to no info, i.e. all one profile
if any([x.sum()==n_states for x in self.profile_map.values()]):
amb_states = [c for c,x in self.profile_map.items() if x.sum()==n_states]
self.ambiguous = 'N' if 'N' in amb_states else amb_states[0]
self.logger("GTR: ambiguous character: "+self.ambiguous,2)
else:
self.ambiguous = None
# check for a gap symbol
try:
self.gap_index = list(self.alphabet).index('-')
except:
self.logger("GTR: no gap symbol!", 4, warn=True)
self.gap_index=-1
self.gap_index = None
self.n_states = len(self.alphabet)
self.assign_gap_and_ambiguous()
# NEEDED TO BREAK RATE MATRIX DEGENERACY AND FORCE NP TO RETURN REAL ORTHONORMAL EIGENVECTORS
# ugly hack, but works and shouldn't affect results
tmp_rng_state = np.random.get_state()
np.random.seed(12345)
self.break_degen = np.random.random(size=(n_states, n_states))*1e-6
self.break_degen = np.random.random(size=(self.n_states, self.n_states))*1e-6
np.random.set_state(tmp_rng_state)
# init all matrices with dummy values
......@@ -96,10 +84,29 @@ class GTR(object):
self.assign_rates()
def assign_gap_and_ambiguous(self):
n_states = len(self.alphabet)
self.logger("GTR: with alphabet: "+str(self.alphabet),1)
# determine if a character exists that corresponds to no info, i.e. all one profile
if any([x.sum()==n_states for x in self.profile_map.values()]):
amb_states = [c for c,x in self.profile_map.items() if x.sum()==n_states]
self.ambiguous = 'N' if 'N' in amb_states else amb_states[0]
self.logger("GTR: ambiguous character: "+self.ambiguous,2)
else:
self.ambiguous=None
# check for a gap symbol
try:
self.gap_index = list(self.alphabet).index('-')
except:
self.logger("GTR: no gap symbol!", 4, warn=True)
self.gap_index=None
@property
def Q(self):
"""function that return the product of the transtiion matrix
and the equilibrium frequencies to option the rate matrix
"""function that return the product of the transition matrix
and the equilibrium frequencies to obtain the rate matrix
of the GTR model
"""
return (self.W*self.Pi).T
......@@ -113,8 +120,14 @@ class GTR(object):
'''
String representation of the GTR model for pretty printing
'''
multi_site = len(self.Pi.shape)==2
if multi_site:
eq_freq_str = "Average substitution rate (mu): "+str(np.round(self.average_rate,6))+'\n'
else:
eq_freq_str = "Substitution rate (mu): "+str(np.round(self.mu,6))+'\n'
if not multi_site:
eq_freq_str += "\nEquilibrium frequencies (pi_i):\n"
for a,p in zip(self.alphabet, self.Pi):
eq_freq_str+=' '+str(a)+': '+str(np.round(p,4))+'\n'
......@@ -124,6 +137,7 @@ class GTR(object):
for a,Wi in zip(self.alphabet, self.W):
W_str+= ' '+str(a)+'\t'+'\t'.join([str(np.round(max(0,p),4)) for p in Wi])+'\n'
if not multi_site:
Q_str = "\nActual rates from j->i (Q_ij):\n"
Q_str+='\t'+'\t'.join(map(str, self.alphabet))+'\n'
for a,Qi in zip(self.alphabet, self.Q):
......@@ -131,6 +145,7 @@ class GTR(object):
return eq_freq_str + W_str + Q_str
def assign_rates(self, mu=1.0, pi=None, W=None):
"""
Overwrite the GTR model given the provided data
......@@ -402,7 +417,7 @@ class GTR(object):
nij : nxn matrix
The number of times a change in character state is observed
between state i and j
between state j and i
Ti :n vector
The time spent in each character state
......@@ -451,11 +466,7 @@ class GTR(object):
+ ttconf.TINY_NUMBER + 2*pc_mat)
np.fill_diagonal(W_ij, 0)
Wdiag = (((W_ij.T*pi).T).sum(axis=0)+ttconf.TINY_NUMBER)/(pi+ttconf.TINY_NUMBER)
np.fill_diagonal(W_ij, Wdiag)
Q1 = np.diag(pi).dot(W_ij)
scale_factor = np.sum(np.diagonal(Q1*np.diag(pi)))
np.fill_diagonal(W_ij, 0)
scale_factor = np.einsum('i,ij,j',pi,W_ij,pi)
W_ij = W_ij/scale_factor
if fixed_pi is None:
......@@ -468,7 +479,7 @@ class GTR(object):
gtr.logger('the iterative scheme has not converged',3,warn=True)
elif np.abs(1-np.max(pi.sum(axis=0))) > dp:
gtr.logger('the iterative scheme has converged, but proper normalization was not reached',3,warn=True)
if gtr.gap_index>=0:
if gtr.gap_index is not None:
if pi[gtr.gap_index]<gap_limit:
gtr.logger('The model allows for gaps which are estimated to occur at a low fraction of %1.3e'%pi[gtr.gap_index]+
'\n\t\tthis can potentially result in artificats.'+
......@@ -517,7 +528,23 @@ class GTR(object):
self.v = np.real(eigvecs)
self.v_inv = np.linalg.inv(self.v)
self.eigenvals = np.real(eigvals)
return
def _eig_sym(self):
"""
Perform eigendecompositon of the rate matrix and stores the left- and right-
matrices to convert the sequence profiles to the GTR matrix eigenspace
and hence to speed-up the computations.
"""
# eigendecomposition of the rate matrix
tmpp = np.sqrt(self.Pi)
symQ = self.W*np.outer(tmpp, tmpp)
eigvals, eigvecs = np.linalg.eigh(symQ)
tmp_v = eigvecs.T*tmpp
one_norm = np.sum(np.abs(tmp_v), axis=1)
self.v = tmp_v.T/one_norm
self.v_inv = (eigvecs*one_norm).T/tmpp
self.eigenvals = eigvals
def compress_sequence_pair(self, seq_p, seq_ch, pattern_multiplicity=None,
......@@ -572,9 +599,9 @@ class GTR(object):
bs.append(seq==nuc)
for n1,nuc1 in enumerate(self.alphabet):
if (n1!=self.gap_index or (not ignore_gaps)):
if (self.gap_index is None) or (not ignore_gaps) or (n1!=self.gap_index):
for n2,nuc2 in enumerate(self.alphabet):
if (n2!=self.gap_index or (not ignore_gaps)):
if (self.gap_index is None) or (not ignore_gaps) or (n2!=self.gap_index):
count = ((bool_seqs_p[n1]&bool_seqs_ch[n2])*pattern_multiplicity).sum()
if count: pair_count.append(((n1,n2), count))
else: # enumerate state pairs of the sequence for large alphabets
......@@ -637,7 +664,8 @@ class GTR(object):
return logP if return_log else np.exp(logP)
def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None, return_log=False, ignore_gaps=True):
def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None,
return_log=False, ignore_gaps=True):
"""
Compute the probability to observe seq_ch (child sequence) after time t starting from seq_p
(parent sequence).
......@@ -703,7 +731,7 @@ class GTR(object):
return self.optimal_t_compressed(seq_pair, multiplicity)
def optimal_t_compressed(self, seq_pair, multiplicity, profiles=False):
def optimal_t_compressed(self, seq_pair, multiplicity, profiles=False, tol=1e-10):
"""
Find the optimal distance between the two sequences, for compressed sequences
......@@ -756,7 +784,8 @@ class GTR(object):
to be separated by the time t.
"""
if profiles:
return -1.0*self.prob_t_profiles(seq_pair, multiplicity,t**2, return_log=True)
res = -1.0*self.prob_t_profiles(seq_pair, multiplicity,t**2, return_log=True)
return res
else:
return -1.0*self.prob_t_compressed(seq_pair, multiplicity,t**2, return_log=True)
......@@ -764,7 +793,7 @@ class GTR(object):
from scipy.optimize import minimize_scalar
opt = minimize_scalar(_neg_prob,
bounds=[-np.sqrt(ttconf.MAX_BRANCH_LENGTH),np.sqrt(ttconf.MAX_BRANCH_LENGTH)],
args=(seq_pair, multiplicity), tol=1e-10)
args=(seq_pair, multiplicity), tol=tol)
new_len = opt["x"]**2
if 'success' not in opt:
opt['success'] = True
......@@ -789,7 +818,8 @@ class GTR(object):
return new_len
def prob_t_profiles(self, profile_pair, multiplicity, t, return_log=False, ignore_gaps=True):
def prob_t_profiles(self, profile_pair, multiplicity, t,
return_log=False, ignore_gaps=True):
'''
Calculate the probability of observing a node pair at a distance t
......@@ -816,15 +846,17 @@ class GTR(object):
if t<0:
logP = -ttconf.BIG_NUMBER
else:
Qt = self.expQt(t).T
res = profile_pair[0].dot(Qt)
overlap = np.sum(res*profile_pair[1], axis=1)
if ignore_gaps: # calculate the probability that neither outgroup/node has a gap
Qt = self.expQt(t)
if len(Qt.shape)==3:
res = np.einsum('ai,ija,aj->a', profile_pair[1], Qt, profile_pair[0])
else:
res = np.einsum('ai,ij,aj->a', profile_pair[1], Qt, profile_pair[0])
if ignore_gaps and (self.gap_index is not None): # calculate the probability that neither outgroup/node has a gap
non_gap_frac = (1-profile_pair[0][:,self.gap_index])*(1-profile_pair[1][:,self.gap_index])
# weigh log LH by the non-gap probability
logP = np.sum(multiplicity*np.log(overlap)*non_gap_frac)
logP = np.sum(multiplicity*np.log(res)*non_gap_frac)
else:
logP = np.sum(multiplicity*np.log(overlap))
logP = np.sum(multiplicity*np.log(res))
return logP if return_log else np.exp(logP)
......@@ -862,6 +894,37 @@ class GTR(object):
return np.log(res) if return_log else res
def evolve(self, profile, t, return_log=False):
"""
Compute the probability of the sequence state of the child
at time t later, given the parent profile.
Parameters
----------
profile : numpy.array
Sequence profile. Shape = (L, a),
where L - sequence length, a - alphabet size.
t : double
Time to propagate
return_log: bool
If True, return log-probability
Returns
-------
res : np.array
Profile of the sequence after time t in the future.
Shape = (L, a), where L - sequence length, a - alphabet size.
"""
Qt = self.expQt(t).T
res = profile.dot(Qt)
return np.log(res) if return_log else res
def _exp_lt(self, t):
"""
Parameters
......@@ -952,6 +1015,8 @@ class GTR(object):
return np.sum([np.sum((seq==state)*pattern_multiplicity*np.log(self.Pi[si]))
for si,state in enumerate(self.alphabet)])
def average_rate(self):
return self.mu*np.einsum('i,ij,j',self.Pi, self.W, self.Pi)
def save_to_npz(self, outfile):
full_gtr = self.mu * np.dot(self.Pi, self.W)
......
from __future__ import division, print_function, absolute_import
from collections import defaultdict
import numpy as np
from treetime import config as ttconf
from .seq_utils import alphabets, profile_maps, alphabet_synonyms
from .gtr import GTR
class GTR_site_specific(GTR):
"""
Defines General-Time-Reversible model of character evolution.
"""
def __init__(self, *args, **kwargs):
if 'seq_len' in kwargs:
self.seq_len = kwargs['seq_len']
kwargs.pop('seq_len')
else:
self.seq_len = 1
super(GTR_site_specific, self).__init__(**kwargs)
@property
def Q(self):
"""function that return the product of the transition matrix
and the equilibrium frequencies to obtain the rate matrix
of the GTR model
"""
tmp = np.einsum('ia,ij->ija', self.Pi, self.W)
diag_vals = np.sum(tmp, axis=0)
for x in range(tmp.shape[-1]):
np.fill_diagonal(tmp[:,:,x], -diag_vals[:,x])
return tmp
def assign_rates(self, mu=1.0, pi=None, W=None):
"""
Overwrite the GTR model given the provided data
Parameters
----------
mu : float
Substitution rate
W : nxn matrix
Substitution matrix
pi : n vector
Equilibrium frequencies
"""
n = len(self.alphabet)
self.mu = np.copy(mu)
if pi is not None and pi.shape[0]==n:
self.seq_len = pi.shape[-1]
Pi = np.copy(pi)
else:
if pi is not None and len(pi)!=n:
self.logger("length of equilibrium frequency vector does not match alphabet length", 4, warn=True)
self.logger("Ignoring input equilibrium frequencies", 4, warn=True)
Pi = np.ones(shape=(n,self.seq_len))
self.Pi = Pi/np.sum(Pi, axis=0)
if W is None or W.shape!=(n,n):
if (W is not None) and W.shape!=(n,n):
self.logger("Substitution matrix size does not match alphabet size", 4, warn=True)
self.logger("Ignoring input substitution matrix", 4, warn=True)
# flow matrix
W = np.ones((n,n))
else:
W=0.5*(np.copy(W)+np.copy(W).T)
np.fill_diagonal(W,0)
avg_pi = self.Pi.mean(axis=-1)
average_rate = W.dot(avg_pi).dot(avg_pi)
self.W = W/average_rate
self.mu *=average_rate
self._eig()
@classmethod
def random(cls, L=1, avg_mu=1.0, alphabet='nuc', pi_dirichlet_alpha=1,
W_dirichlet_alpha=3.0, mu_gamma_alpha=3.0):
"""
Creates a random GTR model
Parameters
----------
mu : float
Substitution rate
alphabet : str
Alphabet name (should be standard: 'nuc', 'nuc_gap', 'aa', 'aa_gap')
"""
from scipy.stats import gamma
alphabet=alphabets[alphabet]
gtr = cls(alphabet=alphabet, seq_len=L)
n = gtr.alphabet.shape[0]
if pi_dirichlet_alpha:
pi = 1.0*gamma.rvs(pi_dirichlet_alpha, size=(n,L))
else:
pi = np.ones((n,L))
pi /= pi.sum(axis=0)
if W_dirichlet_alpha:
tmp = 1.0*gamma.rvs(W_dirichlet_alpha, size=(n,n))
else:
tmp = np.ones((n,n))
tmp = np.tril(tmp,k=-1)
W = tmp + tmp.T
if mu_gamma_alpha:
mu = gamma.rvs(mu_gamma_alpha, size=(L,))
else:
mu = np.ones(L)
gtr.assign_rates(mu=mu, pi=pi, W=W)
gtr.mu *= avg_mu/np.mean(gtr.mu)
return gtr
@classmethod
def custom(cls, mu=1.0, pi=None, W=None, **kwargs):
"""
Create a GTR model by specifying the matrix explicitly
Parameters
----------
mu : float
Substitution rate
W : nxn matrix
Substitution matrix
pi : n vector
Equilibrium frequencies
**kwargs:
Key word arguments to be passed
Keyword Args
------------
alphabet : str
Specify alphabet when applicable. If the alphabet specification is
required, but no alphabet is specified, the nucleotide alphabet will be used as
default.
"""
gtr = cls(**kwargs)
gtr.assign_rates(mu=mu, pi=pi, W=W)
return gtr
@classmethod
def infer(cls, sub_ija, T_ia, root_state, pc=0.01,
gap_limit=0.01, Nit=30, dp=1e-5, **kwargs):
"""
Infer a GTR model by specifying the number of transitions and time spent in each
character. The basic equation that is being solved is
:math:`n_{ij} = pi_i W_{ij} T_j`
where :math:`n_{ij}` are the transitions, :math:`pi_i` are the equilibrium
state frequencies, :math:`W_{ij}` is the "substitution attempt matrix",
while :math:`T_i` is the time on the tree spent in character state
:math:`i`. To regularize the process, we add pseudocounts and also need
to account for the fact that the root of the tree is in a particular
state. the modified equation is
:math:`n_{ij} + pc = pi_i W_{ij} (T_j+pc+root\_state)`
Parameters
----------
nija : nxn matrix
The number of times a change in character state is observed
between state j and i at position a
Tia :n vector
The time spent in each character state at position a
root_state : n vector
The number of characters in state i in the sequence
of the root node.
pc : float
Pseudocounts, this determines the lower cutoff on the rate when
no substitutions are observed
**kwargs:
Key word arguments to be passed
Keyword Args
------------
alphabet : str
Specify alphabet when applicable. If the alphabet specification
is required, but no alphabet is specified, the nucleotide alphabet will be used as default.
"""
from scipy import linalg as LA
gtr = cls(**kwargs)
gtr.logger("GTR: model inference ",1)
q = len(gtr.alphabet)
L = sub_ija.shape[-1]
n_iter = 0
n_ija = np.copy(sub_ija)
n_ija[range(q),range(q),:] = 0
n_ij = n_ija.sum(axis=-1)
m_ia = np.sum(n_ija,axis=1) + root_state + pc
n_a = n_ija.sum(axis=1).sum(axis=0) + pc
Lambda = np.sum(root_state,axis=0) + q*pc
p_ia_old=np.zeros((q,L))
p_ia = np.ones((q,L))/q
mu_a = np.ones(L)
W_ij = np.ones((q,q)) - np.eye(q)
while (LA.norm(p_ia_old-p_ia)>dp) and n_iter<Nit:
n_iter += 1
p_ia_old = np.copy(p_ia)
S_ij = np.einsum('a,ia,ja',mu_a, p_ia, T_ia)
W_ij = (n_ij + n_ij.T + pc)/(S_ij + S_ij.T + pc)
avg_pi = p_ia.mean(axis=-1)
average_rate = W_ij.dot(avg_pi).dot(avg_pi)
W_ij = W_ij/average_rate
mu_a *=average_rate
p_ia = m_ia/(mu_a*np.dot(W_ij,T_ia)+Lambda)
p_ia = p_ia/p_ia.sum(axis=0)
mu_a = n_a/(pc+np.einsum('ia,ij,ja->a', p_ia, W_ij, T_ia))
if n_iter >= Nit:
gtr.logger('WARNING: maximum number of iterations has been reached in GTR inference',3, warn=True)
if LA.norm(p_ia_old-p_ia) > dp:
gtr.logger('the iterative scheme has not converged',3,warn=True)
if gtr.gap_index is not None:
for p in range(p_ia.shape[-1]):
if p_ia[gtr.gap_index,p]<gap_limit:
gtr.logger('The model allows for gaps which are estimated to occur at a low fraction of %1.3e'%p_ia[gtr.gap_index]+
'\n\t\tthis can potentially result in artificats.'+
'\n\t\tgap fraction will be set to %1.4f'%gap_limit,2,warn=True)
p_ia[gtr.gap_index,p] = gap_limit
p_ia[:,p] /= p_ia[:,p].sum()
gtr.assign_rates(mu=mu_a, W=W_ij, pi=p_ia)
return gtr
def _eig_single_site(self, W, p):
tmpp = np.sqrt(p)
symQ = W*np.outer(tmpp, tmpp)
np.fill_diagonal(symQ, -np.sum(W*p, axis=1))
eigvals, eigvecs = np.linalg.eigh(symQ)
tmp_v = eigvecs.T*tmpp
one_norm = np.sum(np.abs(tmp_v), axis=1)
return eigvals, tmp_v.T/one_norm, (eigvecs*one_norm).T/tmpp
def _eig(self):
eigvals, vec, vec_inv = [], [], []
for pi in range(self.seq_len):
if len(self.W.shape)>2:
W = np.copy(self.W[:,:,pi])
np.fill_diagonal(W, 0)
elif pi==0:
np.fill_diagonal(self.W, 0)
W=self.W
ev, evec, evec_inv = self._eig_single_site(W,self.Pi[:,pi])
eigvals.append(ev)
vec.append(evec)
vec_inv.append(evec_inv)
self.eigenvals = np.array(eigvals).T
self.v = np.swapaxes(vec,0,-1)
self.v_inv = np.swapaxes(vec_inv, 0,-1)
def expQt(self, t):
# this is currently very slow.
eLambdaT = np.exp(self.mu*t*self.eigenvals)
return np.einsum('jia,ja,kja->ika', self.v, eLambdaT, self.v_inv)
def prop_t_compressed(self, seq_pair, multiplicity, t, return_log=False):
print("NOT IMPEMENTED")
def propagate_profile(self, profile, t, return_log=False):
"""
Compute the probability of the sequence state of the parent
at time (t+t0, backwards), given the sequence state of the
child (profile) at time t0.
Parameters
----------
profile : numpy.array
Sequence profile. Shape = (L, a),
where L - sequence length, a - alphabet size.
t : double
Time to propagate
return_log: bool
If True, return log-probability
Returns
-------
res : np.array
Profile of the sequence after time t in the past.
Shape = (L, a), where L - sequence length, a - alphabet size.
"""
Qt = self.expQt(t)
res = np.einsum('ai,ija->aj', profile, Qt)
return np.log(res) if return_log else res
def evolve(self, profile, t, return_log=False):
"""
Compute the probability of the sequence state of the child
at time t later, given the parent profile.
Parameters
----------
profile : numpy.array
Sequence profile. Shape = (L, a),
where L - sequence length, a - alphabet size.
t : double
Time to propagate
return_log: bool
If True, return log-probability
Returns
-------
res : np.array
Profile of the sequence after time t in the future.
Shape = (L, a), where L - sequence length, a - alphabet size.
"""
Qt = self.expQt(t)
res = np.einsum('ai,jia->aj', profile, Qt)
return np.log(res) if return_log else res
def prob_t(self, seq_p, seq_ch, t, pattern_multiplicity = None,
return_log=False, ignore_gaps=True):
"""
Compute the probability to observe seq_ch (child sequence) after time t starting from seq_p
(parent sequence).
Parameters
----------
seq_p : character array
Parent sequence
seq_c : character array
Child sequence
t : double
Time (branch len) separating the profiles.
pattern_multiplicity : numpy array
If sequences are reduced by combining identical alignment patterns,
these multplicities need to be accounted for when counting the number
of mutations across a branch. If None, all pattern are assumed to
occur exactly once.
return_log : bool
It True, return log-probability.
Returns
-------
prob : np.array
Resulting probability
"""
if t<0:
logP = -ttconf.BIG_NUMBER
else:
tmp_eQT = self.expQt(t)
bad_indices=(tmp_eQT==0)
logQt = np.log(tmp_eQT + ttconf.TINY_NUMBER*(bad_indices))
logQt[np.isnan(logQt) | np.isinf(logQt) | bad_indices] = -ttconf.BIG_NUMBER
seq_indices_c = np.zeros(len(seq_ch), dtype=int)
seq_indices_p = np.zeros(len(seq_p), dtype=int)
for ai, a in self.alphabet:
seq_indices_p[seq_p==a] = ai
seq_indices_c[seq_ch==a] = ai
if len(logQt.shape)==2:
logP = np.sum(logQt[seq_indices_p, seq_indices_c]*pattern_multiplicity)
else:
logP = np.sum(logQt[seq_indices_p, seq_indices_c, np.arange(len(seq_c))]*pattern_multiplicity)
return logP if return_log else np.exp(logP)
def average_rate(self):
return np.einsum('a,ia,ij,ja->a',self.mu, self.Pi, self.W, self.Pi)
......@@ -80,7 +80,7 @@ def _convolution_integrand(t_val, f, g,
# create the interpolation object on this grid
FG = Distribution(tau, fg, is_log=True, min_width = np.max([f.min_width, g.min_width]),
kind='linear')
kind='linear', assume_sorted=True)
return FG
......@@ -258,7 +258,6 @@ class NodeInterpolator (Distribution):
insert_point_idx[1:] = refine_factor
insert_point_idx[:-1] += refine_factor
# add additional points if there are any to add
if np.sum(insert_point_idx):
add_x = np.concatenate([np.linspace(t1,t2,n+2)[1:-1] for t1,t2,n in
zip(t_grid_0[1:-2], t_grid_0[2:-1], insert_point_idx) if n>0])
......
......@@ -89,31 +89,29 @@ profile_maps = {
},
'aa_nogap':{
'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine Ala
'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine Cys
'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD Asp
'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid Glu
'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine Phe
'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine Gly
'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine His
'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine Ile
'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine Lys
'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine Leu
'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine Met
'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe Asn
'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline Pro
'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine Gln
'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #ARginine Arg
'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Serine Ser
'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #Threonine Thr
'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Valine Val
'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Tryptophan Trp
'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Tyrosine Tyr
'*': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #stop
'-': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #gap
'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid Asx
'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid Glx
'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine Ala
'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine Cys
'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD Asp
'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid Glu
'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine Phe
'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine Gly
'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine His
'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine Ile
'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine Lys
'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine Leu
'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine Met
'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe Asn
'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline Pro
'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine Gln
'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #ARginine Arg
'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Serine Ser
'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Threonine Thr
'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Valine Val
'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype='float'), #Tryptophan Trp
'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype='float'), #Tyrosine Tyr
'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid Asx
'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid Glx
}
}
......@@ -172,20 +170,11 @@ def seq2prof(seq, profile_map):
Profile for the character. Zero array if the character not found
"""
plength = np.unique([len(x) for x in profile_map.values()])
if len(plength)==1:
n_states = plength[0]
else:
print("profile contains arrays of different length!")
return None
prof = np.array([profile_map[k] if k in profile_map
else np.ones(n_states) for k in seq ])
return np.array([profile_map[k] for k in seq])
bad_indices=(prof.sum(1) < 1e-5)
return prof[~bad_indices]
def prof2seq(profile, gtr, sample_from_prof=False):
def prof2seq(profile, gtr, sample_from_prof=False, normalize=True):
"""
Convert profile to sequence and normalize profile across sites.
......@@ -215,7 +204,10 @@ def prof2seq(profile, gtr, sample_from_prof=False):
"""
# normalize profile such that probabilities at each site sum to one
tmp_profile=(profile.T/profile.sum(axis=1)).T
if normalize:
tmp_profile, pre=normalize_profile(profile, return_offset=False)
else:
tmp_profile = profile
# sample sequence according to the probabilities in the profile
# (sampling from cumulative distribution over the different states)
......@@ -231,3 +223,30 @@ def prof2seq(profile, gtr, sample_from_prof=False):
return seq, prof_values, idx
def normalize_profile(in_profile, log=False, return_offset = True):
"""return a normalized version of a profile matrix
Parameters
----------
in_profile : np.array
shape Lxq, will be normalized to one across each row
log : bool, optional
treat the input as log probabilities
return_offset : bool, optional
return the log of the scale factor for each row
Returns
-------
tuple
normalized profile (fresh np object) and offset (if return_offset==True)
"""
if log:
tmp_prof = np.exp(in_profile) # - tmp_prefactor)
else:
tmp_prof = in_profile
norm_vector = tmp_prof.sum(axis=1)
return (np.copy(np.einsum('ai,a->ai',tmp_prof,1.0/norm_vector)),
np.log(norm_vector) if return_offset else None)
from __future__ import division, print_function, absolute_import
from collections import defaultdict
import numpy as np
from treetime import config as ttconf
from .seq_utils import alphabets, profile_maps, alphabet_synonyms, seq2array, seq2prof
from .gtr import GTR
from .treeanc import TreeAnc
class SeqGen(TreeAnc):
def __init__(self, *args, **kwargs):
super(SeqGen, self).__init__(reduce_alignment=False, **kwargs)
def sample_from_profile(self, p):
cum_p = p.cumsum(axis=1).T
prand = np.random.random(p.shape[0])
seq = self.gtr.alphabet[np.argmax(cum_p>prand, axis=0)]
return seq
def evolve(self, root_seq=None):
self.seq_len = self.gtr.seq_len
if root_seq:
self.tree.root.sequence = seq2array(root_seq)
else:
self.tree.root.sequence = self.sample_from_profile(self.gtr.Pi.T)
for n in self.tree.get_nonterminals(order='preorder'):
profile_p = seq2prof(n.sequence, self.gtr.profile_map)
for c in n:
profile = self.gtr.evolve(profile_p, c.branch_length)
c.sequence = self.sample_from_profile(profile)
self.make_reduced_alignment()
for n in self.tree.find_clades():
if n==self.tree.root:
n.mutations=[]
else:
n.mutations = self.get_mutations(n)
def get_aln(self, internal=False):
from Bio import SeqRecord, Seq
from Bio.Align import MultipleSeqAlignment
tmp = []
for n in self.tree.get_terminals():
if n.is_terminal() or internal:
tmp.append(SeqRecord.SeqRecord(id=n.name, name=n.name, description='', seq=Seq.Seq(''.join(n.sequence))))
return MultipleSeqAlignment(tmp)
This diff is collapsed.
......@@ -7,6 +7,7 @@ from .utils import tree_layout
from .clock_tree import ClockTree
rerooting_mechanisms = ["min_dev", "min_dev_ML", "best", "least-squares", "ML"]
deprecated_rerooting_mechanisms = {"residual":"least-squares", "res":"least-squares"}
class TreeTime(ClockTree):
"""
......@@ -158,7 +159,8 @@ class TreeTime(ClockTree):
else:
plot_rtt=False
reroot_mechanism = 'least-squares' if root=='clock_filter' else root
self.clock_filter(reroot=reroot_mechanism, n_iqd=n_iqd, plot=plot_rtt)
if self.clock_filter(reroot=reroot_mechanism, n_iqd=n_iqd, plot=plot_rtt)==ttconf.ERROR:
return ttconf.ERROR
elif root is not None:
if self.reroot(root=root)==ttconf.ERROR:
return ttconf.ERROR
......@@ -200,6 +202,7 @@ class TreeTime(ClockTree):
# estimate a relaxed molecular clock
if relaxed_clock:
print("relaxed_clock", relaxed_clock)
self.relaxed_clock(**relaxed_clock)
need_new_time_tree = True
......@@ -323,7 +326,8 @@ class TreeTime(ClockTree):
if reroot:
if reroot.startswith("ML"):
self.logger("TreeTime.ClockFilter: filtering with covariance aware methods is not recommended.", 0, warn=True)
self.reroot(root='least-squares' if reroot=='best' else reroot)
if self.reroot(root='least-squares' if reroot=='best' else reroot)==ttconf.ERROR:
return ttconf.ERROR
else:
self.get_clock_model(covariation=False)
......@@ -344,8 +348,8 @@ class TreeTime(ClockTree):
node.bad_branch=False
# redo root estimation after outlier removal
if reroot:
self.reroot(root=reroot)
if reroot and self.reroot(root=reroot)==ttconf.ERROR:
return ttconf.ERROR
if plot:
self.plot_root_to_tip()
......@@ -405,7 +409,12 @@ class TreeTime(ClockTree):
for n in self.tree.find_clades():
n.branch_length=n.mutation_length
if root in rerooting_mechanisms:
if root in rerooting_mechanisms or root in deprecated_rerooting_mechanisms:
if root in deprecated_rerooting_mechanisms:
self.logger('TreeTime.reroot: rerooting mechanisms %s has been renamed to %s'
%(root, deprecated_rerooting_mechanisms[root]), 1, warn=True)
root = deprecated_rerooting_mechanisms[root]
new_root = self._find_best_root(covariation='ML' in root,
force_positive=force_positive and (not root.startswith('min_dev')))
else:
......@@ -420,7 +429,7 @@ class TreeTime(ClockTree):
if n.raw_date_constraint is not None],
key=lambda x:np.mean(x.raw_date_constraint))[0]
else:
self.logger('TreeTime.reroot -- WARNING: unsupported rooting mechanisms or root not found',2,warn=True)
self.logger('TreeTime.reroot -- ERROR: unsupported rooting mechanisms or root not found',0,warn=True)
return ttconf.ERROR
#this forces a bifurcating root, as we want. Branch lengths will be reoptimized anyway.
......@@ -733,13 +742,13 @@ class TreeTime(ClockTree):
for node in self.tree.find_clades(order='preorder'):
if node.up is None:
node.gamma =- 0.5*node._k1/node._k2
node.gamma = max(0.1, -0.5*node._k1/node._k2)
else:
if node.up.up is None:
g_up = node.up.gamma
else:
g_up = node.up.branch_length_interpolator.gamma
node.branch_length_interpolator.gamma = (coupling*g_up - 0.5*node._k1)/(coupling+node._k2)
node.branch_length_interpolator.gamma = max(0.1,(coupling*g_up - 0.5*node._k1)/(coupling+node._k2))
###############################################################################
### rerooting
......
......@@ -538,6 +538,7 @@ def timetree(params):
branch_length_mode = branch_length_mode,
fixed_pi=fixed_pi)
if success==ttconf.ERROR: # if TreeTime.run failed, exit
print("\nTreeTime run FAILED: please check above for errors and/or rerun with --verbose 4.\n")
return 1
###########################################################################
......