Skip to content
Commits on Source (6)
bali-phy (3.0.2+dfsg-2) UNRELEASED; urgency=medium
bali-phy (3.0.3+dfsg-1) unstable; urgency=medium
* Team upload.
* Require version 0.45 of meson now
* New upstream version
* Standards-Version: 4.1.4
-- Andreas Tille <tille@debian.org> Fri, 20 Apr 2018 15:45:36 +0200
-- Andreas Tille <tille@debian.org> Fri, 20 Apr 2018 15:52:59 +0200
bali-phy (3.0.2+dfsg-1) unstable; urgency=medium
......
......@@ -15,7 +15,7 @@ Build-Depends: debhelper (>= 11),
libboost-system-dev,
libboost-filesystem-dev,
libboost-chrono-dev
Standards-Version: 4.1.3
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/bali-phy
Vcs-Git: https://salsa.debian.org/med-team/bali-phy.git
Homepage: http://www.bali-phy.org
......
This diff is collapsed.
......@@ -160,7 +160,15 @@ However, note that this might conflict with R installed from other places, such
</section>
<section><info><title>Install on Linux</title></info>
<section><info><title>Install BAli-Phy using executables from website</title></info>
<section><info><title>Install BAli-Phy using <command>apt-get</command> (recommended if using <emphasis>Debian: unstable</emphasis>)</title></info>
<screen><prompt>%</prompt> <userinput>sudo apt-get install bali-phy</userinput></screen>
Check that the executable runs:
<screen><prompt>%</prompt> <userinput>bali-phy --version</userinput></screen>
If you install with <command>apt-get</command>, you don't need to do anything extra to put bali-phy in your PATH.
</section>
<section><info><title>Install BAli-Phy using executables from website (alternative)</title></info>
<para>First install <command>wget</command>. If you have Debian or Ubuntu Linux, type:
<screen><prompt>%</prompt> <userinput>sudo apt-get install wget</userinput></screen>
......@@ -196,7 +204,7 @@ However, note that this might conflict with R installed from other places, such
<section><title>Is bali-phy in your PATH already?</title>
<para> First check if the executable is in your PATH.
<screen><prompt>%</prompt> <userinput>bali-phy --version</userinput></screen>
If this shows version info, then <command>bali-phy</command> is already in your PATH and you can skip this section. This should be true if you installed <command>bali-phy</command> using homebrew, or if you've already added it to your PATH.</para>
If this shows version info, then <command>bali-phy</command> is already in your PATH and you can skip this section. This should be true if you installed <command>bali-phy</command> using a package manager such as homebrew or apt, or if you've already added it to your PATH.</para>
<para>If bali-phy is not in your path, then you should see:
<screen><prompt>%</prompt> <userinput>bali-phy --version</userinput>
bali-phy: command not found.</screen>
......
No preview for this file type
This diff is collapsed.
......@@ -8,60 +8,63 @@
# SYNOPSIS
**alignment-thin** _alignment-file_
**alignment-thin** _alignment-file_ [OPTIONS]
# DESCRIPTION
Remove sequences or columns from an alignment.
# ALLOWED OPTIONS:
# GENERAL OPTIONS:
**-h**, **--help**
: produce help message
: Print usage information.
**--align** _arg_
: file with sequences and initial alignment
**-v**, **--verbose**
: Output more log messages on stderr.
**--find-dups** _arg_
: for each other sequence, find the closest sequence
# SEQUENCE FILTERING OPTIONS:
**--cutoff** _arg_
: only leave taxa with more mismatches than this value
: Keep only sequence with more mismatches than _arg_.
**--longer-than** _arg_
: only leave taxa w/ sequences longer than this
: Keep only sequences longer than _arg_.
**--shorter-than** _arg_
: only leave taxa w/ sequences shorter than this
**--down-to** _arg_
: number of taxa to keep
: Keep only sequence sequences shorter than _arg_.
**--keep** _arg_
: comma-separated list of taxon names to keep - remove others
: Keep only sequences in comma-separated list _arg_.
**--min-letters** _arg_
: Remove columns with fewer letters.
**--remove** _arg_
: Remove sequences in comma-separated list _arg_.
**-v**, **--verbose**
: Output more log messages on stderr.
**--down-to** _arg_
: Remove similar sequences down to _arg_ sequences.
**--show-lengths**
: just print out sequence lengths
**--remove-crazy** _arg_
: Remove _arg_ sequences that are missing too many conserved sites.
**--conserved** _arg_ (=0.75)
: Fraction of sequences that must contain a letter for it to be considered conserved.
**--sort**
: Sort partially ordered columns to minimize the number of visible indels.
# COLUMN FILTERING OPTIONS:
**--min-letters** _arg_
: Remove columns with fewer than _arg_ letters.
**--remove-unique** _arg_
: Remove insertions in a single sequence if longer than this many letters
: Remove insertions in a single sequence if longer than _arg_ letters
**--remove-crazy** _arg_
: Remove sequence that have deleted conserved sites
**--remove** _arg_
: comma-separated list of taxon names to remove
# OUTPUT OPTIONS:
**--sort**
: Sort partially ordered columns to group similar gaps.
**--conserved-fraction** _arg_ (=0.75)
: Fraction of sequences that must contain a letter for it to be considered conserved.
**--show-lengths**
: Just print out sequence lengths.
**--find-dups** _arg_
: For each sequence, find the closest other sequence.
# REPORTING BUGS:
......
% alignment-align(1)
% alignments-diff(1)
% Benjamin Redelings
% Feb 2018
# NAME
**alignment-align** -- Align two alignments for comparison.
**alignments-diff** -- Align two alignments for comparison.
# SYNOPSIS
**alignment-align** alignment-file1 alignment-file2 ... [OPTIONS] < alignments-file
**alignments-diff** alignment-file1 alignment-file2 [OPTIONS]
# DESCRIPTION
Align two alignments for comparison.
# ALLOWED OPTIONS:
# GENERAL OPTIONS:
**-h**, **--help**
: produce help message
**--alignment1** _arg_
: First alignment
**--alignment2** _arg_
: Second alignment
# INPUT OPTIONS:
**--alphabet** _arg_
: set to 'Codons' to prefer codon alphabets
# OUTPUT OPTIONS:
**--merge**
: Stack the two alignments into one alignment with duplicate names
......
......@@ -17,8 +17,7 @@ bali-phy -- Bayesian Alignment and Phylogeny estimation using MCMC
**bali-phy** estimates multiple sequence alignments and evolutionary trees
from DNA, amino acid, or codon sequences. BAli-Phy uses MCMC and Bayesian
methods to estimate evolutionary trees, positive selection, and branch
lengths while averaging over alternative alignments. BAli-Phy can display
alignment ambiguity graphically in an alignment uncertainty (AU) plot.
lengths while averaging over alternative alignments.
BAli-Phy can also estimate phylogenies from a fixed alignment (like MrBayes
and BEAST) using substitution models like GTR+gamma. BAli-Phy automatically
......
#!/bin/sh
mkdir -p man
for i in *.md ; do
pandoc -s $i --css ../man.css -o man/${i%%.md}.html
done
......@@ -102,6 +102,10 @@ for option_group in help['options']:
value = value if value else ''
has_short = (len(option) == 5)
description = option[-1].replace('[','\[').replace(']','\]')
# Change <arg> to _arg>
description = re.sub(r'<([^\s>][^>]*[^\s>])>',r'_\1_',description)
if has_short:
print("**-{}**{}{}, ".format(option[1],arg,value),end="")
print("**--{}**{}{}\n: {}\n".format(option[0],arg,value,description))
......
......@@ -4,7 +4,7 @@
# NAME
**trees-bootstrap** -- Usage: trees-bootstrap <file1> [<file2> ... ] --predicates <predicate file> [OPTIONS]
**trees-bootstrap** -- Compare support for partitions between different files.
# SYNOPSIS
......@@ -12,7 +12,7 @@
# DESCRIPTION
Usage: trees-bootstrap <file1> [<file2> ... ] --predicates <predicate file> [OPTIONS]
Compare support for partitions between different files.
# INPUT OPTIONS:
**-h**, **--help**
......
......@@ -18,7 +18,7 @@ the other parameters.
# Examples:
bali-phy bglobin.fasta --smodel=M8a_Test[freq_model=MG94] --Rao-Blackwellize=M8a_Test:posSelection
bali-phy bglobin.fasta --smodel=M8a\_Test[freq\_model=MG94] --Rao-Blackwellize=M8a_Test:posSelection
# See also:
......
......@@ -10,7 +10,7 @@ be referenced as the variable T.
The default branch-length prior is:
~iid[num_branches[T],Gamma[0.5,Div[2,num_branches[T]]]]
~iid[num\_branches[T],Gamma[0.5,Div[2,num\_branches[T]]]]
Since the expected total tree length is 1 under this prior, the
expected total tree length for each partition is given by the
......@@ -18,6 +18,6 @@ scale factor.
# Examples:
-B ~iid[num_branches[T],Gamma[0.5,Div[2,num_branches[T]]]]
-B ~iid[num\_branches[T],Gamma[0.5,Div[2,num\_branches[T]]]]
--branch-length iid[num_branches[T],~Exponential[Div[1,num_branches[T]]]]
--branch-length iid[num\_branches[T],~Exponential[Div[1,num\_branches[T]]]]
......@@ -14,7 +14,7 @@ MCMC moves have attributes that be used to disable them:
* topology: changes the topology
* lengths: changes the branch lengths
* alignment: changes the alignment
* alignment_branch: change the alignment of leaf sequences
* alignment\_branch: change the alignment of leaf sequences
This allows one to fix the topology by writing `--disable=topology`.
......
......@@ -19,7 +19,7 @@ However, enabling moves by attribute is probably a bad idea.
# Examples:
# Enable a disabled MCMC move
bali-phy dna.fasta --enable=sample_tri_branch
bali-phy dna.fasta --enable=sample\_tri\_branch
# See also:
......
......@@ -7,7 +7,7 @@ Pass a key-value pair into BAli-Phy. The <value> is a json value.
# Examples:
# Work twice at hard at resampling the alignment.
bali-phy dna.fasta --set alignment_sampling_factor=2.0
bali-phy dna.fasta --set alignment\_sampling\_factor=2.0
# Stop enforcing low gap probabilities after 10 iterations
bali-phy dna.fasta --set alignment-burnin=10
......
# https://github.com/mesonbuild/meson/blob/master/docs/markdown/Porting-from-autotools.md
# http://mesonbuild.com/Syntax.html
# http://mesonbuild.com/howtox.html#set-default-cc-language-version
project('bali-phy', ['cpp','c'],
version: '3.0.1',
version: '3.0.3',
default_options : [
'buildtype=release',
'cpp_std=c++14'
......
......@@ -11,6 +11,7 @@
#include <unsupported/Eigen/MatrixFunctions>
using std::vector;
using std::pair;
typedef Eigen::MatrixXd EMatrix;
double cdf(double eta, double t)
......@@ -74,6 +75,9 @@ Matrix JC_transition_p(double t)
// We need emission probabilities for 2*t, since t is the depth of the tree,
// but the distance between the tips is 2*t.
// So... this should be 0.25*JC_transition_p( ), but maybe leaving out the
// 0.25 doesn't hurt, since its a constant factor.
vector<Matrix> get_emission_probabilities(const vector<double>& t)
{
vector<Matrix> E(t.size());
......@@ -308,10 +312,177 @@ Matrix get_transition_probabilities(const vector<double>& B, const vector<double
return P;
}
EMatrix get_no_snp_matrix(const Matrix& T, const vector<Matrix>& emission)
{
// This is only valid for Jukes-Cantor
int n_bins = T.size1();
EMatrix M(n_bins,n_bins);
for(int j=0;j<n_bins;j++)
for(int k=0; k<n_bins; k++)
M(j,k) = T(j,k) * emission[k](0,0);
return M;
}
EMatrix get_snp_matrix(const Matrix& T, const vector<Matrix>& emission)
{
// This is only valid for Jukes-Cantor
int n_bins = T.size1();
EMatrix M(n_bins,n_bins);
for(int j=0;j<n_bins;j++)
for(int k=0; k<n_bins; k++)
M(j,k) = T(j,k) * emission[k](0,1);
return M;
}
EMatrix get_missing_matrix(const Matrix& T)
{
// This is only valid for Jukes-Cantor
int n_bins = T.size1();
EMatrix M(n_bins,n_bins);
for(int j=0;j<n_bins;j++)
for(int k=0; k<n_bins; k++)
M(j,k) = T(j,k);
return M;
}
constexpr double scale_factor = 115792089237316195423570985008687907853269984665640564039457584007913129639936e0;
constexpr double scale_min = 1.0/scale_factor;
constexpr double log_scale_min = -177.445678223345999210811423093293201427328034396225345054e0;
enum class site_t {unknown=0,poly,mono,missing};
inline site_t classify_site(int x1, int x2)
{
if (x1 < 0 or x2< 0)
return site_t::missing;
else if (x1 == x2)
return site_t::mono;
else
return site_t::poly;
}
void rescale(vector<double>& L, int& scale)
{
const int n_bins = L.size();
// Check if we need to scale the likelihoods
bool need_scale = true;
for(int k=0; k < n_bins; k++)
{
need_scale = need_scale and (L[k] < scale_min);
assert(0 <= L[k] and L[k] <= 1.0);
}
// Scale likelihoods if necessary
if (need_scale)
{
scale++;
for(int k=0; k < n_bins; k++)
L[k] *= scale_factor;
}
}
bool too_small(const EMatrix& M)
{
for(int j=0;j<M.rows();j++)
{
double max_k = 0;
for(int k=0;k<M.cols();k++)
max_k = std::max(max_k,M(j,k));
if (max_k < scale_min)
return true;
}
return false;
}
EMatrix square(const EMatrix& M)
{
return M*M;
}
int silly_log_2(int i)
{
assert(i > 0);
int count = 0;
while (i>>1)
{
i>>=1;
count++;
};
return count;
}
int silly_power_2(int i)
{
return (1<<i);
}
vector<EMatrix> matrix_binary_power(const EMatrix& M, int L)
{
vector<EMatrix> P {M};
do {
P.push_back(square(P.back()));
if (too_small(P.back()))
{
P.pop_back();
break;
}
} while(std::pow(2,P.size()) < L);
return P;
}
vector<pair<int,site_t>> classify_sites(const alignment& A)
{
vector<pair<int,site_t>> sites;
for(int l=1; l < A.length();)
{
site_t s = classify_site(A(l,0),A(l,1));
int count = 0;
do
{
l++;
count++;
}
while(l < A.length() and classify_site(A(l,0),A(l,1)) == s);
sites.push_back({count,s});
}
return sites;
}
void smc_group(vector<double>& L, vector<double>& L2, int& scale, const vector<EMatrix>& matrices, int length)
{
const int n_bins = L.size();
for(int i=0;i<length;)
{
int left = length - i;
int x = silly_log_2(left);
x = std::min<int>(x, matrices.size()-1);
int taking = silly_power_2(x);
auto& M = matrices[x];
for(int k=0; k < n_bins; k++)
{
double temp = 0;
for(int j=0;j<n_bins; j++)
temp += L[j] * M(j,k);
L2[k] = temp;
}
i += taking;
// Scale likelihoods if necessary
rescale(L2, scale);
// Swap current & next likelihoods
std::swap(L, L2);
}
}
log_double_t smc(double theta, double rho, const alignment& A)
{
assert(rho >= 0);
......@@ -330,48 +501,38 @@ log_double_t smc(double theta, double rho, const alignment& A)
// # Compute the likelihoods for the first column
const auto pi = get_equilibrium(bin_boundaries, 2.0/theta);
vector<double> L(n_bins);
vector<double> L2(n_bins);
int scale = 0;
// FIXME: I think we should be able to start at site -1 with L[i] = pi[i], if pi[i] is the equilibrium of T(j,k)
for(int i=0;i< n_bins; i++)
L[i] = pi[i] * emission_probabilities[i](A(0,0), A(0,1));
// # Iteratively compute likelihoods for remaining columns
const auto transition = get_transition_probabilities(bin_boundaries, bin_times, theta, rho);
for(int l=1; l < A.length(); l++)
{
bool need_scale = true;
vector<EMatrix> no_snp = matrix_binary_power(get_no_snp_matrix(transition, emission_probabilities), A.length());
int x0 = A(l,0);
int x1 = A(l,1);
vector<EMatrix> missing = matrix_binary_power(get_missing_matrix(transition), A.length());
for(int k=0; k < n_bins; k++)
{
double temp = 0;
for(int j=0;j<n_bins; j++)
temp += L[j] * transition(j,k);
temp *= emission_probabilities[k]( x0, x1 );
L2[k] = temp;
vector<EMatrix> snp = matrix_binary_power(get_snp_matrix(transition, emission_probabilities), A.length());
need_scale = need_scale and (temp < scale_min);
assert(0 <= L2[k] and L2[k] <= 1.0);
}
if (need_scale)
for(auto& group: classify_sites(A))
{
scale++;
for(int k=0; k < n_bins; k++)
L2[k] *= scale_factor;
}
std::swap(L, L2);
if (group.second == site_t::missing)
smc_group(L, L2, scale, missing, group.first);
else if (group.second == site_t::mono)
smc_group(L, L2, scale, no_snp, group.first);
else if (group.second == site_t::poly)
smc_group(L, L2, scale, snp, group.first);
else
std::abort();
}
// # Compute the total likelihood
log_double_t Pr = 1;
for(int i=0; i < n_bins; i++)
Pr += L[i];
log_double_t Pr (sum(L));
Pr.log() += log_scale_min * scale;
return Pr;
}
......
......@@ -52,7 +52,7 @@ public:
void resize(int s) {states_.resize(s);};
void reserve(int s) {states_.reserve(s);};
void push_back(char state) {states_.push_back(state);};
void push_back(char state) { assert(state != A2::states::E); states_.push_back(state);};
void push_back_match() {states_.push_back(A2::states::M);};
void push_back_insert() {states_.push_back(A2::states::G1);};
void push_back_delete() {states_.push_back(A2::states::G2);};
......@@ -105,6 +105,8 @@ namespace A2 {
pairwise_alignment_t get_pairwise_alignment(const matrix<int>& A, int n1, int n2);
pairwise_alignment_t get_pairwise_alignment_from_path(const std::vector<int>& path);
std::vector<int> get_path_from_pairwise_alignment(const pairwise_alignment_t& A);
}
pairwise_alignment_t get_pairwise_alignment_from_bits(const std::vector<HMM::bitmask_t>& bit_path, int n1, int n2);
......