Skip to content
Commits on Source (8)
......@@ -2,6 +2,17 @@
All notable changes to `libpll` will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [0.3.2] - 2017-07-12
### Added
- Optional per-rate category scalers for protein and generic kernels
- Hardware detection for APPLE builds defaults to assembly code
### Fixed
- Improved fix for negative p-matrix values.
- Derivatives computation for Lewis/Felsenstein ascertainment bias correction
- set_tipchars() for ascertainment bias correction with non-DNA sequences
- Excessive memory allocation when compressing site patterns
- Issue with PHYLIP parsing when header ends with CRLF
## [0.3.1] - 2017-05-17
### Added
- Checks for older versions of clang and gcc to use assembly instructions
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.63])
AC_INIT([libpll], [0.3.1], [Tomas.Flouri@h-its.org])
AC_INIT([libpll], [0.3.2], [Tomas.Flouri@h-its.org])
AM_INIT_AUTOMAKE([subdir-objects])
AC_LANG([C])
AC_CONFIG_SRCDIR([src/pll.c])
......
libpll (0.3.2-1) unstable; urgency=medium
* New upstream version
* debhelper 11
* Point Vcs fields to salsa.debian.org
* Standards-Version: 4.1.4
* Add symbols file
-- Andreas Tille <tille@debian.org> Mon, 28 May 2018 21:21:16 +0200
libpll (0.3.1-1) unstable; urgency=medium
* new upstream version incorporating all patches
......
Source: libpll
Section: science
Priority: optional
Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
Uploaders: Andreas Tille <tille@debian.org>
Build-Depends: debhelper (>= 10),
Section: science
Priority: optional
Build-Depends: debhelper (>= 11~),
d-shlibs,
bison,
flex,
autoconf-archive
Standards-Version: 3.9.8
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/libpll.git
Vcs-Git: https://anonscm.debian.org/git/debian-med/libpll.git
Standards-Version: 4.1.4
Vcs-Browser: https://salsa.debian.org/med-team/libpll
Vcs-Git: https://salsa.debian.org/med-team/libpll.git
Homepage: http://www.libpll.org/
Package: libpll0
Section: libs
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}
Section: libs
Depends: ${shlibs:Depends},
${misc:Depends}
Description: Phylogenetic Likelihood Library
PLL is a highly optimized, parallelized software library to ease the
development of new software tools dealing with phylogenetic inference.
......@@ -32,9 +33,11 @@ Description: Phylogenetic Likelihood Library
This package contains the dynamic library.
Package: libpll-dev
Section: libdevel
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, libpll0 (= ${binary:Version})
Section: libdevel
Depends: ${shlibs:Depends},
${misc:Depends},
libpll0 (= ${binary:Version})
Description: Phylogenetic Likelihood Library (development)
PLL is a highly optimized, parallelized software library to ease the
development of new software tools dealing with phylogenetic inference.
......
libpll.so.0 libpll0 #MINVER#
pll_aa_freqs_blosum62@Base 0.3.2
pll_aa_freqs_cprev@Base 0.3.2
pll_aa_freqs_dayhoff@Base 0.3.2
pll_aa_freqs_dcmut@Base 0.3.2
pll_aa_freqs_flu@Base 0.3.2
pll_aa_freqs_hivb@Base 0.3.2
pll_aa_freqs_hivw@Base 0.3.2
pll_aa_freqs_jtt@Base 0.3.2
pll_aa_freqs_jttdcmut@Base 0.3.2
pll_aa_freqs_lg4m@Base 0.3.2
pll_aa_freqs_lg4x@Base 0.3.2
pll_aa_freqs_lg@Base 0.3.2
pll_aa_freqs_mtart@Base 0.3.2
pll_aa_freqs_mtmam@Base 0.3.2
pll_aa_freqs_mtrev@Base 0.3.2
pll_aa_freqs_mtzoa@Base 0.3.2
pll_aa_freqs_pmb@Base 0.3.2
pll_aa_freqs_rtrev@Base 0.3.2
pll_aa_freqs_stmtrev@Base 0.3.2
pll_aa_freqs_vt@Base 0.3.2
pll_aa_freqs_wag@Base 0.3.2
pll_aa_rates_blosum62@Base 0.3.2
pll_aa_rates_cprev@Base 0.3.2
pll_aa_rates_dayhoff@Base 0.3.2
pll_aa_rates_dcmut@Base 0.3.2
pll_aa_rates_flu@Base 0.3.2
pll_aa_rates_hivb@Base 0.3.2
pll_aa_rates_hivw@Base 0.3.2
pll_aa_rates_jtt@Base 0.3.2
pll_aa_rates_jttdcmut@Base 0.3.2
pll_aa_rates_lg4m@Base 0.3.2
pll_aa_rates_lg4x@Base 0.3.2
pll_aa_rates_lg@Base 0.3.2
pll_aa_rates_mtart@Base 0.3.2
pll_aa_rates_mtmam@Base 0.3.2
pll_aa_rates_mtrev@Base 0.3.2
pll_aa_rates_mtzoa@Base 0.3.2
pll_aa_rates_pmb@Base 0.3.2
pll_aa_rates_rtrev@Base 0.3.2
pll_aa_rates_stmtrev@Base 0.3.2
pll_aa_rates_vt@Base 0.3.2
pll_aa_rates_wag@Base 0.3.2
pll_aligned_alloc@Base 0.3.2
pll_aligned_free@Base 0.3.2
pll_compress_site_patterns@Base 0.3.2
pll_compute_edge_loglikelihood@Base 0.3.2
pll_compute_gamma_cats@Base 0.3.2
pll_compute_likelihood_derivatives@Base 0.3.2
pll_compute_root_loglikelihood@Base 0.3.2
pll_core_create_lookup@Base 0.3.2
pll_core_create_lookup_20x20_avx@Base 0.3.2
pll_core_create_lookup_4x4@Base 0.3.2
pll_core_create_lookup_4x4_avx@Base 0.3.2
pll_core_create_lookup_4x4_sse@Base 0.3.2
pll_core_create_lookup_avx@Base 0.3.2
pll_core_create_lookup_sse@Base 0.3.2
pll_core_edge_loglikelihood_ii@Base 0.3.2
pll_core_edge_loglikelihood_ii_4x4_avx@Base 0.3.2
pll_core_edge_loglikelihood_ii_4x4_sse@Base 0.3.2
pll_core_edge_loglikelihood_ii_avx2@Base 0.3.2
pll_core_edge_loglikelihood_ii_avx@Base 0.3.2
pll_core_edge_loglikelihood_ii_sse@Base 0.3.2
pll_core_edge_loglikelihood_ti@Base 0.3.2
pll_core_edge_loglikelihood_ti_20x20_avx2@Base 0.3.2
pll_core_edge_loglikelihood_ti_20x20_avx@Base 0.3.2
pll_core_edge_loglikelihood_ti_4x4@Base 0.3.2
pll_core_edge_loglikelihood_ti_4x4_avx@Base 0.3.2
pll_core_edge_loglikelihood_ti_4x4_sse@Base 0.3.2
pll_core_edge_loglikelihood_ti_avx@Base 0.3.2
pll_core_edge_loglikelihood_ti_sse@Base 0.3.2
pll_core_likelihood_derivatives@Base 0.3.2
pll_core_likelihood_derivatives_avx2@Base 0.3.2
pll_core_likelihood_derivatives_avx@Base 0.3.2
pll_core_root_loglikelihood@Base 0.3.2
pll_core_root_loglikelihood_4x4_avx@Base 0.3.2
pll_core_root_loglikelihood_4x4_sse@Base 0.3.2
pll_core_root_loglikelihood_avx2@Base 0.3.2
pll_core_root_loglikelihood_avx@Base 0.3.2
pll_core_root_loglikelihood_sse@Base 0.3.2
pll_core_update_partial_ii@Base 0.3.2
pll_core_update_partial_ii_4x4_avx@Base 0.3.2
pll_core_update_partial_ii_4x4_sse@Base 0.3.2
pll_core_update_partial_ii_avx2@Base 0.3.2
pll_core_update_partial_ii_avx@Base 0.3.2
pll_core_update_partial_ii_sse@Base 0.3.2
pll_core_update_partial_ti@Base 0.3.2
pll_core_update_partial_ti_20x20_avx2@Base 0.3.2
pll_core_update_partial_ti_20x20_avx@Base 0.3.2
pll_core_update_partial_ti_4x4@Base 0.3.2
pll_core_update_partial_ti_4x4_avx@Base 0.3.2
pll_core_update_partial_ti_4x4_sse@Base 0.3.2
pll_core_update_partial_ti_avx2@Base 0.3.2
pll_core_update_partial_ti_avx@Base 0.3.2
pll_core_update_partial_ti_sse@Base 0.3.2
pll_core_update_partial_tt@Base 0.3.2
pll_core_update_partial_tt_4x4@Base 0.3.2
pll_core_update_partial_tt_4x4_avx@Base 0.3.2
pll_core_update_partial_tt_4x4_sse@Base 0.3.2
pll_core_update_partial_tt_avx@Base 0.3.2
pll_core_update_partial_tt_sse@Base 0.3.2
pll_core_update_pmatrix@Base 0.3.2
pll_core_update_pmatrix_20x20_avx2@Base 0.3.2
pll_core_update_pmatrix_20x20_avx@Base 0.3.2
pll_core_update_pmatrix_4x4_avx@Base 0.3.2
pll_core_update_pmatrix_4x4_sse@Base 0.3.2
pll_core_update_sumtable_ii@Base 0.3.2
pll_core_update_sumtable_ii_avx2@Base 0.3.2
pll_core_update_sumtable_ii_avx@Base 0.3.2
pll_core_update_sumtable_ii_sse@Base 0.3.2
pll_core_update_sumtable_ti@Base 0.3.2
pll_core_update_sumtable_ti_4x4@Base 0.3.2
pll_core_update_sumtable_ti_avx2@Base 0.3.2
pll_core_update_sumtable_ti_avx@Base 0.3.2
pll_core_update_sumtable_ti_sse@Base 0.3.2
pll_count_invariant_sites@Base 0.3.2
pll_dlist_append@Base 0.3.2
pll_dlist_prepend@Base 0.3.2
pll_dlist_remove@Base 0.3.2
pll_errmsg@Base 0.3.2
pll_errno@Base 0.3.2
pll_fasta_close@Base 0.3.2
pll_fasta_getfilepos@Base 0.3.2
pll_fasta_getfilesize@Base 0.3.2
pll_fasta_getnext@Base 0.3.2
pll_fasta_open@Base 0.3.2
pll_fasta_rewind@Base 0.3.2
pll_fastparsimony_edge_score@Base 0.3.2
pll_fastparsimony_edge_score_4x4@Base 0.3.2
pll_fastparsimony_edge_score_4x4_avx2@Base 0.3.2
pll_fastparsimony_edge_score_4x4_avx@Base 0.3.2
pll_fastparsimony_edge_score_4x4_sse@Base 0.3.2
pll_fastparsimony_edge_score_avx2@Base 0.3.2
pll_fastparsimony_edge_score_avx@Base 0.3.2
pll_fastparsimony_edge_score_sse@Base 0.3.2
pll_fastparsimony_init@Base 0.3.2
pll_fastparsimony_root_score@Base 0.3.2
pll_fastparsimony_stepwise@Base 0.3.2
pll_fastparsimony_update_vector@Base 0.3.2
pll_fastparsimony_update_vector_4x4@Base 0.3.2
pll_fastparsimony_update_vector_4x4_avx2@Base 0.3.2
pll_fastparsimony_update_vector_4x4_avx@Base 0.3.2
pll_fastparsimony_update_vector_4x4_sse@Base 0.3.2
pll_fastparsimony_update_vector_avx2@Base 0.3.2
pll_fastparsimony_update_vector_avx@Base 0.3.2
pll_fastparsimony_update_vector_sse@Base 0.3.2
pll_fastparsimony_update_vectors@Base 0.3.2
pll_hardware@Base 0.3.2
pll_hardware_dump@Base 0.3.2
pll_hardware_ignore@Base 0.3.2
pll_hardware_probe@Base 0.3.2
pll_initstate_r@Base 0.3.2
pll_map_aa@Base 0.3.2
pll_map_bin@Base 0.3.2
pll_map_fasta@Base 0.3.2
pll_map_nt@Base 0.3.2
pll_map_phylip@Base 0.3.2
pll_msa_destroy@Base 0.3.2
pll_parsimony_build@Base 0.3.2
pll_parsimony_create@Base 0.3.2
pll_parsimony_destroy@Base 0.3.2
pll_parsimony_reconstruct@Base 0.3.2
pll_parsimony_score@Base 0.3.2
pll_partition_create@Base 0.3.2
pll_partition_destroy@Base 0.3.2
pll_phylip_close@Base 0.3.2
pll_phylip_open@Base 0.3.2
pll_phylip_parse_interleaved@Base 0.3.2
pll_phylip_parse_sequential@Base 0.3.2
pll_phylip_rewind@Base 0.3.2
pll_random_r@Base 0.3.2
pll_rtree__create_buffer@Base 0.3.2
pll_rtree__delete_buffer@Base 0.3.2
pll_rtree__flex_debug@Base 0.3.2
pll_rtree__flush_buffer@Base 0.3.2
pll_rtree__scan_buffer@Base 0.3.2
pll_rtree__scan_bytes@Base 0.3.2
pll_rtree__scan_string@Base 0.3.2
pll_rtree__switch_to_buffer@Base 0.3.2
pll_rtree_alloc@Base 0.3.2
pll_rtree_char@Base 0.3.2
pll_rtree_colend@Base 0.3.2
pll_rtree_colstart@Base 0.3.2
pll_rtree_create_operations@Base 0.3.2
pll_rtree_create_pars_buildops@Base 0.3.2
pll_rtree_create_pars_recops@Base 0.3.2
pll_rtree_destroy@Base 0.3.2
pll_rtree_export_newick@Base 0.3.2
pll_rtree_free@Base 0.3.2
pll_rtree_get_debug@Base 0.3.2
pll_rtree_get_in@Base 0.3.2
pll_rtree_get_leng@Base 0.3.2
pll_rtree_get_lineno@Base 0.3.2
pll_rtree_get_out@Base 0.3.2
pll_rtree_get_text@Base 0.3.2
pll_rtree_graph_destroy@Base 0.3.2
pll_rtree_in@Base 0.3.2
pll_rtree_leng@Base 0.3.2
pll_rtree_lex@Base 0.3.2
pll_rtree_lex_destroy@Base 0.3.2
pll_rtree_lineno@Base 0.3.2
pll_rtree_lval@Base 0.3.2
pll_rtree_nerrs@Base 0.3.2
pll_rtree_out@Base 0.3.2
pll_rtree_parse@Base 0.3.2
pll_rtree_parse_newick@Base 0.3.2
pll_rtree_parse_newick_string@Base 0.3.2
pll_rtree_pop_buffer_state@Base 0.3.2
pll_rtree_push_buffer_state@Base 0.3.2
pll_rtree_realloc@Base 0.3.2
pll_rtree_reset_template_indices@Base 0.3.2
pll_rtree_restart@Base 0.3.2
pll_rtree_set_debug@Base 0.3.2
pll_rtree_set_in@Base 0.3.2
pll_rtree_set_lineno@Base 0.3.2
pll_rtree_set_out@Base 0.3.2
pll_rtree_show_ascii@Base 0.3.2
pll_rtree_text@Base 0.3.2
pll_rtree_traverse@Base 0.3.2
pll_rtree_unroot@Base 0.3.2
pll_rtree_wraptree@Base 0.3.2
pll_set_asc_bias_type@Base 0.3.2
pll_set_asc_state_weights@Base 0.3.2
pll_set_category_rates@Base 0.3.2
pll_set_category_weights@Base 0.3.2
pll_set_frequencies@Base 0.3.2
pll_set_parsimony_sequence@Base 0.3.2
pll_set_pattern_weights@Base 0.3.2
pll_set_subst_params@Base 0.3.2
pll_set_tip_clv@Base 0.3.2
pll_set_tip_states@Base 0.3.2
pll_setstate_r@Base 0.3.2
pll_show_clv@Base 0.3.2
pll_show_pmatrix@Base 0.3.2
pll_srandom_r@Base 0.3.2
pll_svg_attrib_create@Base 0.3.2
pll_svg_attrib_destroy@Base 0.3.2
pll_update_eigen@Base 0.3.2
pll_update_invariant_sites@Base 0.3.2
pll_update_invariant_sites_proportion@Base 0.3.2
pll_update_partials@Base 0.3.2
pll_update_prob_matrices@Base 0.3.2
pll_update_sumtable@Base 0.3.2
pll_utree__create_buffer@Base 0.3.2
pll_utree__delete_buffer@Base 0.3.2
pll_utree__flex_debug@Base 0.3.2
pll_utree__flush_buffer@Base 0.3.2
pll_utree__scan_buffer@Base 0.3.2
pll_utree__scan_bytes@Base 0.3.2
pll_utree__scan_string@Base 0.3.2
pll_utree__switch_to_buffer@Base 0.3.2
pll_utree_alloc@Base 0.3.2
pll_utree_char@Base 0.3.2
pll_utree_check_integrity@Base 0.3.2
pll_utree_clone@Base 0.3.2
pll_utree_colend@Base 0.3.2
pll_utree_colstart@Base 0.3.2
pll_utree_create_operations@Base 0.3.2
pll_utree_create_pars_buildops@Base 0.3.2
pll_utree_destroy@Base 0.3.2
pll_utree_every@Base 0.3.2
pll_utree_every_const@Base 0.3.2
pll_utree_export_newick@Base 0.3.2
pll_utree_export_svg@Base 0.3.2
pll_utree_free@Base 0.3.2
pll_utree_get_debug@Base 0.3.2
pll_utree_get_in@Base 0.3.2
pll_utree_get_leng@Base 0.3.2
pll_utree_get_lineno@Base 0.3.2
pll_utree_get_out@Base 0.3.2
pll_utree_get_text@Base 0.3.2
pll_utree_graph_clone@Base 0.3.2
pll_utree_graph_destroy@Base 0.3.2
pll_utree_in@Base 0.3.2
pll_utree_leng@Base 0.3.2
pll_utree_lex@Base 0.3.2
pll_utree_lex_destroy@Base 0.3.2
pll_utree_lineno@Base 0.3.2
pll_utree_lval@Base 0.3.2
pll_utree_nerrs@Base 0.3.2
pll_utree_nni@Base 0.3.2
pll_utree_out@Base 0.3.2
pll_utree_parse@Base 0.3.2
pll_utree_parse_newick@Base 0.3.2
pll_utree_parse_newick_string@Base 0.3.2
pll_utree_pop_buffer_state@Base 0.3.2
pll_utree_push_buffer_state@Base 0.3.2
pll_utree_realloc@Base 0.3.2
pll_utree_reset_template_indices@Base 0.3.2
pll_utree_restart@Base 0.3.2
pll_utree_rollback@Base 0.3.2
pll_utree_set_debug@Base 0.3.2
pll_utree_set_in@Base 0.3.2
pll_utree_set_lineno@Base 0.3.2
pll_utree_set_out@Base 0.3.2
pll_utree_show_ascii@Base 0.3.2
pll_utree_spr@Base 0.3.2
pll_utree_spr_safe@Base 0.3.2
pll_utree_text@Base 0.3.2
pll_utree_traverse@Base 0.3.2
pll_utree_wraptree@Base 0.3.2
......@@ -228,23 +228,23 @@ int main(int argc, char * argv[])
/* set the 5 tip CLVs, and use the pll_map_nt map for converting
the sequences to CLVs */
double * enc = encode_tacg("WAAAAB");
assert(pll_set_tip_clv(partition, 0, enc));
assert(pll_set_tip_clv(partition, 0, enc, PLL_FALSE));
free(enc);
enc = encode_tacg("CACACD");
assert(pll_set_tip_clv(partition, 1, enc));
assert(pll_set_tip_clv(partition, 1, enc, PLL_FALSE));
free(enc);
enc = encode_tacg("AGGACA");
assert(pll_set_tip_clv(partition, 2, enc));
assert(pll_set_tip_clv(partition, 2, enc, PLL_FALSE));
free(enc);
enc = encode_tacg("CGTAGT");
assert(pll_set_tip_clv(partition, 3, enc));
assert(pll_set_tip_clv(partition, 3, enc, PLL_FALSE));
free(enc);
enc = encode_tacg("CGAATT");
assert(pll_set_tip_clv(partition, 4, enc));
assert(pll_set_tip_clv(partition, 4, enc, PLL_FALSE));
free(enc);
/* update two probability matrices for the corresponding branch lengths */
......
.\" -*- coding: utf-8 -*-
.\" ============================================================================
.TH libpll 3 "May 17, 2017" "libpll 0.3.1" "Library Functions Manual"
.TH libpll 3 "July 12, 2017" "libpll 0.3.2" "Library Functions Manual"
.\" ============================================================================
.SH NAME
libpll \(em Phylogenetic Likelihood Library
......@@ -544,7 +544,7 @@ Source code and binaries are available at
<https://github.com/xflouris/libpll>.
.\" ============================================================================
.SH COPYRIGHT
Copyright (C) 2015-2016, Tomas Flouri, Diego Darriba
Copyright (C) 2015-2017, Tomas Flouri, Diego Darriba
.PP
All rights reserved.
.PP
......@@ -581,7 +581,7 @@ First public release.
Added faster vectorizations for 20-state and arbitrary-state models, unweighted
parsimony functions, randomized stepwise addition, portable functions for
parsing trees from C-strings, per-rate category scalers for preventing
numberical underflows. Modified newick exporting function to accept callbacks
numerical underflows. Modified newick exporting function to accept callbacks
for custom printing. Fixed derivatives computation, parsing of branch lengths,
invariant sites computation, log-likelihood computation for cases where we have
scaling and patterns, ascertainment bias computation, per-site log-likelihood
......@@ -591,5 +591,12 @@ computation, memory leaks. Added run-time detection of hardware.
Correct updating of paddded eigen-decomposition arrays for models with a number
of states not being a power of two. Added portable hardware detection for clang
and GCC.
.TP
.BR v0.3.2\~ "released July 12th, 2017"
Added optional per-rate category scalers for protein and generic kernels.
Improved fix for negative transition probability matrices caused by numerics.
Fixed initialization of tip CLVs when using ascertainment bias correction with
non-DNA sequences. Fixed excessive memory allocation when compressing site
patterns and issue with PHYLIP parsing when header ends with CRLF.
.RE
.LP
......@@ -188,9 +188,9 @@ PLL_EXPORT unsigned int * pll_compress_site_patterns(char ** sequence,
}
/* allocate memory for the alignment */
memptr = column[0] = (char *)malloc((size_t)((*length)+1) *
(size_t)count *
sizeof(char *));
memptr = column[0] = (char *)malloc((size_t)(*length) *
(size_t)(count+1) *
sizeof(char));
if (!memptr)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
......
......@@ -371,9 +371,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
attrib);
}
unsigned int min_scaler;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
/* build sumtable: non-vectorized version, general case */
for (n = 0; n < sites; n++)
{
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
for (i = 0; i < rate_cats; ++i)
{
t_eigenvecs = eigenvecs[i];
......@@ -393,12 +430,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
tipstate >>= 1;
}
sum[j] = lefterm * righterm;
if (rate_scalings && rate_scalings[i] > 0)
sum[j] *= scale_minlh[rate_scalings[i]-1];
}
t_clvc += states_padded;
sum += states_padded;
}
}
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......@@ -652,6 +695,8 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,
switch(asc_bias_type)
{
/* NOTE: since we compute derivatives of -logL, the signs below are flipped
* compared to RAxML ("+" for LEWIS and "-" for FELSENSTEIN) */
case PLL_ATTRIB_AB_LEWIS:
{
// TODO: pattern_weight_sum should be stored somewhere!
......@@ -660,16 +705,16 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,
pattern_weight_sum += pattern_weights[n];
/* derivatives of log(1.0 - (sum Li(s) over states 's')) */
*d_f -= pattern_weight_sum * (asc_Lk[1] / (asc_Lk[0] - 1.0));
*dd_f -= pattern_weight_sum *
*d_f += pattern_weight_sum * (asc_Lk[1] / (asc_Lk[0] - 1.0));
*dd_f += pattern_weight_sum *
(((asc_Lk[0] - 1.0) * asc_Lk[2] - asc_Lk[1] * asc_Lk[1]) /
((asc_Lk[0] - 1.0) * (asc_Lk[0] - 1.0)));
}
break;
case PLL_ATTRIB_AB_FELSENSTEIN:
/* derivatives of log(sum Li(s) over states 's') */
*d_f += sum_w_inv * (asc_Lk[1] / asc_Lk[0]);
*dd_f += sum_w_inv *
*d_f -= sum_w_inv * (asc_Lk[1] / asc_Lk[0]);
*dd_f -= sum_w_inv *
(((asc_Lk[2] * asc_Lk[0]) - asc_Lk[1] * asc_Lk[1]) /
(asc_Lk[0] * asc_Lk[0]));
break;
......
......@@ -45,6 +45,7 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites,
unsigned int states = 4;
/* scaling stuff*/
unsigned int min_scaler = 0;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
......@@ -55,6 +56,13 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites,
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
......@@ -238,6 +246,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
unsigned int states_padded = (states+3) & 0xFFFFFFFC;
/* scaling stuff */
unsigned int min_scaler = 0;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
__m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
}
}
/* padded eigenvecs */
double * tt_eigenvecs = (double *) pll_aligned_alloc (
(states_padded * states_padded * rate_cats) * sizeof(double),
......@@ -282,6 +316,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
/* vectorized loop from update_sumtable() */
for (n = 0; n < sites; n++)
{
/* compute per-rate scalers and obtain minimum value (within site) */
if (per_rate_scaling)
{
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
const double * c_eigenvecs = tt_eigenvecs;
const double * ct_inv_eigenvecs = tt_inv_eigenvecs;
for (i = 0; i < rate_cats; ++i)
......@@ -381,6 +435,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
/* update sum */
__m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum);
/* apply per-rate scalers */
if (rate_scalings && rate_scalings[i] > 0)
{
v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]);
}
_mm256_store_pd (sum + j, v_prod);
}
......@@ -392,6 +453,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
pll_aligned_free (tt_inv_eigenvecs);
pll_aligned_free (tt_eigenvecs);
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......@@ -426,6 +489,13 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites,
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
......@@ -568,7 +638,6 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites,
pll_aligned_free(eigenvecs_trans);
pll_aligned_free(precomp_left);
if (rate_scalings)
free(rate_scalings);
......@@ -614,6 +683,32 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
const double * t_clvc = parent_clv;
const double * t_eigenvecs_padded;
/* scaling stuff */
unsigned int min_scaler = 0;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
__m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
}
}
double * eigenvecs_padded = (double *) pll_aligned_alloc (
(states_padded * states_padded * rate_cats) * sizeof(double),
PLL_ALIGNMENT_AVX);
......@@ -688,6 +783,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
/* build sumtable */
for (n = 0; n < sites; n++)
{
/* compute per-rate scalers and obtain minimum value (within site) */
if (per_rate_scaling)
{
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
tipstate = (unsigned int) left_tipchars[n];
unsigned int loffset = tipstate * span;
......@@ -757,6 +871,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
__m256d v_lefterm = _mm256_load_pd(t_precomp + j);
__m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm);
/* apply per-rate scalers */
if (rate_scalings && rate_scalings[i] > 0)
{
v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]);
}
_mm256_store_pd(sum + j, v_sum);
}
......@@ -768,6 +889,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
pll_aligned_free(eigenvecs_padded);
pll_aligned_free(precomp_left);
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......
......@@ -18,7 +18,7 @@
Exelixis Lab, Heidelberg Instutute for Theoretical Studies
Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
*/
#include <limits.h>
#include "pll.h"
PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
......@@ -63,6 +63,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
unsigned int states_padded = (states+3) & 0xFFFFFFFC;
/* scaling stuff */
unsigned int min_scaler = 0;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
__m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
}
}
/* padded eigenvecs */
double * tt_eigenvecs = (double *) pll_aligned_alloc (
(states_padded * states_padded * rate_cats) * sizeof(double),
......@@ -107,6 +133,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
/* vectorized loop from update_sumtable() */
for (n = 0; n < sites; n++)
{
/* compute per-rate scalers and obtain minimum value (within site) */
if (per_rate_scaling)
{
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
const double * c_eigenvecs = tt_eigenvecs;
const double * ct_inv_eigenvecs = tt_inv_eigenvecs;
for (i = 0; i < rate_cats; ++i)
......@@ -201,6 +247,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
/* update sum */
__m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum);
/* apply per-rate scalers */
if (rate_scalings && rate_scalings[i] > 0)
{
v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]);
}
_mm256_store_pd (sum + j, v_prod);
}
......@@ -212,6 +265,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
pll_aligned_free (tt_inv_eigenvecs);
pll_aligned_free (tt_eigenvecs);
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......@@ -255,6 +310,31 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
unsigned int i, j, k, n;
unsigned int tipstate;
unsigned int min_scaler = 0;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
__m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
return PLL_FAILURE;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
}
}
double * sum = sumtable;
const double * t_clvc = parent_clv;
const double * t_eigenvecs_padded;
......@@ -332,6 +412,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
/* build sumtable */
for (n = 0; n < sites; n++)
{
/* compute per-rate scalers and obtain minimum value (within site) */
if (per_rate_scaling)
{
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
tipstate = (unsigned int) left_tipchars[n];
unsigned int loffset = tipstate * span;
......@@ -397,6 +496,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
__m256d v_lefterm = _mm256_load_pd(t_precomp + j);
__m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm);
/* apply per-rate scalers */
if (rate_scalings && rate_scalings[i] > 0)
{
v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]);
}
_mm256_store_pd(sum + j, v_sum);
}
......@@ -408,6 +514,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
pll_aligned_free(eigenvecs_padded);
pll_aligned_free(precomp_left);
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......
......@@ -271,9 +271,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,
unsigned int states_padded = (states+1) & 0xFFFFFFFE;
unsigned int min_scaler;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
/* build sumtable: non-vectorized version, general case */
for (n = 0; n < sites; n++)
{
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
min_scaler = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < min_scaler)
min_scaler = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
PLL_SCALE_RATE_MAXDIFF);
}
}
for (i = 0; i < rate_cats; ++i)
{
ev = eigenvecs[i];
......@@ -293,11 +330,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,
tipstate >>= 1;
}
sum[j] = lterm*rterm;
if (rate_scalings && rate_scalings[i] > 0)
sum[j] *= scale_minlh[rate_scalings[i]-1];
}
clvc += states_padded;
sum += states_padded;
}
}
if (rate_scalings)
free(rate_scalings);
return PLL_SUCCESS;
}
......@@ -439,7 +439,6 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
double terma, terma_r, termb;
double site_lk, inv_site_lk;
unsigned int scale_factors;
unsigned int cstate;
unsigned int states_padded = states;
......@@ -480,7 +479,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -523,7 +523,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
else
{
......@@ -541,7 +542,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -584,7 +586,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
else
{
......@@ -602,7 +605,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -610,8 +614,50 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
}
#endif
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
}
pmat = pmatrix;
terma = 0;
for (i = 0; i < rate_cats; ++i)
......@@ -633,6 +679,12 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
pmat += states_padded;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
if (prop_invar > 0)
{
......@@ -649,13 +701,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
clvp += states_padded;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -667,6 +716,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
tipchars++;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -741,7 +794,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -785,7 +839,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -829,7 +884,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
invar_proportion,
invar_indices,
freqs_indices,
persite_lnl);
persite_lnl,
attrib);
}
/* this line is never called, but should we disable the else case above,
then states_padded must be set to this value */
......@@ -857,9 +913,6 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
terma = 0;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
......@@ -886,6 +939,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
site_scalings += (child_scaler) ? child_scaler[n] : 0;
}
pmat = pmatrix;
terma = 0;
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......
......@@ -420,7 +420,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,m;
double logl = 0;
......@@ -434,7 +435,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
double site_lk, inv_site_lk;
unsigned int cstate;
unsigned int scale_factors;
unsigned int states = 20;
unsigned int states_padded = states;
......@@ -445,6 +445,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
unsigned int span = states_padded * rate_cats;
unsigned int maxstates = tipmap_size;
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
/* precompute a lookup table of four values per entry (one for each state),
for all 16 states (including ambiguities) and for each rate category. */
double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
......@@ -510,6 +536,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
cstate = (unsigned int) tipchars[n];
unsigned int loffset = cstate*span;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
xmm1 = _mm256_setzero_pd();
......@@ -534,6 +584,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
xmm0 = _mm256_hadd_pd(xmm1,xmm1);
terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2];
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -551,13 +607,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -569,6 +623,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
}
pll_aligned_free(lookup);
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -588,7 +644,8 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,k;
double logl = 0;
......@@ -602,7 +659,6 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
double site_lk, inv_site_lk;
unsigned int cstate;
unsigned int scale_factors;
unsigned int states_padded = (states+3) & 0xFFFFFFFC;
__m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
......@@ -610,6 +666,32 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
size_t displacement = (states_padded - states) * (states_padded);
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
......@@ -617,6 +699,30 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
cstate = tipmap[tipchars[n]];
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......@@ -706,6 +812,12 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
clvp += 4;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -723,13 +835,11 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -739,6 +849,10 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
logl += site_lk;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -757,7 +871,8 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,k;
double logl = 0;
......@@ -771,17 +886,69 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
double terma, terma_r;
double site_lk, inv_site_lk;
unsigned int scale_factors;
unsigned int states_padded = (states+3) & 0xFFFFFFFC;
__m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
size_t displacement = (states_padded - states) * (states_padded);
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for precomputation.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
terma = 0;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
site_scalings += (child_scaler) ? child_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......@@ -864,6 +1031,12 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
clvp += 4;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -882,14 +1055,11 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
clvc += states_padded;
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
scale_factors += (child_scaler) ? child_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -899,6 +1069,10 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
logl += site_lk;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......
......@@ -19,6 +19,7 @@
Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
*/
#include <limits.h>
#include "pll.h"
PLL_EXPORT double pll_core_root_loglikelihood_avx2(unsigned int states,
......@@ -122,7 +123,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,m = 0;
double logl = 0;
......@@ -136,7 +138,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
double site_lk, inv_site_lk;
unsigned int cstate;
unsigned int scale_factors;
unsigned int states = 20;
unsigned int states_padded = states;
......@@ -144,6 +145,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
size_t displacement = (states_padded - states) * (states_padded);
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
unsigned int span = states_padded * rate_cats;
unsigned int maxstates = tipmap_size;
......@@ -212,6 +239,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
cstate = (unsigned int) tipchars[n];
unsigned int loffset = cstate*span;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
xmm1 = _mm256_setzero_pd();
......@@ -234,6 +285,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
xmm0 = _mm256_hadd_pd(xmm1,xmm1);
terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2];
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -251,13 +308,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -269,6 +324,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
}
pll_aligned_free(lookup);
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -288,7 +345,8 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,k;
double logl = 0;
......@@ -302,17 +360,69 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
double terma, terma_r;
double site_lk, inv_site_lk;
unsigned int scale_factors;
unsigned int states_padded = (states+3) & 0xFFFFFFFC;
__m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
size_t displacement = (states_padded - states) * (states_padded);
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
terma = 0;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
site_scalings += (child_scaler) ? child_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......@@ -391,6 +501,12 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
clvp += 4;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -409,14 +525,11 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
clvc += states_padded;
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
scale_factors += (child_scaler) ? child_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -426,5 +539,9 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
logl += site_lk;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -202,7 +202,8 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,k;
double logl = 0;
......@@ -216,7 +217,6 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
double site_lk, inv_site_lk;
unsigned int cstate;
unsigned int scale_factors;
unsigned int states_padded = (states+1) & 0xFFFFFFFE;
__m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
......@@ -225,6 +225,32 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
xmm5 = _mm_setzero_pd();
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
......@@ -232,6 +258,30 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
cstate = tipmap[tipchars[n]];
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......@@ -294,6 +344,12 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
clvp += 2;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -311,13 +367,11 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -327,6 +381,10 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
logl += site_lk;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......@@ -345,7 +403,8 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
const double * invar_proportion,
const int * invar_indices,
const unsigned int * freqs_indices,
double * persite_lnl)
double * persite_lnl,
unsigned int attrib)
{
unsigned int n,i,j,k;
double logl = 0;
......@@ -359,17 +418,69 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
double terma, terma_r;
double site_lk, inv_site_lk;
unsigned int scale_factors;
unsigned int states_padded = (states+1) & 0xFFFFFFFE;
__m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
size_t displacement = (states_padded - states) * (states_padded);
/* scaling stuff */
unsigned int site_scalings;
unsigned int * rate_scalings = NULL;
int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
/* powers of scale threshold for undoing the scaling */
double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
if (per_rate_scaling)
{
rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
if (!rate_scalings)
{
pll_errno = PLL_ERROR_MEM_ALLOC;
snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
return -INFINITY;
}
double scale_factor = 1.0;
for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
{
scale_factor *= PLL_SCALE_THRESHOLD;
scale_minlh[i] = scale_factor;
}
}
for (n = 0; n < sites; ++n)
{
pmat = pmatrix;
terma = 0;
if (per_rate_scaling)
{
/* compute minimum per-rate scaler -> common per-site scaler */
site_scalings = UINT_MAX;
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
if (rate_scalings[i] < site_scalings)
site_scalings = rate_scalings[i];
}
/* compute relative capped per-rate scalers */
for (i = 0; i < rate_cats; ++i)
{
rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
PLL_SCALE_RATE_MAXDIFF);
}
}
else
{
/* count number of scaling factors to account for */
site_scalings = (parent_scaler) ? parent_scaler[n] : 0;
site_scalings += (child_scaler) ? child_scaler[n] : 0;
}
for (i = 0; i < rate_cats; ++i)
{
freqs = frequencies[freqs_indices[i]];
......@@ -424,6 +535,12 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
clvp += 2;
}
/* apply per-rate scalers, if necessary */
if (rate_scalings && rate_scalings[i] > 0)
{
terma_r *= scale_minlh[rate_scalings[i]-1];
}
/* account for invariant sites */
prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
if (prop_invar > 0)
......@@ -442,14 +559,11 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
clvc += states_padded;
pmat -= displacement;
}
/* count number of scaling factors to acount for */
scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
scale_factors += (child_scaler) ? child_scaler[n] : 0;
/* compute site log-likelihood and scale if necessary */
site_lk = log(terma);
if (scale_factors)
site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
if (site_scalings)
site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);
site_lk *= pattern_weights[n];
......@@ -459,6 +573,10 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
logl += site_lk;
}
if (rate_scalings)
free(rate_scalings);
return logl;
}
......
......@@ -377,9 +377,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,
{
unsigned int states = 4;
unsigned int n,k,i;
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
const double * lmat;
const double * rmat;
......@@ -389,6 +386,14 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,
unsigned int span = states * rate_cats;
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
if (!parent_scaler)
{
/* scaling disabled / not required */
......@@ -404,9 +409,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,
fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
}
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
for (n = 0; n < sites; ++n)
{
lmat = left_matrix;
......@@ -556,8 +558,11 @@ PLL_EXPORT void pll_core_update_partial_tt_avx(unsigned int states,
return;
}
size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ?
sites*rate_cats : sites;
if (parent_scaler)
memset(parent_scaler, 0, sizeof(unsigned int) * sites);
memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size);
for (n = 0; n < sites; ++n)
{
......@@ -627,7 +632,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
unsigned int attrib)
{
unsigned int i,j,k,n;
unsigned int scaling;
const double * lmat;
const double * rmat;
......@@ -666,18 +670,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
right_matrix,
right_scaler,
tipmap,
tipmap_size);
tipmap_size,
attrib);
return;
}
/* add up the scale vector of the two children if available */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
}
size_t displacement = (states_padded - states) * (states_padded);
__m256i mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
/* compute CLV */
for (n = 0; n < sites; ++n)
......@@ -685,16 +707,17 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
lmat = left_matrix;
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
lstate = tipmap[left_tipchars[n]];
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
__m256d v_terma0 = _mm256_setzero_pd();
__m256d v_termb0 = _mm256_setzero_pd();
__m256d v_terma1 = _mm256_setzero_pd();
......@@ -820,14 +843,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -837,12 +878,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
parent_clv += states_padded;
right_clv += states_padded;
}
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
......@@ -1064,25 +1104,25 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
const double * right_matrix,
const unsigned int * right_scaler,
const unsigned int * tipmap,
unsigned int tipmap_size)
unsigned int tipmap_size,
unsigned int attrib)
{
unsigned int states = 20;
unsigned int states_padded = states;
unsigned int maxstates = tipmap_size;
unsigned int scaling;
unsigned int i,j,k,n,m;
const double * lmat;
const double * rmat;
unsigned int span = states_padded * rate_cats;
unsigned int span_padded = states_padded * rate_cats;
unsigned int lstate;
__m256d xmm0,xmm1,xmm2,xmm3;
/* precompute a lookup table of four values per entry (one for each state),
for all 16 states (including ambiguities) and for each rate category. */
double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double),
PLL_ALIGNMENT_AVX);
if (!lookup)
{
......@@ -1136,31 +1176,48 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
}
}
/* update the parent scaler with the scaler of the right child */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
size_t displacement = (states_padded - states) * (states_padded);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
}
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
size_t displacement = (states_padded - states) * (states_padded);
/* iterate over sites and compute CLV entries */
for (n = 0; n < sites; ++n)
{
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
lstate = (unsigned int) left_tipchar[n];
unsigned int loffset = lstate*span;
unsigned int loffset = lstate*span_padded;
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
__m256d v_termb0 = _mm256_setzero_pd();
__m256d v_termb1 = _mm256_setzero_pd();
__m256d v_termb2 = _mm256_setzero_pd();
......@@ -1230,13 +1287,31 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -1248,18 +1323,16 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span;
for (i = 0; i < span; i += 4)
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod,v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_clv += span;
parent_clv += span_padded;
parent_scaler[n] += 1;
}
}
......@@ -1280,7 +1353,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
unsigned int attrib)
{
unsigned int i,j,k,n;
unsigned int scaling;
const double * lmat;
const double * rmat;
......@@ -1305,27 +1377,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
return;
}
/* add up the scale vector of the two children if available */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
size_t displacement = (states_padded - states) * (states_padded);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
}
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
size_t displacement = (states_padded - states) * (states_padded);
/* compute CLV */
for (n = 0; n < sites; ++n)
{
lmat = left_matrix;
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
__m256d v_terma0 = _mm256_setzero_pd();
__m256d v_termb0 = _mm256_setzero_pd();
__m256d v_terma1 = _mm256_setzero_pd();
......@@ -1436,13 +1525,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -1456,10 +1563,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
......
......@@ -21,7 +21,7 @@
#include "pll.h"
static void fill_parent_scaler(unsigned int sites,
static void fill_parent_scaler(unsigned int scaler_size,
unsigned int * parent_scaler,
const unsigned int * left_scaler,
const unsigned int * right_scaler)
......@@ -29,19 +29,19 @@ static void fill_parent_scaler(unsigned int sites,
unsigned int i;
if (!left_scaler && !right_scaler)
memset(parent_scaler, 0, sizeof(unsigned int) * sites);
memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size);
else if (left_scaler && right_scaler)
{
memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites);
for (i = 0; i < sites; ++i)
memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size);
for (i = 0; i < scaler_size; ++i)
parent_scaler[i] += right_scaler[i];
}
else
{
if (left_scaler)
memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites);
memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size);
else
memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * sites);
memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * scaler_size);
}
}
......@@ -61,7 +61,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
unsigned int attrib)
{
unsigned int i,j,k,n;
unsigned int scaling;
const double * lmat;
const double * rmat;
......@@ -101,18 +100,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
right_matrix,
right_scaler,
tipmap,
tipmap_size);
tipmap_size,
attrib);
return;
}
/* add up the scale vector of the two children if available */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
}
size_t displacement = (states_padded - states) * (states_padded);
__m256i mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
/* compute CLV */
for (n = 0; n < sites; ++n)
......@@ -120,12 +137,14 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
lmat = left_matrix;
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
lstate = tipmap[left_tipchars[n]];
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
......@@ -251,14 +270,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -268,12 +305,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
parent_clv += states_padded;
right_clv += states_padded;
}
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
......@@ -298,25 +334,25 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
const double * right_matrix,
const unsigned int * right_scaler,
const unsigned int * tipmap,
unsigned int tipmap_size)
unsigned int tipmap_size,
unsigned int attrib)
{
unsigned int states = 20;
unsigned int states_padded = states;
unsigned int maxstates = tipmap_size;
unsigned int scaling;
unsigned int i,j,k,n,m;
const double * lmat;
const double * rmat;
unsigned int span = states_padded * rate_cats;
unsigned int span_padded = states_padded * rate_cats;
unsigned int lstate;
__m256d xmm0,xmm1,xmm2,xmm3;
/* precompute a lookup table of four values per entry (one for each state),
for all 16 states (including ambiguities) and for each rate category. */
double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double),
PLL_ALIGNMENT_AVX);
if (!lookup)
{
......@@ -370,31 +406,48 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
}
}
/* update the parent scaler with the scaler of the right child */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
size_t displacement = (states_padded - states) * (states_padded);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
}
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
size_t displacement = (states_padded - states) * (states_padded);
/* iterate over sites and compute CLV entries */
for (n = 0; n < sites; ++n)
{
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
lstate = (unsigned int) left_tipchar[n];
unsigned int loffset = lstate*span;
unsigned int loffset = lstate*span_padded;
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
__m256d v_termb0 = _mm256_setzero_pd();
__m256d v_termb1 = _mm256_setzero_pd();
__m256d v_termb2 = _mm256_setzero_pd();
......@@ -460,13 +513,31 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -478,18 +549,16 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span;
for (i = 0; i < span; i += 4)
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod,v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_clv += span;
parent_clv += span_padded;
parent_scaler[n] += 1;
}
}
......@@ -510,7 +579,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
unsigned int attrib)
{
unsigned int i,j,k,n;
unsigned int scaling;
const double * lmat;
const double * rmat;
......@@ -536,27 +604,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
return;
}
/* add up the scale vector of the two children if available */
if (parent_scaler)
fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler);
/* scaling-related stuff */
unsigned int scale_mode; /* 0 = none, 1 = per-site, 2 = per-rate */
unsigned int scale_mask;
unsigned int init_mask;
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
size_t displacement = (states_padded - states) * (states_padded);
if (!parent_scaler)
{
/* scaling disabled / not required */
scale_mode = init_mask = 0;
}
else
{
/* determine the scaling mode and init the vars accordingly */
scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
init_mask = (scale_mode == 1) ? 0xF : 0;
const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
/* add up the scale vector of the two children if available */
fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
}
__m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
size_t displacement = (states_padded - states) * (states_padded);
/* compute CLV */
for (n = 0; n < sites; ++n)
{
lmat = left_matrix;
rmat = right_matrix;
scaling = (parent_scaler) ? 0xF : 0;
scale_mask = init_mask;
for (k = 0; k < rate_cats; ++k)
{
unsigned int rate_mask = 0xF;
/* iterate over quadruples of rows */
for (i = 0; i < states_padded; i += 4)
{
__m256d v_terma0 = _mm256_setzero_pd();
__m256d v_termb0 = _mm256_setzero_pd();
__m256d v_terma1 = _mm256_setzero_pd();
......@@ -664,13 +749,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
__m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);
/* check for scaling */
/* check if scaling is needed for the current rate category */
__m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
scaling = scaling & _mm256_movemask_pd(v_cmp);
rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);
_mm256_store_pd(parent_clv+i, v_prod);
}
if (scale_mode == 2)
{
/* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
* the threshold then scale (all) entries by PLL_SCALE_FACTOR */
if (rate_mask == 0xF)
{
for (i = 0; i < states_padded; i += 4)
{
__m256d v_prod = _mm256_load_pd(parent_clv + i);
v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
_mm256_store_pd(parent_clv + i, v_prod);
}
parent_scaler[n*rate_cats + k] += 1;
}
}
else
scale_mask = scale_mask & rate_mask;
/* reset pointers to point to the start of the next p-matrix, as the
vectorization assumes a square states_padded * states_padded matrix,
even though the real matrix is states * states_padded */
......@@ -684,10 +787,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
/* if *all* entries of the site CLV were below the threshold then scale
(all) entries by PLL_SCALE_FACTOR */
if (scaling == 0xF)
if (scale_mask == 0xF)
{
__m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
parent_clv -= span_padded;
for (i = 0; i < span_padded; i += 4)
{
......