Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille · Andreas Tille
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -2,6 +2,17 @@
 All notable changes to `libpll` will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).

+## [0.3.2] - 2017-07-12
+### Added
+ - Optional per-rate category scalers for protein and generic kernels
+ - Hardware detection for APPLE builds defaults to assembly code
+### Fixed
+ - Improved fix for negative p-matrix values.
+ - Derivatives computation for Lewis/Felsenstein ascertainment bias correction
+ - set_tipchars() for ascertainment bias correction with non-DNA sequences
+ - Excessive memory allocation when compressing site patterns
+ - Issue with PHYLIP parsing when header ends with CRLF
+
 ## [0.3.1] - 2017-05-17
 ### Added
 - Checks for older versions of clang and gcc to use assembly instructions

--- a/README.md
+++ b/README.md
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.

 AC_PREREQ([2.63])
-AC_INIT([libpll], [0.3.1], [Tomas.Flouri@h-its.org])
+AC_INIT([libpll], [0.3.2], [Tomas.Flouri@h-its.org])
 AM_INIT_AUTOMAKE([subdir-objects])
 AC_LANG([C])
 AC_CONFIG_SRCDIR([src/pll.c])

--- a/debian/changelog
+++ b/debian/changelog
+libpll (0.3.2-1) unstable; urgency=medium
+
+  * New upstream version
+  * debhelper 11
+  * Point Vcs fields to salsa.debian.org
+  * Standards-Version: 4.1.4
+  * Add symbols file
+
+ -- Andreas Tille <tille@debian.org>  Mon, 28 May 2018 21:21:16 +0200
+
 libpll (0.3.1-1) unstable; urgency=medium

  * new upstream version incorporating all patches

--- a/debian/compat
+++ b/debian/compat
-10
+11
--- a/debian/control
+++ b/debian/control
 Source: libpll
-Section: science
-Priority: optional
 Maintainer: Debian Med Packaging Team <debian-med-packaging@lists.alioth.debian.org>
 Uploaders: Andreas Tille <tille@debian.org>
-Build-Depends: debhelper (>= 10),
+Section: science
+Priority: optional
+Build-Depends: debhelper (>= 11~),
               d-shlibs,
               bison,
               flex,
               autoconf-archive
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/libpll.git
-Vcs-Git: https://anonscm.debian.org/git/debian-med/libpll.git
+Standards-Version: 4.1.4
+Vcs-Browser: https://salsa.debian.org/med-team/libpll
+Vcs-Git: https://salsa.debian.org/med-team/libpll.git
 Homepage: http://www.libpll.org/

 Package: libpll0
-Section: libs
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
+Section: libs
+Depends: ${shlibs:Depends},
+         ${misc:Depends}
 Description: Phylogenetic Likelihood Library
 PLL is a highly optimized, parallelized software library to ease the
 development of new software tools dealing with phylogenetic inference.
@@ -32,9 +33,11 @@ Description: Phylogenetic Likelihood Library
 This package contains the dynamic library.

 Package: libpll-dev
-Section: libdevel
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}, libpll0 (= ${binary:Version})
+Section: libdevel
+Depends: ${shlibs:Depends},
+         ${misc:Depends},
+         libpll0 (= ${binary:Version})
 Description: Phylogenetic Likelihood Library (development)
 PLL is a highly optimized, parallelized software library to ease the
 development of new software tools dealing with phylogenetic inference.

--- a/debian/libpll0.symbols
+++ b/debian/libpll0.symbols
+libpll.so.0 libpll0 #MINVER#
+ pll_aa_freqs_blosum62@Base 0.3.2
+ pll_aa_freqs_cprev@Base 0.3.2
+ pll_aa_freqs_dayhoff@Base 0.3.2
+ pll_aa_freqs_dcmut@Base 0.3.2
+ pll_aa_freqs_flu@Base 0.3.2
+ pll_aa_freqs_hivb@Base 0.3.2
+ pll_aa_freqs_hivw@Base 0.3.2
+ pll_aa_freqs_jtt@Base 0.3.2
+ pll_aa_freqs_jttdcmut@Base 0.3.2
+ pll_aa_freqs_lg4m@Base 0.3.2
+ pll_aa_freqs_lg4x@Base 0.3.2
+ pll_aa_freqs_lg@Base 0.3.2
+ pll_aa_freqs_mtart@Base 0.3.2
+ pll_aa_freqs_mtmam@Base 0.3.2
+ pll_aa_freqs_mtrev@Base 0.3.2
+ pll_aa_freqs_mtzoa@Base 0.3.2
+ pll_aa_freqs_pmb@Base 0.3.2
+ pll_aa_freqs_rtrev@Base 0.3.2
+ pll_aa_freqs_stmtrev@Base 0.3.2
+ pll_aa_freqs_vt@Base 0.3.2
+ pll_aa_freqs_wag@Base 0.3.2
+ pll_aa_rates_blosum62@Base 0.3.2
+ pll_aa_rates_cprev@Base 0.3.2
+ pll_aa_rates_dayhoff@Base 0.3.2
+ pll_aa_rates_dcmut@Base 0.3.2
+ pll_aa_rates_flu@Base 0.3.2
+ pll_aa_rates_hivb@Base 0.3.2
+ pll_aa_rates_hivw@Base 0.3.2
+ pll_aa_rates_jtt@Base 0.3.2
+ pll_aa_rates_jttdcmut@Base 0.3.2
+ pll_aa_rates_lg4m@Base 0.3.2
+ pll_aa_rates_lg4x@Base 0.3.2
+ pll_aa_rates_lg@Base 0.3.2
+ pll_aa_rates_mtart@Base 0.3.2
+ pll_aa_rates_mtmam@Base 0.3.2
+ pll_aa_rates_mtrev@Base 0.3.2
+ pll_aa_rates_mtzoa@Base 0.3.2
+ pll_aa_rates_pmb@Base 0.3.2
+ pll_aa_rates_rtrev@Base 0.3.2
+ pll_aa_rates_stmtrev@Base 0.3.2
+ pll_aa_rates_vt@Base 0.3.2
+ pll_aa_rates_wag@Base 0.3.2
+ pll_aligned_alloc@Base 0.3.2
+ pll_aligned_free@Base 0.3.2
+ pll_compress_site_patterns@Base 0.3.2
+ pll_compute_edge_loglikelihood@Base 0.3.2
+ pll_compute_gamma_cats@Base 0.3.2
+ pll_compute_likelihood_derivatives@Base 0.3.2
+ pll_compute_root_loglikelihood@Base 0.3.2
+ pll_core_create_lookup@Base 0.3.2
+ pll_core_create_lookup_20x20_avx@Base 0.3.2
+ pll_core_create_lookup_4x4@Base 0.3.2
+ pll_core_create_lookup_4x4_avx@Base 0.3.2
+ pll_core_create_lookup_4x4_sse@Base 0.3.2
+ pll_core_create_lookup_avx@Base 0.3.2
+ pll_core_create_lookup_sse@Base 0.3.2
+ pll_core_edge_loglikelihood_ii@Base 0.3.2
+ pll_core_edge_loglikelihood_ii_4x4_avx@Base 0.3.2
+ pll_core_edge_loglikelihood_ii_4x4_sse@Base 0.3.2
+ pll_core_edge_loglikelihood_ii_avx2@Base 0.3.2
+ pll_core_edge_loglikelihood_ii_avx@Base 0.3.2
+ pll_core_edge_loglikelihood_ii_sse@Base 0.3.2
+ pll_core_edge_loglikelihood_ti@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_20x20_avx2@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_20x20_avx@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_4x4@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_4x4_avx@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_4x4_sse@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_avx@Base 0.3.2
+ pll_core_edge_loglikelihood_ti_sse@Base 0.3.2
+ pll_core_likelihood_derivatives@Base 0.3.2
+ pll_core_likelihood_derivatives_avx2@Base 0.3.2
+ pll_core_likelihood_derivatives_avx@Base 0.3.2
+ pll_core_root_loglikelihood@Base 0.3.2
+ pll_core_root_loglikelihood_4x4_avx@Base 0.3.2
+ pll_core_root_loglikelihood_4x4_sse@Base 0.3.2
+ pll_core_root_loglikelihood_avx2@Base 0.3.2
+ pll_core_root_loglikelihood_avx@Base 0.3.2
+ pll_core_root_loglikelihood_sse@Base 0.3.2
+ pll_core_update_partial_ii@Base 0.3.2
+ pll_core_update_partial_ii_4x4_avx@Base 0.3.2
+ pll_core_update_partial_ii_4x4_sse@Base 0.3.2
+ pll_core_update_partial_ii_avx2@Base 0.3.2
+ pll_core_update_partial_ii_avx@Base 0.3.2
+ pll_core_update_partial_ii_sse@Base 0.3.2
+ pll_core_update_partial_ti@Base 0.3.2
+ pll_core_update_partial_ti_20x20_avx2@Base 0.3.2
+ pll_core_update_partial_ti_20x20_avx@Base 0.3.2
+ pll_core_update_partial_ti_4x4@Base 0.3.2
+ pll_core_update_partial_ti_4x4_avx@Base 0.3.2
+ pll_core_update_partial_ti_4x4_sse@Base 0.3.2
+ pll_core_update_partial_ti_avx2@Base 0.3.2
+ pll_core_update_partial_ti_avx@Base 0.3.2
+ pll_core_update_partial_ti_sse@Base 0.3.2
+ pll_core_update_partial_tt@Base 0.3.2
+ pll_core_update_partial_tt_4x4@Base 0.3.2
+ pll_core_update_partial_tt_4x4_avx@Base 0.3.2
+ pll_core_update_partial_tt_4x4_sse@Base 0.3.2
+ pll_core_update_partial_tt_avx@Base 0.3.2
+ pll_core_update_partial_tt_sse@Base 0.3.2
+ pll_core_update_pmatrix@Base 0.3.2
+ pll_core_update_pmatrix_20x20_avx2@Base 0.3.2
+ pll_core_update_pmatrix_20x20_avx@Base 0.3.2
+ pll_core_update_pmatrix_4x4_avx@Base 0.3.2
+ pll_core_update_pmatrix_4x4_sse@Base 0.3.2
+ pll_core_update_sumtable_ii@Base 0.3.2
+ pll_core_update_sumtable_ii_avx2@Base 0.3.2
+ pll_core_update_sumtable_ii_avx@Base 0.3.2
+ pll_core_update_sumtable_ii_sse@Base 0.3.2
+ pll_core_update_sumtable_ti@Base 0.3.2
+ pll_core_update_sumtable_ti_4x4@Base 0.3.2
+ pll_core_update_sumtable_ti_avx2@Base 0.3.2
+ pll_core_update_sumtable_ti_avx@Base 0.3.2
+ pll_core_update_sumtable_ti_sse@Base 0.3.2
+ pll_count_invariant_sites@Base 0.3.2
+ pll_dlist_append@Base 0.3.2
+ pll_dlist_prepend@Base 0.3.2
+ pll_dlist_remove@Base 0.3.2
+ pll_errmsg@Base 0.3.2
+ pll_errno@Base 0.3.2
+ pll_fasta_close@Base 0.3.2
+ pll_fasta_getfilepos@Base 0.3.2
+ pll_fasta_getfilesize@Base 0.3.2
+ pll_fasta_getnext@Base 0.3.2
+ pll_fasta_open@Base 0.3.2
+ pll_fasta_rewind@Base 0.3.2
+ pll_fastparsimony_edge_score@Base 0.3.2
+ pll_fastparsimony_edge_score_4x4@Base 0.3.2
+ pll_fastparsimony_edge_score_4x4_avx2@Base 0.3.2
+ pll_fastparsimony_edge_score_4x4_avx@Base 0.3.2
+ pll_fastparsimony_edge_score_4x4_sse@Base 0.3.2
+ pll_fastparsimony_edge_score_avx2@Base 0.3.2
+ pll_fastparsimony_edge_score_avx@Base 0.3.2
+ pll_fastparsimony_edge_score_sse@Base 0.3.2
+ pll_fastparsimony_init@Base 0.3.2
+ pll_fastparsimony_root_score@Base 0.3.2
+ pll_fastparsimony_stepwise@Base 0.3.2
+ pll_fastparsimony_update_vector@Base 0.3.2
+ pll_fastparsimony_update_vector_4x4@Base 0.3.2
+ pll_fastparsimony_update_vector_4x4_avx2@Base 0.3.2
+ pll_fastparsimony_update_vector_4x4_avx@Base 0.3.2
+ pll_fastparsimony_update_vector_4x4_sse@Base 0.3.2
+ pll_fastparsimony_update_vector_avx2@Base 0.3.2
+ pll_fastparsimony_update_vector_avx@Base 0.3.2
+ pll_fastparsimony_update_vector_sse@Base 0.3.2
+ pll_fastparsimony_update_vectors@Base 0.3.2
+ pll_hardware@Base 0.3.2
+ pll_hardware_dump@Base 0.3.2
+ pll_hardware_ignore@Base 0.3.2
+ pll_hardware_probe@Base 0.3.2
+ pll_initstate_r@Base 0.3.2
+ pll_map_aa@Base 0.3.2
+ pll_map_bin@Base 0.3.2
+ pll_map_fasta@Base 0.3.2
+ pll_map_nt@Base 0.3.2
+ pll_map_phylip@Base 0.3.2
+ pll_msa_destroy@Base 0.3.2
+ pll_parsimony_build@Base 0.3.2
+ pll_parsimony_create@Base 0.3.2
+ pll_parsimony_destroy@Base 0.3.2
+ pll_parsimony_reconstruct@Base 0.3.2
+ pll_parsimony_score@Base 0.3.2
+ pll_partition_create@Base 0.3.2
+ pll_partition_destroy@Base 0.3.2
+ pll_phylip_close@Base 0.3.2
+ pll_phylip_open@Base 0.3.2
+ pll_phylip_parse_interleaved@Base 0.3.2
+ pll_phylip_parse_sequential@Base 0.3.2
+ pll_phylip_rewind@Base 0.3.2
+ pll_random_r@Base 0.3.2
+ pll_rtree__create_buffer@Base 0.3.2
+ pll_rtree__delete_buffer@Base 0.3.2
+ pll_rtree__flex_debug@Base 0.3.2
+ pll_rtree__flush_buffer@Base 0.3.2
+ pll_rtree__scan_buffer@Base 0.3.2
+ pll_rtree__scan_bytes@Base 0.3.2
+ pll_rtree__scan_string@Base 0.3.2
+ pll_rtree__switch_to_buffer@Base 0.3.2
+ pll_rtree_alloc@Base 0.3.2
+ pll_rtree_char@Base 0.3.2
+ pll_rtree_colend@Base 0.3.2
+ pll_rtree_colstart@Base 0.3.2
+ pll_rtree_create_operations@Base 0.3.2
+ pll_rtree_create_pars_buildops@Base 0.3.2
+ pll_rtree_create_pars_recops@Base 0.3.2
+ pll_rtree_destroy@Base 0.3.2
+ pll_rtree_export_newick@Base 0.3.2
+ pll_rtree_free@Base 0.3.2
+ pll_rtree_get_debug@Base 0.3.2
+ pll_rtree_get_in@Base 0.3.2
+ pll_rtree_get_leng@Base 0.3.2
+ pll_rtree_get_lineno@Base 0.3.2
+ pll_rtree_get_out@Base 0.3.2
+ pll_rtree_get_text@Base 0.3.2
+ pll_rtree_graph_destroy@Base 0.3.2
+ pll_rtree_in@Base 0.3.2
+ pll_rtree_leng@Base 0.3.2
+ pll_rtree_lex@Base 0.3.2
+ pll_rtree_lex_destroy@Base 0.3.2
+ pll_rtree_lineno@Base 0.3.2
+ pll_rtree_lval@Base 0.3.2
+ pll_rtree_nerrs@Base 0.3.2
+ pll_rtree_out@Base 0.3.2
+ pll_rtree_parse@Base 0.3.2
+ pll_rtree_parse_newick@Base 0.3.2
+ pll_rtree_parse_newick_string@Base 0.3.2
+ pll_rtree_pop_buffer_state@Base 0.3.2
+ pll_rtree_push_buffer_state@Base 0.3.2
+ pll_rtree_realloc@Base 0.3.2
+ pll_rtree_reset_template_indices@Base 0.3.2
+ pll_rtree_restart@Base 0.3.2
+ pll_rtree_set_debug@Base 0.3.2
+ pll_rtree_set_in@Base 0.3.2
+ pll_rtree_set_lineno@Base 0.3.2
+ pll_rtree_set_out@Base 0.3.2
+ pll_rtree_show_ascii@Base 0.3.2
+ pll_rtree_text@Base 0.3.2
+ pll_rtree_traverse@Base 0.3.2
+ pll_rtree_unroot@Base 0.3.2
+ pll_rtree_wraptree@Base 0.3.2
+ pll_set_asc_bias_type@Base 0.3.2
+ pll_set_asc_state_weights@Base 0.3.2
+ pll_set_category_rates@Base 0.3.2
+ pll_set_category_weights@Base 0.3.2
+ pll_set_frequencies@Base 0.3.2
+ pll_set_parsimony_sequence@Base 0.3.2
+ pll_set_pattern_weights@Base 0.3.2
+ pll_set_subst_params@Base 0.3.2
+ pll_set_tip_clv@Base 0.3.2
+ pll_set_tip_states@Base 0.3.2
+ pll_setstate_r@Base 0.3.2
+ pll_show_clv@Base 0.3.2
+ pll_show_pmatrix@Base 0.3.2
+ pll_srandom_r@Base 0.3.2
+ pll_svg_attrib_create@Base 0.3.2
+ pll_svg_attrib_destroy@Base 0.3.2
+ pll_update_eigen@Base 0.3.2
+ pll_update_invariant_sites@Base 0.3.2
+ pll_update_invariant_sites_proportion@Base 0.3.2
+ pll_update_partials@Base 0.3.2
+ pll_update_prob_matrices@Base 0.3.2
+ pll_update_sumtable@Base 0.3.2
+ pll_utree__create_buffer@Base 0.3.2
+ pll_utree__delete_buffer@Base 0.3.2
+ pll_utree__flex_debug@Base 0.3.2
+ pll_utree__flush_buffer@Base 0.3.2
+ pll_utree__scan_buffer@Base 0.3.2
+ pll_utree__scan_bytes@Base 0.3.2
+ pll_utree__scan_string@Base 0.3.2
+ pll_utree__switch_to_buffer@Base 0.3.2
+ pll_utree_alloc@Base 0.3.2
+ pll_utree_char@Base 0.3.2
+ pll_utree_check_integrity@Base 0.3.2
+ pll_utree_clone@Base 0.3.2
+ pll_utree_colend@Base 0.3.2
+ pll_utree_colstart@Base 0.3.2
+ pll_utree_create_operations@Base 0.3.2
+ pll_utree_create_pars_buildops@Base 0.3.2
+ pll_utree_destroy@Base 0.3.2
+ pll_utree_every@Base 0.3.2
+ pll_utree_every_const@Base 0.3.2
+ pll_utree_export_newick@Base 0.3.2
+ pll_utree_export_svg@Base 0.3.2
+ pll_utree_free@Base 0.3.2
+ pll_utree_get_debug@Base 0.3.2
+ pll_utree_get_in@Base 0.3.2
+ pll_utree_get_leng@Base 0.3.2
+ pll_utree_get_lineno@Base 0.3.2
+ pll_utree_get_out@Base 0.3.2
+ pll_utree_get_text@Base 0.3.2
+ pll_utree_graph_clone@Base 0.3.2
+ pll_utree_graph_destroy@Base 0.3.2
+ pll_utree_in@Base 0.3.2
+ pll_utree_leng@Base 0.3.2
+ pll_utree_lex@Base 0.3.2
+ pll_utree_lex_destroy@Base 0.3.2
+ pll_utree_lineno@Base 0.3.2
+ pll_utree_lval@Base 0.3.2
+ pll_utree_nerrs@Base 0.3.2
+ pll_utree_nni@Base 0.3.2
+ pll_utree_out@Base 0.3.2
+ pll_utree_parse@Base 0.3.2
+ pll_utree_parse_newick@Base 0.3.2
+ pll_utree_parse_newick_string@Base 0.3.2
+ pll_utree_pop_buffer_state@Base 0.3.2
+ pll_utree_push_buffer_state@Base 0.3.2
+ pll_utree_realloc@Base 0.3.2
+ pll_utree_reset_template_indices@Base 0.3.2
+ pll_utree_restart@Base 0.3.2
+ pll_utree_rollback@Base 0.3.2
+ pll_utree_set_debug@Base 0.3.2
+ pll_utree_set_in@Base 0.3.2
+ pll_utree_set_lineno@Base 0.3.2
+ pll_utree_set_out@Base 0.3.2
+ pll_utree_show_ascii@Base 0.3.2
+ pll_utree_spr@Base 0.3.2
+ pll_utree_spr_safe@Base 0.3.2
+ pll_utree_text@Base 0.3.2
+ pll_utree_traverse@Base 0.3.2
+ pll_utree_wraptree@Base 0.3.2
--- a/examples/rooted-tacg/rooted-tacg.c
+++ b/examples/rooted-tacg/rooted-tacg.c
@@ -228,23 +228,23 @@ int main(int argc, char * argv[])
  /* set the 5 tip CLVs, and use the pll_map_nt map for converting
     the sequences to CLVs */
  double * enc = encode_tacg("WAAAAB");
-  assert(pll_set_tip_clv(partition, 0, enc));
+  assert(pll_set_tip_clv(partition, 0, enc, PLL_FALSE));
  free(enc);

  enc = encode_tacg("CACACD");
-  assert(pll_set_tip_clv(partition, 1, enc));
+  assert(pll_set_tip_clv(partition, 1, enc, PLL_FALSE));
  free(enc);

  enc = encode_tacg("AGGACA");
-  assert(pll_set_tip_clv(partition, 2, enc));
+  assert(pll_set_tip_clv(partition, 2, enc, PLL_FALSE));
  free(enc);

  enc = encode_tacg("CGTAGT");
-  assert(pll_set_tip_clv(partition, 3, enc));
+  assert(pll_set_tip_clv(partition, 3, enc, PLL_FALSE));
  free(enc);

  enc = encode_tacg("CGAATT");
-  assert(pll_set_tip_clv(partition, 4, enc));
+  assert(pll_set_tip_clv(partition, 4, enc, PLL_FALSE));
  free(enc);

  /* update two probability matrices for the corresponding branch lengths */

--- a/man/libpll.3
+++ b/man/libpll.3
 .\" -*- coding: utf-8 -*-
 .\" ============================================================================
-.TH libpll 3 "May 17, 2017" "libpll 0.3.1" "Library Functions Manual"
+.TH libpll 3 "July 12, 2017" "libpll 0.3.2" "Library Functions Manual"
 .\" ============================================================================
 .SH NAME
 libpll \(em Phylogenetic Likelihood Library
@@ -544,7 +544,7 @@ Source code and binaries are available at
 <https://github.com/xflouris/libpll>.
 .\" ============================================================================
 .SH COPYRIGHT
-Copyright (C) 2015-2016, Tomas Flouri, Diego Darriba
+Copyright (C) 2015-2017, Tomas Flouri, Diego Darriba
 .PP
 All rights reserved.
 .PP
@@ -581,7 +581,7 @@ First public release.
 Added faster vectorizations for 20-state and arbitrary-state models, unweighted
 parsimony functions, randomized stepwise addition, portable functions for
 parsing trees from C-strings, per-rate category scalers for preventing
-numberical underflows. Modified newick exporting function to accept callbacks
+numerical underflows. Modified newick exporting function to accept callbacks
 for custom printing. Fixed derivatives computation, parsing of branch lengths,
 invariant sites computation, log-likelihood computation for cases where we have
 scaling and patterns, ascertainment bias computation, per-site log-likelihood
@@ -591,5 +591,12 @@ computation, memory leaks. Added run-time detection of hardware.
 Correct updating of paddded eigen-decomposition arrays for models with a number
 of states not being a power of two. Added portable hardware detection for clang
 and GCC. 
+.TP
+.BR v0.3.2\~ "released July 12th, 2017"
+Added optional per-rate category scalers for protein and generic kernels.
+Improved fix for negative transition probability matrices caused by numerics.
+Fixed initialization of tip CLVs when using ascertainment bias correction with
+non-DNA sequences. Fixed excessive memory allocation when compressing site
+patterns and issue with PHYLIP parsing when header ends with CRLF.
 .RE
 .LP
--- a/src/compress.c
+++ b/src/compress.c
@@ -188,9 +188,9 @@ PLL_EXPORT unsigned int * pll_compress_site_patterns(char ** sequence,
  }

  /* allocate memory for the alignment */
-  memptr = column[0] = (char *)malloc((size_t)((*length)+1) *
-                                      (size_t)count *
-                                      sizeof(char *));
+  memptr = column[0] = (char *)malloc((size_t)(*length) *
+                                      (size_t)(count+1) *
+                                      sizeof(char));
  if (!memptr)
  {
    pll_errno = PLL_ERROR_MEM_ALLOC;

--- a/src/core_derivatives.c
+++ b/src/core_derivatives.c
@@ -371,9 +371,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
                                           attrib);
  }

+  unsigned int min_scaler;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  /* build sumtable: non-vectorized version, general case */
  for (n = 0; n < sites; n++)
  {
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      t_eigenvecs = eigenvecs[i];
@@ -393,12 +430,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti(unsigned int states,
          tipstate >>= 1;
        }
        sum[j] = lefterm * righterm;
+
+        if (rate_scalings && rate_scalings[i] > 0)
+          sum[j] *= scale_minlh[rate_scalings[i]-1];
      }
      t_clvc += states_padded;
      sum += states_padded;
    }
  }

+  if (rate_scalings)
+    free(rate_scalings);
+
  return PLL_SUCCESS;
 }

@@ -652,6 +695,8 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,

      switch(asc_bias_type)
      {
+        /* NOTE: since we compute derivatives of -logL, the signs below are flipped
+         * compared to RAxML ("+" for LEWIS and "-" for FELSENSTEIN) */
        case PLL_ATTRIB_AB_LEWIS:
        {
          // TODO: pattern_weight_sum should be stored somewhere!
@@ -660,16 +705,16 @@ PLL_EXPORT int pll_core_likelihood_derivatives(unsigned int states,
            pattern_weight_sum += pattern_weights[n];

          /* derivatives of log(1.0 - (sum Li(s) over states 's')) */
-          *d_f  -= pattern_weight_sum * (asc_Lk[1] / (asc_Lk[0] - 1.0));
-          *dd_f -= pattern_weight_sum *
+          *d_f  += pattern_weight_sum * (asc_Lk[1] / (asc_Lk[0] - 1.0));
+          *dd_f += pattern_weight_sum *
               (((asc_Lk[0] - 1.0) * asc_Lk[2] - asc_Lk[1] * asc_Lk[1]) /
               ((asc_Lk[0] - 1.0) * (asc_Lk[0] - 1.0)));
        }
        break;
        case PLL_ATTRIB_AB_FELSENSTEIN:
          /* derivatives of log(sum Li(s) over states 's') */
-          *d_f  += sum_w_inv * (asc_Lk[1] / asc_Lk[0]);
-          *dd_f += sum_w_inv *
+          *d_f  -= sum_w_inv * (asc_Lk[1] / asc_Lk[0]);
+          *dd_f -= sum_w_inv *
               (((asc_Lk[2] * asc_Lk[0]) - asc_Lk[1] * asc_Lk[1]) /
               (asc_Lk[0] * asc_Lk[0]));
        break;

--- a/src/core_derivatives_avx.c
+++ b/src/core_derivatives_avx.c
@@ -45,6 +45,7 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites,

  unsigned int states = 4;

+  /* scaling stuff*/
  unsigned int min_scaler = 0;
  unsigned int * rate_scalings = NULL;
  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
@@ -55,6 +56,13 @@ static int core_update_sumtable_ii_4x4_avx(unsigned int sites,
  {
    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));

+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
    double scale_factor = 1.0;
    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
    {
@@ -238,6 +246,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,

  unsigned int states_padded = (states+3) & 0xFFFFFFFC;

+  /* scaling stuff */
+  unsigned int min_scaler = 0;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
+    }
+  }
+
  /* padded eigenvecs */
  double * tt_eigenvecs = (double *) pll_aligned_alloc (
        (states_padded * states_padded * rate_cats) * sizeof(double),
@@ -282,6 +316,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,
  /* vectorized loop from update_sumtable() */
  for (n = 0; n < sites; n++)
  {
+    /* compute per-rate scalers and obtain minimum value (within site) */
+    if (per_rate_scaling)
+    {
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    const double * c_eigenvecs      = tt_eigenvecs;
    const double * ct_inv_eigenvecs = tt_inv_eigenvecs;
    for (i = 0; i < rate_cats; ++i)
@@ -381,6 +435,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,

        /* update sum */
        __m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum);
+
+        /* apply per-rate scalers */
+        if (rate_scalings && rate_scalings[i] > 0)
+        {
+          v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]);
+        }
+
        _mm256_store_pd (sum + j, v_prod);
      }

@@ -392,6 +453,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx(unsigned int states,

  pll_aligned_free (tt_inv_eigenvecs);
  pll_aligned_free (tt_eigenvecs);
+  if (rate_scalings)
+    free(rate_scalings);

  return PLL_SUCCESS;
 }
@@ -426,6 +489,13 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites,
  {
    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));

+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
    double scale_factor = 1.0;
    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
    {
@@ -568,7 +638,6 @@ static int core_update_sumtable_ti_4x4_avx(unsigned int sites,

  pll_aligned_free(eigenvecs_trans);
  pll_aligned_free(precomp_left);
-
  if (rate_scalings)
    free(rate_scalings);

@@ -614,6 +683,32 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
  const double * t_clvc = parent_clv;
  const double * t_eigenvecs_padded;

+  /* scaling stuff */
+  unsigned int min_scaler = 0;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
+    }
+  }
+
  double * eigenvecs_padded = (double *) pll_aligned_alloc (
      (states_padded * states_padded * rate_cats) * sizeof(double),
      PLL_ALIGNMENT_AVX);
@@ -688,6 +783,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
  /* build sumtable */
  for (n = 0; n < sites; n++)
  {
+    /* compute per-rate scalers and obtain minimum value (within site) */
+    if (per_rate_scaling)
+    {
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    tipstate = (unsigned int) left_tipchars[n];

    unsigned int loffset = tipstate * span;
@@ -757,6 +871,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,
        __m256d v_lefterm = _mm256_load_pd(t_precomp + j);

        __m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm);
+
+        /* apply per-rate scalers */
+        if (rate_scalings && rate_scalings[i] > 0)
+        {
+          v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]);
+        }
+
        _mm256_store_pd(sum + j, v_sum);
      }

@@ -768,6 +889,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx(unsigned int states,

  pll_aligned_free(eigenvecs_padded);
  pll_aligned_free(precomp_left);
+  if (rate_scalings)
+    free(rate_scalings);

  return PLL_SUCCESS;
 }

--- a/src/core_derivatives_avx2.c
+++ b/src/core_derivatives_avx2.c
@@ -18,7 +18,7 @@
    Exelixis Lab, Heidelberg Instutute for Theoretical Studies
    Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
 */
-
+#include <limits.h>
 #include "pll.h"

 PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
@@ -63,6 +63,32 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,

  unsigned int states_padded = (states+3) & 0xFFFFFFFC;

+  /* scaling stuff */
+  unsigned int min_scaler = 0;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
+    }
+  }
+
  /* padded eigenvecs */
  double * tt_eigenvecs = (double *) pll_aligned_alloc (
        (states_padded * states_padded * rate_cats) * sizeof(double),
@@ -107,6 +133,26 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,
  /* vectorized loop from update_sumtable() */
  for (n = 0; n < sites; n++)
  {
+    /* compute per-rate scalers and obtain minimum value (within site) */
+    if (per_rate_scaling)
+    {
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    const double * c_eigenvecs      = tt_eigenvecs;
    const double * ct_inv_eigenvecs = tt_inv_eigenvecs;
    for (i = 0; i < rate_cats; ++i)
@@ -201,6 +247,13 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,

        /* update sum */
        __m256d v_prod = _mm256_mul_pd (v_lefterm_sum, v_righterm_sum);
+
+        /* apply per-rate scalers */
+        if (rate_scalings && rate_scalings[i] > 0)
+        {
+          v_prod = _mm256_mul_pd(v_prod, v_scale_minlh[rate_scalings[i]-1]);
+        }
+
        _mm256_store_pd (sum + j, v_prod);
      }

@@ -212,6 +265,8 @@ PLL_EXPORT int pll_core_update_sumtable_ii_avx2(unsigned int states,

  pll_aligned_free (tt_inv_eigenvecs);
  pll_aligned_free (tt_eigenvecs);
+  if (rate_scalings)
+    free(rate_scalings);

  return PLL_SUCCESS;
 }
@@ -255,6 +310,31 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
  unsigned int i, j, k, n;
  unsigned int tipstate;

+  unsigned int min_scaler = 0;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  __m256d v_scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf (pll_errmsg, 200, "Cannot allocate memory for rate scalers");
+      return PLL_FAILURE;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      v_scale_minlh[i] = _mm256_set1_pd(scale_factor);
+    }
+  }
+
  double * sum = sumtable;
  const double * t_clvc = parent_clv;
  const double * t_eigenvecs_padded;
@@ -332,6 +412,25 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
  /* build sumtable */
  for (n = 0; n < sites; n++)
  {
+    /* compute per-rate scalers and obtain minimum value (within site) */
+    if (per_rate_scaling)
+    {
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    tipstate = (unsigned int) left_tipchars[n];

    unsigned int loffset = tipstate * span;
@@ -397,6 +496,13 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,
        __m256d v_lefterm = _mm256_load_pd(t_precomp + j);

        __m256d v_sum = _mm256_mul_pd(v_lefterm, v_righterm);
+
+        /* apply per-rate scalers */
+        if (rate_scalings && rate_scalings[i] > 0)
+        {
+          v_sum = _mm256_mul_pd(v_sum, v_scale_minlh[rate_scalings[i]-1]);
+        }
+
        _mm256_store_pd(sum + j, v_sum);
      }

@@ -408,6 +514,8 @@ PLL_EXPORT int pll_core_update_sumtable_ti_avx2(unsigned int states,

  pll_aligned_free(eigenvecs_padded);
  pll_aligned_free(precomp_left);
+  if (rate_scalings)
+    free(rate_scalings);

  return PLL_SUCCESS;
 }

--- a/src/core_derivatives_sse.c
+++ b/src/core_derivatives_sse.c
@@ -271,9 +271,46 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,

  unsigned int states_padded = (states+1) & 0xFFFFFFFE;

+  unsigned int min_scaler;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  /* build sumtable: non-vectorized version, general case */
  for (n = 0; n < sites; n++)
  {
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      min_scaler = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < min_scaler)
+          min_scaler = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - min_scaler,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      ev    = eigenvecs[i];
@@ -293,11 +330,18 @@ PLL_EXPORT int pll_core_update_sumtable_ti_sse(unsigned int states,
          tipstate >>= 1;
        }
        sum[j] = lterm*rterm;
+
+        if (rate_scalings && rate_scalings[i] > 0)
+          sum[j] *= scale_minlh[rate_scalings[i]-1];
      }

      clvc += states_padded;
      sum += states_padded;
    }
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return PLL_SUCCESS;
 }
--- a/src/core_likelihood.c
+++ b/src/core_likelihood.c
@@ -439,7 +439,6 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
  double terma, terma_r, termb;
  double site_lk, inv_site_lk;

-  unsigned int scale_factors;
  unsigned int cstate;

  unsigned int states_padded = states;
@@ -480,7 +479,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -523,7 +523,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
                                                      invar_proportion,
                                                      invar_indices,
                                                      freqs_indices,
-                                                      persite_lnl);
+                                                      persite_lnl,
+                                                      attrib);
    }
    else
    {
@@ -541,7 +542,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -584,7 +586,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
                                                      invar_proportion,
                                                      invar_indices,
                                                      freqs_indices,
-                                                      persite_lnl);
+                                                      persite_lnl,
+                                                      attrib);
    }
    else
    {
@@ -602,7 +605,8 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -610,8 +614,50 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
  }
  #endif

+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+    }
+
    pmat = pmatrix;
    terma = 0;
    for (i = 0; i < rate_cats; ++i)
@@ -633,6 +679,12 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
        pmat += states_padded;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      if (prop_invar > 0)
      {
@@ -649,13 +701,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,
      clvp += states_padded;
    }

-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
-
    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -667,6 +716,10 @@ double pll_core_edge_loglikelihood_ti(unsigned int states,

    tipchars++;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }

@@ -741,7 +794,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -785,7 +839,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -829,7 +884,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
                                                invar_proportion,
                                                invar_indices,
                                                freqs_indices,
-                                                persite_lnl);
+                                                persite_lnl,
+                                                attrib);
    }
    /* this line is never called, but should we disable the else case above,
       then states_padded must be set to this value */
@@ -857,9 +913,6 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,

  for (n = 0; n < sites; ++n)
  {
-    pmat = pmatrix;
-    terma = 0;
-
    if (per_rate_scaling)
    {
      /* compute minimum per-rate scaler -> common per-site scaler */
@@ -886,6 +939,8 @@ double pll_core_edge_loglikelihood_ii(unsigned int states,
      site_scalings += (child_scaler) ? child_scaler[n] : 0;
    }

+    pmat = pmatrix;
+    terma = 0;
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];

--- a/src/core_likelihood_avx.c
+++ b/src/core_likelihood_avx.c
@@ -420,7 +420,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
                                                const double * invar_proportion,
                                                const int * invar_indices,
                                                const unsigned int * freqs_indices,
-                                                double * persite_lnl)
+                                                double * persite_lnl,
+                                                unsigned int attrib)
 {
  unsigned int n,i,j,m;
  double logl = 0;
@@ -434,7 +435,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
  double site_lk, inv_site_lk;

  unsigned int cstate;
-  unsigned int scale_factors;
  unsigned int states = 20;
  unsigned int states_padded = states;

@@ -445,6 +445,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
  unsigned int span = states_padded * rate_cats;
  unsigned int maxstates = tipmap_size;

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  /* precompute a lookup table of four values per entry (one for each state),
     for all 16 states (including ambiguities) and for each rate category. */
  double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
@@ -510,6 +536,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
    cstate = (unsigned int) tipchars[n];
    unsigned int loffset = cstate*span;

+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      xmm1 = _mm256_setzero_pd();
@@ -534,6 +584,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
      xmm0 = _mm256_hadd_pd(xmm1,xmm1);
      terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2];

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -551,13 +607,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,

      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -569,6 +623,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx(unsigned int sites,
  }

  pll_aligned_free(lookup);
+  if (rate_scalings)
+    free(rate_scalings);

  return logl;
 }
@@ -588,7 +644,8 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
                                          const double * invar_proportion,
                                          const int * invar_indices,
                                          const unsigned int * freqs_indices,
-                                          double * persite_lnl)
+                                          double * persite_lnl,
+                                          unsigned int attrib)
 {
  unsigned int n,i,j,k;
  double logl = 0;
@@ -602,7 +659,6 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
  double site_lk, inv_site_lk;

  unsigned int cstate;
-  unsigned int scale_factors;
  unsigned int states_padded = (states+3) & 0xFFFFFFFC;

  __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@@ -610,6 +666,32 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,

  size_t displacement = (states_padded - states) * (states_padded);

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
    pmat = pmatrix;
@@ -617,6 +699,30 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,

    cstate = tipmap[tipchars[n]];

+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];
@@ -706,6 +812,12 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,
        clvp += 4;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -723,13 +835,11 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,

      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -739,6 +849,10 @@ double pll_core_edge_loglikelihood_ti_avx(unsigned int states,

    logl += site_lk;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }

@@ -757,7 +871,8 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
                                          const double * invar_proportion,
                                          const int * invar_indices,
                                          const unsigned int * freqs_indices,
-                                          double * persite_lnl)
+                                          double * persite_lnl,
+                                          unsigned int attrib)
 {
  unsigned int n,i,j,k;
  double logl = 0;
@@ -771,17 +886,69 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
  double terma, terma_r;
  double site_lk, inv_site_lk;

-  unsigned int scale_factors;
  unsigned int states_padded = (states+3) & 0xFFFFFFFC;

  __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

  size_t displacement = (states_padded - states) * (states_padded);

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for precomputation.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
    pmat = pmatrix;
    terma = 0;
+
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+      site_scalings += (child_scaler) ? child_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];
@@ -864,6 +1031,12 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
        clvp += 4;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -882,14 +1055,11 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,
      clvc += states_padded;
      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
-    scale_factors += (child_scaler) ? child_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -899,6 +1069,10 @@ double pll_core_edge_loglikelihood_ii_avx(unsigned int states,

    logl += site_lk;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }


--- a/src/core_likelihood_avx2.c
+++ b/src/core_likelihood_avx2.c
@@ -19,6 +19,7 @@
    Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
 */

+#include <limits.h>
 #include "pll.h"

 PLL_EXPORT double pll_core_root_loglikelihood_avx2(unsigned int states,
@@ -122,7 +123,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
                                                 const double * invar_proportion,
                                                 const int * invar_indices,
                                                 const unsigned int * freqs_indices,
-                                                 double * persite_lnl)
+                                                 double * persite_lnl,
+                                                 unsigned int attrib)
 {
  unsigned int n,i,j,m = 0;
  double logl = 0;
@@ -136,7 +138,6 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
  double site_lk, inv_site_lk;

  unsigned int cstate;
-  unsigned int scale_factors;
  unsigned int states = 20;
  unsigned int states_padded = states;

@@ -144,6 +145,32 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,

  size_t displacement = (states_padded - states) * (states_padded);

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  unsigned int span = states_padded * rate_cats;
  unsigned int maxstates = tipmap_size;

@@ -212,6 +239,30 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
    cstate = (unsigned int) tipchars[n];
    unsigned int loffset = cstate*span;

+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      xmm1 = _mm256_setzero_pd();
@@ -234,6 +285,12 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
      xmm0 = _mm256_hadd_pd(xmm1,xmm1);
      terma_r = ((double *)&xmm0)[0] + ((double *)&xmm0)[2];

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -251,13 +308,11 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,

      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -269,6 +324,8 @@ double pll_core_edge_loglikelihood_ti_20x20_avx2(unsigned int sites,
  }

  pll_aligned_free(lookup);
+  if (rate_scalings)
+    free(rate_scalings);

  return logl;
 }
@@ -288,7 +345,8 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
                                           const double * invar_proportion,
                                           const int * invar_indices,
                                           const unsigned int * freqs_indices,
-                                           double * persite_lnl)
+                                           double * persite_lnl,
+                                           unsigned int attrib)
 {
  unsigned int n,i,j,k;
  double logl = 0;
@@ -302,17 +360,69 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
  double terma, terma_r;
  double site_lk, inv_site_lk;

-  unsigned int scale_factors;
  unsigned int states_padded = (states+3) & 0xFFFFFFFC;

  __m256d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

  size_t displacement = (states_padded - states) * (states_padded);

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
    pmat = pmatrix;
    terma = 0;
+
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+      site_scalings += (child_scaler) ? child_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];
@@ -391,6 +501,12 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
        clvp += 4;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -409,14 +525,11 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,
      clvc += states_padded;
      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
-    scale_factors += (child_scaler) ? child_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -426,5 +539,9 @@ double pll_core_edge_loglikelihood_ii_avx2(unsigned int states,

    logl += site_lk;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }
--- a/src/core_likelihood_sse.c
+++ b/src/core_likelihood_sse.c
@@ -202,7 +202,8 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
                                          const double * invar_proportion,
                                          const int * invar_indices,
                                          const unsigned int * freqs_indices,
-                                          double * persite_lnl)
+                                          double * persite_lnl,
+                                          unsigned int attrib)
 {
  unsigned int n,i,j,k;
  double logl = 0;
@@ -216,7 +217,6 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
  double site_lk, inv_site_lk;

  unsigned int cstate;
-  unsigned int scale_factors;
  unsigned int states_padded = (states+1) & 0xFFFFFFFE;

  __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
@@ -225,6 +225,32 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,

  xmm5 = _mm_setzero_pd();

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
    pmat = pmatrix;
@@ -232,6 +258,30 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,

    cstate = tipmap[tipchars[n]];

+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =  (parent_scaler) ? parent_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];
@@ -294,6 +344,12 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,
        clvp += 2;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -311,13 +367,11 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,

      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -327,6 +381,10 @@ double pll_core_edge_loglikelihood_ti_sse(unsigned int states,

    logl += site_lk;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }

@@ -345,7 +403,8 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
                                          const double * invar_proportion,
                                          const int * invar_indices,
                                          const unsigned int * freqs_indices,
-                                          double * persite_lnl)
+                                          double * persite_lnl,
+                                          unsigned int attrib)
 {
  unsigned int n,i,j,k;
  double logl = 0;
@@ -359,17 +418,69 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
  double terma, terma_r;
  double site_lk, inv_site_lk;

-  unsigned int scale_factors;
  unsigned int states_padded = (states+1) & 0xFFFFFFFE;

  __m128d xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;

  size_t displacement = (states_padded - states) * (states_padded);

+  /* scaling stuff */
+  unsigned int site_scalings;
+  unsigned int * rate_scalings = NULL;
+  int per_rate_scaling = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 1 : 0;
+
+  /* powers of scale threshold for undoing the scaling */
+  double scale_minlh[PLL_SCALE_RATE_MAXDIFF];
+  if (per_rate_scaling)
+  {
+    rate_scalings = (unsigned int*) calloc(rate_cats, sizeof(unsigned int));
+
+    if (!rate_scalings)
+    {
+      pll_errno = PLL_ERROR_MEM_ALLOC;
+      snprintf(pll_errmsg, 200, "Cannot allocate space for rate scalers.");
+      return -INFINITY;
+    }
+
+    double scale_factor = 1.0;
+    for (i = 0; i < PLL_SCALE_RATE_MAXDIFF; ++i)
+    {
+      scale_factor *= PLL_SCALE_THRESHOLD;
+      scale_minlh[i] = scale_factor;
+    }
+  }
+
  for (n = 0; n < sites; ++n)
  {
    pmat = pmatrix;
    terma = 0;
+
+    if (per_rate_scaling)
+    {
+      /* compute minimum per-rate scaler -> common per-site scaler */
+      site_scalings = UINT_MAX;
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = (parent_scaler) ? parent_scaler[n*rate_cats+i] : 0;
+        rate_scalings[i] += (child_scaler) ? child_scaler[n*rate_cats+i] : 0;
+        if (rate_scalings[i] < site_scalings)
+          site_scalings = rate_scalings[i];
+      }
+
+      /* compute relative capped per-rate scalers */
+      for (i = 0; i < rate_cats; ++i)
+      {
+        rate_scalings[i] = PLL_MIN(rate_scalings[i] - site_scalings,
+                                   PLL_SCALE_RATE_MAXDIFF);
+      }
+    }
+    else
+    {
+      /* count number of scaling factors to account for */
+      site_scalings =   (parent_scaler) ? parent_scaler[n] : 0;
+      site_scalings +=  (child_scaler) ? child_scaler[n] : 0;
+    }
+
    for (i = 0; i < rate_cats; ++i)
    {
      freqs = frequencies[freqs_indices[i]];
@@ -424,6 +535,12 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
        clvp += 2;
      }

+      /* apply per-rate scalers, if necessary */
+      if (rate_scalings && rate_scalings[i] > 0)
+      {
+        terma_r *= scale_minlh[rate_scalings[i]-1];
+      }
+
      /* account for invariant sites */
      prop_invar = invar_proportion ? invar_proportion[freqs_indices[i]] : 0;
      if (prop_invar > 0)
@@ -442,14 +559,11 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,
      clvc += states_padded;
      pmat -= displacement;
    }
-    /* count number of scaling factors to acount for */
-    scale_factors = (parent_scaler) ? parent_scaler[n] : 0;
-    scale_factors += (child_scaler) ? child_scaler[n] : 0;

    /* compute site log-likelihood and scale if necessary */
    site_lk = log(terma);
-    if (scale_factors)
-      site_lk += scale_factors * log(PLL_SCALE_THRESHOLD);
+    if (site_scalings)
+      site_lk += site_scalings * log(PLL_SCALE_THRESHOLD);

    site_lk *= pattern_weights[n];

@@ -459,6 +573,10 @@ double pll_core_edge_loglikelihood_ii_sse(unsigned int states,

    logl += site_lk;
  }
+
+  if (rate_scalings)
+    free(rate_scalings);
+
  return logl;
 }


--- a/src/core_partials_avx.c
+++ b/src/core_partials_avx.c
@@ -377,9 +377,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,
 {
  unsigned int states = 4;
  unsigned int n,k,i;
-  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
-  unsigned int scale_mask;
-  unsigned int init_mask;

  const double * lmat;
  const double * rmat;
@@ -389,6 +386,14 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,

  unsigned int span = states * rate_cats;

+
+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
+
  if (!parent_scaler)
  {
    /* scaling disabled / not required */
@@ -404,9 +409,6 @@ PLL_EXPORT void pll_core_update_partial_ii_4x4_avx(unsigned int sites,
    fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
  }

-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
-  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
  for (n = 0; n < sites; ++n)
  {
    lmat = left_matrix;
@@ -556,8 +558,11 @@ PLL_EXPORT void pll_core_update_partial_tt_avx(unsigned int states,
    return;
  }

+  size_t scaler_size = (attrib & PLL_ATTRIB_RATE_SCALERS) ?
+                                                        sites*rate_cats : sites;
+
  if (parent_scaler)
-    memset(parent_scaler, 0, sizeof(unsigned int) * sites);
+    memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size);

  for (n = 0; n < sites; ++n)
  {
@@ -627,7 +632,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
                                               unsigned int attrib)
 {
  unsigned int i,j,k,n;
-  unsigned int scaling;

  const double * lmat;
  const double * rmat;
@@ -666,18 +670,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
                                       right_matrix,
                                       right_scaler,
                                       tipmap,
-                                       tipmap_size);
+                                       tipmap_size,
+                                       attrib);
    return;
  }

+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
+
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
    /* add up the scale vector of the two children if available */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
+    fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
+  }

  size_t displacement = (states_padded - states) * (states_padded);

  __m256i mask;
-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);

  /* compute CLV */
  for (n = 0; n < sites; ++n)
@@ -685,16 +707,17 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
    lmat = left_matrix;
    rmat = right_matrix;

-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    lstate = tipmap[left_tipchars[n]];

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
-
        __m256d v_terma0 = _mm256_setzero_pd();
        __m256d v_termb0 = _mm256_setzero_pd();
        __m256d v_terma1 = _mm256_setzero_pd();
@@ -820,14 +843,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);

      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -837,12 +878,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx(unsigned int states,
      parent_clv += states_padded;
      right_clv  += states_padded;
    }
+
    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
      parent_clv -= span_padded;
      for (i = 0; i < span_padded; i += 4)
      {
@@ -1064,25 +1104,25 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
                                                     const double * right_matrix,
                                                     const unsigned int * right_scaler,
                                                     const unsigned int * tipmap,
-                                                     unsigned int tipmap_size)
+                                                     unsigned int tipmap_size,
+                                                     unsigned int attrib)
 {
  unsigned int states = 20;
  unsigned int states_padded = states;
  unsigned int maxstates = tipmap_size;
-  unsigned int scaling;
  unsigned int i,j,k,n,m;

  const double * lmat;
  const double * rmat;

-  unsigned int span = states_padded * rate_cats;
+  unsigned int span_padded = states_padded * rate_cats;
  unsigned int lstate;

  __m256d xmm0,xmm1,xmm2,xmm3;

  /* precompute a lookup table of four values per entry (one for each state),
     for all 16 states (including ambiguities) and for each rate category. */
-  double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
+  double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double),
                                      PLL_ALIGNMENT_AVX);
  if (!lookup)
  {
@@ -1136,31 +1176,48 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,
    }
  }

-  /* update the parent scaler with the scaler of the right child */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);

-  size_t displacement = (states_padded - states) * (states_padded);
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
+    /* add up the scale vector of the two children if available */
+    fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
+  }

-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  size_t displacement = (states_padded - states) * (states_padded);

  /* iterate over sites and compute CLV entries */
  for (n = 0; n < sites; ++n)
  {
    rmat = right_matrix;

-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    lstate = (unsigned int) left_tipchar[n];

-    unsigned int loffset = lstate*span;
+    unsigned int loffset = lstate*span_padded;

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
-
        __m256d v_termb0 = _mm256_setzero_pd();
        __m256d v_termb1 = _mm256_setzero_pd();
        __m256d v_termb2 = _mm256_setzero_pd();
@@ -1230,13 +1287,31 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);
      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -1248,18 +1323,16 @@ PLL_EXPORT void pll_core_update_partial_ti_20x20_avx(unsigned int sites,

    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
-      parent_clv -= span;
-      for (i = 0; i < span; i += 4)
+      parent_clv -= span_padded;
+      for (i = 0; i < span_padded; i += 4)
      {
        __m256d v_prod = _mm256_load_pd(parent_clv + i);
        v_prod = _mm256_mul_pd(v_prod,v_scale_factor);
        _mm256_store_pd(parent_clv + i, v_prod);
      }
-      parent_clv += span;
+      parent_clv += span_padded;
      parent_scaler[n] += 1;
    }
  }
@@ -1280,7 +1353,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
                                               unsigned int attrib)
 {
  unsigned int i,j,k,n;
-  unsigned int scaling;

  const double * lmat;
  const double * rmat;
@@ -1305,27 +1377,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,
    return;
  }

+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
+
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
    /* add up the scale vector of the two children if available */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler);
+    fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
+  }

  size_t displacement = (states_padded - states) * (states_padded);

-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
-
  /* compute CLV */
  for (n = 0; n < sites; ++n)
  {
    lmat = left_matrix;
    rmat = right_matrix;
-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
-
        __m256d v_terma0 = _mm256_setzero_pd();
        __m256d v_termb0 = _mm256_setzero_pd();
        __m256d v_terma1 = _mm256_setzero_pd();
@@ -1436,13 +1525,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);
      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -1456,10 +1563,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx(unsigned int states,

    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
      parent_clv -= span_padded;
      for (i = 0; i < span_padded; i += 4)
      {

--- a/src/core_partials_avx2.c
+++ b/src/core_partials_avx2.c
@@ -21,7 +21,7 @@

 #include "pll.h"

-static void fill_parent_scaler(unsigned int sites,
+static void fill_parent_scaler(unsigned int scaler_size,
                               unsigned int * parent_scaler,
                               const unsigned int * left_scaler,
                               const unsigned int * right_scaler)
@@ -29,19 +29,19 @@ static void fill_parent_scaler(unsigned int sites,
  unsigned int i;

  if (!left_scaler && !right_scaler)
-    memset(parent_scaler, 0, sizeof(unsigned int) * sites);
+    memset(parent_scaler, 0, sizeof(unsigned int) * scaler_size);
  else if (left_scaler && right_scaler)
  {
-    memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites);
-    for (i = 0; i < sites; ++i)
+    memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size);
+    for (i = 0; i < scaler_size; ++i)
      parent_scaler[i] += right_scaler[i];
  }
  else
  {
    if (left_scaler)
-      memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * sites);
+      memcpy(parent_scaler, left_scaler, sizeof(unsigned int) * scaler_size);
    else
-      memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * sites);
+      memcpy(parent_scaler, right_scaler, sizeof(unsigned int) * scaler_size);
  }
 }

@@ -61,7 +61,6 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
                                                unsigned int attrib)
 {
  unsigned int i,j,k,n;
-  unsigned int scaling;

  const double * lmat;
  const double * rmat;
@@ -101,18 +100,36 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
                                          right_matrix,
                                          right_scaler,
                                          tipmap,
-                                          tipmap_size);
+                                          tipmap_size,
+                                          attrib);
    return;
  }

+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
+
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
    /* add up the scale vector of the two children if available */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
+    fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
+  }

  size_t displacement = (states_padded - states) * (states_padded);

  __m256i mask;
-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);

  /* compute CLV */
  for (n = 0; n < sites; ++n)
@@ -120,12 +137,14 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
    lmat = left_matrix;
    rmat = right_matrix;

-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    lstate = tipmap[left_tipchars[n]];

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
@@ -251,14 +270,32 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);

      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -268,12 +305,11 @@ PLL_EXPORT void pll_core_update_partial_ti_avx2(unsigned int states,
      parent_clv += states_padded;
      right_clv  += states_padded;
    }
+
    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
      parent_clv -= span_padded;
      for (i = 0; i < span_padded; i += 4)
      {
@@ -298,25 +334,25 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
                                           const double * right_matrix,
                                           const unsigned int * right_scaler,
                                           const unsigned int * tipmap,
-                                           unsigned int tipmap_size)
+                                           unsigned int tipmap_size,
+                                           unsigned int attrib)
 {
  unsigned int states = 20;
  unsigned int states_padded = states;
  unsigned int maxstates = tipmap_size;
-  unsigned int scaling;
  unsigned int i,j,k,n,m;

  const double * lmat;
  const double * rmat;

-  unsigned int span = states_padded * rate_cats;
+  unsigned int span_padded = states_padded * rate_cats;
  unsigned int lstate;

  __m256d xmm0,xmm1,xmm2,xmm3;

  /* precompute a lookup table of four values per entry (one for each state),
     for all 16 states (including ambiguities) and for each rate category. */
-  double * lookup = pll_aligned_alloc(maxstates*span*sizeof(double),
+  double * lookup = pll_aligned_alloc(maxstates*span_padded*sizeof(double),
                                      PLL_ALIGNMENT_AVX);
  if (!lookup)
  {
@@ -370,31 +406,48 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,
    }
  }

-  /* update the parent scaler with the scaler of the right child */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, NULL, right_scaler);
+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);

-  size_t displacement = (states_padded - states) * (states_padded);
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
+    /* add up the scale vector of the two children if available */
+    fill_parent_scaler(scaler_size, parent_scaler, NULL, right_scaler);
+  }

-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  size_t displacement = (states_padded - states) * (states_padded);

  /* iterate over sites and compute CLV entries */
  for (n = 0; n < sites; ++n)
  {
    rmat = right_matrix;

-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    lstate = (unsigned int) left_tipchar[n];

-    unsigned int loffset = lstate*span;
+    unsigned int loffset = lstate*span_padded;

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
-
        __m256d v_termb0 = _mm256_setzero_pd();
        __m256d v_termb1 = _mm256_setzero_pd();
        __m256d v_termb2 = _mm256_setzero_pd();
@@ -460,13 +513,31 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);
      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -478,18 +549,16 @@ void pll_core_update_partial_ti_20x20_avx2(unsigned int sites,

    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
-      parent_clv -= span;
-      for (i = 0; i < span; i += 4)
+      parent_clv -= span_padded;
+      for (i = 0; i < span_padded; i += 4)
      {
        __m256d v_prod = _mm256_load_pd(parent_clv + i);
        v_prod = _mm256_mul_pd(v_prod,v_scale_factor);
        _mm256_store_pd(parent_clv + i, v_prod);
      }
-      parent_clv += span;
+      parent_clv += span_padded;
      parent_scaler[n] += 1;
    }
  }
@@ -510,7 +579,6 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
                                                unsigned int attrib)
 {
  unsigned int i,j,k,n;
-  unsigned int scaling;

  const double * lmat;
  const double * rmat;
@@ -536,27 +604,44 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,
    return;
  }

+  /* scaling-related stuff */
+  unsigned int scale_mode;  /* 0 = none, 1 = per-site, 2 = per-rate */
+  unsigned int scale_mask;
+  unsigned int init_mask;
+  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
+  __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
+
+  if (!parent_scaler)
+  {
+    /* scaling disabled / not required */
+    scale_mode = init_mask = 0;
+  }
+  else
+  {
+    /* determine the scaling mode and init the vars accordingly */
+    scale_mode = (attrib & PLL_ATTRIB_RATE_SCALERS) ? 2 : 1;
+    init_mask = (scale_mode == 1) ? 0xF : 0;
+    const size_t scaler_size = (scale_mode == 2) ? sites * rate_cats : sites;
    /* add up the scale vector of the two children if available */
-  if (parent_scaler)
-    fill_parent_scaler(sites, parent_scaler, left_scaler, right_scaler);
+    fill_parent_scaler(scaler_size, parent_scaler, left_scaler, right_scaler);
+  }

  size_t displacement = (states_padded - states) * (states_padded);

-  __m256d v_scale_threshold = _mm256_set1_pd(PLL_SCALE_THRESHOLD);
-
  /* compute CLV */
  for (n = 0; n < sites; ++n)
  {
    lmat = left_matrix;
    rmat = right_matrix;
-    scaling = (parent_scaler) ? 0xF : 0;
+    scale_mask = init_mask;

    for (k = 0; k < rate_cats; ++k)
    {
+      unsigned int rate_mask = 0xF;
+
      /* iterate over quadruples of rows */
      for (i = 0; i < states_padded; i += 4)
      {
-
        __m256d v_terma0 = _mm256_setzero_pd();
        __m256d v_termb0 = _mm256_setzero_pd();
        __m256d v_terma1 = _mm256_setzero_pd();
@@ -664,13 +749,31 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,

        __m256d v_prod = _mm256_mul_pd(v_terma_sum,v_termb_sum);

-        /* check for scaling */
+        /* check if scaling is needed for the current rate category */
        __m256d v_cmp = _mm256_cmp_pd(v_prod, v_scale_threshold, _CMP_LT_OS);
-        scaling = scaling & _mm256_movemask_pd(v_cmp);
+        rate_mask = rate_mask & _mm256_movemask_pd(v_cmp);

        _mm256_store_pd(parent_clv+i, v_prod);
      }

+      if (scale_mode == 2)
+      {
+        /* PER-RATE SCALING: if *all* entries of the *rate* CLV were below
+         * the threshold then scale (all) entries by PLL_SCALE_FACTOR */
+        if (rate_mask == 0xF)
+        {
+          for (i = 0; i < states_padded; i += 4)
+          {
+            __m256d v_prod = _mm256_load_pd(parent_clv + i);
+            v_prod = _mm256_mul_pd(v_prod, v_scale_factor);
+            _mm256_store_pd(parent_clv + i, v_prod);
+          }
+          parent_scaler[n*rate_cats + k] += 1;
+        }
+      }
+      else
+        scale_mask = scale_mask & rate_mask;
+
      /* reset pointers to point to the start of the next p-matrix, as the
         vectorization assumes a square states_padded * states_padded matrix,
         even though the real matrix is states * states_padded */
@@ -684,10 +787,8 @@ PLL_EXPORT void pll_core_update_partial_ii_avx2(unsigned int states,

    /* if *all* entries of the site CLV were below the threshold then scale
       (all) entries by PLL_SCALE_FACTOR */
-    if (scaling == 0xF)
+    if (scale_mask == 0xF)
    {
-      __m256d v_scale_factor = _mm256_set1_pd(PLL_SCALE_FACTOR);
-
      parent_clv -= span_padded;
      for (i = 0; i < span_padded; i += 4)
      {