Dylan Aïssi · Dylan Aïssi · 2d8a2e32 · 2d8a2e32 · 2d8a2e32 · 2d8a2e32
--- a/Makefile.src
+++ b/Makefile.src
@@ -5,11 +5,11 @@ CXXWARN = ${CWARN} -Wold-style-cast
 CWARN2 = -Wall -Wextra -Wshadow -Wformat-security
 CXXWARN2 = ${CWARN2} -Wold-style-cast

-CSRC = SFMT.c libdeflate/lib/adler32.c libdeflate/lib/aligned_malloc.c libdeflate/lib/crc32.c libdeflate/lib/deflate_compress.c libdeflate/lib/deflate_decompress.c libdeflate/lib/gzip_compress.c libdeflate/lib/gzip_decompress.c libdeflate/lib/zlib_compress.c libdeflate/lib/zlib_decompress.c libdeflate/lib/x86/cpu_features.c
+CSRC = include/SFMT.c libdeflate/lib/adler32.c libdeflate/lib/aligned_malloc.c libdeflate/lib/crc32.c libdeflate/lib/deflate_compress.c libdeflate/lib/deflate_decompress.c libdeflate/lib/gzip_compress.c libdeflate/lib/gzip_decompress.c libdeflate/lib/zlib_compress.c libdeflate/lib/zlib_decompress.c libdeflate/lib/x86/cpu_features.c

 ZCSRC = zstd/lib/common/debug.c zstd/lib/common/entropy_common.c zstd/lib/common/zstd_common.c zstd/lib/common/error_private.c zstd/lib/common/xxhash.c zstd/lib/common/fse_decompress.c zstd/lib/common/pool.c zstd/lib/common/threading.c zstd/lib/compress/fse_compress.c zstd/lib/compress/hist.c zstd/lib/compress/huf_compress.c zstd/lib/compress/zstd_double_fast.c zstd/lib/compress/zstd_fast.c zstd/lib/compress/zstd_lazy.c zstd/lib/compress/zstd_ldm.c zstd/lib/compress/zstd_opt.c zstd/lib/compress/zstd_compress.c zstd/lib/compress/zstd_compress_literals.c zstd/lib/compress/zstd_compress_sequences.c zstd/lib/compress/zstdmt_compress.c zstd/lib/decompress/huf_decompress.c zstd/lib/decompress/zstd_decompress.c zstd/lib/decompress/zstd_ddict.c zstd/lib/decompress/zstd_decompress_block.c

-CCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc plink2.cc plink2_adjust.cc plink2_bgzf.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc plink2_stats.cc plink2_string.cc plink2_text.cc plink2_thread.cc plink2_zstfile.cc
+CCSRC = include/plink2_base.cc include/plink2_bits.cc include/pgenlib_misc.cc include/pgenlib_read.cc include/pgenlib_write.cc include/plink2_bgzf.cc include/plink2_stats.cc include/plink2_string.cc include/plink2_text.cc include/plink2_thread.cc include/plink2_zstfile.cc plink2.cc plink2_adjust.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc

 OBJ_NO_ZSTD = $(CSRC:.c=.o) $(CCSRC:.cc=.o)
 OBJ = $(CSRC:.c=.o) $(ZCSRC:.c=.o) $(CCSRC:.cc=.o)
@@ -26,9 +26,9 @@ CINCLUDE2 = -I../libdeflate -I../libdeflate/common
 ZSTD_INCLUDE = -Izstd/lib -Izstd/lib/common
 ZSTD_INCLUDE2 = -I../zstd/lib -I../zstd/lib/common

-PGCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc pgen_compress.cc
+PGCSRC = include/plink2_base.cc include/plink2_bits.cc include/pgenlib_misc.cc include/pgenlib_read.cc include/pgenlib_write.cc pgen_compress.cc
 PGCOBJ = $(PGCSRC:.c=.o)
 PGCSRC2 = $(foreach fname,$(PGCSRC),../$(fname))

-CLEAN = *.o libdeflate/lib/*.o libdeflate/lib/x86/*.o zstd/lib/common/*.o zstd/lib/compress/*.o zstd/lib/decompress/*.o bin/plink2 bin/pgen_compress
+CLEAN = *.o include/*.o libdeflate/lib/*.o libdeflate/lib/x86/*.o zstd/lib/common/*.o zstd/lib/compress/*.o zstd/lib/decompress/*.o bin/plink2 bin/pgen_compress
 CLEAN3 = $(foreach expr,$(CLEAN),../$(expr))
--- a/Python/pgenlib.pyx
+++ b/Python/pgenlib.pyx
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -8,7 +8,7 @@ import numpy as np

 ext_modules = [
    Extension('pgenlib',
-              sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../pgenlib_misc.cc', '../pgenlib_read.cc', '../pgenlib_write.cc', '../plink2_base.cc'],
+              sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../include/pgenlib_misc.cc', '../include/pgenlib_read.cc', '../include/pgenlib_write.cc', '../include/plink2_base.cc', '../include/plink2_bits.cc'],
              language = "c++",
              # do not compile as c++11, since cython doesn't yet support
              # overload of uint32_t operator
@@ -21,6 +21,6 @@ ext_modules = [
    ]

 setup(name = 'Pgenlib',
-      version = '0.71',
+      version = '0.73',
      description = "Wrapper for pgenlib's basic reader and writer.",
      ext_modules = cythonize(ext_modules))
--- a/cindex/DESCRIPTION
+++ b/cindex/DESCRIPTION
+Package: cindex
+Type: Package
+Title: C-index
+Version: 0.1
+Date: 2019-12-12
+Author: Christopher Chang
+Maintainer: Christopher Chang <chrchang@alumni.caltech.edu>
+Description: Supports fast concordance-index computation.
+License: LGPL (>= 3)
+Imports: Rcpp (>= 1.0.1)
+LinkingTo: Rcpp
--- a/cindex/NAMESPACE
+++ b/cindex/NAMESPACE
+useDynLib(cindex, .registration=TRUE)
+exportPattern("^[[:alpha:]]+")
+importFrom(Rcpp, evalCpp)
--- a/cindex/R/RcppExports.R
+++ b/cindex/R/RcppExports.R
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+CIndex <- function(yhat, y, status) {
+    .Call(`_cindex_CIndex`, yhat, y, status)
+}
+
--- a/cindex/R/old.R
+++ b/cindex/R/old.R
+old_cindex=function(yhat,y,status,w=rep(1,length(y))){
+  # Rob's implementation of C-index
+  #note:  this function gives identical results to the summary function from coxph, and the concordance.index function in survcomp.
+  #  (with their default settings), and no ties in yhat. Works with ties in y. But does not agree with latter when yhat has ties. There are conflicting definitions for c-index in this case
+  #
+  #formula used  is
+  #  Concordance = (#all concordant pairs + #tied pairs/2)/(#total pairs including ties).
+  # with w,  weights used are ave wts for each pair
+  time.computeC.start <- Sys.time()
+
+  risksets=which(status==1)
+  w=length(w)*w/sum(w)
+  fun=function(riskset,y,yhat,w){
+    total=concordant=0
+    i=riskset
+    rest=which(y>y[i])
+    if(length(rest)>0){
+      ww=(w[rest]+w[i])/2
+      total=sum(ww)
+      concordant = 1.0*sum(ww*(yhat[rest]<yhat[i]))+0.5*sum(ww*(yhat[rest]==yhat[i]))
+    }
+    return(c(concordant,total))
+  }
+  out=sapply(risksets,fun,y,yhat,w)
+  cindex=sum(out[1,])/sum(out[2,])
+
+  return(cindex)
+}
--- a/cindex/src/Makevars
+++ b/cindex/src/Makevars
+OBJECTS = include/plink2_base.o cindex.o RcppExports.o
+PKG_CPPFLAGS = -march=native
--- a/cindex/src/RcppExports.cpp
+++ b/cindex/src/RcppExports.cpp
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// CIndex
+double CIndex(NumericVector yhat, NumericVector y, SEXP status);
+RcppExport SEXP _cindex_CIndex(SEXP yhatSEXP, SEXP ySEXP, SEXP statusSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< NumericVector >::type yhat(yhatSEXP);
+    Rcpp::traits::input_parameter< NumericVector >::type y(ySEXP);
+    Rcpp::traits::input_parameter< SEXP >::type status(statusSEXP);
+    rcpp_result_gen = Rcpp::wrap(CIndex(yhat, y, status));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_cindex_CIndex", (DL_FUNC) &_cindex_CIndex, 3},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_cindex(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
--- a/cindex/src/cindex.cpp
+++ b/cindex/src/cindex.cpp
+#include "include/plink2_base.h"
+#include <algorithm>
+
+#include <Rcpp.h>
+using namespace Rcpp;
+
+typedef struct CIndexRecStruct {
+  double key;  // yhat on first sort, y on second sort
+  uint32_t status;  // original pos on first sort
+  uint32_t yhat_int;
+  bool operator<(const struct CIndexRecStruct& rhs) const {
+    return (key < rhs.key);
+  }
+} CIndexRec;
+
+uint32_t PopcountLookup(const uintptr_t* new_yhat_present, const uint16_t* new_yhat_64b_popcounts, const uint16_t* new_yhat_2048b_popcounts, const uint32_t* new_yhat_64kib_popcounts, uint32_t widx);
+
+double CIndexTieheavyMain(const CIndexRec* recs, uintptr_t size, unsigned char* wkspace, uintptr_t* new_yhat_present, uint16_t* new_yhat_64b_popcounts, uint16_t* new_yhat_2048b_popcounts, uint32_t* new_yhat_64kib_popcounts);
+
+// [[Rcpp::export]]
+double CIndex(NumericVector yhat, NumericVector y, SEXP status) {
+  if ((TYPEOF(status) != INTSXP) && (TYPEOF(status) != REALSXP)) {
+    // could support LGLSXP too
+    stop("Unsupported status type");
+  }
+  const uintptr_t size = yhat.size();
+  if (size == 0) {
+    stop("yhat cannot be empty");
+  }
+  if (size != static_cast<uintptr_t>(y.size())) {
+    stop("yhat and y must have the same length");
+  }
+
+  // We define C-index as:
+  //   sum_{all k where status[k]=1} [#(m: y[m]>y[j], yhat[m]<yhat[j]) +
+  //                                  0.5 * #(m: y[m]>y[j], yhat[m]==yhat[j])]
+  //   ------------------------------------------------------------------------
+  //   sum_{all k where status[k]=1} #(m: y[m]>y[j])
+  //
+  // The computation is organized as follows:
+  // 1. Replace yhat with 0-based ranks ('yhat_int'); note that this has no
+  //    effect on the expression above.  Ties round down.
+  // 2. Sort records in order of nondecreasing y.
+  // 3. Iterate through records, starting from largest y.  Incrementally update
+  //    a data structure tracking which yhat_int values have been seen so far,
+  //    and in case of ties, their multiplicity.  This data structure supports
+  //    fast insertion and number-of-smaller{-or-equal}-elements queries, which
+  //    covers the operations we need to compute the numerator.  Formally, the
+  //    total query time is still O(N^2) if status==1 isn't sparse, but the
+  //    constant multiplier is so small that the std::sort calls practically
+  //    always dominate for R's N<2^31 domain.  (Not literally true for current
+  //    implementation, but it's trivial to branch on N and use another
+  //    popcount index-levels for very large N once any application cares.)
+  // More precisely, the new_yhat data structure consists of a bitarray, and
+  // three index-arrays containing partial popcounts for intervals of 2^9,
+  // 2^14, and 2^19 bits.
+
+  // Intermediate data structures (all cacheline-aligned except maybe recs,
+  // which is vector-aligned):
+  // - recs: N CIndexRec
+  // - new_yhat_present: N bits
+  // - new_yhat_64b_popcounts: DivUp(N, 512) uint16s
+  // - new_yhat_2048b_popcounts: DivUp(N, 512*32) uint16s
+  // - new_yhat_64kib_popcounts: DivUp(N, 512*32*32) uint16s
+  const uintptr_t recs_vec_ct = plink2::DivUp(sizeof(CIndexRec) * size, plink2::kBytesPerVec);
+  const uintptr_t new_yhat_present_cacheline_ct = plink2::DivUp(size, plink2::kBitsPerCacheline);
+  const uintptr_t new_yhat_present_vec_ct = new_yhat_present_cacheline_ct * plink2::kVecsPerCacheline;
+  const uintptr_t new_yhat_64b_popcounts_cacheline_ct = plink2::DivUp(new_yhat_present_cacheline_ct, plink2::kInt16PerCacheline);
+  const uintptr_t new_yhat_64b_popcounts_vec_ct = new_yhat_64b_popcounts_cacheline_ct * plink2::kVecsPerCacheline;
+  const uintptr_t new_yhat_2048b_popcounts_cacheline_ct = plink2::DivUp(new_yhat_64b_popcounts_cacheline_ct, plink2::kInt16PerCacheline);
+  const uintptr_t new_yhat_2048b_popcounts_vec_ct = new_yhat_2048b_popcounts_cacheline_ct * plink2::kVecsPerCacheline;
+  const uintptr_t new_yhat_64kib_popcounts_vec_ct = plink2::DivUp(new_yhat_2048b_popcounts_cacheline_ct, plink2::kInt32PerVec);
+  const uintptr_t vec_ct = recs_vec_ct + new_yhat_present_vec_ct + new_yhat_64b_popcounts_vec_ct + new_yhat_2048b_popcounts_vec_ct + new_yhat_64kib_popcounts_vec_ct;
+  unsigned char* wkspace;
+  if (plink2::cachealigned_malloc(vec_ct * plink2::kBytesPerVec, &wkspace)) {
+    stop("Out of memory");
+  }
+  memset(wkspace, 0, (vec_ct - recs_vec_ct) * plink2::kBytesPerVec);
+  unsigned char* wkspace_iter = wkspace;
+  // Put new_yhat arrays first to guarantee cacheline alignment, since it
+  // matters there.
+  uintptr_t* new_yhat_present = reinterpret_cast<uintptr_t*>(wkspace_iter);
+  wkspace_iter = &(wkspace_iter[new_yhat_present_vec_ct * plink2::kBytesPerVec]);
+  uint16_t* new_yhat_64b_popcounts = reinterpret_cast<uint16_t*>(wkspace_iter);
+  wkspace_iter = &(wkspace_iter[new_yhat_64b_popcounts_vec_ct * plink2::kBytesPerVec]);
+  uint16_t* new_yhat_2048b_popcounts = reinterpret_cast<uint16_t*>(wkspace_iter);
+  wkspace_iter = &(wkspace_iter[new_yhat_2048b_popcounts_vec_ct * plink2::kBytesPerVec]);
+  uint32_t* new_yhat_64kib_popcounts = reinterpret_cast<uint32_t*>(wkspace_iter);
+  wkspace_iter = &(wkspace_iter[new_yhat_64kib_popcounts_vec_ct * plink2::kBytesPerVec]);
+  CIndexRec* recs = reinterpret_cast<CIndexRec*>(wkspace_iter);
+
+  uint64_t yhat_int_sum = 0;
+  {
+    const double* yhatd = &(yhat[0]);
+    for (uintptr_t ulii = 0; ulii != size; ++ulii) {
+      recs[ulii].key = yhatd[ulii];
+      recs[ulii].status = ulii;
+    }
+    std::sort(recs, &(recs[size]));
+    recs[recs[0].status].yhat_int = 0;
+    double prev_yhat = recs[0].key;
+    uint32_t prev_idx = 0;
+    for (uintptr_t ulii = 1; ulii != size; ++ulii) {
+      const double cur_yhat = recs[ulii].key;
+      if (cur_yhat != prev_yhat) {
+        prev_idx = ulii;
+        prev_yhat = cur_yhat;
+      }
+      recs[recs[ulii].status].yhat_int = prev_idx;
+      yhat_int_sum += prev_idx;
+    }
+  }
+  const double* yd = &(y[0]);
+  if (TYPEOF(status) == INTSXP) {
+    IntegerVector status_iv = as<IntegerVector>(status);
+    if (size != static_cast<uintptr_t>(status_iv.size())) {
+      plink2::aligned_free(wkspace);
+      stop("y and status must have the same length");
+    }
+    const int32_t* status_ivi = &(status_iv[0]);
+    for (uintptr_t ulii = 0; ulii != size; ++ulii) {
+      recs[ulii].key = yd[ulii];
+      const uint32_t uii = status_ivi[ulii];
+      if (uii & (~1)) {
+        plink2::aligned_free(wkspace);
+        stop("all status values must be 0 or 1");
+      }
+      recs[ulii].status = uii;
+    }
+  } else {
+    NumericVector status_nv = as<NumericVector>(status);
+    if (size != static_cast<uintptr_t>(status_nv.size())) {
+      plink2::aligned_free(wkspace);
+      stop("y and status must have the same length");
+    }
+    const double* status_nvd = &(status_nv[0]);
+    for (uintptr_t ulii = 0; ulii != size; ++ulii) {
+      recs[ulii].key = yd[ulii];
+      const double dxx = status_nvd[ulii];
+      if (dxx == 0.0) {
+        recs[ulii].status = 0;
+      } else if (dxx == 1.0) {
+        recs[ulii].status = 1;
+      } else {
+        plink2::aligned_free(wkspace);
+        stop("all status values must be 0 or 1");
+      }
+    }
+  }
+  std::sort(recs, &(recs[size]));
+
+  // With no ties, yhat_int_sum would be (size * (size - 1)) / 2.
+  // k-way ties decrease the sum by (k * (k-1)) / 2, which is conveniently
+  // proportional to the burden imposed by the ties (sequential scanning for
+  // the next unset bit in new_yhat_present) on our usual algorithm.
+  // With high enough tie-burden (threshold proportional to n log n), we switch
+  // to a slightly different approach, which is usually slower but avoids
+  // sequential scans in case of ties.
+  const uint64_t tie_burden = (static_cast<uint64_t>(size) * (size - 1)) / 2 - yhat_int_sum;
+  if (tie_burden > 512 * static_cast<uint64_t>(size)) {
+    return CIndexTieheavyMain(recs, size, wkspace, new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts);
+  }
+
+  // Main loop.
+  double prev_key = recs[size - 1].key;
+  uint32_t recs_block_end = size;
+  uint32_t new_yhat_entry_ct = 0;
+  uint64_t numer_x2 = 0;
+  uint64_t denom = 0;
+  for (uint32_t recs_idx = size; recs_idx; ) {
+    --recs_idx;
+    const double cur_key = recs[recs_idx].key;
+    if (cur_key != prev_key) {
+      const uint32_t next_block_end = recs_idx + 1;
+      for (uint32_t uii = next_block_end; uii != recs_block_end; ++uii) {
+        const uint32_t yhat_int = recs[uii].yhat_int;
+        // This is essentially an inlined AdvTo0Bit() call.
+        uintptr_t widx = yhat_int / plink2::kBitsPerWord;
+        {
+          const uintptr_t remainder_bit = plink2::k1LU << (yhat_int % plink2::kBitsPerWord);
+          uintptr_t cur_word = new_yhat_present[widx];
+          if (!(remainder_bit & cur_word)) {
+            // common case
+            new_yhat_present[widx] = cur_word | remainder_bit;
+          } else {
+            // There's a tie, and we've already seen at least one previous
+            // instance.
+            // Find the rightmost clear bit in cur_word after remainder_bit, if
+            // there is one.  We use the bitarray property that cur_word +
+            // <bit> sets the first 0-bit in cur_word at or past the added bit,
+            // may clear some other bits, and doesn't do anything else.
+            uintptr_t ulii = cur_word + remainder_bit;
+            if (ulii < remainder_bit) {
+              // Addition 'carry' needed.
+              do {
+                cur_word = new_yhat_present[++widx];
+              } while (cur_word == ~plink2::k0LU);
+              ulii = cur_word + 1;
+            }
+            const uintptr_t new_word = ulii | cur_word;
+            new_yhat_present[widx] = new_word;
+          }
+        }
+        new_yhat_64b_popcounts[widx / plink2::kWordsPerCacheline] += 1;
+        new_yhat_2048b_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline)] += 1;
+        new_yhat_64kib_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline)] += 1;
+      }
+      recs_block_end = next_block_end;
+      new_yhat_entry_ct = size - recs_block_end;
+      prev_key = cur_key;
+    }
+    if (!recs[recs_idx].status) {
+      continue;
+    }
+    denom += new_yhat_entry_ct;
+    const uint32_t yhat_int = recs[recs_idx].yhat_int;
+    // numer_x2 += 2 * <# of set bits before yhat_int> + <# of ties>
+    uint32_t widx = yhat_int / plink2::kBitsPerWord;
+    uint32_t n_set_bits_before = PopcountLookup(new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts, widx);
+    uint32_t remainder = yhat_int % plink2::kBitsPerWord;
+    uintptr_t cur_word = new_yhat_present[widx];
+    uintptr_t remainder_bit = plink2::k1LU << remainder;
+    n_set_bits_before += plink2::PopcountWord(cur_word & (remainder_bit - 1));
+    numer_x2 += 2 * n_set_bits_before;
+
+    if (!(cur_word & remainder_bit)) {
+      // common case
+      continue;
+    }
+    // Similar to the insertion tie-handling code above.
+    numer_x2 -= remainder;
+    uintptr_t ulii = cur_word + remainder_bit;
+    if (ulii < remainder_bit) {
+      uint32_t widx2 = widx;
+      do {
+        cur_word = new_yhat_present[++widx2];
+      } while (cur_word == ~plink2::k0LU);
+      numer_x2 += (widx2 - widx) * plink2::kBitsPerWord;
+      ulii = cur_word + 1;
+    }
+    numer_x2 += plink2::ctzw(ulii & (~cur_word));
+  }
+  plink2::aligned_free(wkspace);
+  return plink2::u63tod(numer_x2) / plink2::u63tod(denom * 2);
+}
+
+uint32_t PopcountLookup(const uintptr_t* new_yhat_present, const uint16_t* new_yhat_64b_popcounts, const uint16_t* new_yhat_2048b_popcounts, const uint32_t* new_yhat_64kib_popcounts, uint32_t widx) {
+  uint32_t result = 0;
+  const uint32_t idx_64kib = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline);
+  for (uint32_t uii = 0; uii != idx_64kib; ++uii) {
+    result += new_yhat_64kib_popcounts[uii];
+  }
+  const uint32_t idx_2048b = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline);
+  for (uint32_t uii = idx_64kib * plink2::kInt16PerCacheline; uii != idx_2048b; ++uii) {
+    result += new_yhat_2048b_popcounts[uii];
+  }
+
+  // Each entry in the bottom-level-index array is <= 512, and we're adding
+  // at most 16 of them even when compiling as 32-bit, so there's no overflow
+  // danger if we add entire words at a time (effectively performing 2-4
+  // uint16 additions in parallel).
+  const uintptr_t* walias_64b_popcounts = reinterpret_cast<const uintptr_t*>(new_yhat_64b_popcounts);
+  const uint32_t widx_64b = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerWord);
+  uintptr_t acc = 0;
+  for (uint32_t uii = idx_2048b * plink2::kWordsPerCacheline; uii != widx_64b; ++uii) {
+    acc += walias_64b_popcounts[uii];
+  }
+  const uint32_t idx_64b = widx / plink2::kWordsPerCacheline;
+  uint32_t remainder = idx_64b % plink2::kInt16PerWord;
+  if (remainder) {
+    acc += plink2::bzhi(walias_64b_popcounts[widx_64b], remainder * 16);
+  }
+  result += (acc * plink2::kMask0001) >> (plink2::kBitsPerWord - 16);
+
+  for (uint32_t uii = idx_64b * plink2::kWordsPerCacheline; uii != widx; ++uii) {
+    result += plink2::PopcountWord(new_yhat_present[uii]);
+  }
+  return result;
+}
+
+double CIndexTieheavyMain(const CIndexRec* recs, uintptr_t size, unsigned char* wkspace, uintptr_t* new_yhat_present, uint16_t* new_yhat_64b_popcounts, uint16_t* new_yhat_2048b_popcounts, uint32_t* new_yhat_64kib_popcounts) {
+  // Alternate tie-handling strategy: maintain an additional array which tracks
+  // the number of times we've seen each yhat_int value.
+  uint32_t* yhat_int_multiplicities = static_cast<uint32_t*>(calloc(size, sizeof(int32_t)));
+  if (!yhat_int_multiplicities) {
+    plink2::aligned_free(wkspace);
+    stop("Out of memory");
+  }
+
+  double prev_key = recs[size - 1].key;
+  uint32_t recs_block_end = size;
+  uint32_t new_yhat_entry_ct = 0;
+  uint64_t numer_x2 = 0;
+  uint64_t denom = 0;
+  for (uint32_t recs_idx = size; recs_idx; ) {
+    --recs_idx;
+    const double cur_key = recs[recs_idx].key;
+    if (cur_key != prev_key) {
+      const uint32_t next_block_end = recs_idx + 1;
+      for (uint32_t uii = next_block_end; uii != recs_block_end; ++uii) {
+        const uint32_t yhat_int = recs[uii].yhat_int;
+        const uint32_t multiplicity = yhat_int_multiplicities[yhat_int];
+        const uint32_t adj_yhat_int = yhat_int + multiplicity;
+        yhat_int_multiplicities[yhat_int] = multiplicity + 1;
+        const uintptr_t widx = adj_yhat_int / plink2::kBitsPerWord;
+        const uintptr_t remainder_bit = plink2::k1LU << (adj_yhat_int % plink2::kBitsPerWord);
+        new_yhat_present[widx] |= remainder_bit;
+        new_yhat_64b_popcounts[widx / plink2::kWordsPerCacheline] += 1;
+        new_yhat_2048b_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline)] += 1;
+        new_yhat_64kib_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline)] += 1;
+      }
+      recs_block_end = next_block_end;
+      new_yhat_entry_ct = size - recs_block_end;
+      prev_key = cur_key;
+    }
+    if (!recs[recs_idx].status) {
+      continue;
+    }
+    denom += new_yhat_entry_ct;
+    const uint32_t yhat_int = recs[recs_idx].yhat_int;
+    uint32_t widx = yhat_int / plink2::kBitsPerWord;
+    uint32_t n_set_bits_before = PopcountLookup(new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts, widx);
+    uint32_t remainder = yhat_int % plink2::kBitsPerWord;
+    uintptr_t cur_word = new_yhat_present[widx];
+    uintptr_t remainder_bit = plink2::k1LU << remainder;
+    n_set_bits_before += plink2::PopcountWord(cur_word & (remainder_bit - 1));
+    numer_x2 += 2 * n_set_bits_before + yhat_int_multiplicities[yhat_int];
+  }
+  free(yhat_int_multiplicities);
+  plink2::aligned_free(wkspace);
+  return plink2::u63tod(numer_x2) / plink2::u63tod(denom * 2);
+}
--- a/SFMT.c
+++ b/SFMT.c
--- a/SFMT.h
+++ b/SFMT.h
--- a/pgenlib_misc.cc
+++ b/pgenlib_misc.cc
-// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
+// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
 // Christopher Chang.
 //
 // This library is free software: you can redistribute it and/or modify it

--- a/pgenlib_misc.h
+++ b/pgenlib_misc.h
 #ifndef __PGENLIB_MISC_H__
 #define __PGENLIB_MISC_H__

-// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
+// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
 // Christopher Chang.
 //
 // This library is free software: you can redistribute it and/or modify it
@@ -74,12 +74,12 @@
 //   on the fly, since that tends to be faster than having to access twice as
 //   much memory.

-#include "plink2_base.h"
+#include "plink2_bits.h"

 // 10000 * major + 100 * minor + patch
 // Exception to CONSTI32, since we want the preprocessor to have access to this
 // value.  Named with all caps as a consequence.
-#define PGENLIB_INTERNAL_VERNUM 1401
+#define PGENLIB_INTERNAL_VERNUM 1501

 #ifdef __cplusplus
 namespace plink2 {

--- a/pgenlib_read.cc
+++ b/pgenlib_read.cc
--- a/pgenlib_read.h
+++ b/pgenlib_read.h
--- a/pgenlib_write.cc
+++ b/pgenlib_write.cc
-// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
+// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
 // Christopher Chang.
 //
 // This library is free software: you can redistribute it and/or modify it

--- a/pgenlib_write.h
+++ b/pgenlib_write.h
 #ifndef __PGENLIB_WRITE_H__
 #define __PGENLIB_WRITE_H__

-// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
+// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
 // Christopher Chang.
 //
 // This library is free software: you can redistribute it and/or modify it

--- a/include/plink2_base.cc
+++ b/include/plink2_base.cc
+// This library is part of PLINK 2, copyright (C) 2005-2020 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_base.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+uintptr_t g_failed_alloc_attempt_size = 0;
+
+#if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
+BoolErr pgl_malloc(uintptr_t size, void* pp) {
+  *S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
+  if (likely(*S_CAST(unsigned char**, pp))) {
+    return 0;
+  }
+  g_failed_alloc_attempt_size = size;
+  return 1;
+}
+#endif
+
+BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile) {
+  while (len > kMaxBytesPerIO) {
+    // OS X fwrite() doesn't support 2GiB+ writes
+    // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
+    // write size
+    // bugfix (9 Mar 2018): forgot a 'not' here...
+    if (unlikely(!fwrite_unlocked(buf, kMaxBytesPerIO, 1, outfile))) {
+      return 1;
+    }
+    buf = &(S_CAST(const unsigned char*, buf)[kMaxBytesPerIO]);
+    len -= kMaxBytesPerIO;
+  }
+  uintptr_t written_byte_ct = fwrite_unlocked(buf, 1, len, outfile);
+  // must do the right thing when len == 0
+  return (written_byte_ct != len);
+}
+
+/*
+IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
+  uintptr_t bytes_read = 0;
+  while (len > kMaxBytesPerIO) {
+    const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
+    bytes_read += cur_bytes_read;
+    if (cur_bytes_read != kMaxBytesPerIO) {
+      *bytes_read_ptr = bytes_read;
+      return ferror_unlocked(infile);
+    }
+    buf = &(((char*)buf)[kMaxBytesPerIO]);
+    len -= kMaxBytesPerIO;
+  }
+  bytes_read += fread_unlocked(buf, 1, len, infile);
+  *bytes_read_ptr = bytes_read;
+  // could skip ferror_unlocked call if bytes_read == original len
+  return ferror_unlocked(infile);
+}
+*/
+
+BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile) {
+  while (len > kMaxBytesPerIO) {
+    const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
+    if (unlikely(cur_bytes_read != kMaxBytesPerIO)) {
+      return 1;
+    }
+    buf = &(S_CAST(unsigned char*, buf)[kMaxBytesPerIO]);
+    len -= kMaxBytesPerIO;
+  }
+  const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, len, infile);
+  return (cur_bytes_read != len);
+}
+
+#ifdef __LP64__
+static inline BoolErr ScanUintCappedFinish(const char* str_iter, uint64_t cap, uint32_t* valp) {
+  uint64_t val = *valp;
+  while (1) {
+    // a little bit of unrolling seems to help
+    const uint64_t cur_digit = ctou64(*str_iter++) - 48;
+    if (cur_digit >= 10) {
+      break;
+    }
+    // val = val * 10 + cur_digit;
+    const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
+    if (cur_digit2 >= 10) {
+      val = val * 10 + cur_digit;
+      if (unlikely(val > cap)) {
+        return 1;
+      }
+      break;
+    }
+    val = val * 100 + cur_digit * 10 + cur_digit2;
+    if (unlikely(val > cap)) {
+      return 1;
+    }
+  }
+  *valp = val;
+  return 0;
+}
+
+BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
+  // '0' has ascii code 48
+  assert(ctou32(str_iter[0]) > 32);
+  *valp = ctou32(*str_iter++) - 48;
+  if (*valp >= 10) {
+    // permit leading '+' (ascii 43), but not '++' or '+-'
+    // reasonable to use unlikely() here since these functions aren't used for
+    // numeric vs. non-numeric classification anyway due to erroring out on
+    // overflow
+    if (unlikely(*valp != 0xfffffffbU)) {
+      return 1;
+    }
+    *valp = ctou32(*str_iter++) - 48;
+    if (unlikely(*valp >= 10)) {
+      return 1;
+    }
+  }
+  while (!(*valp)) {
+    *valp = ctou32(*str_iter++) - 48;
+    if (unlikely((*valp) >= 10)) {
+      return 1;
+    }
+  }
+  return ScanUintCappedFinish(str_iter, cap, valp);
+}
+
+// Note that NumericRangeListToBitarr() can call this in an ignore-overflow
+// mode.  If similar logic ever goes into an inner loop, remove all unlikely()
+// annotations in this function and its children.
+BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
+  // Reads an integer in [0, cap].  Assumes first character is nonspace.
+  assert(ctou32(str_iter[0]) > 32);
+  uint32_t val = ctou32(*str_iter++) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
+      if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
+        return 1;
+      }
+      // accept "-0", "-00", etc.
+      while (*(++str_iter) == '0');
+      *valp = 0;
+      return (ctou32(*str_iter) - 48) < 10;
+    }
+    // accept leading '+'
+    val = ctou32(*str_iter++) - 48;
+    if (unlikely(val >= 10)) {
+      return 1;
+    }
+  }
+  *valp = val;
+  return ScanUintCappedFinish(str_iter, cap, valp);
+}
+
+BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp) {
+  // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
+  assert(ctou32(str_iter[0]) > 32);
+  *valp = ctou32(*str_iter++) - 48;
+  int32_t sign = 1;
+  if (ctou32(*valp) >= 10) {
+    if (*valp == -3) {
+      sign = -1;
+    } else if (*valp != -5) {
+      return 1;
+    }
+    *valp = ctou32(*str_iter++) - 48;
+    if (unlikely(ctou32(*valp) >= 10)) {
+      return 1;
+    }
+  }
+  if (unlikely(ScanUintCappedFinish(str_iter, bound, R_CAST(uint32_t*, valp)))) {
+    return 1;
+  }
+  *valp *= sign;
+  return 0;
+}
+#else  // not __LP64__
+BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
+  // '0' has ascii code 48
+  assert(ctou32(str_iter[0]) > 32);
+  uint32_t val = ctou32(*str_iter++) - 48;
+  if (val >= 10) {
+    if (unlikely(val != 0xfffffffbU)) {
+      return 1;
+    }
+    val = ctou32(*str_iter++) - 48;
+    if (unlikely(val >= 10)) {
+      return 1;
+    }
+  }
+  while (!val) {
+    val = ctou32(*str_iter++) - 48;
+    if (unlikely(val >= 10)) {
+      return 1;
+    }
+  }
+  for (; ; ++str_iter) {
+    const uint32_t cur_digit = ctou32(*str_iter) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      return 0;
+    }
+    // avoid integer overflow in middle of computation
+    if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+
+BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
+  // Reads an integer in [0, cap].  Assumes first character is nonspace.
+  assert(ctou32(str_iter[0]) > 32);
+  uint32_t val = ctou32(*str_iter++) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
+        return 1;
+      }
+      while (*(++str_iter) == '0');
+      *valp = 0;
+      return (ctou32(*str_iter) - 48) < 10;
+    }
+    val = ctou32(*str_iter++) - 48;
+    if (unlikely(val >= 10)) {
+      return 1;
+    }
+  }
+  for (; ; ++str_iter) {
+    const uint32_t cur_digit = ctou32(*str_iter) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      return 0;
+    }
+    if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+
+BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
+  // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
+  assert(ctou32(str_iter[0]) > 32);
+  uint32_t val = ctou32(*str_iter++) - 48;
+  int32_t sign = 1;
+  if (val >= 10) {
+    if (val == 0xfffffffdU) {
+      sign = -1;
+    } else if (val != 0xfffffffbU) {
+      return 1;
+    }
+    val = ctou32(*str_iter++) - 48;
+    if (unlikely(val >= 10)) {
+      return 1;
+    }
+  }
+  for (; ; ++str_iter) {
+    const uint32_t cur_digit = ctou32(*str_iter) - 48;
+    if (cur_digit >= 10) {
+      *valp = sign * S_CAST(int32_t, val);
+      return 0;
+    }
+    if (unlikely((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10)))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+#endif
+
+BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp) {
+  // Assumes malloc returns word-aligned addresses.
+  assert(alignment);
+  assert(!(alignment % kBytesPerWord));
+  uintptr_t malloc_addr;
+  if (unlikely(pgl_malloc(size + alignment, &malloc_addr))) {
+    return 1;
+  }
+  assert(!(malloc_addr % kBytesPerWord));
+  uintptr_t** casted_aligned_pp = S_CAST(uintptr_t**, aligned_pp);
+  *casted_aligned_pp = R_CAST(uintptr_t*, RoundDownPow2(malloc_addr + alignment, alignment));
+  (*casted_aligned_pp)[-1] = malloc_addr;
+  return 0;
+}
+
+#ifdef __LP64__
+int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
+  const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
+  const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
+  if (byte_ct < 16 + (kBytesPerVec / 2)) {
+    if (byte_ct < kBytesPerWord) {
+      if (byte_ct < 4) {
+        if (byte_ct < 2) {
+          return (!byte_ct) || (m1_uc[0] == m2_uc[0]);
+        }
+        if ((*S_CAST(const uint16_t*, m1)) != (*S_CAST(const uint16_t*, m2))) {
+          return 0;
+        }
+        if ((byte_ct == 3) && (m1_uc[2] != m2_uc[2])) {
+          return 0;
+        }
+        return 1;
+      }
+      if ((*R_CAST(const uint32_t*, m1_uc)) != (*R_CAST(const uint32_t*, m2_uc))) {
+        return 0;
+      }
+      if (byte_ct > 4) {
+        const uintptr_t final_offset = byte_ct - 4;
+        if ((*R_CAST(const uint32_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uint32_t*, &(m2_uc[final_offset])))) {
+          return 0;
+        }
+      }
+      return 1;
+    }
+    const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
+    const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
+    if (m1_alias[0] != m2_alias[0]) {
+      return 0;
+    }
+    if (byte_ct >= 16) {
+      if (m1_alias[1] != m2_alias[1]) {
+        return 0;
+      }
+#  ifdef USE_AVX2
+      if (byte_ct >= 24) {
+        if (m1_alias[2] != m2_alias[2]) {
+          return 0;
+        }
+      }
+#  endif
+    }
+    if (byte_ct % kBytesPerWord) {
+      const uintptr_t final_offset = byte_ct - kBytesPerWord;
+      if ((*R_CAST(const uintptr_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uintptr_t*, &(m2_uc[final_offset])))) {
+        return 0;
+      }
+    }
+    return 1;
+  }
+  // Don't use VecW since _mm_cmpeq_epi64() not defined until SSE4.1.
+  const VecUc* m1_alias = S_CAST(const VecUc*, m1);
+  const VecUc* m2_alias = S_CAST(const VecUc*, m2);
+  const uintptr_t vec_ct = byte_ct / kBytesPerVec;
+  for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
+    // tried unrolling this, doesn't make a difference
+    const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
+    const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
+    if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
+      return 0;
+    }
+  }
+  if (byte_ct % kBytesPerVec) {
+    // put this last instead of first, for better behavior when inputs are
+    // aligned
+    const uintptr_t final_offset = byte_ct - kBytesPerVec;
+    const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
+    const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
+    if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+// clang/gcc memcmp is not that well-optimized for the short strings we usually
+// compare.
+int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) {
+  const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
+  const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
+  // tried larger crossover threshold, doesn't help
+  if (byte_ct < kBytesPerVec) {
+    if (byte_ct < kBytesPerWord) {
+      if (byte_ct < 4) {
+        for (uintptr_t pos = 0; pos != byte_ct; ++pos) {
+          const unsigned char ucc1 = m1_uc[pos];
+          const unsigned char ucc2 = m2_uc[pos];
+          if (ucc1 != ucc2) {
+            return (ucc1 < ucc2)? -1 : 1;
+          }
+        }
+        return 0;
+      }
+      uint32_t m1_u32 = *S_CAST(const uint32_t*, m1);
+      uint32_t m2_u32 = *S_CAST(const uint32_t*, m2);
+      if (m1_u32 != m2_u32) {
+        return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
+      }
+      if (byte_ct > 4) {
+        const uintptr_t final_offset = byte_ct - 4;
+        m1_u32 = *R_CAST(const uint32_t*, &(m1_uc[final_offset]));
+        m2_u32 = *R_CAST(const uint32_t*, &(m2_uc[final_offset]));
+        if (m1_u32 != m2_u32) {
+          return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
+        }
+      }
+      return 0;
+    }
+    const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
+    const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
+    uintptr_t m1_word = m1_alias[0];
+    uintptr_t m2_word = m2_alias[0];
+    if (m1_word != m2_word) {
+      return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
+    }
+#  ifdef USE_AVX2
+    if (byte_ct >= 16) {
+      m1_word = m1_alias[1];
+      m2_word = m2_alias[1];
+      if (m1_word != m2_word) {
+        return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
+      }
+      if (byte_ct >= 24) {
+        m1_word = m1_alias[2];
+        m2_word = m2_alias[2];
+        if (m1_word != m2_word) {
+          return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
+        }
+      }
+    }
+#  endif
+    if (byte_ct % kBytesPerWord) {
+      const uintptr_t final_offset = byte_ct - kBytesPerWord;
+      m1_word = *R_CAST(const uintptr_t*, &(m1_uc[final_offset]));
+      m2_word = *R_CAST(const uintptr_t*, &(m2_uc[final_offset]));
+      if (m1_word != m2_word) {
+        return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
+      }
+    }
+    return 0;
+  }
+  const VecUc* m1_alias = S_CAST(const VecUc*, m1);
+  const VecUc* m2_alias = S_CAST(const VecUc*, m2);
+  const uintptr_t fullvec_ct = byte_ct / kBytesPerVec;
+  // uh, clang/LLVM -O2 optimizes this better when comparison is != instead of
+  // <?  ugh, time to change all of the for loops...
+  // (and yes, both -O3 configurations generate worse code here)
+  // at least for loop is better than do-while loop even when 1 iteration is
+  // guaranteed...
+  for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
+    const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
+    const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
+    // is this even worthwhile now in non-AVX2 case?
+    const uint32_t movemask_result = vecuc_movemask(v1 == v2);
+    if (movemask_result != kVec8thUintMax) {
+      const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result);
+      return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
+    }
+  }
+  if (byte_ct % kBytesPerVec) {
+    const uintptr_t final_offset = byte_ct - kBytesPerVec;
+    const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
+    const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
+    const uint32_t movemask_result = vecuc_movemask(v1 == v2);
+    if (movemask_result != kVec8thUintMax) {
+      const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result);
+      return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+#ifdef __cplusplus
+}  // namespace plink2
+#endif
--- a/plink2_base.h
+++ b/plink2_base.h