Skip to content
Commits on Source (2)
......@@ -5,11 +5,11 @@ CXXWARN = ${CWARN} -Wold-style-cast
CWARN2 = -Wall -Wextra -Wshadow -Wformat-security
CXXWARN2 = ${CWARN2} -Wold-style-cast
CSRC = SFMT.c libdeflate/lib/adler32.c libdeflate/lib/aligned_malloc.c libdeflate/lib/crc32.c libdeflate/lib/deflate_compress.c libdeflate/lib/deflate_decompress.c libdeflate/lib/gzip_compress.c libdeflate/lib/gzip_decompress.c libdeflate/lib/zlib_compress.c libdeflate/lib/zlib_decompress.c libdeflate/lib/x86/cpu_features.c
CSRC = include/SFMT.c libdeflate/lib/adler32.c libdeflate/lib/aligned_malloc.c libdeflate/lib/crc32.c libdeflate/lib/deflate_compress.c libdeflate/lib/deflate_decompress.c libdeflate/lib/gzip_compress.c libdeflate/lib/gzip_decompress.c libdeflate/lib/zlib_compress.c libdeflate/lib/zlib_decompress.c libdeflate/lib/x86/cpu_features.c
ZCSRC = zstd/lib/common/debug.c zstd/lib/common/entropy_common.c zstd/lib/common/zstd_common.c zstd/lib/common/error_private.c zstd/lib/common/xxhash.c zstd/lib/common/fse_decompress.c zstd/lib/common/pool.c zstd/lib/common/threading.c zstd/lib/compress/fse_compress.c zstd/lib/compress/hist.c zstd/lib/compress/huf_compress.c zstd/lib/compress/zstd_double_fast.c zstd/lib/compress/zstd_fast.c zstd/lib/compress/zstd_lazy.c zstd/lib/compress/zstd_ldm.c zstd/lib/compress/zstd_opt.c zstd/lib/compress/zstd_compress.c zstd/lib/compress/zstd_compress_literals.c zstd/lib/compress/zstd_compress_sequences.c zstd/lib/compress/zstdmt_compress.c zstd/lib/decompress/huf_decompress.c zstd/lib/decompress/zstd_decompress.c zstd/lib/decompress/zstd_ddict.c zstd/lib/decompress/zstd_decompress_block.c
CCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc plink2.cc plink2_adjust.cc plink2_bgzf.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc plink2_stats.cc plink2_string.cc plink2_text.cc plink2_thread.cc plink2_zstfile.cc
CCSRC = include/plink2_base.cc include/plink2_bits.cc include/pgenlib_misc.cc include/pgenlib_read.cc include/pgenlib_write.cc include/plink2_bgzf.cc include/plink2_stats.cc include/plink2_string.cc include/plink2_text.cc include/plink2_thread.cc include/plink2_zstfile.cc plink2.cc plink2_adjust.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc
OBJ_NO_ZSTD = $(CSRC:.c=.o) $(CCSRC:.cc=.o)
OBJ = $(CSRC:.c=.o) $(ZCSRC:.c=.o) $(CCSRC:.cc=.o)
......@@ -26,9 +26,9 @@ CINCLUDE2 = -I../libdeflate -I../libdeflate/common
ZSTD_INCLUDE = -Izstd/lib -Izstd/lib/common
ZSTD_INCLUDE2 = -I../zstd/lib -I../zstd/lib/common
PGCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc pgen_compress.cc
PGCSRC = include/plink2_base.cc include/plink2_bits.cc include/pgenlib_misc.cc include/pgenlib_read.cc include/pgenlib_write.cc pgen_compress.cc
PGCOBJ = $(PGCSRC:.c=.o)
PGCSRC2 = $(foreach fname,$(PGCSRC),../$(fname))
CLEAN = *.o libdeflate/lib/*.o libdeflate/lib/x86/*.o zstd/lib/common/*.o zstd/lib/compress/*.o zstd/lib/decompress/*.o bin/plink2 bin/pgen_compress
CLEAN = *.o include/*.o libdeflate/lib/*.o libdeflate/lib/x86/*.o zstd/lib/common/*.o zstd/lib/compress/*.o zstd/lib/decompress/*.o bin/plink2 bin/pgen_compress
CLEAN3 = $(foreach expr,$(CLEAN),../$(expr))
This diff is collapsed.
......@@ -8,7 +8,7 @@ import numpy as np
ext_modules = [
Extension('pgenlib',
sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../pgenlib_misc.cc', '../pgenlib_read.cc', '../pgenlib_write.cc', '../plink2_base.cc'],
sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../include/pgenlib_misc.cc', '../include/pgenlib_read.cc', '../include/pgenlib_write.cc', '../include/plink2_base.cc', '../include/plink2_bits.cc'],
language = "c++",
# do not compile as c++11, since cython doesn't yet support
# overload of uint32_t operator
......@@ -21,6 +21,6 @@ ext_modules = [
]
setup(name = 'Pgenlib',
version = '0.71',
version = '0.73',
description = "Wrapper for pgenlib's basic reader and writer.",
ext_modules = cythonize(ext_modules))
Package: cindex
Type: Package
Title: C-index
Version: 0.1
Date: 2019-12-12
Author: Christopher Chang
Maintainer: Christopher Chang <chrchang@alumni.caltech.edu>
Description: Supports fast concordance-index computation.
License: LGPL (>= 3)
Imports: Rcpp (>= 1.0.1)
LinkingTo: Rcpp
useDynLib(cindex, .registration=TRUE)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp, evalCpp)
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
CIndex <- function(yhat, y, status) {
.Call(`_cindex_CIndex`, yhat, y, status)
}
old_cindex=function(yhat,y,status,w=rep(1,length(y))){
# Rob's implementation of C-index
#note: this function gives identical results to the summary function from coxph, and the concordance.index function in survcomp.
# (with their default settings), and no ties in yhat. Works with ties in y. But does not agree with latter when yhat has ties. There are conflicting definitions for c-index in this case
#
#formula used is
# Concordance = (#all concordant pairs + #tied pairs/2)/(#total pairs including ties).
# with w, weights used are ave wts for each pair
time.computeC.start <- Sys.time()
risksets=which(status==1)
w=length(w)*w/sum(w)
fun=function(riskset,y,yhat,w){
total=concordant=0
i=riskset
rest=which(y>y[i])
if(length(rest)>0){
ww=(w[rest]+w[i])/2
total=sum(ww)
concordant = 1.0*sum(ww*(yhat[rest]<yhat[i]))+0.5*sum(ww*(yhat[rest]==yhat[i]))
}
return(c(concordant,total))
}
out=sapply(risksets,fun,y,yhat,w)
cindex=sum(out[1,])/sum(out[2,])
return(cindex)
}
OBJECTS = include/plink2_base.o cindex.o RcppExports.o
PKG_CPPFLAGS = -march=native
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
using namespace Rcpp;
// CIndex
double CIndex(NumericVector yhat, NumericVector y, SEXP status);
RcppExport SEXP _cindex_CIndex(SEXP yhatSEXP, SEXP ySEXP, SEXP statusSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< NumericVector >::type yhat(yhatSEXP);
Rcpp::traits::input_parameter< NumericVector >::type y(ySEXP);
Rcpp::traits::input_parameter< SEXP >::type status(statusSEXP);
rcpp_result_gen = Rcpp::wrap(CIndex(yhat, y, status));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_cindex_CIndex", (DL_FUNC) &_cindex_CIndex, 3},
{NULL, NULL, 0}
};
RcppExport void R_init_cindex(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
#include "include/plink2_base.h"
#include <algorithm>
#include <Rcpp.h>
using namespace Rcpp;
typedef struct CIndexRecStruct {
double key; // yhat on first sort, y on second sort
uint32_t status; // original pos on first sort
uint32_t yhat_int;
bool operator<(const struct CIndexRecStruct& rhs) const {
return (key < rhs.key);
}
} CIndexRec;
uint32_t PopcountLookup(const uintptr_t* new_yhat_present, const uint16_t* new_yhat_64b_popcounts, const uint16_t* new_yhat_2048b_popcounts, const uint32_t* new_yhat_64kib_popcounts, uint32_t widx);
double CIndexTieheavyMain(const CIndexRec* recs, uintptr_t size, unsigned char* wkspace, uintptr_t* new_yhat_present, uint16_t* new_yhat_64b_popcounts, uint16_t* new_yhat_2048b_popcounts, uint32_t* new_yhat_64kib_popcounts);
// [[Rcpp::export]]
double CIndex(NumericVector yhat, NumericVector y, SEXP status) {
if ((TYPEOF(status) != INTSXP) && (TYPEOF(status) != REALSXP)) {
// could support LGLSXP too
stop("Unsupported status type");
}
const uintptr_t size = yhat.size();
if (size == 0) {
stop("yhat cannot be empty");
}
if (size != static_cast<uintptr_t>(y.size())) {
stop("yhat and y must have the same length");
}
// We define C-index as:
// sum_{all k where status[k]=1} [#(m: y[m]>y[j], yhat[m]<yhat[j]) +
// 0.5 * #(m: y[m]>y[j], yhat[m]==yhat[j])]
// ------------------------------------------------------------------------
// sum_{all k where status[k]=1} #(m: y[m]>y[j])
//
// The computation is organized as follows:
// 1. Replace yhat with 0-based ranks ('yhat_int'); note that this has no
// effect on the expression above. Ties round down.
// 2. Sort records in order of nondecreasing y.
// 3. Iterate through records, starting from largest y. Incrementally update
// a data structure tracking which yhat_int values have been seen so far,
// and in case of ties, their multiplicity. This data structure supports
// fast insertion and number-of-smaller{-or-equal}-elements queries, which
// covers the operations we need to compute the numerator. Formally, the
// total query time is still O(N^2) if status==1 isn't sparse, but the
// constant multiplier is so small that the std::sort calls practically
// always dominate for R's N<2^31 domain. (Not literally true for current
// implementation, but it's trivial to branch on N and use another
// popcount index-levels for very large N once any application cares.)
// More precisely, the new_yhat data structure consists of a bitarray, and
// three index-arrays containing partial popcounts for intervals of 2^9,
// 2^14, and 2^19 bits.
// Intermediate data structures (all cacheline-aligned except maybe recs,
// which is vector-aligned):
// - recs: N CIndexRec
// - new_yhat_present: N bits
// - new_yhat_64b_popcounts: DivUp(N, 512) uint16s
// - new_yhat_2048b_popcounts: DivUp(N, 512*32) uint16s
// - new_yhat_64kib_popcounts: DivUp(N, 512*32*32) uint16s
const uintptr_t recs_vec_ct = plink2::DivUp(sizeof(CIndexRec) * size, plink2::kBytesPerVec);
const uintptr_t new_yhat_present_cacheline_ct = plink2::DivUp(size, plink2::kBitsPerCacheline);
const uintptr_t new_yhat_present_vec_ct = new_yhat_present_cacheline_ct * plink2::kVecsPerCacheline;
const uintptr_t new_yhat_64b_popcounts_cacheline_ct = plink2::DivUp(new_yhat_present_cacheline_ct, plink2::kInt16PerCacheline);
const uintptr_t new_yhat_64b_popcounts_vec_ct = new_yhat_64b_popcounts_cacheline_ct * plink2::kVecsPerCacheline;
const uintptr_t new_yhat_2048b_popcounts_cacheline_ct = plink2::DivUp(new_yhat_64b_popcounts_cacheline_ct, plink2::kInt16PerCacheline);
const uintptr_t new_yhat_2048b_popcounts_vec_ct = new_yhat_2048b_popcounts_cacheline_ct * plink2::kVecsPerCacheline;
const uintptr_t new_yhat_64kib_popcounts_vec_ct = plink2::DivUp(new_yhat_2048b_popcounts_cacheline_ct, plink2::kInt32PerVec);
const uintptr_t vec_ct = recs_vec_ct + new_yhat_present_vec_ct + new_yhat_64b_popcounts_vec_ct + new_yhat_2048b_popcounts_vec_ct + new_yhat_64kib_popcounts_vec_ct;
unsigned char* wkspace;
if (plink2::cachealigned_malloc(vec_ct * plink2::kBytesPerVec, &wkspace)) {
stop("Out of memory");
}
memset(wkspace, 0, (vec_ct - recs_vec_ct) * plink2::kBytesPerVec);
unsigned char* wkspace_iter = wkspace;
// Put new_yhat arrays first to guarantee cacheline alignment, since it
// matters there.
uintptr_t* new_yhat_present = reinterpret_cast<uintptr_t*>(wkspace_iter);
wkspace_iter = &(wkspace_iter[new_yhat_present_vec_ct * plink2::kBytesPerVec]);
uint16_t* new_yhat_64b_popcounts = reinterpret_cast<uint16_t*>(wkspace_iter);
wkspace_iter = &(wkspace_iter[new_yhat_64b_popcounts_vec_ct * plink2::kBytesPerVec]);
uint16_t* new_yhat_2048b_popcounts = reinterpret_cast<uint16_t*>(wkspace_iter);
wkspace_iter = &(wkspace_iter[new_yhat_2048b_popcounts_vec_ct * plink2::kBytesPerVec]);
uint32_t* new_yhat_64kib_popcounts = reinterpret_cast<uint32_t*>(wkspace_iter);
wkspace_iter = &(wkspace_iter[new_yhat_64kib_popcounts_vec_ct * plink2::kBytesPerVec]);
CIndexRec* recs = reinterpret_cast<CIndexRec*>(wkspace_iter);
uint64_t yhat_int_sum = 0;
{
const double* yhatd = &(yhat[0]);
for (uintptr_t ulii = 0; ulii != size; ++ulii) {
recs[ulii].key = yhatd[ulii];
recs[ulii].status = ulii;
}
std::sort(recs, &(recs[size]));
recs[recs[0].status].yhat_int = 0;
double prev_yhat = recs[0].key;
uint32_t prev_idx = 0;
for (uintptr_t ulii = 1; ulii != size; ++ulii) {
const double cur_yhat = recs[ulii].key;
if (cur_yhat != prev_yhat) {
prev_idx = ulii;
prev_yhat = cur_yhat;
}
recs[recs[ulii].status].yhat_int = prev_idx;
yhat_int_sum += prev_idx;
}
}
const double* yd = &(y[0]);
if (TYPEOF(status) == INTSXP) {
IntegerVector status_iv = as<IntegerVector>(status);
if (size != static_cast<uintptr_t>(status_iv.size())) {
plink2::aligned_free(wkspace);
stop("y and status must have the same length");
}
const int32_t* status_ivi = &(status_iv[0]);
for (uintptr_t ulii = 0; ulii != size; ++ulii) {
recs[ulii].key = yd[ulii];
const uint32_t uii = status_ivi[ulii];
if (uii & (~1)) {
plink2::aligned_free(wkspace);
stop("all status values must be 0 or 1");
}
recs[ulii].status = uii;
}
} else {
NumericVector status_nv = as<NumericVector>(status);
if (size != static_cast<uintptr_t>(status_nv.size())) {
plink2::aligned_free(wkspace);
stop("y and status must have the same length");
}
const double* status_nvd = &(status_nv[0]);
for (uintptr_t ulii = 0; ulii != size; ++ulii) {
recs[ulii].key = yd[ulii];
const double dxx = status_nvd[ulii];
if (dxx == 0.0) {
recs[ulii].status = 0;
} else if (dxx == 1.0) {
recs[ulii].status = 1;
} else {
plink2::aligned_free(wkspace);
stop("all status values must be 0 or 1");
}
}
}
std::sort(recs, &(recs[size]));
// With no ties, yhat_int_sum would be (size * (size - 1)) / 2.
// k-way ties decrease the sum by (k * (k-1)) / 2, which is conveniently
// proportional to the burden imposed by the ties (sequential scanning for
// the next unset bit in new_yhat_present) on our usual algorithm.
// With high enough tie-burden (threshold proportional to n log n), we switch
// to a slightly different approach, which is usually slower but avoids
// sequential scans in case of ties.
const uint64_t tie_burden = (static_cast<uint64_t>(size) * (size - 1)) / 2 - yhat_int_sum;
if (tie_burden > 512 * static_cast<uint64_t>(size)) {
return CIndexTieheavyMain(recs, size, wkspace, new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts);
}
// Main loop.
double prev_key = recs[size - 1].key;
uint32_t recs_block_end = size;
uint32_t new_yhat_entry_ct = 0;
uint64_t numer_x2 = 0;
uint64_t denom = 0;
for (uint32_t recs_idx = size; recs_idx; ) {
--recs_idx;
const double cur_key = recs[recs_idx].key;
if (cur_key != prev_key) {
const uint32_t next_block_end = recs_idx + 1;
for (uint32_t uii = next_block_end; uii != recs_block_end; ++uii) {
const uint32_t yhat_int = recs[uii].yhat_int;
// This is essentially an inlined AdvTo0Bit() call.
uintptr_t widx = yhat_int / plink2::kBitsPerWord;
{
const uintptr_t remainder_bit = plink2::k1LU << (yhat_int % plink2::kBitsPerWord);
uintptr_t cur_word = new_yhat_present[widx];
if (!(remainder_bit & cur_word)) {
// common case
new_yhat_present[widx] = cur_word | remainder_bit;
} else {
// There's a tie, and we've already seen at least one previous
// instance.
// Find the rightmost clear bit in cur_word after remainder_bit, if
// there is one. We use the bitarray property that cur_word +
// <bit> sets the first 0-bit in cur_word at or past the added bit,
// may clear some other bits, and doesn't do anything else.
uintptr_t ulii = cur_word + remainder_bit;
if (ulii < remainder_bit) {
// Addition 'carry' needed.
do {
cur_word = new_yhat_present[++widx];
} while (cur_word == ~plink2::k0LU);
ulii = cur_word + 1;
}
const uintptr_t new_word = ulii | cur_word;
new_yhat_present[widx] = new_word;
}
}
new_yhat_64b_popcounts[widx / plink2::kWordsPerCacheline] += 1;
new_yhat_2048b_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline)] += 1;
new_yhat_64kib_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline)] += 1;
}
recs_block_end = next_block_end;
new_yhat_entry_ct = size - recs_block_end;
prev_key = cur_key;
}
if (!recs[recs_idx].status) {
continue;
}
denom += new_yhat_entry_ct;
const uint32_t yhat_int = recs[recs_idx].yhat_int;
// numer_x2 += 2 * <# of set bits before yhat_int> + <# of ties>
uint32_t widx = yhat_int / plink2::kBitsPerWord;
uint32_t n_set_bits_before = PopcountLookup(new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts, widx);
uint32_t remainder = yhat_int % plink2::kBitsPerWord;
uintptr_t cur_word = new_yhat_present[widx];
uintptr_t remainder_bit = plink2::k1LU << remainder;
n_set_bits_before += plink2::PopcountWord(cur_word & (remainder_bit - 1));
numer_x2 += 2 * n_set_bits_before;
if (!(cur_word & remainder_bit)) {
// common case
continue;
}
// Similar to the insertion tie-handling code above.
numer_x2 -= remainder;
uintptr_t ulii = cur_word + remainder_bit;
if (ulii < remainder_bit) {
uint32_t widx2 = widx;
do {
cur_word = new_yhat_present[++widx2];
} while (cur_word == ~plink2::k0LU);
numer_x2 += (widx2 - widx) * plink2::kBitsPerWord;
ulii = cur_word + 1;
}
numer_x2 += plink2::ctzw(ulii & (~cur_word));
}
plink2::aligned_free(wkspace);
return plink2::u63tod(numer_x2) / plink2::u63tod(denom * 2);
}
uint32_t PopcountLookup(const uintptr_t* new_yhat_present, const uint16_t* new_yhat_64b_popcounts, const uint16_t* new_yhat_2048b_popcounts, const uint32_t* new_yhat_64kib_popcounts, uint32_t widx) {
uint32_t result = 0;
const uint32_t idx_64kib = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline);
for (uint32_t uii = 0; uii != idx_64kib; ++uii) {
result += new_yhat_64kib_popcounts[uii];
}
const uint32_t idx_2048b = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline);
for (uint32_t uii = idx_64kib * plink2::kInt16PerCacheline; uii != idx_2048b; ++uii) {
result += new_yhat_2048b_popcounts[uii];
}
// Each entry in the bottom-level-index array is <= 512, and we're adding
// at most 16 of them even when compiling as 32-bit, so there's no overflow
// danger if we add entire words at a time (effectively performing 2-4
// uint16 additions in parallel).
const uintptr_t* walias_64b_popcounts = reinterpret_cast<const uintptr_t*>(new_yhat_64b_popcounts);
const uint32_t widx_64b = widx / (plink2::kWordsPerCacheline * plink2::kInt16PerWord);
uintptr_t acc = 0;
for (uint32_t uii = idx_2048b * plink2::kWordsPerCacheline; uii != widx_64b; ++uii) {
acc += walias_64b_popcounts[uii];
}
const uint32_t idx_64b = widx / plink2::kWordsPerCacheline;
uint32_t remainder = idx_64b % plink2::kInt16PerWord;
if (remainder) {
acc += plink2::bzhi(walias_64b_popcounts[widx_64b], remainder * 16);
}
result += (acc * plink2::kMask0001) >> (plink2::kBitsPerWord - 16);
for (uint32_t uii = idx_64b * plink2::kWordsPerCacheline; uii != widx; ++uii) {
result += plink2::PopcountWord(new_yhat_present[uii]);
}
return result;
}
double CIndexTieheavyMain(const CIndexRec* recs, uintptr_t size, unsigned char* wkspace, uintptr_t* new_yhat_present, uint16_t* new_yhat_64b_popcounts, uint16_t* new_yhat_2048b_popcounts, uint32_t* new_yhat_64kib_popcounts) {
// Alternate tie-handling strategy: maintain an additional array which tracks
// the number of times we've seen each yhat_int value.
uint32_t* yhat_int_multiplicities = static_cast<uint32_t*>(calloc(size, sizeof(int32_t)));
if (!yhat_int_multiplicities) {
plink2::aligned_free(wkspace);
stop("Out of memory");
}
double prev_key = recs[size - 1].key;
uint32_t recs_block_end = size;
uint32_t new_yhat_entry_ct = 0;
uint64_t numer_x2 = 0;
uint64_t denom = 0;
for (uint32_t recs_idx = size; recs_idx; ) {
--recs_idx;
const double cur_key = recs[recs_idx].key;
if (cur_key != prev_key) {
const uint32_t next_block_end = recs_idx + 1;
for (uint32_t uii = next_block_end; uii != recs_block_end; ++uii) {
const uint32_t yhat_int = recs[uii].yhat_int;
const uint32_t multiplicity = yhat_int_multiplicities[yhat_int];
const uint32_t adj_yhat_int = yhat_int + multiplicity;
yhat_int_multiplicities[yhat_int] = multiplicity + 1;
const uintptr_t widx = adj_yhat_int / plink2::kBitsPerWord;
const uintptr_t remainder_bit = plink2::k1LU << (adj_yhat_int % plink2::kBitsPerWord);
new_yhat_present[widx] |= remainder_bit;
new_yhat_64b_popcounts[widx / plink2::kWordsPerCacheline] += 1;
new_yhat_2048b_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline)] += 1;
new_yhat_64kib_popcounts[widx / (plink2::kWordsPerCacheline * plink2::kInt16PerCacheline * plink2::kInt16PerCacheline)] += 1;
}
recs_block_end = next_block_end;
new_yhat_entry_ct = size - recs_block_end;
prev_key = cur_key;
}
if (!recs[recs_idx].status) {
continue;
}
denom += new_yhat_entry_ct;
const uint32_t yhat_int = recs[recs_idx].yhat_int;
uint32_t widx = yhat_int / plink2::kBitsPerWord;
uint32_t n_set_bits_before = PopcountLookup(new_yhat_present, new_yhat_64b_popcounts, new_yhat_2048b_popcounts, new_yhat_64kib_popcounts, widx);
uint32_t remainder = yhat_int % plink2::kBitsPerWord;
uintptr_t cur_word = new_yhat_present[widx];
uintptr_t remainder_bit = plink2::k1LU << remainder;
n_set_bits_before += plink2::PopcountWord(cur_word & (remainder_bit - 1));
numer_x2 += 2 * n_set_bits_before + yhat_int_multiplicities[yhat_int];
}
free(yhat_int_multiplicities);
plink2::aligned_free(wkspace);
return plink2::u63tod(numer_x2) / plink2::u63tod(denom * 2);
}
// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
......
#ifndef __PGENLIB_MISC_H__
#define __PGENLIB_MISC_H__
// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
......@@ -74,12 +74,12 @@
// on the fly, since that tends to be faster than having to access twice as
// much memory.
#include "plink2_base.h"
#include "plink2_bits.h"
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
#define PGENLIB_INTERNAL_VERNUM 1401
#define PGENLIB_INTERNAL_VERNUM 1501
#ifdef __cplusplus
namespace plink2 {
......
// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
......
#ifndef __PGENLIB_WRITE_H__
#define __PGENLIB_WRITE_H__
// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
// This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
......
// This library is part of PLINK 2, copyright (C) 2005-2020 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by the
// Free Software Foundation; either version 3 of the License, or (at your
// option) any later version.
//
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
// for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see <http://www.gnu.org/licenses/>.
#include "plink2_base.h"
#ifdef __cplusplus
namespace plink2 {
#endif
uintptr_t g_failed_alloc_attempt_size = 0;
#if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
BoolErr pgl_malloc(uintptr_t size, void* pp) {
*S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
if (likely(*S_CAST(unsigned char**, pp))) {
return 0;
}
g_failed_alloc_attempt_size = size;
return 1;
}
#endif
BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile) {
while (len > kMaxBytesPerIO) {
// OS X fwrite() doesn't support 2GiB+ writes
// typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
// write size
// bugfix (9 Mar 2018): forgot a 'not' here...
if (unlikely(!fwrite_unlocked(buf, kMaxBytesPerIO, 1, outfile))) {
return 1;
}
buf = &(S_CAST(const unsigned char*, buf)[kMaxBytesPerIO]);
len -= kMaxBytesPerIO;
}
uintptr_t written_byte_ct = fwrite_unlocked(buf, 1, len, outfile);
// must do the right thing when len == 0
return (written_byte_ct != len);
}
/*
IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
uintptr_t bytes_read = 0;
while (len > kMaxBytesPerIO) {
const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
bytes_read += cur_bytes_read;
if (cur_bytes_read != kMaxBytesPerIO) {
*bytes_read_ptr = bytes_read;
return ferror_unlocked(infile);
}
buf = &(((char*)buf)[kMaxBytesPerIO]);
len -= kMaxBytesPerIO;
}
bytes_read += fread_unlocked(buf, 1, len, infile);
*bytes_read_ptr = bytes_read;
// could skip ferror_unlocked call if bytes_read == original len
return ferror_unlocked(infile);
}
*/
BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile) {
while (len > kMaxBytesPerIO) {
const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, kMaxBytesPerIO, infile);
if (unlikely(cur_bytes_read != kMaxBytesPerIO)) {
return 1;
}
buf = &(S_CAST(unsigned char*, buf)[kMaxBytesPerIO]);
len -= kMaxBytesPerIO;
}
const uintptr_t cur_bytes_read = fread_unlocked(buf, 1, len, infile);
return (cur_bytes_read != len);
}
#ifdef __LP64__
static inline BoolErr ScanUintCappedFinish(const char* str_iter, uint64_t cap, uint32_t* valp) {
uint64_t val = *valp;
while (1) {
// a little bit of unrolling seems to help
const uint64_t cur_digit = ctou64(*str_iter++) - 48;
if (cur_digit >= 10) {
break;
}
// val = val * 10 + cur_digit;
const uint64_t cur_digit2 = ctou64(*str_iter++) - 48;
if (cur_digit2 >= 10) {
val = val * 10 + cur_digit;
if (unlikely(val > cap)) {
return 1;
}
break;
}
val = val * 100 + cur_digit * 10 + cur_digit2;
if (unlikely(val > cap)) {
return 1;
}
}
*valp = val;
return 0;
}
BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
// '0' has ascii code 48
assert(ctou32(str_iter[0]) > 32);
*valp = ctou32(*str_iter++) - 48;
if (*valp >= 10) {
// permit leading '+' (ascii 43), but not '++' or '+-'
// reasonable to use unlikely() here since these functions aren't used for
// numeric vs. non-numeric classification anyway due to erroring out on
// overflow
if (unlikely(*valp != 0xfffffffbU)) {
return 1;
}
*valp = ctou32(*str_iter++) - 48;
if (unlikely(*valp >= 10)) {
return 1;
}
}
while (!(*valp)) {
*valp = ctou32(*str_iter++) - 48;
if (unlikely((*valp) >= 10)) {
return 1;
}
}
return ScanUintCappedFinish(str_iter, cap, valp);
}
// Note that NumericRangeListToBitarr() can call this in an ignore-overflow
// mode. If similar logic ever goes into an inner loop, remove all unlikely()
// annotations in this function and its children.
BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp) {
// Reads an integer in [0, cap]. Assumes first character is nonspace.
assert(ctou32(str_iter[0]) > 32);
uint32_t val = ctou32(*str_iter++) - 48;
if (val >= 10) {
if (val != 0xfffffffbU) {
// '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
return 1;
}
// accept "-0", "-00", etc.
while (*(++str_iter) == '0');
*valp = 0;
return (ctou32(*str_iter) - 48) < 10;
}
// accept leading '+'
val = ctou32(*str_iter++) - 48;
if (unlikely(val >= 10)) {
return 1;
}
}
*valp = val;
return ScanUintCappedFinish(str_iter, cap, valp);
}
BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp) {
// Reads an integer in [-bound, bound]. Assumes first character is nonspace.
assert(ctou32(str_iter[0]) > 32);
*valp = ctou32(*str_iter++) - 48;
int32_t sign = 1;
if (ctou32(*valp) >= 10) {
if (*valp == -3) {
sign = -1;
} else if (*valp != -5) {
return 1;
}
*valp = ctou32(*str_iter++) - 48;
if (unlikely(ctou32(*valp) >= 10)) {
return 1;
}
}
if (unlikely(ScanUintCappedFinish(str_iter, bound, R_CAST(uint32_t*, valp)))) {
return 1;
}
*valp *= sign;
return 0;
}
#else // not __LP64__
BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
// '0' has ascii code 48
assert(ctou32(str_iter[0]) > 32);
uint32_t val = ctou32(*str_iter++) - 48;
if (val >= 10) {
if (unlikely(val != 0xfffffffbU)) {
return 1;
}
val = ctou32(*str_iter++) - 48;
if (unlikely(val >= 10)) {
return 1;
}
}
while (!val) {
val = ctou32(*str_iter++) - 48;
if (unlikely(val >= 10)) {
return 1;
}
}
for (; ; ++str_iter) {
const uint32_t cur_digit = ctou32(*str_iter) - 48;
if (cur_digit >= 10) {
*valp = val;
return 0;
}
// avoid integer overflow in middle of computation
if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
return 1;
}
val = val * 10 + cur_digit;
}
}
BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
// Reads an integer in [0, cap]. Assumes first character is nonspace.
assert(ctou32(str_iter[0]) > 32);
uint32_t val = ctou32(*str_iter++) - 48;
if (val >= 10) {
if (val != 0xfffffffbU) {
if (unlikely((val != 0xfffffffdU) || (*str_iter != '0'))) {
return 1;
}
while (*(++str_iter) == '0');
*valp = 0;
return (ctou32(*str_iter) - 48) < 10;
}
val = ctou32(*str_iter++) - 48;
if (unlikely(val >= 10)) {
return 1;
}
}
for (; ; ++str_iter) {
const uint32_t cur_digit = ctou32(*str_iter) - 48;
if (cur_digit >= 10) {
*valp = val;
return 0;
}
if (unlikely((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10)))) {
return 1;
}
val = val * 10 + cur_digit;
}
}
BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
// Reads an integer in [-bound, bound]. Assumes first character is nonspace.
assert(ctou32(str_iter[0]) > 32);
uint32_t val = ctou32(*str_iter++) - 48;
int32_t sign = 1;
if (val >= 10) {
if (val == 0xfffffffdU) {
sign = -1;
} else if (val != 0xfffffffbU) {
return 1;
}
val = ctou32(*str_iter++) - 48;
if (unlikely(val >= 10)) {
return 1;
}
}
for (; ; ++str_iter) {
const uint32_t cur_digit = ctou32(*str_iter) - 48;
if (cur_digit >= 10) {
*valp = sign * S_CAST(int32_t, val);
return 0;
}
if (unlikely((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10)))) {
return 1;
}
val = val * 10 + cur_digit;
}
}
#endif
BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp) {
// Assumes malloc returns word-aligned addresses.
assert(alignment);
assert(!(alignment % kBytesPerWord));
uintptr_t malloc_addr;
if (unlikely(pgl_malloc(size + alignment, &malloc_addr))) {
return 1;
}
assert(!(malloc_addr % kBytesPerWord));
uintptr_t** casted_aligned_pp = S_CAST(uintptr_t**, aligned_pp);
*casted_aligned_pp = R_CAST(uintptr_t*, RoundDownPow2(malloc_addr + alignment, alignment));
(*casted_aligned_pp)[-1] = malloc_addr;
return 0;
}
#ifdef __LP64__
int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
if (byte_ct < 16 + (kBytesPerVec / 2)) {
if (byte_ct < kBytesPerWord) {
if (byte_ct < 4) {
if (byte_ct < 2) {
return (!byte_ct) || (m1_uc[0] == m2_uc[0]);
}
if ((*S_CAST(const uint16_t*, m1)) != (*S_CAST(const uint16_t*, m2))) {
return 0;
}
if ((byte_ct == 3) && (m1_uc[2] != m2_uc[2])) {
return 0;
}
return 1;
}
if ((*R_CAST(const uint32_t*, m1_uc)) != (*R_CAST(const uint32_t*, m2_uc))) {
return 0;
}
if (byte_ct > 4) {
const uintptr_t final_offset = byte_ct - 4;
if ((*R_CAST(const uint32_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uint32_t*, &(m2_uc[final_offset])))) {
return 0;
}
}
return 1;
}
const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
if (m1_alias[0] != m2_alias[0]) {
return 0;
}
if (byte_ct >= 16) {
if (m1_alias[1] != m2_alias[1]) {
return 0;
}
# ifdef USE_AVX2
if (byte_ct >= 24) {
if (m1_alias[2] != m2_alias[2]) {
return 0;
}
}
# endif
}
if (byte_ct % kBytesPerWord) {
const uintptr_t final_offset = byte_ct - kBytesPerWord;
if ((*R_CAST(const uintptr_t*, &(m1_uc[final_offset]))) != (*R_CAST(const uintptr_t*, &(m2_uc[final_offset])))) {
return 0;
}
}
return 1;
}
// Don't use VecW since _mm_cmpeq_epi64() not defined until SSE4.1.
const VecUc* m1_alias = S_CAST(const VecUc*, m1);
const VecUc* m2_alias = S_CAST(const VecUc*, m2);
const uintptr_t vec_ct = byte_ct / kBytesPerVec;
for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
// tried unrolling this, doesn't make a difference
const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
return 0;
}
}
if (byte_ct % kBytesPerVec) {
// put this last instead of first, for better behavior when inputs are
// aligned
const uintptr_t final_offset = byte_ct - kBytesPerVec;
const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
return 0;
}
}
return 1;
}
// clang/gcc memcmp is not that well-optimized for the short strings we usually
// compare.
int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) {
const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
// tried larger crossover threshold, doesn't help
if (byte_ct < kBytesPerVec) {
if (byte_ct < kBytesPerWord) {
if (byte_ct < 4) {
for (uintptr_t pos = 0; pos != byte_ct; ++pos) {
const unsigned char ucc1 = m1_uc[pos];
const unsigned char ucc2 = m2_uc[pos];
if (ucc1 != ucc2) {
return (ucc1 < ucc2)? -1 : 1;
}
}
return 0;
}
uint32_t m1_u32 = *S_CAST(const uint32_t*, m1);
uint32_t m2_u32 = *S_CAST(const uint32_t*, m2);
if (m1_u32 != m2_u32) {
return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
}
if (byte_ct > 4) {
const uintptr_t final_offset = byte_ct - 4;
m1_u32 = *R_CAST(const uint32_t*, &(m1_uc[final_offset]));
m2_u32 = *R_CAST(const uint32_t*, &(m2_uc[final_offset]));
if (m1_u32 != m2_u32) {
return (__builtin_bswap32(m1_u32) < __builtin_bswap32(m2_u32))? -1 : 1;
}
}
return 0;
}
const uintptr_t* m1_alias = R_CAST(const uintptr_t*, m1_uc);
const uintptr_t* m2_alias = R_CAST(const uintptr_t*, m2_uc);
uintptr_t m1_word = m1_alias[0];
uintptr_t m2_word = m2_alias[0];
if (m1_word != m2_word) {
return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
}
# ifdef USE_AVX2
if (byte_ct >= 16) {
m1_word = m1_alias[1];
m2_word = m2_alias[1];
if (m1_word != m2_word) {
return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
}
if (byte_ct >= 24) {
m1_word = m1_alias[2];
m2_word = m2_alias[2];
if (m1_word != m2_word) {
return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
}
}
}
# endif
if (byte_ct % kBytesPerWord) {
const uintptr_t final_offset = byte_ct - kBytesPerWord;
m1_word = *R_CAST(const uintptr_t*, &(m1_uc[final_offset]));
m2_word = *R_CAST(const uintptr_t*, &(m2_uc[final_offset]));
if (m1_word != m2_word) {
return (__builtin_bswap64(m1_word) < __builtin_bswap64(m2_word))? -1 : 1;
}
}
return 0;
}
const VecUc* m1_alias = S_CAST(const VecUc*, m1);
const VecUc* m2_alias = S_CAST(const VecUc*, m2);
const uintptr_t fullvec_ct = byte_ct / kBytesPerVec;
// uh, clang/LLVM -O2 optimizes this better when comparison is != instead of
// <? ugh, time to change all of the for loops...
// (and yes, both -O3 configurations generate worse code here)
// at least for loop is better than do-while loop even when 1 iteration is
// guaranteed...
for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
// is this even worthwhile now in non-AVX2 case?
const uint32_t movemask_result = vecuc_movemask(v1 == v2);
if (movemask_result != kVec8thUintMax) {
const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result);
return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
}
}
if (byte_ct % kBytesPerVec) {
const uintptr_t final_offset = byte_ct - kBytesPerVec;
const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
const uint32_t movemask_result = vecuc_movemask(v1 == v2);
if (movemask_result != kVec8thUintMax) {
const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result);
return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
}
}
return 0;
}
#endif
#ifdef __cplusplus
} // namespace plink2
#endif
This diff is collapsed.