Skip to content
Commits on Source (3)
......@@ -56,10 +56,10 @@ plink2: $(OBJ_NO_ZSTD)
$(MKDIR) -p bin
$(CXX) $(ARCH32) $(OBJ_NO_ZSTD) -o bin/plink2 $(BLASFLAGS64) $(LDFLAGS)
# basic pgenlib_internal.h usage example; also needed for tests
pgen_compress: plink2_base.o pgenlib_internal.o pgen_compress.o
# basic pgenlib usage example; also needed for tests
pgen_compress: $(PGCOBJ)
$(MKDIR) -p bin
$(CXX) plink2_base.o pgenlib_internal.o pgen_compress.o \
$(CXX) $(PGCOBJ) \
-o bin/pgen_compress
.PHONY: install-strip install clean
......
......@@ -7,9 +7,9 @@ CXXWARN2 = ${CWARN2} -Wold-style-cast
CSRC = SFMT.c libdeflate/lib/adler32.c libdeflate/lib/aligned_malloc.c libdeflate/lib/crc32.c libdeflate/lib/deflate_compress.c libdeflate/lib/deflate_decompress.c libdeflate/lib/gzip_compress.c libdeflate/lib/gzip_decompress.c libdeflate/lib/zlib_compress.c libdeflate/lib/zlib_decompress.c libdeflate/lib/x86/cpu_features.c
ZCSRC = zstd/lib/common/debug.c zstd/lib/common/entropy_common.c zstd/lib/common/zstd_common.c zstd/lib/common/error_private.c zstd/lib/common/xxhash.c zstd/lib/common/fse_decompress.c zstd/lib/common/pool.c zstd/lib/common/threading.c zstd/lib/compress/fse_compress.c zstd/lib/compress/hist.c zstd/lib/compress/huf_compress.c zstd/lib/compress/zstd_double_fast.c zstd/lib/compress/zstd_fast.c zstd/lib/compress/zstd_lazy.c zstd/lib/compress/zstd_ldm.c zstd/lib/compress/zstd_opt.c zstd/lib/compress/zstd_compress.c zstd/lib/compress/zstdmt_compress.c zstd/lib/decompress/huf_decompress.c zstd/lib/decompress/zstd_decompress.c zstd/lib/decompress/zstd_ddict.c zstd/lib/decompress/zstd_decompress_block.c
ZCSRC = zstd/lib/common/debug.c zstd/lib/common/entropy_common.c zstd/lib/common/zstd_common.c zstd/lib/common/error_private.c zstd/lib/common/xxhash.c zstd/lib/common/fse_decompress.c zstd/lib/common/pool.c zstd/lib/common/threading.c zstd/lib/compress/fse_compress.c zstd/lib/compress/hist.c zstd/lib/compress/huf_compress.c zstd/lib/compress/zstd_double_fast.c zstd/lib/compress/zstd_fast.c zstd/lib/compress/zstd_lazy.c zstd/lib/compress/zstd_ldm.c zstd/lib/compress/zstd_opt.c zstd/lib/compress/zstd_compress.c zstd/lib/compress/zstd_compress_literals.c zstd/lib/compress/zstd_compress_sequences.c zstd/lib/compress/zstdmt_compress.c zstd/lib/decompress/huf_decompress.c zstd/lib/decompress/zstd_decompress.c zstd/lib/decompress/zstd_ddict.c zstd/lib/decompress/zstd_decompress_block.c
CCSRC = plink2_base.cc pgenlib_internal.cc plink2.cc plink2_adjust.cc plink2_bgzf.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc plink2_stats.cc plink2_string.cc plink2_text.cc plink2_thread.cc plink2_zstfile.cc
CCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc plink2.cc plink2_adjust.cc plink2_bgzf.cc plink2_cmdline.cc plink2_common.cc plink2_compress_stream.cc plink2_data.cc plink2_decompress.cc plink2_export.cc plink2_fasta.cc plink2_filter.cc plink2_glm.cc plink2_help.cc plink2_import.cc plink2_ld.cc plink2_matrix.cc plink2_matrix_calc.cc plink2_misc.cc plink2_psam.cc plink2_pvar.cc plink2_random.cc plink2_set.cc plink2_stats.cc plink2_string.cc plink2_text.cc plink2_thread.cc plink2_zstfile.cc
OBJ_NO_ZSTD = $(CSRC:.c=.o) $(CCSRC:.cc=.o)
OBJ = $(CSRC:.c=.o) $(ZCSRC:.c=.o) $(CCSRC:.cc=.o)
......@@ -26,5 +26,9 @@ CINCLUDE2 = -I../libdeflate -I../libdeflate/common
ZSTD_INCLUDE = -Izstd/lib -Izstd/lib/common
ZSTD_INCLUDE2 = -I../zstd/lib -I../zstd/lib/common
PGCSRC = plink2_base.cc pgenlib_misc.cc pgenlib_read.cc pgenlib_write.cc pgen_compress.cc
PGCOBJ = $(PGCSRC:.c=.o)
PGCSRC2 = $(foreach fname,$(PGCSRC),../$(fname))
CLEAN = *.o libdeflate/lib/*.o libdeflate/lib/x86/*.o zstd/lib/common/*.o zstd/lib/compress/*.o zstd/lib/decompress/*.o bin/plink2 bin/pgen_compress
CLEAN3 = $(foreach expr,$(CLEAN),../$(expr))
This diff is collapsed.
......@@ -8,7 +8,7 @@ import numpy as np
ext_modules = [
Extension('pgenlib',
sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../pgenlib_internal.cc', '../plink2_base.cc'],
sources = ['pgenlib.pyx', '../pgenlib_ffi_support.cc', '../pgenlib_misc.cc', '../pgenlib_read.cc', '../pgenlib_write.cc', '../plink2_base.cc'],
language = "c++",
# do not compile as c++11, since cython doesn't yet support
# overload of uint32_t operator
......@@ -21,6 +21,6 @@ ext_modules = [
]
setup(name = 'Pgenlib',
version = '0.7',
version = '0.71',
description = "Wrapper for pgenlib's basic reader and writer.",
ext_modules = cythonize(ext_modules))
......@@ -150,8 +150,8 @@ plink2: $(CSRC2) $(ZCSRC2) $(CCSRC2) ../plink2_cpu.cc
g++ $(CPUCHECK_FLAGS) ../plink2_cpu.cc -c
g++ $(OBJ2) plink2_cpu.o $(ARCH32) -o plink2 $(BLASFLAGS) $(LINKFLAGS)
pgen_compress: ../plink2_base.cc ../pgenlib_internal.cc ../pgen_compress.cc
g++ $(CXXFLAGS) ../plink2_base.cc ../pgenlib_internal.cc ../pgen_compress.cc -o pgen_compress
pgen_compress: $(PGCSRC2)
g++ $(CXXFLAGS) $(PGCSRC2) -o pgen_compress
.PHONY: clean
clean:
......
......@@ -80,8 +80,8 @@ plink2: $(CSRC2) $(ZCSRC2) $(CCSRC2) ../plink2_cpu.cc
g++ $(CPUCHECK_FLAGS) ../plink2_cpu.cc -c
gfortran $(OBJ2) plink2_cpu.o -o plink2 $(BLASFLAGS) $(LINKFLAGS)
pgen_compress: ../plink2_base.cc ../pgenlib_internal.cc ../pgen_compress.cc
g++ $(CXXFLAGS) ../plink2_base.cc ../pgenlib_internal.cc ../pgen_compress.cc -o pgen_compress
pgen_compress: $(PGCSRC2)
g++ $(CXXFLAGS) $(PGCSRC2) -o pgen_compress
.PHONY: clean
clean:
......
plink2 (2.00~a2-191022-1) UNRELEASED; urgency=medium
plink2 (2.00~a2-191115-1) UNRELEASED; urgency=medium
* Initial release. (Closes: #858947)
* TODO:
- add autopkgtest
- remove embedded zstd
-- Dylan Aïssi <daissi@debian.org> Fri, 25 Oct 2019 23:03:56 +0200
-- Dylan Aïssi <daissi@debian.org> Wed, 20 Nov 2019 07:00:08 +0100
This diff is collapsed.
#include "pgenlib_internal.h"
#include "pgenlib_read.h"
#include "pgenlib_write.h"
// #define SUBSET_TEST
......@@ -85,7 +86,7 @@ int32_t main(int32_t argc, char** argv) {
} else {
printf("%u variant%s and %u sample%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
}
if (cachealigned_malloc(QuaterCtToVecCt(sample_ct) * kBytesPerVec, &genovec)) {
if (cachealigned_malloc(NypCtToVecCt(sample_ct) * kBytesPerVec, &genovec)) {
goto main_ret_NOMEM;
}
if (decompress) {
......@@ -94,7 +95,7 @@ int32_t main(int32_t argc, char** argv) {
goto main_ret_OPEN_FAIL;
}
const uintptr_t final_mask = (k1LU << ((sample_ct % kBitsPerWordD2) * 2)) - k1LU;
const uint32_t final_widx = QuaterCtToWordCt(sample_ct) - 1;
const uint32_t final_widx = NypCtToWordCt(sample_ct) - 1;
const uint32_t variant_byte_ct = (sample_ct + 3) / 4;
fwrite("l\x1b\x01", 3, 1, outfile);
for (uint32_t vidx = 0; vidx < variant_ct; ) {
......@@ -210,7 +211,7 @@ int32_t main(int32_t argc, char** argv) {
#ifndef NO_MMAP
CleanupPgfi(&pgfi, &reterr);
#endif
SpgwCleanup(&spgw);
CleanupSpgw(&spgw, &reterr);
if (pgfi_alloc) {
aligned_free(pgfi_alloc);
}
......
......@@ -187,7 +187,65 @@ void Dosage16ToDoubles(const double* geno_double_pair_table, const uintptr_t* ge
}
}
double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double weight_sum) {
BoolErr Dosage16ToDoublesMeanimpute(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double) {
STD_ARRAY_DECL(uint32_t, 4, genocounts);
double geno_double_pair_buf[32];
if (!dosage_ct) {
GenoarrCountFreqsUnsafe(genoarr, sample_ct, genocounts);
const double* geno_double_pair_table = kGenoDoublePairs;
if (genocounts[3]) {
const uint32_t denom = sample_ct - genocounts[3];
if (!denom) {
return 1;
}
const uint32_t numer = genocounts[1] + 2 * genocounts[2];
const double missing_val = u63tod(numer) / u31tod(denom);
geno_double_pair_buf[0] = 0.0;
geno_double_pair_buf[2] = 1.0;
geno_double_pair_buf[4] = 2.0;
geno_double_pair_buf[6] = missing_val;
InitLookup16x8bx2(geno_double_pair_buf);
geno_double_pair_table = geno_double_pair_buf;
}
GenoarrLookup16x8bx2(genoarr, geno_double_pair_table, sample_ct, geno_double);
return 0;
}
// In the generic case, it may be faster to check for the existence of a
// missing value before calling GenoarrCountInvsubsetFreqs2() (since if there
// are no missing values, we don't need to count at all). However, we assume
// the caller is using this function over Dosage16ToDoubles() for a reason.
GenoarrCountInvsubsetFreqs2(genoarr, dosage_present, sample_ct, sample_ct - dosage_ct, genocounts);
const double* geno_double_pair_table = kGenoDoublePairs;
if (genocounts[3]) {
uint64_t denom = sample_ct - genocounts[3];
if (!denom) {
return 1;
}
denom *= 16384LLU;
uint64_t numer = 0;
for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
numer += dosage_main[dosage_idx];
}
numer += 16384LLU * (genocounts[1] + 2 * genocounts[2]);
const double missing_val = u63tod(numer) / u63tod(denom);
geno_double_pair_buf[0] = 0.0;
geno_double_pair_buf[2] = 1.0;
geno_double_pair_buf[4] = 2.0;
geno_double_pair_buf[6] = missing_val;
InitLookup16x8bx2(geno_double_pair_buf);
geno_double_pair_table = geno_double_pair_buf;
}
GenoarrLookup16x8bx2(genoarr, geno_double_pair_table, sample_ct, geno_double);
uintptr_t sample_uidx_base = 0;
uintptr_t cur_bits = dosage_present[0];
for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
geno_double[sample_uidx] = S_CAST(double, dosage_main[dosage_idx]) * 0.00006103515625;
}
return 0;
}
double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct) {
const uint32_t word_ct = DivUp(sample_ct, kBitsPerWordD2);
double result = 0.0;
double result2 = 0.0;
......@@ -221,8 +279,22 @@ double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoa
}
}
result += 2 * result2;
} else {
if (miss_weight != 0.0) {
// bugfix (29 Oct 2019): previous mean-imputation formula was based on
// *weighted* MAF, which was obviously nonsense when negative weights
// were present.
STD_ARRAY_DECL(uint32_t, 4, genocounts);
GenoarrCountFreqsUnsafe(genoarr, sample_ct, genocounts);
const double numer = u63tod(genocounts[1] + 2 * genocounts[2]);
const double denom = u31tod(sample_ct - genocounts[3]);
result += miss_weight * (numer / denom);
}
return result;
}
const Halfword* dosage_present_hws = R_CAST(const Halfword*, dosage_present);
uint32_t onealt_ct = 0;
uint32_t twoalt_ct = 0;
uint32_t missing_ct = 0;
for (uint32_t widx = 0; widx != word_ct; ++widx) {
const uintptr_t geno_word = genoarr[widx];
if (geno_word) {
......@@ -235,17 +307,21 @@ double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoa
while (geno_word1) {
const uint32_t sample_idx_lowbits = ctzw(geno_word1) / 2;
result += cur_weights[sample_idx_lowbits];
// probably sparse enough that this is faster than popcount?
++onealt_ct;
geno_word1 &= geno_word1 - 1;
}
geno_word2 &= mask_word;
while (geno_word2) {
const uint32_t sample_idx_lowbits = ctzw(geno_word2) / 2;
result2 += cur_weights[sample_idx_lowbits];
++twoalt_ct;
geno_word2 &= geno_word2 - 1;
}
while (geno_missing_word) {
const uint32_t sample_idx_lowbits = ctzw(geno_missing_word) / 2;
miss_weight += cur_weights[sample_idx_lowbits];
++missing_ct;
geno_missing_word &= geno_missing_word - 1;
}
}
......@@ -255,15 +331,26 @@ double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoa
double resultx = 0.0;
uintptr_t sample_uidx_base = 0;
uintptr_t cur_bits = dosage_present[0];
if (miss_weight == 0.0) {
for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
resultx += S_CAST(double, *dosage_main_iter++) * weights[sample_uidx];
}
result += 0.00006103515625 * resultx;
return result;
}
if (miss_weight != 0.0) {
result = result * weight_sum / (weight_sum - miss_weight);
// Also need to track dosage-sum for mean-imputation.
uint64_t dosage_sum = 0;
for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
const uint32_t cur_dosage = *dosage_main_iter++;
dosage_sum += cur_dosage;
resultx += u31tod(cur_dosage) * weights[sample_uidx];
}
result += 0.00006103515625 * resultx;
const double numer = u63tod(16384 * S_CAST(uint64_t, onealt_ct + 2 * twoalt_ct) + dosage_sum);
const double denom = 16384 * u31tod(sample_ct - missing_ct);
result += miss_weight * (numer / denom);
return result;
}
......
......@@ -17,7 +17,7 @@
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see <http://www.gnu.org/licenses/>.
#include "pgenlib_internal.h"
#include "pgenlib_misc.h"
#ifdef __cplusplus
namespace plink2 {
......@@ -66,7 +66,11 @@ void Dosage16ToFloatsMinus9(const uintptr_t* genoarr, const uintptr_t* dosage_pr
void Dosage16ToDoubles(const double* geno_double_pair_table, const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double);
double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double weight_sum);
// If all samples are missing, this errors out.
BoolErr Dosage16ToDoublesMeanimpute(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double);
// Currently requires trailing bits of genoarr to be zeroed out.
double LinearCombinationMeanimpute(const double* weights, const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct);
HEADER_INLINE void Dosage16ToDoublesMinus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double) {
Dosage16ToDoubles(kGenoDoublePairs, genoarr, dosage_present, dosage_main, sample_ct, dosage_ct, geno_double);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef __PGENLIB_WRITE_H__
#define __PGENLIB_WRITE_H__
// This library is part of PLINK 2.00, copyright (C) 2005-2019 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by the
// Free Software Foundation; either version 3 of the License, or (at your
// option) any later version.
//
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
// for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see <http://www.gnu.org/licenses/>.
// pgenlib_write contains writer-specific code.
#include "pgenlib_misc.h"
#ifdef __cplusplus
namespace plink2 {
#endif
typedef struct PgenWriterCommonStruct {
// was marked noncopyable, but, well, gcc 9 caught me cheating (memcpying the
// whole struct) in the multithreaded writer implementation. So, copyable
// now.
uint32_t variant_ct;
uint32_t sample_ct;
PgenGlobalFlags phase_dosage_gflags; // subset of gflags
// there should be a single copy of these arrays shared by all threads.
// allele_idx_offsets is read-only.
uint64_t* vblock_fpos;
unsigned char* vrec_len_buf;
uintptr_t* vrtype_buf;
const uintptr_t* allele_idx_offsets;
uintptr_t* explicit_nonref_flags; // usually nullptr
// needed for multiallelic-phased case
uintptr_t* genovec_hets_buf;
STD_ARRAY_DECL(uint32_t, 4, ldbase_genocounts);
// should match ftello() return value in singlethreaded case, but be set to
// zero in multithreaded case
uint64_t vblock_fpos_offset;
// these must hold sample_ct entries
// genovec_invert_buf also used as phaseinfo and dphase_present temporary
// storage
uintptr_t* genovec_invert_buf;
uintptr_t* ldbase_genovec;
// these must hold 2 * (sample_ct / kPglMaxDifflistLenDivisor) entries
uintptr_t* ldbase_raregeno;
uint32_t* ldbase_difflist_sample_ids; // 1 extra entry, == sample_ct
// this must fit 64k variants in multithreaded case
unsigned char* fwrite_buf;
unsigned char* fwrite_bufp;
uint32_t ldbase_common_geno; // UINT32_MAX if ldbase_genovec present
uint32_t ldbase_difflist_len;
// I'll cache this for now
uintptr_t vrec_len_byte_ct;
uint32_t vidx;
} PgenWriterCommon;
// Given packed arrays of unphased biallelic genotypes in uncompressed plink2
// binary format (00 = hom ref, 01 = het ref/alt1, 10 = hom alt1, 11 =
// missing), {Single,Multi}threaded_pgen_writer performs difflist (sparse
// variant), one bit (mostly-two-value), and LD compression before writing to
// disk, and backfills the header at the end. CPRA -> CPR merging is under
// development.
// The major difference between the two interfaces is that
// Multithreaded_pgen_writer forces you to process large blocks of variants at
// a time (64k per thread). So Singlethreaded_pgen_writer is still worth using
// in some cases (memory is very limited, I/O is slow, no programmer time to
// spare for the additional complexity).
typedef struct STPgenWriterStruct {
NONCOPYABLE(STPgenWriterStruct);
#ifdef __cplusplus
PgenWriterCommon& GET_PRIVATE_pwc() { return pwc; }
PgenWriterCommon const& GET_PRIVATE_pwc() const { return pwc; }
FILE*& GET_PRIVATE_pgen_outfile() { return pgen_outfile; }
FILE* const& GET_PRIVATE_pgen_outfile() const { return pgen_outfile; }
private:
#endif
PgenWriterCommon pwc;
FILE* pgen_outfile;
} STPgenWriter;
typedef struct MTPgenWriterStruct {
NONCOPYABLE(MTPgenWriterStruct);
FILE* pgen_outfile;
uint32_t thread_ct;
PgenWriterCommon* pwcs[];
} MTPgenWriter;
HEADER_INLINE const uintptr_t* SpgwGetAlleleIdxOffsets(STPgenWriter* spgwp) {
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
return pwcp->allele_idx_offsets;
}
HEADER_INLINE uint32_t SpgwGetVariantCt(STPgenWriter* spgwp) {
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
return pwcp->variant_ct;
}
HEADER_INLINE uint32_t SpgwGetSampleCt(STPgenWriter* spgwp) {
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
return pwcp->sample_ct;
}
HEADER_INLINE uint32_t SpgwGetVidx(STPgenWriter* spgwp) {
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
return pwcp->vidx;
}
void PreinitSpgw(STPgenWriter* spgwp);
// phase_dosage_gflags zero vs. nonzero is most important: this determines size
// of header. Otherwise, setting more flags than necessary just increases
// memory requirements.
//
// nonref_flags_storage values:
// 0 = no info stored
// 1 = always trusted
// 2 = always untrusted
// 3 = use explicit_nonref_flags
//
// Caller is responsible for printing open-fail error message.
PglErr SpgwInitPhase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr);
void SpgwInitPhase2(uint32_t max_vrec_len, STPgenWriter* spgwp, unsigned char* spgw_alloc);
// moderately likely that there isn't enough memory to use the maximum number
// of threads, so this returns per-thread memory requirements before forcing
// the caller to specify thread count
// (eventually should write code which falls back on STPgenWriter
// when there isn't enough memory for even a single 64k variant block, at least
// for the most commonly used plink 2.0 functions)
void MpgwInitPhase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr);
// Caller is responsible for printing open-fail error message.
PglErr MpgwInitPhase2(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp);
// trailing bits of genovec must be zeroed out
void PwcAppendBiallelicGenovec(const uintptr_t* __restrict genovec, PgenWriterCommon* pwcp);
BoolErr SpgwFlush(STPgenWriter* spgwp);
HEADER_INLINE PglErr SpgwAppendBiallelicGenovec(const uintptr_t* __restrict genovec, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
PwcAppendBiallelicGenovec(genovec, pwcp);
return kPglRetSuccess;
}
// trailing bits of raregeno must be zeroed out
// all raregeno entries assumed to be unequal to difflist_common_geno; the
// difflist should be compacted first if this isn't true
// difflist_len must be <= 2 * (sample_ct / kPglMaxDifflistLenDivisor);
// there's an assert checking this
void PwcAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
PwcAppendBiallelicDifflistLimited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, pwcp);
return kPglRetSuccess;
}
// Two interfaces for appending multiallelic hardcalls:
// 1. sparse: genovec, bitarray+values describing ref/altx hardcalls which
// aren't ref/alt1, bitarray+values describing altx/alty hardcalls which
// aren't alt1/alt1.
// patch_01_vals[] contains one entry per relevant sample (only need altx
// index), patch_10_vals[] contains two.
// Ok if patch_01_ct == patch_10_ct == 0; in this case no aux1 track is
// saved and bit 3 of vrtype is not set. (Note that multiallelic dosage may
// still be present when vrtype bit 3 is unset.)
// 2. generic dense: takes a length-2n array of AlleleCode allele codes.
// Assumes [2k] <= [2k+1] for each k. Instead of providing direct API
// functions for this, we just provide a dense -> sparse helper function.
//
// All arrays should be vector-aligned.
BoolErr PwcAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
if (unlikely(PwcAppendMultiallelicSparse(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, patch_01_ct, patch_10_ct, pwcp))) {
return kPglRetVarRecordTooLarge;
}
return kPglRetSuccess;
}
// This may not zero out trailing halfword of patch_{01,10}_set.
void PglMultiallelicDenseToSparse(const AlleleCode* __restrict wide_codes, uint32_t sample_ct, uintptr_t* __restrict genoarr, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict patch_01_ct_ptr, uint32_t* __restrict patch_10_ct_ptr);
// If remap is not nullptr, this simultaneously performs a rotation operation:
// wide_codes[2n] and [2n+1] are set to remap[geno[n]] rather than geno[n], and
// bit n of flipped (if flipped non-null) is set iff phase orientation is
// flipped (i.e. wide_codes[2n] was larger than wide_codes[2n+1] before the
// final reordering pass; caller needs to know this to properly update
// phaseinfo, dphase_delta, multidphase_delta).
// It currently assumes no alleles are being mapped to 'missing'.
void PglMultiallelicSparseToDense(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const AlleleCode* __restrict remap, uint32_t sample_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uintptr_t* __restrict flipped, AlleleCode* __restrict wide_codes);
// phasepresent == nullptr ok, that indicates that ALL heterozygous calls are
// phased. Caller should use e.g. PwcAppendBiallelicGenovec() if it's known
// in advance that no calls are phased.
// Ok for phaseinfo to have bits set at non-het calls, NOT currently okay for
// phasepresent
void PwcAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, PgenWriterCommon* pwcp);
// phasepresent == nullptr ok
// ok for trailing bits of phaseinfo to not be zeroed out, NOT currently ok for
// phasepresent
HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
PwcAppendBiallelicGenovecHphase(genovec, phasepresent, phaseinfo, pwcp);
return kPglRetSuccess;
}
BoolErr PwcAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
if (unlikely(PwcAppendMultiallelicGenovecHphase(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, phasepresent, phaseinfo, patch_01_ct, patch_10_ct, pwcp))) {
return kPglRetVarRecordTooLarge;
}
return kPglRetSuccess;
}
// dosage_main[] has length dosage_ct, not sample_ct
// ok for traling bits of dosage_present to not be zeroed out
BoolErr PwcAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
if (unlikely(PwcAppendBiallelicGenovecDosage16(genovec, dosage_present, dosage_main, dosage_ct, pwcp))) {
return kPglRetVarRecordTooLarge;
}
return kPglRetSuccess;
}
BoolErr PwcAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
if (unlikely(PwcAppendBiallelicGenovecHphaseDosage16(genovec, phasepresent, phaseinfo, dosage_present, dosage_main, dosage_ct, pwcp))) {
return kPglRetVarRecordTooLarge;
}
return kPglRetSuccess;
}
// dosage_present cannot be null for nonzero dosage_ct
// could make dosage_main[] has length dosage_ct + dphase_ct instead of having
// separate dphase_delta[]?
BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp);
HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, STPgenWriter* spgwp) {
if (unlikely(SpgwFlush(spgwp))) {
return kPglRetWriteFail;
}
PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
if (unlikely(PwcAppendBiallelicGenovecDphase16(genovec, phasepresent, phaseinfo, dosage_present, dphase_present, dosage_main, dphase_delta, dosage_ct, dphase_ct, pwcp))) {
return kPglRetVarRecordTooLarge;
}
return kPglRetSuccess;
}
// Backfills header info, then closes the file.
PglErr SpgwFinish(STPgenWriter* spgwp);
// Last flush automatically backfills header info and closes the file.
// (caller should set mpgwp = nullptr after that)
PglErr MpgwFlush(MTPgenWriter* mpgwp);
// these close the file if open, but do not free any memory
// MpgwCleanup() handles mpgwp == nullptr, since it shouldn't be allocated on
// the stack
// error-return iff reterr was success and was changed to kPglRetWriteFail
// (i.e. an error message should be printed), though this is not relevant for
// plink2
BoolErr CleanupSpgw(STPgenWriter* spgwp, PglErr* reterrp);
BoolErr CleanupMpgw(MTPgenWriter* mpgwp, PglErr* reterrp);
#ifdef __cplusplus
} // namespace plink2
#endif
#endif // __PGENLIB_WRITE_H__
......@@ -57,6 +57,14 @@ ReadAlleles <- function(pgen, acbuf, variant_num, phasepresent_buf = NULL) {
invisible(.Call(`_pgenlibr_ReadAlleles`, pgen, acbuf, variant_num, phasepresent_buf))
}
ReadIntList <- function(pgen, variant_subset) {
.Call(`_pgenlibr_ReadIntList`, pgen, variant_subset)
}
ReadList <- function(pgen, variant_subset, meanimpute = FALSE) {
.Call(`_pgenlibr_ReadList`, pgen, variant_subset, meanimpute)
}
VariantScores <- function(pgen, weights, variant_subset = NULL) {
.Call(`_pgenlibr_VariantScores`, pgen, weights, variant_subset)
}
......
SOURCES = $(wildcard libdeflate/lib/*.c) $(wildcard libdeflate/lib/x86/*.c)
OBJECTS = plink2_base.o pgenlib_internal.o pvar_ffi_support.o pgenlib_ffi_support.o plink2_bgzf.o plink2_string.o plink2_text.o plink2_thread.o plink2_zstfile.o pvar.o pgenlibr.o RcppExports.o $(SOURCES:.c=.o)
OBJECTS = plink2_base.o pgenlib_misc.o pgenlib_read.o pvar_ffi_support.o pgenlib_ffi_support.o plink2_bgzf.o plink2_string.o plink2_text.o plink2_thread.o plink2_zstfile.o pvar.o pgenlibr.o RcppExports.o $(SOURCES:.c=.o)
PKG_CFLAGS = -Ilibdeflate -Ilibdeflate/common
PKG_CPPFLAGS =
PKG_LIBS = -lpthread -lzstd
......@@ -169,6 +169,31 @@ BEGIN_RCPP
return R_NilValue;
END_RCPP
}
// ReadIntList
IntegerMatrix ReadIntList(List pgen, IntegerVector variant_subset);
RcppExport SEXP _pgenlibr_ReadIntList(SEXP pgenSEXP, SEXP variant_subsetSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type pgen(pgenSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type variant_subset(variant_subsetSEXP);
rcpp_result_gen = Rcpp::wrap(ReadIntList(pgen, variant_subset));
return rcpp_result_gen;
END_RCPP
}
// ReadList
NumericMatrix ReadList(List pgen, IntegerVector variant_subset, bool meanimpute);
RcppExport SEXP _pgenlibr_ReadList(SEXP pgenSEXP, SEXP variant_subsetSEXP, SEXP meanimputeSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type pgen(pgenSEXP);
Rcpp::traits::input_parameter< IntegerVector >::type variant_subset(variant_subsetSEXP);
Rcpp::traits::input_parameter< bool >::type meanimpute(meanimputeSEXP);
rcpp_result_gen = Rcpp::wrap(ReadList(pgen, variant_subset, meanimpute));
return rcpp_result_gen;
END_RCPP
}
// VariantScores
NumericVector VariantScores(List pgen, NumericVector weights, Nullable<IntegerVector> variant_subset);
RcppExport SEXP _pgenlibr_VariantScores(SEXP pgenSEXP, SEXP weightsSEXP, SEXP variant_subsetSEXP) {
......@@ -254,6 +279,8 @@ static const R_CallMethodDef CallEntries[] = {
{"_pgenlibr_ReadHardcalls", (DL_FUNC) &_pgenlibr_ReadHardcalls, 4},
{"_pgenlibr_Read", (DL_FUNC) &_pgenlibr_Read, 4},
{"_pgenlibr_ReadAlleles", (DL_FUNC) &_pgenlibr_ReadAlleles, 4},
{"_pgenlibr_ReadIntList", (DL_FUNC) &_pgenlibr_ReadIntList, 2},
{"_pgenlibr_ReadList", (DL_FUNC) &_pgenlibr_ReadList, 3},
{"_pgenlibr_VariantScores", (DL_FUNC) &_pgenlibr_VariantScores, 3},
{"_pgenlibr_ClosePgen", (DL_FUNC) &_pgenlibr_ClosePgen, 1},
{"_pgenlibr_NewPvar", (DL_FUNC) &_pgenlibr_NewPvar, 1},
......