Skip to content

Commit

Permalink
pgenlibr ReadSparseHardcalls
Browse files Browse the repository at this point in the history
  • Loading branch information
chrchang committed Feb 4, 2025
1 parent 1edd0ce commit 5e6be2f
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 40 deletions.
2 changes: 1 addition & 1 deletion 2.0/include/pgenlib_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
#define PGENLIB_INTERNAL_VERNUM 2005
#define PGENLIB_INTERNAL_VERNUM 2006

#ifdef __cplusplus
namespace plink2 {
Expand Down
7 changes: 4 additions & 3 deletions 2.0/include/pgenlib_read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2269,8 +2269,9 @@ PglErr ParseAndSaveDifflist(const unsigned char* fread_end, uint32_t raw_sample_
}

PglErr ParseAndSaveDifflistProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) {
// Requires a PROPER subset. Might want to just merge this with
// ParseAndSaveDifflist() and rename appropriately.
// Requires a PROPER subset, since it assumes sample_include is non-null.
// Might want to just merge this with ParseAndSaveDifflist() and rename
// appropriately.
// Trailing bits of raregeno are zeroed out.
uint32_t raw_difflist_len;
const unsigned char* group_info_iter;
Expand Down Expand Up @@ -3139,13 +3140,13 @@ PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include
PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
assert(vidx < pgrp->fi.raw_variant_ct);
assert(sample_ct);
assert(max_simple_difflist_len < sample_ct);
// Side effects:
// may use pgr.workspace_raregeno_tmp_loadbuf
// Trailing bits of genovec/main_raregeno may not be zeroed out.
const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
const uint32_t maintrack_vrtype = vrtype & 7;
const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
assert(max_simple_difflist_len < raw_sample_ct);
const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
// const uint32_t multiallelic_hc_present = fread_pp && VrtypeMultiallelic(vrtype);
if (VrtypeLdCompressed(maintrack_vrtype)) {
Expand Down
2 changes: 1 addition & 1 deletion 2.0/include/pgenlib_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex p
// difflist_common_geno to the common genotype value in that case. Otherwise,
// genovec is populated and difflist_common_geno is set to UINT32_MAX.
//
// max_simple_difflist_len must be smaller than sample_ct.
// max_simple_difflist_len must be smaller than raw_sample_ct.
//
// Note that the returned difflist_len can be much larger than
// max_simple_difflist_len when the variant is LD-encoded; it's bounded by
Expand Down
3 changes: 1 addition & 2 deletions 2.0/pgenlibr/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,7 @@ HasSparseHardcalls <- function(pgen, variant_num, allele_num = 2L) {
#' to 2.
#' @param return_ints Whether to make the "counts" component of the return
#' value an IntegerVector instead of a NumericVector; defaults to false.
#' @return Either an empty list, in which case buf is filled in the same
#' mannerObject where "sample_ids" is an increasing sequence of positive
#' @return An object where "sample_nums" is an increasing sequence of positive
#' integers listing which samples have the allele, and "counts" is a vector
#' listing the allele counts for those samples.
#' @export
Expand Down
3 changes: 1 addition & 2 deletions 2.0/pgenlibr/man/ReadSparseHardcalls.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

149 changes: 122 additions & 27 deletions 2.0/pgenlibr/src/pgenlibr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ class RPgenReader {

void ReadHardcalls(NumericVector buf, int variant_idx, int allele_idx);

void ReadIntMaybeSparseHardcalls(IntegerVector buf, int variant_idx, int allele_idx, int file_difflist_len_limit, IntegerVector* sample_nums_ptr, IntegerVector* allele_counts_ptr);
void ReadIntMaybeSparseHardcalls(IntegerVector buf, int variant_idx, int allele_idx, int max_simple_difflist_len, IntegerVector* sample_nums_ptr, IntegerVector* allele_counts_ptr);

// void ReadMaybeSparseHardcalls(NumericVector buf, int variant_idx, int allele_idx, int file_difflist_len_limit, IntegerVector* sample_nums_ptr, NumericVector* allele_dosages_ptr);
void ReadMaybeSparseHardcalls(NumericVector buf, int variant_idx, int allele_idx, int max_simple_difflist_len, IntegerVector* sample_nums_ptr, NumericVector* allele_dosages_ptr);

void Read(NumericVector buf, int variant_idx, int allele_idx);

Expand Down Expand Up @@ -83,7 +83,7 @@ class RPgenReader {

void SetSampleSubsetInternal(IntegerVector sample_subset_1based);

void ReadMaybeSparseHardcallsInternal(int variant_idx, int file_difflist_len_limit, uint32_t* difflist_common_geno_ptr, uint32_t* difflist_len_ptr);
void ReadMaybeSparseHardcallsInternal(int variant_idx, int max_simple_difflist_len, uint32_t* difflist_common_geno_ptr, uint32_t* difflist_len_ptr);

void ReadAllelesPhasedInternal(int variant_idx);
};
Expand Down Expand Up @@ -179,13 +179,13 @@ void RPgenReader::Load(String filename, Nullable<List> pvar,
const uintptr_t sample_subset_byte_ct = plink2::DivUp(file_sample_ct, plink2::kBitsPerVec) * plink2::kBytesPerVec;
const uintptr_t cumulative_popcounts_byte_ct = plink2::DivUp(file_sample_ct, plink2::kBitsPerWord * plink2::kInt32PerVec) * plink2::kBytesPerVec;
const uintptr_t genovec_byte_ct = plink2::DivUp(file_sample_ct, plink2::kNypsPerVec) * plink2::kBytesPerVec;
const uint32_t is_not_plink1_bed = (plink2::PgrGetVrtypes(_state_ptr) != nullptr);
const uint32_t is_not_plink1_bed = (_info_ptr->vrtypes != nullptr);
uintptr_t raregeno_byte_ct = 0;
uintptr_t difflist_sample_ids_byte_ct = 0;
if (is_not_plink1_bed) {
const uint32_t max_simple_difflist_len = 2 * (file_sample_ct / plink2::kPglMaxDifflistLenDivisor);
raregeno_byte_ct = plink2::DivUp(max_simple_difflist_len, plink2::kNypsPerVec) * plink2::kBytesPerVec;
difflist_sample_ids_byte_ct = plink2::RoundUpPow2(max_simple_difflist_len * sizeof(int32_t), plink2::kBytesPerVec);
const uint32_t max_returned_difflist_len = 2 * (file_sample_ct / plink2::kPglMaxDifflistLenDivisor);
raregeno_byte_ct = plink2::DivUp(max_returned_difflist_len, plink2::kNypsPerVec) * plink2::kBytesPerVec;
difflist_sample_ids_byte_ct = plink2::RoundUpPow2(max_returned_difflist_len * sizeof(int32_t), plink2::kBytesPerVec);
}
const uintptr_t ac_byte_ct = plink2::RoundUpPow2(file_sample_ct * sizeof(plink2::AlleleCode), plink2::kBytesPerVec);
const uintptr_t ac2_byte_ct = plink2::RoundUpPow2(file_sample_ct * 2 * sizeof(plink2::AlleleCode), plink2::kBytesPerVec);
Expand Down Expand Up @@ -346,6 +346,16 @@ void RPgenReader::ReadIntHardcalls(IntegerVector buf, int variant_idx, int allel
snprintf(errstr_buf, 256, "variant_num out of range (%d; must be 1..%u)", variant_idx + 1, _info_ptr->raw_variant_ct);
stop(errstr_buf);
}
if (buf.size() != _subset_size) {
using namespace plink2;
char errstr_buf[256];
char* write_iter = strcpya_k(errstr_buf, "buf has wrong length (");
write_iter = wtoa(buf.size(), write_iter);
write_iter = strcpya_k(write_iter, "; ");
write_iter = u32toa(_subset_size, write_iter);
strcpy_k(write_iter, " expected)");
stop(errstr_buf);
}
plink2::PglErr reterr = plink2::PgrGet1(_subset_include_vec, _subset_index, _subset_size, variant_idx, allele_idx, _state_ptr, _pgv.genovec);
if (reterr != plink2::kPglRetSuccess) {
char errstr_buf[256];
Expand Down Expand Up @@ -385,24 +395,82 @@ void RPgenReader::ReadHardcalls(NumericVector buf, int variant_idx, int allele_i
plink2::GenoarrLookup16x8bx2(_pgv.genovec, kGenoRDoublePairs, _subset_size, &buf[0]);
}

void RPgenReader::ReadIntMaybeSparseHardcalls(IntegerVector buf, int variant_idx, int allele_idx, int file_difflist_len_limit, IntegerVector* sample_nums_ptr, IntegerVector* allele_counts_ptr) {
if (!_info_ptr) {
stop("pgen is closed");
static const int32_t kGenoRInt32QuadsFlipped[1024] ALIGNV16 = QUAD_TABLE256(2, 1, 0, NA_INTEGER);

void RPgenReader::ReadIntMaybeSparseHardcalls(IntegerVector buf, int variant_idx, int allele_idx, int max_simple_difflist_len, IntegerVector* sample_nums_ptr, IntegerVector* allele_counts_ptr) {
uint32_t difflist_common_geno;
uint32_t difflist_len;
ReadMaybeSparseHardcallsInternal(variant_idx, max_simple_difflist_len, &difflist_common_geno, &difflist_len);
const int32_t* quad_table = (allele_idx == 0)? kGenoRInt32QuadsFlipped : kGenoRInt32Quads;
if (((allele_idx == 0) && (difflist_common_geno != 2)) ||
((allele_idx == 1) && (difflist_common_geno != 0)) ||
(static_cast<uint32_t>(allele_idx) > 1)) {
if (buf.size() != _subset_size) {
// Note that buf is *not* required to be the expected size when we return
// the sparse representation.
using namespace plink2;
char errstr_buf[256];
char* write_iter = strcpya_k(errstr_buf, "buf has wrong length (");
write_iter = wtoa(buf.size(), write_iter);
write_iter = strcpya_k(write_iter, "; ");
write_iter = u32toa(_subset_size, write_iter);
strcpy_k(write_iter, " expected)");
stop(errstr_buf);
}
if (difflist_common_geno != UINT32_MAX) {
// Sparse, but not w.r.t. the correct allele. Just return dense
// representation.
plink2::PgrDifflistToGenovecUnsafe(_raregeno_buf, _difflist_sample_ids_buf, difflist_common_geno, _subset_size, difflist_len, _pgv.genovec);
}
plink2::GenoarrLookup256x4bx4(_pgv.genovec, quad_table, _subset_size, &buf[0]);
return;
}
const uint32_t raw_variant_ct = _info_ptr->raw_variant_ct;
if (static_cast<uint32_t>(variant_idx) >= raw_variant_ct) {
char errstr_buf[256];
snprintf(errstr_buf, 256, "variant_num out of range (%d; must be 1..%u)", variant_idx + 1, _info_ptr->raw_variant_ct);
stop(errstr_buf);
*sample_nums_ptr = IntegerVector(difflist_len);
int* sample_nums_data = &((*sample_nums_ptr)[0]);
for (uint32_t uii = 0; uii != difflist_len; ++uii) {
sample_nums_data[uii] = _difflist_sample_ids_buf[uii] + 1;
}
*allele_counts_ptr = IntegerVector(difflist_len);
int* allele_counts_data = &((*allele_counts_ptr)[0]);
plink2::GenoarrLookup256x4bx4(_raregeno_buf, quad_table, difflist_len, allele_counts_data);
}

static const double kGenoRDoublePairsFlipped[32] ALIGNV16 = PAIR_TABLE16(0.0, 1.0, 2.0, NA_REAL);

void RPgenReader::ReadMaybeSparseHardcalls(NumericVector buf, int variant_idx, int allele_idx, int max_simple_difflist_len, IntegerVector* sample_nums_ptr, NumericVector* allele_dosages_ptr) {
uint32_t difflist_common_geno;
uint32_t difflist_len;
ReadMaybeSparseHardcallsInternal(variant_idx, file_difflist_len_limit, &difflist_common_geno, &difflist_len);
if (difflist_common_geno == UINT32_MAX) {
plink2::GenoarrLookup256x4bx4(_pgv.genovec, kGenoRInt32Quads, _subset_size, &buf[0]);
ReadMaybeSparseHardcallsInternal(variant_idx, max_simple_difflist_len, &difflist_common_geno, &difflist_len);
const double* pair_table = (allele_idx == 0)? kGenoRDoublePairsFlipped : kGenoRDoublePairs;
if (((allele_idx == 0) && (difflist_common_geno != 2)) ||
((allele_idx == 1) && (difflist_common_geno != 0)) ||
(static_cast<uint32_t>(allele_idx) > 1)) {
if (buf.size() != _subset_size) {
using namespace plink2;
char errstr_buf[256];
char* write_iter = strcpya_k(errstr_buf, "buf has wrong length (");
write_iter = wtoa(buf.size(), write_iter);
write_iter = strcpya_k(write_iter, "; ");
write_iter = u32toa(_subset_size, write_iter);
strcpy_k(write_iter, " expected)");
stop(errstr_buf);
}
if (difflist_common_geno != UINT32_MAX) {
// Sparse, but not w.r.t. the correct allele. Just return dense
// representation.
plink2::PgrDifflistToGenovecUnsafe(_raregeno_buf, _difflist_sample_ids_buf, difflist_common_geno, _subset_size, difflist_len, _pgv.genovec);
}
plink2::GenoarrLookup16x8bx2(_pgv.genovec, pair_table, _subset_size, &buf[0]);
return;
}
// TODO
*sample_nums_ptr = IntegerVector(difflist_len);
int* sample_nums_data = &((*sample_nums_ptr)[0]);
for (uint32_t uii = 0; uii != difflist_len; ++uii) {
sample_nums_data[uii] = _difflist_sample_ids_buf[uii] + 1;
}
*allele_dosages_ptr = NumericVector(difflist_len);
double* allele_dosages_data = &((*allele_dosages_ptr)[0]);
plink2::GenoarrLookup16x8bx2(_raregeno_buf, pair_table, difflist_len, allele_dosages_data);
}

void RPgenReader::Read(NumericVector buf, int variant_idx, int allele_idx) {
Expand Down Expand Up @@ -761,8 +829,30 @@ void RPgenReader::SetSampleSubsetInternal(IntegerVector sample_subset_1based) {
_subset_size = subset_size;
}

void RPgenReader::ReadMaybeSparseHardcallsInternal(int variant_idx, int file_difflist_len_limit, uint32_t* difflist_common_geno_ptr, uint32_t* difflist_len_ptr) {
// TODO
void RPgenReader::ReadMaybeSparseHardcallsInternal(int variant_idx, int max_simple_difflist_len, uint32_t* difflist_common_geno_ptr, uint32_t* difflist_len_ptr) {
// Fills {_raregeno_buf, _difflist_sample_ids_buf, *difflist_common_geno_ptr,
// *difflist_len_ptr} iff hardcalls are either stored as a simple difflist no
// longer than max_simple_difflist_len, or stored as a list of differences
// from an earlier variant. (Note that when the latter representation is
// involved, the final difflist can be longer than max_simple_difflist_len.)
//
// Otherwise, fills _pgv.genovec and sets *difflist_common_geno_ptr to
// UINT32_MAX.
if (!_info_ptr) {
stop("pgen is closed");
}
const uint32_t raw_variant_ct = _info_ptr->raw_variant_ct;
if (static_cast<uint32_t>(variant_idx) >= raw_variant_ct) {
char errstr_buf[256];
snprintf(errstr_buf, 256, "variant_num out of range (%d; must be 1..%u)", variant_idx + 1, _info_ptr->raw_variant_ct);
stop(errstr_buf);
}
plink2::PglErr reterr = plink2::PgrGetDifflistOrGenovec(_subset_include_vec, _subset_index, _subset_size, max_simple_difflist_len, variant_idx, _state_ptr, _pgv.genovec, difflist_common_geno_ptr, _raregeno_buf, _difflist_sample_ids_buf, difflist_len_ptr);
if (reterr != plink2::kPglRetSuccess) {
char errstr_buf[256];
snprintf(errstr_buf, 256, "PgrGetDifflistOrGenovec() error %d", static_cast<int>(reterr));
stop(errstr_buf);
}
}

void RPgenReader::ReadAllelesPhasedInternal(int variant_idx) {
Expand Down Expand Up @@ -1046,8 +1136,7 @@ bool HasSparseHardcalls(List pgen, int variant_num, int allele_num = 2) {
//' to 2.
//' @param return_ints Whether to make the "counts" component of the return
//' value an IntegerVector instead of a NumericVector; defaults to false.
//' @return Either an empty list, in which case buf is filled in the same
//' mannerObject where "sample_ids" is an increasing sequence of positive
//' @return An object where "sample_nums" is an increasing sequence of positive
//' integers listing which samples have the allele, and "counts" is a vector
//' listing the allele counts for those samples.
//' @export
Expand All @@ -1065,18 +1154,24 @@ List ReadSparseHardcalls(List pgen, int variant_num, int allele_num = 2, bool re
if (!is_supported_sparse) {
stop("(variant, allele) does not have supported sparse representation");
}
stop("ReadSparseHardcalls() is under development");
IntegerVector sample_ids(0);
IntegerVector sample_nums(0);
const int allele_idx = allele_num - 1;
const uint32_t file_sample_ct = rp->GetRawSampleCt();
const uint32_t max_simple_difflist_len = file_sample_ct / plink2::kPglMaxDifflistLenDivisor;
if (return_ints) {
IntegerVector unused_buf(0);
IntegerVector integer_counts(0);
rp->ReadIntMaybeSparseHardcalls(unused_buf, variant_idx, allele_idx, max_simple_difflist_len, &sample_nums, &integer_counts);
return List::create(
_["sample_ids"] = sample_ids,
_["sample_nums"] = sample_nums,
_["counts"] = integer_counts
);
} else {
NumericVector unused_buf(0);
NumericVector numeric_counts(0);
rp->ReadMaybeSparseHardcalls(unused_buf, variant_idx, allele_idx, max_simple_difflist_len, &sample_nums, &numeric_counts);
return List::create(
_["sample_ids"] = sample_ids,
_["sample_nums"] = sample_nums,
_["counts"] = numeric_counts
);
}
Expand Down
8 changes: 4 additions & 4 deletions 2.0/plink2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
namespace plink2 {
#endif

static const char ver_str[] = "PLINK v2.0.0-a.6.9"
static const char ver_str[] = "PLINK v2.0.0-a.6.9.a"
#ifdef NOLAPACK
"NL"
#elif defined(LAPACK_ILP64)
Expand Down Expand Up @@ -72,10 +72,10 @@ static const char ver_str[] = "PLINK v2.0.0-a.6.9"
#elif defined(USE_AOCL)
" AMD"
#endif
" (29 Jan 2025)";
" (4 Feb 2025)";
static const char ver_str2[] =
// include leading space if day < 10, so character length stays the same
""
" "

#ifdef NOLAPACK
#elif defined(LAPACK_ILP64)
Expand All @@ -100,7 +100,7 @@ static const char ver_str2[] =
# endif
#endif

" cog-genomics.org/plink/2.0/\n"
" cog-genomics.org/plink/2.0/\n"
"(C) 2005-2025 Shaun Purcell, Christopher Chang GNU General Public License v3\n";
static const char errstr_append[] = "For more info, try \"" PROG_NAME_STR " --help <flag name>\" or \"" PROG_NAME_STR " --help | more\".\n";

Expand Down

0 comments on commit 5e6be2f

Please sign in to comment.