From 37cf35438d12c1bd7de30b14aa2a00ddf38594be Mon Sep 17 00:00:00 2001 From: Christopher Chang Date: Mon, 2 Oct 2023 16:56:40 -0700 Subject: [PATCH] --sort-vars bugfix and large-INFO performance improvement --- 2.0/include/pgenlib_misc.h | 2 +- 2.0/include/pgenlib_read.h | 2 +- 2.0/include/pgenlib_write.h | 1 + 2.0/include/plink2_bits.h | 2 + 2.0/plink2.cc | 4 +- 2.0/plink2_cmdline.h | 5 + 2.0/plink2_data.cc | 272 +++++++++++++++++++++++++++++++----- 2.0/plink2_glm.cc | 6 +- 2.0/plink2_import.cc | 83 ++++++----- 2.0/plink2_ld.cc | 38 +++-- 2.0/plink2_merge.cc | 2 +- 11 files changed, 314 insertions(+), 103 deletions(-) diff --git a/2.0/include/pgenlib_misc.h b/2.0/include/pgenlib_misc.h index d4f3968b..00bb604e 100644 --- a/2.0/include/pgenlib_misc.h +++ b/2.0/include/pgenlib_misc.h @@ -393,7 +393,7 @@ HEADER_INLINE void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoar const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); PackWordsToHalfwordsInvmatch(genoarr, 0, sample_ctl2, missingness); if (sample_ctl2 % 2) { - Halfword* missingness_alias = DowncastWToHW(missingness); + Halfword* __attribute__((may_alias)) missingness_alias = DowncastWToHW(missingness); missingness_alias[sample_ctl2] = 0; } } diff --git a/2.0/include/pgenlib_read.h b/2.0/include/pgenlib_read.h index 81e02805..f6740b61 100644 --- a/2.0/include/pgenlib_read.h +++ b/2.0/include/pgenlib_read.h @@ -599,7 +599,7 @@ PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex HEADER_INLINE void PgrDetectGenoarrHetsUnsafe(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, uintptr_t* __restrict all_hets) { PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, raw_sample_ctl2, all_hets); if (raw_sample_ctl2 % 2) { - Halfword* all_hets_alias = DowncastWToHW(all_hets); + Halfword* __attribute__((may_alias)) all_hets_alias = DowncastWToHW(all_hets); all_hets_alias[raw_sample_ctl2] = 0; } } diff --git a/2.0/include/pgenlib_write.h b/2.0/include/pgenlib_write.h index 979bb1cc..d7b0d737 100644 --- a/2.0/include/pgenlib_write.h +++ b/2.0/include/pgenlib_write.h @@ -333,6 +333,7 @@ HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphaseDosage16(const uintptr_t* _ } // dosage_present cannot be null for nonzero dosage_ct +// trailing bits of dosage_present MUST be zeroed out // could make dosage_main[] has length dosage_ct + dphase_ct instead of having // separate dphase_delta[]? BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp); diff --git a/2.0/include/plink2_bits.h b/2.0/include/plink2_bits.h index dfee8218..03d34750 100644 --- a/2.0/include/plink2_bits.h +++ b/2.0/include/plink2_bits.h @@ -479,6 +479,8 @@ HEADER_INLINE void CopyBitarr(const uintptr_t* __restrict src, uintptr_t bit_ct, } // output_bit_idx_end is practically always subset_size +// if not, it currently must correspond to PopcountWords(subset_mask, word_ct) +// for some word_ct void CopyBitarrSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t output_bit_idx_end, uintptr_t* __restrict output_bitarr); #ifndef NO_UNALIGNED diff --git a/2.0/plink2.cc b/2.0/plink2.cc index 6f218d65..c93d6720 100644 --- a/2.0/plink2.cc +++ b/2.0/plink2.cc @@ -72,10 +72,10 @@ static const char ver_str[] = "PLINK v2.00a6" #elif defined(USE_AOCL) " AMD" #endif - " (28 Sep 2023)"; + " (2 Oct 2023)"; static const char ver_str2[] = // include leading space if day < 10, so character length stays the same - "" + " " #ifdef NOLAPACK #elif defined(LAPACK_ILP64) diff --git a/2.0/plink2_cmdline.h b/2.0/plink2_cmdline.h index 81fb13a5..9ac71ff9 100644 --- a/2.0/plink2_cmdline.h +++ b/2.0/plink2_cmdline.h @@ -618,6 +618,11 @@ HEADER_INLINE BoolErr bigstack_alloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) { return !(*w_arr_ptr); } +HEADER_INLINE BoolErr bigstack_alloc_hw(uintptr_t ct, Halfword** hw_arr_ptr) { + *hw_arr_ptr = S_CAST(Halfword*, bigstack_alloc(ct * sizeof(Halfword))); + return !(*hw_arr_ptr); +} + HEADER_INLINE BoolErr bigstack_alloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) { *i64_arr_ptr = S_CAST(int64_t*, bigstack_alloc(ct * sizeof(int64_t))); return !(*i64_arr_ptr); diff --git a/2.0/plink2_data.cc b/2.0/plink2_data.cc index dbb4c742..e701e8d3 100644 --- a/2.0/plink2_data.cc +++ b/2.0/plink2_data.cc @@ -20,7 +20,8 @@ #include "plink2_data.h" #include "plink2_pvar.h" -#include +#include // time() +#include // unlink() #ifdef __cplusplus namespace plink2 { @@ -8145,36 +8146,29 @@ PglErr WriteBimResorted(const char* outname, const ChrInfo* write_cip, const uin return reterr; } -PglErr PvarInfoReloadInterval(const uint32_t* old_variant_uidx_to_new, uint32_t variant_idx_start, uint32_t variant_idx_end, TextStream* pvar_reload_txsp, char** pvar_info_strs) { - // We assume the batch size was chosen such that there's no risk of - // scribbling past g_bigstack_end (barring pathological cases like another - // process modifying the .pvar file after initial load). - // We also assume no more dynamic allocations are needed after this; - // otherwise, str_store_iter should be returned. - char* line_iter; - // probable todo: avoid rewind when one batch is entirely after the previous - // batch (this is likely when input was already almost-sorted, and just a few - // coordinates changed due to e.g. --normalize) +PglErr PvarInfoLoadAll(const uint32_t* old_variant_uidx_to_new, uint32_t variant_ct, TextStream* pvar_reload_txsp, char** pvar_info_strs) { + // We assume no more dynamic allocations are needed after this; otherwise, + // str_store_iter should be returned. PglErr reterr = TextRewind(pvar_reload_txsp); if (unlikely(reterr)) { return reterr; } - const uint32_t cur_batch_size = variant_idx_end - variant_idx_start; char* str_store_iter = R_CAST(char*, g_bigstack_base); + char* line_iter; uint32_t info_col_idx; reterr = PvarInfoReloadHeader(pvar_reload_txsp, &line_iter, &info_col_idx); if (unlikely(reterr)) { return reterr; } - uint32_t variant_idx = 0; + uint32_t old_variant_idx = 0; for (uint32_t variant_uidx = 0; ; ++variant_uidx) { reterr = TextNextLineLstrip(pvar_reload_txsp, &line_iter); if (unlikely(reterr)) { return reterr; } - const uint32_t new_variant_idx_offset = old_variant_uidx_to_new[variant_uidx] - variant_idx_start; + const uint32_t new_variant_idx_offset = old_variant_uidx_to_new[variant_uidx]; // exploit wraparound, UINT32_MAX null value - if (new_variant_idx_offset >= cur_batch_size) { + if (new_variant_idx_offset >= variant_ct) { continue; } line_iter = NextTokenMultFar(line_iter, info_col_idx); @@ -8186,11 +8180,140 @@ PglErr PvarInfoReloadInterval(const uint32_t* old_variant_uidx_to_new, uint32_t pvar_info_strs[new_variant_idx_offset] = str_store_iter; str_store_iter = memcpyax(str_store_iter, line_iter, info_slen, '\0'); line_iter = info_end; - if (++variant_idx == cur_batch_size) { + if (++old_variant_idx == variant_ct) { break; } } - assert(str_store_iter <= R_CAST(char*, g_bigstack_end)); + assert(str_store_iter <= R_CAST(const char*, old_variant_uidx_to_new)); + return kPglRetSuccess; +} + +typedef struct InfoWriteTmpStruct { + char* cswritep; + CompressStreamState css; +} InfoWriteTmp; + +PglErr PvarInfoSplitTmp(const uint32_t* old_variant_uidx_to_new, uint32_t variant_ct, uint32_t batch_size, uint32_t batch_ct, uint32_t info_reload_blen, TextStream* pvar_reload_txsp, char* outname, char* outname_end) { + InfoWriteTmp* info_write_tmps = nullptr; + PglErr reterr = kPglRetSuccess; + { + char* line_iter; + uint32_t info_col_idx; + reterr = PvarInfoReloadHeader(pvar_reload_txsp, &line_iter, &info_col_idx); + if (unlikely(reterr)) { + return reterr; + } + uint64_t mult; + uint32_t pre_shift; + uint32_t post_shift; + uint32_t incr; + DivisionMagicNums(batch_size, &mult, &pre_shift, &post_shift, &incr); + if (batch_ct > kMaxOpenFiles - 12) { + // todo: recurse + logerrputs("Error: INFO column is too large for current --sort-vars implementation and\nworkspace size.\n"); + reterr = kPglRetNotYetSupported; + goto PvarInfoSplitTmp_ret_1; + } + if (unlikely(BIGSTACK_ALLOC_X(InfoWriteTmp, batch_ct, &info_write_tmps))) { + goto PvarInfoSplitTmp_ret_NOMEM; + } + for (uint32_t batch_idx = 0; batch_idx != batch_ct; ++batch_idx) { + PreinitCstream(&(info_write_tmps[batch_idx].css)); + info_write_tmps[batch_idx].cswritep = nullptr; + } + const uintptr_t overflow_buf_size = kCompressStreamBlock + info_reload_blen; + char* outname_continued = strcpya(outname_end, ".info.tmp."); + for (uint32_t batch_idx = 0; batch_idx != batch_ct; ++batch_idx) { + char* outname_iter = u32toa(batch_idx + 1, outname_continued); + strcpy_k(outname_iter, ".zst"); + reterr = InitCstreamAlloc(outname, 0, 1, 1, overflow_buf_size, &(info_write_tmps[batch_idx].css), &(info_write_tmps[batch_idx].cswritep)); + if (unlikely(reterr)) { + goto PvarInfoSplitTmp_ret_1; + } + } + + uint32_t old_variant_idx = 0; + for (uint32_t variant_uidx = 0; ; ++variant_uidx) { + reterr = TextNextLineLstrip(pvar_reload_txsp, &line_iter); + if (unlikely(reterr)) { + return reterr; + } + const uint32_t new_variant_idx = old_variant_uidx_to_new[variant_uidx]; + // UINT32_MAX serves as null value + if (new_variant_idx >= variant_ct) { + continue; + } + line_iter = NextTokenMultFar(line_iter, info_col_idx); + if (!line_iter) { + return kPglRetRewindFail; + } + char* info_end = CurTokenEnd(line_iter); + const uint32_t info_slen = info_end - line_iter; + const uint32_t batch_idx = (mult * ((new_variant_idx >> pre_shift) + incr)) >> post_shift; + // these are temporary files, so don't bother writing \r\n on Windows + info_write_tmps[batch_idx].cswritep = memcpyax(info_write_tmps[batch_idx].cswritep, line_iter, info_slen, '\n'); + if (unlikely(Cswrite(&(info_write_tmps[batch_idx].css), &(info_write_tmps[batch_idx].cswritep)))) { + goto PvarInfoSplitTmp_ret_WRITE_FAIL; + } + line_iter = info_end; + if (++old_variant_idx == variant_ct) { + break; + } + } + for (uint32_t batch_idx = 0; batch_idx != batch_ct; ++batch_idx) { + if (unlikely(CswriteCloseNull(&(info_write_tmps[batch_idx].css), info_write_tmps[batch_idx].cswritep))) { + goto PvarInfoSplitTmp_ret_WRITE_FAIL; + } + } + } + while (0) { + PvarInfoSplitTmp_ret_NOMEM: + reterr = kPglRetNomem; + break; + PvarInfoSplitTmp_ret_WRITE_FAIL: + reterr = kPglRetWriteFail; + break; + } + PvarInfoSplitTmp_ret_1: + if (info_write_tmps) { + for (uint32_t batch_idx = 0; batch_idx != batch_ct; ++batch_idx) { + CswriteCloseCond(&(info_write_tmps[batch_idx].css), info_write_tmps[batch_idx].cswritep); + } + } + // caller expected to reset wkspace + return reterr; +} + +PglErr PvarInfoReloadInterval(const uint32_t* new_variant_idx_to_old, uint32_t variant_idx_start, uint32_t variant_idx_end, TextStream* pvar_reload_txsp, char** pvar_info_strs, uint32_t* variant_order_in_shard) { + // We assume the batch size was chosen such that there's no risk of + // scribbling past g_bigstack_end (barring pathological cases like another + // process modifying the temporary file). + // We assume no more dynamic allocations are needed after this; otherwise, + // str_store_iter should be returned. + uint64_t* old_variant_idx_map = R_CAST(uint64_t*, g_bigstack_base); + const uintptr_t shard_variant_ct = variant_idx_end - variant_idx_start; + const uint32_t* shard_new_variant_idx_to_old = &(new_variant_idx_to_old[variant_idx_start]); + for (uintptr_t shard_new_variant_idx = 0; shard_new_variant_idx != shard_variant_ct; ++shard_new_variant_idx) { + const uint64_t old_variant_uidx = shard_new_variant_idx_to_old[shard_new_variant_idx]; + old_variant_idx_map[shard_new_variant_idx] = (old_variant_uidx << 32) | shard_new_variant_idx; + } + + STD_SORT(shard_variant_ct, u64cmp, old_variant_idx_map); + for (uintptr_t shard_old_variant_idx = 0; shard_old_variant_idx != shard_variant_ct; ++shard_old_variant_idx) { + variant_order_in_shard[shard_old_variant_idx] = S_CAST(uint32_t, old_variant_idx_map[shard_old_variant_idx]); + } + // now we can scribble over old_variant_idx_map + char* str_store_iter = R_CAST(char*, g_bigstack_base); + char* line_iter; + for (uintptr_t shard_old_variant_idx = 0; shard_old_variant_idx != shard_variant_ct; ++shard_old_variant_idx) { + PglErr reterr = TextNextLine(pvar_reload_txsp, &line_iter); + if (unlikely(reterr)) { + return reterr; + } + char* line_end = TextLineEnd(pvar_reload_txsp); + pvar_info_strs[variant_order_in_shard[shard_old_variant_idx]] = str_store_iter; + str_store_iter = memcpya(str_store_iter, line_iter, line_end - line_iter); + } return kPglRetSuccess; } @@ -8332,7 +8455,7 @@ PglErr WritePvarResortedInterval(const ChrInfo* write_cip, const uint32_t* varia // allocation of buffers, and generating the header line occurs directly in // this function, while loading the next pvar_info_strs batch and writing the // next .pvar line batch are one level down. -PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const uint32_t* new_variant_idx_to_old, const uint32_t* contig_lens, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, char output_missing_geno_char, uint32_t thread_ct, char* xheader) { +PglErr WritePvarResorted(const uintptr_t* variant_include, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const uint32_t* new_variant_idx_to_old, const uint32_t* contig_lens, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, char output_missing_geno_char, uint32_t thread_ct, char* xheader, char* outname, char* outname_end) { unsigned char* bigstack_mark = g_bigstack_base; char* cswritep = nullptr; PglErr reterr = kPglRetSuccess; @@ -8434,19 +8557,31 @@ PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, } AppendBinaryEoln(&cswritep); - uint32_t* old_variant_uidx_to_new = nullptr; char** pvar_info_strs = nullptr; + uint32_t* variant_order_in_shard = nullptr; + const uint32_t info_reload_blen = info_reload_slen + 1; uint32_t batch_size = variant_ct; uint32_t batch_ct = 1; + // Original algorithm for handling an INFO column too large to fit in + // memory was to re-iterate through the .pvar/VCF many times, scraping only + // the INFO entries needed for the current batch each time. + // This was done because it reused existing INFO-reloading code, but + // unfortunately it has a quadratic cost, and this matters since huge INFO + // columns are pretty common (see e.g. gnomAD). So it's time to rework + // this. + // New approach: + // - If, knowing info_reload_slen, we know all relevant INFO strings will + // fit in 3/4 of remaining memory, fill pvar_info_strs upfront. + // - Otherwise, scrape the INFO strings in order and append them to + // compressed temporary files, each sized to fit in remaining memory + // after decompression. (If more than 240 shards are required, then we + // shard recursively to avoid having too many files open at once.) Then + // process the shards in order. if (pvar_info_reload) { - if (unlikely(bigstack_alloc_u32(raw_variant_ct, &old_variant_uidx_to_new))) { - goto WritePvarResorted_ret_NOMEM; - } - SetAllU32Arr(raw_variant_ct, old_variant_uidx_to_new); - for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) { - const uint32_t old_variant_uidx = new_variant_idx_to_old[variant_idx]; - old_variant_uidx_to_new[old_variant_uidx] = variant_idx; - } + uintptr_t bytes_left = bigstack_left(); + // 1. array of uint64s, old_uidx in high bits, new_idx in low + // 2. sort this + // 3. extract new_idx sequence uint32_t decompress_thread_ct = 1; if (!output_zst) { @@ -8455,19 +8590,65 @@ PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, decompress_thread_ct = 1; } } - reterr = SizeAndInitTextStream(pvar_info_reload, bigstack_left() / 4, decompress_thread_ct, &pvar_reload_txs); + unsigned char* bigstack_mark2 = g_bigstack_base; + // Long line-length limit, because with e.g. "plink2 --vcf + // --make-just-pvar", the VCF is simply reinterpreted as a .pvar even if + // it's multisample. + reterr = SizeAndInitTextStream(pvar_info_reload, bytes_left / 4, decompress_thread_ct, &pvar_reload_txs); if (unlikely(reterr)) { goto WritePvarResorted_ret_TSTREAM_FAIL; } - // subtract kCacheline to allow for rounding - uintptr_t bytes_left = bigstack_left() - kCacheline; - uint32_t single_variant_byte_ct = info_reload_slen + 1 + sizeof(intptr_t); - if (variant_ct * single_variant_byte_ct > bytes_left) { - batch_size = bytes_left / single_variant_byte_ct; - batch_ct = 1 + (variant_ct - 1) / batch_size; + uint32_t* old_variant_uidx_to_new = S_CAST(uint32_t*, bigstack_alloc_raw_rd(raw_variant_ct * sizeof(int32_t))); + SetAllU32Arr(raw_variant_ct, old_variant_uidx_to_new); + for (uint32_t new_variant_idx = 0; new_variant_idx != variant_ct; ++new_variant_idx) { + const uint32_t old_variant_uidx = new_variant_idx_to_old[new_variant_idx]; + old_variant_uidx_to_new[old_variant_uidx] = new_variant_idx; + } + + const uintptr_t no_reload_bytes_left = bigstack_left(); + uint32_t single_variant_byte_ct = info_reload_blen + sizeof(intptr_t); + // bugfix (2 Oct 2023): left term needs to be uintptr_t, not uint32_t + if (S_CAST(uintptr_t, variant_ct) * single_variant_byte_ct + kCacheline <= no_reload_bytes_left) { + pvar_info_strs = S_CAST(char**, bigstack_alloc_raw_rd(batch_size * sizeof(intptr_t))); + reterr = PvarInfoLoadAll(old_variant_uidx_to_new, variant_ct, &pvar_reload_txs, pvar_info_strs); + if (unlikely(reterr)) { + goto WritePvarResorted_ret_TSTREAM_FAIL; + } + } else { + if (info_reload_blen < sizeof(int64_t) + sizeof(int32_t)) { + single_variant_byte_ct = sizeof(int64_t) + sizeof(int32_t) + sizeof(intptr_t); + } + const uintptr_t expected_text_stream_alloc = MAXV(RoundUpPow2(info_reload_blen + kDecompressChunkSize, kCacheline), kTextStreamBlenFast + kDecompressChunkSize); + if (unlikely(bytes_left < expected_text_stream_alloc + 2 * kCacheline + single_variant_byte_ct)) { + goto WritePvarResorted_ret_NOMEM; + } + bytes_left -= expected_text_stream_alloc + 2 * kCacheline; + if (S_CAST(uintptr_t, variant_ct) * single_variant_byte_ct > bytes_left) { + batch_size = bytes_left / single_variant_byte_ct; + batch_ct = 1 + (variant_ct - 1) / batch_size; + } + const uint32_t usual_zst_level = g_zst_level; + g_zst_level = 1; + reterr = PvarInfoSplitTmp(old_variant_uidx_to_new, variant_ct, batch_size, batch_ct, info_reload_blen, &pvar_reload_txs, outname, outname_end); + if (unlikely(reterr)) { + goto WritePvarResorted_ret_1; + } + g_zst_level = usual_zst_level; + } + if (unlikely(CleanupTextStream2(pvar_info_reload, &pvar_reload_txs, &reterr))) { + goto WritePvarResorted_ret_1; + } + BigstackReset(bigstack_mark2); + if (!pvar_info_strs) { + snprintf(outname_end, kMaxOutfnameExtBlen, ".info.tmp.1.zst"); + reterr = InitTextStream(outname, MAXV(info_reload_blen, kTextStreamBlenFast), 1, &pvar_reload_txs); + if (unlikely(reterr)) { + goto WritePvarResorted_ret_TSTREAM_FAIL_2; + } + variant_order_in_shard = S_CAST(uint32_t*, bigstack_alloc_raw_rd(batch_size * sizeof(int32_t))); + pvar_info_strs = S_CAST(char**, bigstack_alloc_raw_rd(batch_size * sizeof(intptr_t))); } - pvar_info_strs = S_CAST(char**, bigstack_alloc_raw_rd(batch_size * sizeof(intptr_t))); } uint32_t variant_idx_start = 0; @@ -8489,11 +8670,23 @@ PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100; } uint32_t variant_idx_end = MINV(variant_idx_start + batch_size, variant_ct); - if (pvar_info_reload) { - reterr = PvarInfoReloadInterval(old_variant_uidx_to_new, variant_idx_start, variant_idx_end, &pvar_reload_txs, pvar_info_strs); + if (pvar_info_reload && variant_order_in_shard) { + reterr = PvarInfoReloadInterval(new_variant_idx_to_old, variant_idx_start, variant_idx_end, &pvar_reload_txs, pvar_info_strs, variant_order_in_shard); if (unlikely(reterr)) { goto WritePvarResorted_ret_TSTREAM_FAIL; } + const uint32_t fname_slen = S_CAST(uintptr_t, outname_end - outname) + strlen(outname_end); + memcpy(g_textbuf, outname, fname_slen + 1); + if (batch_idx + 1 < batch_ct) { + char* write_iter = strcpya(outname_end, ".info.tmp."); + write_iter = u32toa(batch_idx + 2, write_iter); + strcpy_k(write_iter, ".zst"); + TextRetarget(outname, &pvar_reload_txs); + } + if (unlikely(unlink(g_textbuf))) { + logerrprintfww("Error: Failed to delete %s .\n", g_textbuf); + goto WritePvarResorted_ret_WRITE_FAIL; + } } reterr = WritePvarResortedInterval(write_cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, qual_present, quals, filter_present, filter_npass, filter_storage, nonref_flags, variant_cms, new_variant_idx_to_old, variant_idx_start, variant_idx_end, info_pr_flag_present, write_qual, write_filter, write_info, all_nonref, write_cm, output_missing_geno_char, pvar_info_strs, &css, &cswritep, &chr_fo_idx, &chr_end, &chr_buf_blen, chr_buf); if (unlikely(reterr)) { @@ -8517,6 +8710,9 @@ PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, WritePvarResorted_ret_TSTREAM_FAIL: TextStreamErrPrint(pvar_info_reload, &pvar_reload_txs); break; + WritePvarResorted_ret_TSTREAM_FAIL_2: + TextStreamErrPrint(outname, &pvar_reload_txs); + break; WritePvarResorted_ret_WRITE_FAIL: reterr = kPglRetWriteFail; break; @@ -8759,7 +8955,7 @@ PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* pi if (!PgrGetNonrefFlags(simple_pgrp)) { nonref_flags_storage = (PgrGetGflags(simple_pgrp) & kfPgenGlobalAllNonref)? 2 : 1; } - reterr = WritePvarResorted(outname, variant_include, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, PgrGetNonrefFlags(simple_pgrp), pvar_info_reload, variant_cms, new_variant_idx_to_old, contig_lens, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, output_missing_geno_char, max_thread_ct, xheader); + reterr = WritePvarResorted(variant_include, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, PgrGetNonrefFlags(simple_pgrp), pvar_info_reload, variant_cms, new_variant_idx_to_old, contig_lens, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, output_missing_geno_char, max_thread_ct, xheader, outname, outname_end); if (unlikely(reterr)) { goto MakePlink2Vsort_ret_1; } diff --git a/2.0/plink2_glm.cc b/2.0/plink2_glm.cc index 983a5b18..f790b57f 100644 --- a/2.0/plink2_glm.cc +++ b/2.0/plink2_glm.cc @@ -2894,7 +2894,9 @@ PglErr GlmMain(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, c if (raw_parameter_subset) { memcpy(joint_test_params_buf, common.joint_test_params_x, biallelic_raw_predictor_ctl * sizeof(intptr_t)); ZeroWArr(biallelic_raw_predictor_ctl, common.joint_test_params_x); - CopyBitarrSubset(joint_test_params_buf, common.parameter_subset, biallelic_predictor_ct_x, common.joint_test_params_x); + // bugfix (2 Oct 2023): parameter_subset -> + // parameter_subset_x + CopyBitarrSubset(joint_test_params_buf, common.parameter_subset_x, biallelic_predictor_ct_x, common.joint_test_params_x); } } if (sample_ct_x <= biallelic_predictor_ct_x) { @@ -2984,7 +2986,7 @@ PglErr GlmMain(const uintptr_t* orig_sample_include, const SampleIdInfo* siip, c if (raw_parameter_subset) { memcpy(joint_test_params_buf, common.joint_test_params_y, biallelic_raw_predictor_ctl * sizeof(intptr_t)); ZeroWArr(biallelic_raw_predictor_ctl, common.joint_test_params_y); - CopyBitarrSubset(joint_test_params_buf, common.parameter_subset, biallelic_predictor_ct_y, common.joint_test_params_y); + CopyBitarrSubset(joint_test_params_buf, common.parameter_subset_y, biallelic_predictor_ct_y, common.joint_test_params_y); } } if (sample_ct_y <= biallelic_predictor_ct_y) { diff --git a/2.0/plink2_import.cc b/2.0/plink2_import.cc index 61ef0e2d..7bd149e5 100644 --- a/2.0/plink2_import.cc +++ b/2.0/plink2_import.cc @@ -10515,10 +10515,10 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); const uint32_t sample_ctl = BitCtToWordCt(sample_ct); uintptr_t* genovec; - uintptr_t* dosage_present; + Halfword* dosage_present_hwarr; // if we weren't using bigstack_alloc, this would need to be sample_ctaw2 if (unlikely(bigstack_alloc_w(sample_ctl2, &genovec) || - bigstack_alloc_w(sample_ctl, &dosage_present))) { + bigstack_alloc_hw(2 * sample_ctl, &dosage_present_hwarr))) { goto OxGenToPgen_ret_NOMEM; } Dosage* dosage_main = nullptr; @@ -10639,7 +10639,7 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons Bgen11DosageImportUpdate(dosage_int_sum_thresh, import_dosage_certainty_int, hard_call_halfdist, dosage_erase_halfdist, sample_idx_lowbits, dosage_int0, dosage_int1, dosage_int2, &genovec_word, &dosage_present_hw, &dosage_main_iter); } genovec[widx] = genovec_word; - R_CAST(Halfword*, dosage_present)[widx] = dosage_present_hw; + dosage_present_hwarr[widx] = dosage_present_hw; } if (prov_ref_allele_second) { GenovecInvertUnsafe(sample_ct, genovec); @@ -10650,7 +10650,8 @@ PglErr OxGenToPgen(const char* genname, const char* samplename, const char* cons if (prov_ref_allele_second) { BiallelicDosage16Invert(dosage_ct, dosage_main); } - reterr = SpgwAppendBiallelicGenovecDosage16(genovec, dosage_present, dosage_main, dosage_ct, &spgw); + uintptr_t* __attribute__((may_alias)) dosage_present_warr = R_CAST(uintptr_t*, dosage_present_hwarr); + reterr = SpgwAppendBiallelicGenovecDosage16(genovec, dosage_present_warr, dosage_main, dosage_ct, &spgw); if (unlikely(reterr)) { goto OxGenToPgen_ret_1; } @@ -10874,7 +10875,7 @@ THREAD_FUNC_DECL Bgen11GenoToPgenThread(void* raw_arg) { unsigned char* compressed_geno_iter = bicp->compressed_geno_starts[parity][vidx]; uintptr_t* write_genovec_iter = &(ctx->write_genovecs[parity][vidx * sample_ctaw2]); uint32_t* write_dosage_ct_iter = &(ctx->write_dosage_cts[parity][vidx]); - uintptr_t* write_dosage_present_iter = &(ctx->write_dosage_presents[parity][vidx * sample_ctaw]); + Halfword* write_dosage_present_iter = DowncastWToHW(&(ctx->write_dosage_presents[parity][vidx * sample_ctaw])); Dosage* write_dosage_main_iter = &(ctx->write_dosage_mains[parity][vidx * sample_ct]); uint16_t* bgen_probs = bgen_geno_buf; for (; vidx != vidx_end; ++vidx) { @@ -10934,7 +10935,7 @@ THREAD_FUNC_DECL Bgen11GenoToPgenThread(void* raw_arg) { *cur_dosage_main_iter++ = write_dosage_int; } write_genovec_iter[widx] = genovec_word; - R_CAST(Halfword*, write_dosage_present_iter)[widx] = dosage_present_hw; + write_dosage_present_iter[widx] = dosage_present_hw; } const uint32_t dosage_ct = cur_dosage_main_iter - write_dosage_main_iter; if (prov_ref_allele_second) { @@ -10946,7 +10947,7 @@ THREAD_FUNC_DECL Bgen11GenoToPgenThread(void* raw_arg) { } *write_dosage_ct_iter++ = dosage_ct; write_genovec_iter = &(write_genovec_iter[sample_ctaw2]); - write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]); + write_dosage_present_iter = &(write_dosage_present_iter[2 * sample_ctaw]); write_dosage_main_iter = &(write_dosage_main_iter[sample_ct]); } if (unlikely(vidx != vidx_end)) { @@ -11700,11 +11701,11 @@ THREAD_FUNC_DECL Bgen13GenoToPgenThread(void* raw_arg) { AlleleCode* patch_01_vals = nullptr; uintptr_t* patch_10_set = nullptr; AlleleCode* patch_10_vals = nullptr; - uintptr_t* phasepresent = nullptr; - uintptr_t* phaseinfo = nullptr; - uintptr_t* dosage_present = nullptr; + uintptr_t* __attribute__((may_alias)) phasepresent = nullptr; + uintptr_t* __attribute__((may_alias)) phaseinfo = nullptr; + uintptr_t* __attribute__((may_alias)) dosage_present = nullptr; Dosage* dosage_main = nullptr; - uintptr_t* dphase_present = nullptr; + uintptr_t* __attribute__((may_alias)) dphase_present = nullptr; SDosage* dphase_delta = nullptr; uint32_t cur_allele_ct = 2; uint32_t parity = 0; @@ -12122,7 +12123,8 @@ THREAD_FUNC_DECL Bgen13GenoToPgenThread(void* raw_arg) { } } if (!(sample_ctl2_m1 % 2)) { - // do we actually need this? well, play it safe for now + // needed for dosage_present. don't think it's needed for + // dphase_present, but play that safe for now R_CAST(Halfword*, dosage_present)[sample_ctl2_m1 + 1] = 0; R_CAST(Halfword*, dphase_present)[sample_ctl2_m1 + 1] = 0; } @@ -14403,9 +14405,9 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch } } uintptr_t* genovec; - uintptr_t* phaseinfo; + Halfword* phaseinfo_hwarr; if (unlikely(bigstack_alloc_w(sample_ctl2, &genovec) || - bigstack_alloc_w(sample_ctl, &phaseinfo))) { + bigstack_alloc_hw(2 * sample_ctl, &phaseinfo_hwarr))) { goto OxHapslegendToPgen_ret_NOMEM; } goto OxHapslegendToPgen_first_line; @@ -14536,7 +14538,6 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch const VecU16 all0 = vecu16_set1(0x2030); const VecU16 all1 = vecu16_set1(0x2031); const uint32_t fullword_ct = sample_ct / kBitsPerWordD2; - Halfword* phaseinfo_alias = R_CAST(Halfword*, phaseinfo); for (uint32_t widx = 0; widx != fullword_ct; ++widx) { uintptr_t geno_first = 0; for (uint32_t uii = 0; uii != 2; ++uii) { @@ -14575,7 +14576,7 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch phaseinfo_hw = geno_first & (~geno_second); } phaseinfo_hw = PackWordToHalfword(phaseinfo_hw); - phaseinfo_alias[widx] = phaseinfo_hw; + phaseinfo_hwarr[widx] = phaseinfo_hw; } const uint32_t remainder = sample_ct % kBitsPerWordD2; if (remainder) { @@ -14600,7 +14601,7 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch } genovec[fullword_ct] = genovec_word; genovec_word_or |= genovec_word; - phaseinfo_alias[fullword_ct] = phaseinfo_hw; + phaseinfo_hwarr[fullword_ct] = phaseinfo_hw; } #else // !USE_SSE2 const unsigned char* linebuf_iter_uc = R_CAST(const unsigned char*, linebuf_iter); @@ -14637,7 +14638,7 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch } genovec[widx] = genovec_word; genovec_word_or |= genovec_word; - DowncastWToHW(phaseinfo)[widx] = phaseinfo_hw; + phaseinfo_hwarr[widx] = phaseinfo_hw; } #endif // !USE_SSE2 } else { @@ -14693,7 +14694,7 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch } genovec[widx] = genovec_word; genovec_word_or |= genovec_word; - R_CAST(Halfword*, phaseinfo)[widx] = phaseinfo_hw; + phaseinfo_hwarr[widx] = phaseinfo_hw; } haps_line_iter = AdvPastDelim(linebuf_iter, '\n'); } @@ -14702,7 +14703,8 @@ PglErr OxHapslegendToPgen(const char* hapsname, const char* legendname, const ch ZeroTrailingNyps(sample_ct, genovec); } if (genovec_word_or & kMask5555) { - if (unlikely(SpgwAppendBiallelicGenovecHphase(genovec, nullptr, phaseinfo, &spgw))) { + uintptr_t* __attribute__((may_alias)) phaseinfo_warr = R_CAST(uintptr_t*, phaseinfo_hwarr); + if (unlikely(SpgwAppendBiallelicGenovecHphase(genovec, nullptr, phaseinfo_warr, &spgw))) { goto OxHapslegendToPgen_ret_WRITE_FAIL; } } else { @@ -15465,9 +15467,9 @@ PglErr Plink1DosageToPgen(const char* dosagename, const char* famname, const cha const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); const uint32_t sample_ctl = BitCtToWordCt(sample_ct); uintptr_t* genovec; - uintptr_t* dosage_present; + Halfword* dosage_present_hwarr; if (unlikely(bigstack_alloc_w(sample_ctl2, &genovec) || - bigstack_alloc_w(sample_ctl, &dosage_present))) { + bigstack_alloc_hw(2 * sample_ctl, &dosage_present_hwarr))) { goto Plink1DosageToPgen_ret_NOMEM; } Dosage* dosage_main = nullptr; @@ -15631,7 +15633,7 @@ PglErr Plink1DosageToPgen(const char* dosagename, const char* famname, const cha } } genovec[widx] = genovec_word; - R_CAST(Halfword*, dosage_present)[widx] = dosage_present_hw; + dosage_present_hwarr[widx] = dosage_present_hw; } if (!prov_ref_allele_second) { GenovecInvertUnsafe(sample_ct, genovec); @@ -15642,7 +15644,8 @@ PglErr Plink1DosageToPgen(const char* dosagename, const char* famname, const cha if (!prov_ref_allele_second) { BiallelicDosage16Invert(dosage_ct, dosage_main); } - reterr = SpgwAppendBiallelicGenovecDosage16(genovec, dosage_present, dosage_main, dosage_ct, &spgw); + uintptr_t* __attribute__((may_alias)) dosage_present_warr = R_CAST(uintptr_t*, dosage_present_hwarr); + reterr = SpgwAppendBiallelicGenovecDosage16(genovec, dosage_present_warr, dosage_main, dosage_ct, &spgw); if (unlikely(reterr)) { goto Plink1DosageToPgen_ret_1; } @@ -15818,13 +15821,13 @@ THREAD_FUNC_DECL GenerateDummyThread(void* raw_arg) { uint32_t vidx = (tidx * cur_block_write_ct) / calc_thread_ct; const uint32_t vidx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct; uintptr_t* write_genovec_iter = &(ctx->write_genovecs[parity][vidx * sample_ctaw2]); - uintptr_t* write_phasepresent_iter = &(ctx->write_phasepresents[parity][vidx * sample_ctaw]); - uintptr_t* write_phaseinfo_iter = &(ctx->write_phaseinfos[parity][vidx * sample_ctaw]); + Halfword* write_phasepresent_iter = DowncastWToHW(&(ctx->write_phasepresents[parity][vidx * sample_ctaw])); + Halfword* write_phaseinfo_iter = DowncastWToHW(&(ctx->write_phaseinfos[parity][vidx * sample_ctaw])); uint32_t* write_dosage_ct_iter = &(ctx->write_dosage_cts[parity][vidx]); - uintptr_t* write_dosage_present_iter = &(ctx->write_dosage_presents[parity][vidx * sample_ctaw]); + Halfword* write_dosage_present_iter = DowncastWToHW(&(ctx->write_dosage_presents[parity][vidx * sample_ctaw])); Dosage* write_dosage_main_iter = &(ctx->write_dosage_mains[parity][vidx * sample_ct]); uint32_t* write_dphase_ct_iter = &(ctx->write_dphase_cts[parity][vidx]); - uintptr_t* write_dphase_present_iter = &(ctx->write_dphase_presents[parity][vidx * sample_ctaw]); + Halfword* write_dphase_present_iter = DowncastWToHW(&(ctx->write_dphase_presents[parity][vidx * sample_ctaw])); SDosage* write_dphase_delta_iter = &(ctx->write_dphase_deltas[parity][vidx * sample_ct]); uintptr_t* prev_genovec = nullptr; for (; vidx != vidx_end; ++vidx) { @@ -16068,25 +16071,29 @@ THREAD_FUNC_DECL GenerateDummyThread(void* raw_arg) { write_genovec_iter[widx] = genovec_word; const uint32_t cur_hets = Pack01ToHalfword(genovec_word); const uint32_t phasepresent_hw = phasepresent_possible_hw & cur_hets; - R_CAST(Halfword*, write_phasepresent_iter)[widx] = phasepresent_hw; - R_CAST(Halfword*, write_phaseinfo_iter)[widx] = phaseinfo_hw & phasepresent_hw; - R_CAST(Halfword*, write_dosage_present_iter)[widx] = dosage_present_hw; - R_CAST(Halfword*, write_dphase_present_iter)[widx] = dphase_present_hw; + write_phasepresent_iter[widx] = phasepresent_hw; + write_phaseinfo_iter[widx] = phaseinfo_hw & phasepresent_hw; + write_dosage_present_iter[widx] = dosage_present_hw; + write_dphase_present_iter[widx] = dphase_present_hw; } ZeroTrailingNyps(sample_ct, write_genovec_iter); - ZeroTrailingBits(sample_ct, write_phasepresent_iter); - ZeroTrailingBits(sample_ct, write_phaseinfo_iter); + uintptr_t* __attribute__((may_alias)) write_phasepresent_warr = R_CAST(uintptr_t*, write_phasepresent_iter); + uintptr_t* __attribute__((may_alias)) write_phaseinfo_warr = R_CAST(uintptr_t*, write_phaseinfo_iter); + uintptr_t* __attribute__((may_alias)) write_dosage_present_warr = R_CAST(uintptr_t*, write_dosage_present_iter); + ZeroTrailingBits(sample_ct, write_phasepresent_warr); + ZeroTrailingBits(sample_ct, write_phaseinfo_warr); + ZeroTrailingBits(sample_ct, write_dosage_present_warr); const uint32_t dosage_ct = cur_dosage_main_iter - write_dosage_main_iter; *write_dosage_ct_iter++ = dosage_ct; const uint32_t dphase_ct = cur_dphase_delta_iter - write_dphase_delta_iter; *write_dphase_ct_iter++ = dphase_ct; prev_genovec = write_genovec_iter; write_genovec_iter = &(write_genovec_iter[sample_ctaw2]); - write_phasepresent_iter = &(write_phasepresent_iter[sample_ctaw]); - write_phaseinfo_iter = &(write_phaseinfo_iter[sample_ctaw]); - write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]); + write_phasepresent_iter = &(write_phasepresent_iter[2 * sample_ctaw]); + write_phaseinfo_iter = &(write_phaseinfo_iter[2 * sample_ctaw]); + write_dosage_present_iter = &(write_dosage_present_iter[2 * sample_ctaw]); write_dosage_main_iter = &(write_dosage_main_iter[sample_ct]); - write_dphase_present_iter = &(write_dphase_present_iter[sample_ctaw]); + write_dphase_present_iter = &(write_dphase_present_iter[2 * sample_ctaw]); write_dphase_delta_iter = &(write_dphase_delta_iter[sample_ct]); } parity = 1 - parity; diff --git a/2.0/plink2_ld.cc b/2.0/plink2_ld.cc index 202e3565..04e9abe0 100644 --- a/2.0/plink2_ld.cc +++ b/2.0/plink2_ld.cc @@ -2673,17 +2673,16 @@ void GenoarrSplit12Nm(const uintptr_t* __restrict genoarr, uint32_t sample_ct, u two_bitarr_alias[widx] = high_halfword & (~low_halfword); nm_bitarr_alias[widx] = ~(low_halfword & high_halfword); } - const uint32_t sample_ct_rem = sample_ct % kBitsPerWordD2; + const uint32_t sample_ct_rem = sample_ct % kBitsPerWord; if (sample_ct_rem) { - const Halfword trailing_mask = (1U << sample_ct_rem) - 1; - one_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - two_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - nm_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - } - if (sample_ctl2 % 2) { - one_bitarr_alias[sample_ctl2] = 0; - two_bitarr_alias[sample_ctl2] = 0; - nm_bitarr_alias[sample_ctl2] = 0; + const uint32_t last_widx = sample_ct / kBitsPerWord; + const uintptr_t trailing_mask = (k1LU << sample_ct_rem) - 1; + uintptr_t* __attribute__((may_alias)) one_bitarr_last = &(one_bitarr[last_widx]); + uintptr_t* __attribute__((may_alias)) two_bitarr_last = &(two_bitarr[last_widx]); + uintptr_t* __attribute__((may_alias)) nm_bitarr_last = &(nm_bitarr[last_widx]); + *one_bitarr_last &= trailing_mask; + *two_bitarr_last &= trailing_mask; + *nm_bitarr_last &= trailing_mask; } } #else @@ -2703,17 +2702,16 @@ void GenoarrSplit12Nm(const uintptr_t* __restrict genoarr, uint32_t sample_ct, u nm_bitarr_alias[widx] = ~(low_halfword & high_halfword); } - const uint32_t sample_ct_rem = sample_ct % kBitsPerWordD2; + const uint32_t sample_ct_rem = sample_ct % kBitsPerWord; if (sample_ct_rem) { - const Halfword trailing_mask = (1U << sample_ct_rem) - 1; - one_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - two_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - nm_bitarr_alias[sample_ctl2 - 1] &= trailing_mask; - } - if (sample_ctl2 % 2) { - one_bitarr_alias[sample_ctl2] = 0; - two_bitarr_alias[sample_ctl2] = 0; - nm_bitarr_alias[sample_ctl2] = 0; + const uint32_t last_widx = sample_ct / kBitsPerWord; + const uintptr_t trailing_mask = (k1LU << sample_ct_rem) - 1; + uintptr_t* __attribute__((may_alias)) one_bitarr_last = &(one_bitarr[last_widx]); + uintptr_t* __attribute__((may_alias)) two_bitarr_last = &(two_bitarr[last_widx]); + uintptr_t* __attribute__((may_alias)) nm_bitarr_last = &(nm_bitarr[last_widx]); + *one_bitarr_last &= trailing_mask; + *two_bitarr_last &= trailing_mask; + *nm_bitarr_last &= trailing_mask; } } #endif diff --git a/2.0/plink2_merge.cc b/2.0/plink2_merge.cc index 42fd92b5..1b3b7852 100644 --- a/2.0/plink2_merge.cc +++ b/2.0/plink2_merge.cc @@ -5363,7 +5363,7 @@ PglErr MergePgenVariantNoTmpLocked(SamePosPvarRecord** same_id_records, const Al genovec[hwidx] = most_common_geno_word; } } else { - const Halfword* sample_span_hw = R_CAST(const Halfword*, sample_span); + const Halfword* sample_span_hw = DowncastKWToHW(sample_span); for (uint32_t hwidx = 0; hwidx != write_sample_ctl2; ++hwidx) { // Initialize each genotype outside the single source's sample-span // to missing (both bits set).