diff --git a/.github/workflows/miri.yml b/.github/workflows/miri.yml index 0b7548621..01b50015a 100644 --- a/.github/workflows/miri.yml +++ b/.github/workflows/miri.yml @@ -15,6 +15,8 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4.2.2 + with: + submodules: recursive - uses: actions-rs/toolchain@v1.0.6 with: profile: minimal diff --git a/.github/workflows/semver.yml b/.github/workflows/semver.yml index 8b377f7a4..5d3f5c994 100644 --- a/.github/workflows/semver.yml +++ b/.github/workflows/semver.yml @@ -15,6 +15,8 @@ jobs: - stable steps: - uses: actions/checkout@v4.2.2 + with: + submodules: recursive - uses: dtolnay/rust-toolchain@v1 with: toolchain: ${{ matrix.rust }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9fb803212..d9090fd20 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -68,6 +68,8 @@ jobs: - stable steps: - uses: actions/checkout@v4.2.2 + with: + submodules: recursive - uses: dtolnay/rust-toolchain@v1 with: toolchain: ${{ matrix.rust }} @@ -86,6 +88,8 @@ jobs: - 1.71.0 steps: - uses: actions/checkout@v4.2.2 + with: + submodules: recursive - uses: dtolnay/rust-toolchain@v1 with: toolchain: ${{ matrix.rust }} @@ -102,6 +106,8 @@ jobs: - stable steps: - uses: actions/checkout@v4.2.2 + with: + submodules: recursive - uses: dtolnay/rust-toolchain@v1 with: toolchain: ${{ matrix.rust }} diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..880a98329 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "subprojects/tskit"] + path = subprojects/tskit + url = https://github.com/tskit-dev/tskit.git diff --git a/build.rs b/build.rs index 288535924..0f08670ed 100644 --- a/build.rs +++ b/build.rs @@ -5,17 +5,17 @@ fn main() { pkg_config::Config::new().atleast_version("1.2"); let src = [ - "subprojects/tskit/tskit/convert.c", - "subprojects/tskit/tskit/core.c", - "subprojects/tskit/tskit/genotypes.c", - "subprojects/tskit/tskit/haplotype_matching.c", - "subprojects/tskit/tskit/stats.c", - "subprojects/tskit/tskit/tables.c", - "subprojects/tskit/tskit/trees.c", + "subprojects/tskit/c/tskit/convert.c", + "subprojects/tskit/c/tskit/core.c", + "subprojects/tskit/c/tskit/genotypes.c", + "subprojects/tskit/c/tskit/haplotype_matching.c", + "subprojects/tskit/c/tskit/stats.c", + "subprojects/tskit/c/tskit/tables.c", + "subprojects/tskit/c/tskit/trees.c", "subprojects/kastore/kastore.c", ]; - let tskit_path = Path::new("subprojects/tskit/"); + let tskit_path = Path::new("subprojects/tskit/c"); let kastore_path = Path::new("subprojects/kastore/"); let mut builder = cc::Build::new(); let build = builder @@ -32,7 +32,7 @@ fn main() { // The input header we would like to generate // bindings for. .header("wrapper.h") - .clang_arg("-Isubprojects/tskit") + .clang_arg("-Isubprojects/tskit/c") .clang_arg("-Isubprojects/kastore") .allowlist_type("tsk.*") .allowlist_function("tsk.*") @@ -56,3 +56,4 @@ fn main() { .write_to_file(out_path.join("auto_bindings.rs")) .expect("Couldn't write bindings!"); } + diff --git a/subprojects/tskit b/subprojects/tskit new file mode 160000 index 000000000..4f532bd14 --- /dev/null +++ b/subprojects/tskit @@ -0,0 +1 @@ +Subproject commit 4f532bd14f1a07643d965f8bd98d322464d63269 diff --git a/subprojects/tskit/LICENSE b/subprojects/tskit/LICENSE deleted file mode 100644 index a397394a2..000000000 --- a/subprojects/tskit/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018-2019 Tskit Developers - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/subprojects/tskit/VERSION.txt b/subprojects/tskit/VERSION.txt deleted file mode 100644 index 8428158dc..000000000 --- a/subprojects/tskit/VERSION.txt +++ /dev/null @@ -1 +0,0 @@ -1.1.2 \ No newline at end of file diff --git a/subprojects/tskit/tskit.h b/subprojects/tskit/tskit.h deleted file mode 100644 index e55ffc664..000000000 --- a/subprojects/tskit/tskit.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019 Tskit Developers - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file tskit.h - * @brief Tskit API. - */ -#ifndef __TSKIT_H__ -#define __TSKIT_H__ - -#include -#include -#include -#include -#include -#include - -#endif diff --git a/subprojects/tskit/tskit/convert.c b/subprojects/tskit/tskit/convert.c deleted file mode 100644 index e100b3fea..000000000 --- a/subprojects/tskit/tskit/convert.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2018-2021 Tskit Developers - * Copyright (c) 2015-2017 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include - -/* ======================================================== * - * Newick output. - * ======================================================== */ - -/* This infrastructure is left-over from an earlier more complex version - * of this algorithm that worked over a tree sequence and cached the newick - * subtrees, updating according to diffs. It's unclear whether this complexity - * was of any real-world use, since newick output for large trees is pretty - * pointless. */ - -typedef struct { - unsigned int precision; - tsk_flags_t options; - char *newick; - tsk_id_t *traversal_stack; - const tsk_tree_t *tree; -} tsk_newick_converter_t; - -static int -tsk_newick_converter_run( - tsk_newick_converter_t *self, tsk_id_t root, size_t buffer_size, char *buffer) -{ - int ret = TSK_ERR_GENERIC; - const tsk_tree_t *tree = self->tree; - tsk_id_t *stack = self->traversal_stack; - const double *time = self->tree->tree_sequence->tables->nodes.time; - const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags; - int stack_top = 0; - int label; - size_t s = 0; - int r; - tsk_id_t u, v, w, root_parent; - double branch_length; - bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS; - const char *label_format = ms_labels ? "%d" : "n%d"; - - if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (buffer == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - root_parent = tree->parent[root]; - stack[0] = root; - u = root_parent; - while (stack_top >= 0) { - v = stack[stack_top]; - if (tree->left_child[v] != TSK_NULL && v != u) { - if (s >= buffer_size) { - ret = TSK_ERR_BUFFER_OVERFLOW; - goto out; - } - buffer[s] = '('; - s++; - for (w = tree->right_child[v]; w != TSK_NULL; w = tree->left_sib[w]) { - stack_top++; - stack[stack_top] = w; - } - } else { - u = tree->parent[v]; - stack_top--; - label = -1; - if (ms_labels) { - if (tree->left_child[v] == TSK_NULL) { - label = (int) v + 1; - } - } else if (flags[v] & TSK_NODE_IS_SAMPLE) { - label = (int) v; - } - if (label != -1) { - if (s >= buffer_size) { - ret = TSK_ERR_BUFFER_OVERFLOW; - goto out; - } - r = snprintf(buffer + s, buffer_size - s, label_format, label); - if (r < 0) { - ret = TSK_ERR_IO; - goto out; - } - s += (size_t) r; - if (s >= buffer_size) { - ret = TSK_ERR_BUFFER_OVERFLOW; - goto out; - } - } - if (u != root_parent) { - branch_length = (time[u] - time[v]); - r = snprintf(buffer + s, buffer_size - s, ":%.*f", (int) self->precision, - branch_length); - if (r < 0) { - ret = TSK_ERR_IO; - goto out; - } - s += (size_t) r; - if (s >= buffer_size) { - ret = TSK_ERR_BUFFER_OVERFLOW; - goto out; - } - if (v == tree->right_child[u]) { - buffer[s] = ')'; - } else { - buffer[s] = ','; - } - s++; - } - } - } - if ((s + 1) >= buffer_size) { - ret = TSK_ERR_BUFFER_OVERFLOW; - goto out; - } - buffer[s] = ';'; - buffer[s + 1] = '\0'; - ret = 0; -out: - return ret; -} - -static int -tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree, - unsigned int precision, tsk_flags_t options) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_newick_converter_t)); - self->precision = precision; - self->options = options; - self->tree = tree; - self->traversal_stack - = tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack)); - if (self->traversal_stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -static int -tsk_newick_converter_free(tsk_newick_converter_t *self) -{ - tsk_safe_free(self->traversal_stack); - return 0; -} - -int -tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, - tsk_flags_t options, size_t buffer_size, char *buffer) -{ - int ret = 0; - tsk_newick_converter_t nc; - - ret = tsk_newick_converter_init(&nc, tree, precision, options); - if (ret != 0) { - goto out; - } - ret = tsk_newick_converter_run(&nc, root, buffer_size, buffer); -out: - tsk_newick_converter_free(&nc); - return ret; -} diff --git a/subprojects/tskit/tskit/convert.h b/subprojects/tskit/tskit/convert.h deleted file mode 100644 index ee3de51ee..000000000 --- a/subprojects/tskit/tskit/convert.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2018-2021 Tskit Developers - * Copyright (c) 2015-2017 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TSK_CONVERT_H -#define TSK_CONVERT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -#define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0) - -int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, - tsk_flags_t options, size_t buffer_size, char *buffer); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/subprojects/tskit/tskit/core.c b/subprojects/tskit/tskit/core.c deleted file mode 100644 index b1ea25bad..000000000 --- a/subprojects/tskit/tskit/core.c +++ /dev/null @@ -1,1166 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2015-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include - -#include -#include - -#define UUID_NUM_BYTES 16 - -#if defined(_WIN32) - -#include -#include - -static int TSK_WARN_UNUSED -get_random_bytes(uint8_t *buf) -{ - /* Based on CPython's code in bootstrap_hash.c */ - int ret = TSK_ERR_GENERATE_UUID; - HCRYPTPROV hCryptProv = (HCRYPTPROV) NULL; - - if (!CryptAcquireContext( - &hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { - goto out; - } - if (!CryptGenRandom(hCryptProv, (DWORD) UUID_NUM_BYTES, buf)) { - goto out; - } - if (!CryptReleaseContext(hCryptProv, 0)) { - hCryptProv = (HCRYPTPROV) NULL; - goto out; - } - hCryptProv = (HCRYPTPROV) NULL; - ret = 0; -out: - if (hCryptProv != (HCRYPTPROV) NULL) { - CryptReleaseContext(hCryptProv, 0); - } - return ret; -} - -#else - -/* Assuming the existance of /dev/urandom on Unix platforms */ -static int TSK_WARN_UNUSED -get_random_bytes(uint8_t *buf) -{ - int ret = TSK_ERR_GENERATE_UUID; - FILE *f = fopen("/dev/urandom", "r"); - - if (f == NULL) { - goto out; - } - if (fread(buf, UUID_NUM_BYTES, 1, f) != 1) { - goto out; - } - if (fclose(f) != 0) { - goto out; - } - ret = 0; -out: - return ret; -} - -#endif - -/* Generate a new UUID4 using a system-generated source of randomness. - * Note that this function writes a NULL terminator to the end of this - * string, so that the total length of the buffer must be 37 bytes. - */ -int -tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) -{ - int ret = 0; - uint8_t buf[UUID_NUM_BYTES]; - const char *pattern - = "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x"; - - ret = get_random_bytes(buf); - if (ret != 0) { - goto out; - } - if (snprintf(dest, TSK_UUID_SIZE + 1, pattern, buf[0], buf[1], buf[2], buf[3], - buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], - buf[13], buf[14], buf[15]) - < 0) { - ret = TSK_ERR_GENERATE_UUID; - goto out; - } -out: - return ret; -} -static const char * -tsk_strerror_internal(int err) -{ - const char *ret = "Unknown error"; - - switch (err) { - case 0: - ret = "Normal exit condition. This is not an error!"; - break; - - /* General errors */ - case TSK_ERR_GENERIC: - ret = "Generic error; please file a bug report. (TSK_ERR_GENERIC)"; - break; - case TSK_ERR_NO_MEMORY: - ret = "Out of memory. (TSK_ERR_NO_MEMORY)"; - break; - case TSK_ERR_IO: - if (errno != 0) { - ret = strerror(errno); - } else { - ret = "Unspecified IO error"; - } - break; - case TSK_ERR_BAD_PARAM_VALUE: - ret = "Bad parameter value provided. (TSK_ERR_BAD_PARAM_VALUE)"; - break; - case TSK_ERR_BUFFER_OVERFLOW: - ret = "Supplied buffer is too small. (TSK_ERR_BUFFER_OVERFLOW)"; - break; - case TSK_ERR_UNSUPPORTED_OPERATION: - ret = "Operation cannot be performed in current configuration. " - "(TSK_ERR_UNSUPPORTED_OPERATION)"; - break; - case TSK_ERR_GENERATE_UUID: - ret = "Error generating UUID. (TSK_ERR_GENERATE_UUID)"; - break; - case TSK_ERR_EOF: - ret = "End of file. (TSK_ERR_EOF)"; - break; - - /* File format errors */ - case TSK_ERR_FILE_FORMAT: - ret = "File format error. (TSK_ERR_FILE_FORMAT)"; - break; - case TSK_ERR_FILE_VERSION_TOO_OLD: - ret = "tskit file version too old. Please upgrade using the " - "'tskit upgrade' command. (TSK_ERR_FILE_VERSION_TOO_OLD)"; - break; - case TSK_ERR_FILE_VERSION_TOO_NEW: - ret = "tskit file version is too new for this instance. " - "Please upgrade tskit to the latest version. " - "(TSK_ERR_FILE_VERSION_TOO_NEW)"; - break; - case TSK_ERR_REQUIRED_COL_NOT_FOUND: - ret = "A required column was not found in the file. " - "(TSK_ERR_REQUIRED_COL_NOT_FOUND)"; - break; - case TSK_ERR_BOTH_COLUMNS_REQUIRED: - ret = "Both columns in a related pair must be provided. " - "(TSK_ERR_BOTH_COLUMNS_REQUIRED)"; - break; - case TSK_ERR_BAD_COLUMN_TYPE: - ret = "An incompatible type for a column was found in the file. " - "(TSK_ERR_BAD_COLUMN_TYPE)"; - break; - - /* Out of bounds errors */ - case TSK_ERR_BAD_OFFSET: - ret = "Bad offset provided in input array. (TSK_ERR_BAD_OFFSET)"; - break; - case TSK_ERR_NODE_OUT_OF_BOUNDS: - ret = "Node out of bounds. (TSK_ERR_NODE_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_EDGE_OUT_OF_BOUNDS: - ret = "Edge out of bounds. (TSK_ERR_EDGE_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_POPULATION_OUT_OF_BOUNDS: - ret = "Population out of bounds. (TSK_ERR_POPULATION_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_SITE_OUT_OF_BOUNDS: - ret = "Site out of bounds. (TSK_ERR_SITE_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_MUTATION_OUT_OF_BOUNDS: - ret = "Mutation out of bounds. (TSK_ERR_MUTATION_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_MIGRATION_OUT_OF_BOUNDS: - ret = "Migration out of bounds. (TSK_ERR_MIGRATION_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS: - ret = "Individual out of bounds. (TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_PROVENANCE_OUT_OF_BOUNDS: - ret = "Provenance out of bounds. (TSK_ERR_PROVENANCE_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_TIME_NONFINITE: - ret = "Times must be finite. (TSK_ERR_TIME_NONFINITE)"; - break; - case TSK_ERR_GENOME_COORDS_NONFINITE: - ret = "Genome coordinates must be finite numbers. " - "(TSK_ERR_GENOME_COORDS_NONFINITE)"; - break; - case TSK_ERR_SEEK_OUT_OF_BOUNDS: - ret = "Tree seek position out of bounds. (TSK_ERR_SEEK_OUT_OF_BOUNDS)"; - break; - case TSK_ERR_KEEP_ROWS_MAP_TO_DELETED: - ret = "One of the kept rows in the table refers to a deleted row. " - "(TSK_ERR_KEEP_ROWS_MAP_TO_DELETED)"; - break; - - /* Edge errors */ - case TSK_ERR_NULL_PARENT: - ret = "Edge in parent is null. (TSK_ERR_NULL_PARENT)"; - break; - case TSK_ERR_NULL_CHILD: - ret = "Edge in parent is null. (TSK_ERR_NULL_CHILD)"; - break; - case TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME: - ret = "Edges must be listed in (time[parent], child, left) order;" - " time[parent] order violated. (TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME)"; - break; - case TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS: - ret = "All edges for a given parent must be contiguous. " - "(TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS)"; - break; - case TSK_ERR_EDGES_NOT_SORTED_CHILD: - ret = "Edges must be listed in (time[parent], child, left) order;" - " child order violated. (TSK_ERR_EDGES_NOT_SORTED_CHILD)"; - break; - case TSK_ERR_EDGES_NOT_SORTED_LEFT: - ret = "Edges must be listed in (time[parent], child, left) order;" - " left order violated. (TSK_ERR_EDGES_NOT_SORTED_LEFT)"; - break; - case TSK_ERR_BAD_NODE_TIME_ORDERING: - ret = "time[parent] must be greater than time[child]. " - "(TSK_ERR_BAD_NODE_TIME_ORDERING)"; - break; - case TSK_ERR_BAD_EDGE_INTERVAL: - ret = "Bad edge interval where right <= left. (TSK_ERR_BAD_EDGE_INTERVAL)"; - break; - case TSK_ERR_DUPLICATE_EDGES: - ret = "Duplicate edges provided. (TSK_ERR_DUPLICATE_EDGES)"; - break; - case TSK_ERR_RIGHT_GREATER_SEQ_LENGTH: - ret = "Right coordinate > sequence length. " - "(TSK_ERR_RIGHT_GREATER_SEQ_LENGTH)"; - break; - case TSK_ERR_LEFT_LESS_ZERO: - ret = "Left coordinate must be >= 0. (TSK_ERR_LEFT_LESS_ZERO)"; - break; - case TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN: - ret = "Bad edges: contradictory children for a given parent over " - "an interval, or indexes need to be rebuilt. " - "(TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN)"; - break; - case TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA: - ret = "Can't squash, flush, simplify or link ancestors with edges that have " - "non-empty metadata. (TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA)"; - break; - - /* Site errors */ - case TSK_ERR_UNSORTED_SITES: - ret = "Sites must be provided in strictly increasing position order. " - "(TSK_ERR_UNSORTED_SITES)"; - break; - case TSK_ERR_DUPLICATE_SITE_POSITION: - ret = "Duplicate site positions. (TSK_ERR_DUPLICATE_SITE_POSITION)"; - break; - case TSK_ERR_BAD_SITE_POSITION: - ret = "Site positions must be between 0 and sequence_length. " - "(TSK_ERR_BAD_SITE_POSITION)"; - break; - - /* Mutation errors */ - case TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE: - ret = "Specified parent mutation is at a different site. " - "(TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE)"; - break; - case TSK_ERR_MUTATION_PARENT_EQUAL: - ret = "Parent mutation refers to itself. (TSK_ERR_MUTATION_PARENT_EQUAL)"; - break; - case TSK_ERR_MUTATION_PARENT_AFTER_CHILD: - ret = "Parent mutation ID must be < current ID. " - "(TSK_ERR_MUTATION_PARENT_AFTER_CHILD)"; - break; - case TSK_ERR_MUTATION_PARENT_INCONSISTENT: - ret = "Mutation parent references form a loop. " - "(TSK_ERR_MUTATION_PARENT_INCONSISTENT)"; - break; - case TSK_ERR_UNSORTED_MUTATIONS: - ret = "Mutations must be provided in non-decreasing site order and " - "non-increasing time order within each site. " - "(TSK_ERR_UNSORTED_MUTATIONS)"; - break; - case TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE: - ret = "A mutation's time must be >= the node time, or be marked as " - "'unknown'. (TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE)"; - break; - case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION: - ret = "A mutation's time must be <= the parent mutation time (if known), or " - "be marked as 'unknown'. " - "(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION)"; - break; - case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE: - ret = "A mutation's time must be < the parent node of the edge on which it " - "occurs, or be marked as 'unknown'. " - "(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE)"; - break; - case TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN: - ret = "Mutation times must either be all marked 'unknown', or all be known " - "values for any single site. " - "(TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN)"; - break; - - /* Migration errors */ - case TSK_ERR_UNSORTED_MIGRATIONS: - ret = "Migrations must be sorted by time. (TSK_ERR_UNSORTED_MIGRATIONS)"; - break; - - /* Sample errors */ - case TSK_ERR_DUPLICATE_SAMPLE: - ret = "Duplicate sample value. (TSK_ERR_DUPLICATE_SAMPLE)"; - break; - case TSK_ERR_BAD_SAMPLES: - ret = "The nodes provided are not samples. (TSK_ERR_BAD_SAMPLES)"; - break; - - /* Table errors */ - case TSK_ERR_BAD_TABLE_POSITION: - ret = "Bad table position provided to truncate/reset. " - "(TSK_ERR_BAD_TABLE_POSITION)"; - break; - case TSK_ERR_BAD_SEQUENCE_LENGTH: - ret = "Sequence length must be > 0. (TSK_ERR_BAD_SEQUENCE_LENGTH)"; - break; - case TSK_ERR_TABLES_NOT_INDEXED: - ret = "Table collection must be indexed. (TSK_ERR_TABLES_NOT_INDEXED)"; - break; - case TSK_ERR_TABLES_BAD_INDEXES: - ret = "Table collection indexes inconsistent: do they need to be rebuilt? " - "(TSK_ERR_TABLES_BAD_INDEXES)"; - break; - case TSK_ERR_TABLE_OVERFLOW: - ret = "Table too large; cannot allocate more than 2**31 rows. This error " - "is often caused by a lack of simplification when simulating. " - "(TSK_ERR_TABLE_OVERFLOW)"; - break; - case TSK_ERR_COLUMN_OVERFLOW: - ret = "Table column too large; cannot be more than 2**64 bytes. " - "(TSK_ERR_COLUMN_OVERFLOW)"; - break; - case TSK_ERR_TREE_OVERFLOW: - ret = "Too many trees; cannot be more than 2**31. (TSK_ERR_TREE_OVERFLOW)"; - break; - case TSK_ERR_METADATA_DISABLED: - ret = "Metadata is disabled for this table, so cannot be set. " - "(TSK_ERR_METADATA_DISABLED)"; - break; - - /* Limitations */ - case TSK_ERR_ONLY_INFINITE_SITES: - ret = "Only infinite sites mutations are supported for this operation, " - "i.e. at most a single mutation per site. " - "(TSK_ERR_ONLY_INFINITE_SITES)"; - break; - case TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED: - ret = "Migrations not currently supported by simplify. " - "(TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED)"; - break; - case TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED: - ret = "Migrations not currently supported by sort. " - "(TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED)"; - break; - case TSK_ERR_SORT_OFFSET_NOT_SUPPORTED: - ret = "Sort offsets for sites and mutations must be either 0 " - "or the length of the respective tables. Intermediate values " - "are not supported. (TSK_ERR_SORT_OFFSET_NOT_SUPPORTED)"; - break; - case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED: - ret = "Only binary mutations are supported for this operation. " - "(TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED)"; - break; - case TSK_ERR_MIGRATIONS_NOT_SUPPORTED: - ret = "Migrations not currently supported by this operation. " - "(TSK_ERR_MIGRATIONS_NOT_SUPPORTED)"; - break; - case TSK_ERR_CANNOT_EXTEND_FROM_SELF: - ret = "Tables can only be extended using rows from a different table. " - "(TSK_ERR_CANNOT_EXTEND_FROM_SELF)"; - break; - case TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED: - ret = "Silent mutations not supported by this operation. " - "(TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED)"; - break; - case TSK_ERR_VARIANT_CANT_DECODE_COPY: - ret = "Can't decode a copy of a variant. (TSK_ERR_VARIANT_CANT_DECODE_COPY)"; - break; - case TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA: - ret = "A tree sequence can't take ownership of tables with " - "TSK_NO_EDGE_METADATA. (TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA)"; - break; - case TSK_ERR_UNDEFINED_NONBINARY: - ret = "Operation undefined for nonbinary trees. " - "(TSK_ERR_UNDEFINED_NONBINARY)"; - break; - case TSK_ERR_UNDEFINED_MULTIROOT: - ret = "Operation undefined for trees that are not singly-rooted. " - "(TSK_ERR_UNDEFINED_MULTIROOT)"; - break; - - /* Stats errors */ - case TSK_ERR_BAD_NUM_WINDOWS: - ret = "Must have at least one window, [0, L]. (TSK_ERR_BAD_NUM_WINDOWS)"; - break; - case TSK_ERR_BAD_WINDOWS: - ret = "Windows must be increasing list [0, ..., L]. (TSK_ERR_BAD_WINDOWS)"; - break; - case TSK_ERR_MULTIPLE_STAT_MODES: - ret = "Cannot specify more than one stats mode. " - "(TSK_ERR_MULTIPLE_STAT_MODES)"; - break; - case TSK_ERR_BAD_STATE_DIMS: - ret = "Must have state dimension >= 1. (TSK_ERR_BAD_STATE_DIMS)"; - break; - case TSK_ERR_BAD_RESULT_DIMS: - ret = "Must have result dimension >= 1. (TSK_ERR_BAD_RESULT_DIMS)"; - break; - case TSK_ERR_INSUFFICIENT_SAMPLE_SETS: - ret = "Insufficient sample sets provided. " - "(TSK_ERR_INSUFFICIENT_SAMPLE_SETS)"; - break; - case TSK_ERR_INSUFFICIENT_INDEX_TUPLES: - ret = "Insufficient sample set index tuples provided. " - "(TSK_ERR_INSUFFICIENT_INDEX_TUPLES)"; - break; - case TSK_ERR_BAD_SAMPLE_SET_INDEX: - ret = "Sample set index out of bounds. (TSK_ERR_BAD_SAMPLE_SET_INDEX)"; - break; - case TSK_ERR_EMPTY_SAMPLE_SET: - ret = "Samples cannot be empty. (TSK_ERR_EMPTY_SAMPLE_SET)"; - break; - case TSK_ERR_UNSUPPORTED_STAT_MODE: - ret = "Requested statistics mode not supported for this method. " - "(TSK_ERR_UNSUPPORTED_STAT_MODE)"; - break; - case TSK_ERR_TIME_UNCALIBRATED: - ret = "Statistics using branch lengths cannot be calculated when time_units " - "is 'uncalibrated'. (TSK_ERR_TIME_UNCALIBRATED)"; - break; - - /* Mutation mapping errors */ - case TSK_ERR_GENOTYPES_ALL_MISSING: - ret = "Must provide at least one non-missing genotype. " - "(TSK_ERR_GENOTYPES_ALL_MISSING)"; - break; - case TSK_ERR_BAD_GENOTYPE: - ret = "Bad genotype value provided. (TSK_ERR_BAD_GENOTYPE)"; - break; - case TSK_ERR_BAD_ANCESTRAL_STATE: - ret = "Bad ancestral state specified. (TSK_ERR_BAD_ANCESTRAL_STATE)"; - break; - - /* Genotype decoding errors */ - case TSK_ERR_MUST_IMPUTE_NON_SAMPLES: - ret = "Cannot generate genotypes for non-samples when isolated nodes are " - "considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)"; - break; - case TSK_ERR_ALLELE_NOT_FOUND: - ret = "An allele was not found in the user-specified allele map. " - "(TSK_ERR_ALLELE_NOT_FOUND)"; - break; - case TSK_ERR_TOO_MANY_ALLELES: - ret = "Cannot have more than 2147483647 alleles (TSK_ERR_TOO_MANY_ALLELES)"; - break; - case TSK_ERR_ZERO_ALLELES: - ret = "Must have at least one allele when specifying an allele map. " - "(TSK_ERR_ZERO_ALLELES)"; - break; - - /* Distance metric errors */ - case TSK_ERR_SAMPLE_SIZE_MISMATCH: - ret = "Cannot compare trees with different numbers of samples. " - "(TSK_ERR_SAMPLE_SIZE_MISMATCH)"; - break; - case TSK_ERR_SAMPLES_NOT_EQUAL: - ret = "Samples must be identical in trees to compare. " - "(TSK_ERR_SAMPLES_NOT_EQUAL)"; - break; - case TSK_ERR_MULTIPLE_ROOTS: - ret = "Trees with multiple roots not supported. (TSK_ERR_MULTIPLE_ROOTS)"; - break; - case TSK_ERR_UNARY_NODES: - ret = "Unsimplified trees with unary nodes are not supported. " - "(TSK_ERR_UNARY_NODES)"; - break; - case TSK_ERR_SEQUENCE_LENGTH_MISMATCH: - ret = "Sequence lengths must be identical to compare. " - "(TSK_ERR_SEQUENCE_LENGTH_MISMATCH)"; - break; - case TSK_ERR_NO_SAMPLE_LISTS: - ret = "The sample_lists option must be enabled on the tree to perform this " - "operation. Pass the option to the constructor or method that created " - "the tree. (TSK_ERR_NO_SAMPLE_LISTS)"; - break; - - /* Haplotype matching errors */ - case TSK_ERR_NULL_VITERBI_MATRIX: - ret = "Viterbi matrix has not filled. (TSK_ERR_NULL_VITERBI_MATRIX)"; - break; - case TSK_ERR_MATCH_IMPOSSIBLE: - ret = "No matching haplotype exists with current parameters. " - "(TSK_ERR_MATCH_IMPOSSIBLE)"; - break; - case TSK_ERR_BAD_COMPRESSED_MATRIX_NODE: - ret = "The compressed matrix contains a node that subtends no samples. " - "(TSK_ERR_BAD_COMPRESSED_MATRIX_NODE)"; - break; - case TSK_ERR_TOO_MANY_VALUES: - ret = "Too many values to compress. (TSK_ERR_TOO_MANY_VALUES)"; - break; - - /* Union errors */ - case TSK_ERR_UNION_BAD_MAP: - ret = "Node map contains an entry of a node not present in this table " - "collection. (TSK_ERR_UNION_BAD_MAP)"; - break; - case TSK_ERR_UNION_DIFF_HISTORIES: - // histories could be equivalent, because subset does not reorder - // edges (if not sorted) or mutations. - ret = "Shared portions of the tree sequences are not equal. " - "(TSK_ERR_UNION_DIFF_HISTORIES)"; - break; - - /* IBD errors */ - case TSK_ERR_SAME_NODES_IN_PAIR: - ret = "Both nodes in the sample pair are the same. " - "(TSK_ERR_SAME_NODES_IN_PAIR)"; - break; - - case TSK_ERR_IBD_PAIRS_NOT_STORED: - ret = "The sample pairs are not stored by default in ibd_segments. Please " - "add the TSK_IBD_STORE_PAIRS option flag if per-pair statistics are " - "required. (TSK_ERR_IBD_PAIRS_NOT_STORED)"; - break; - - case TSK_ERR_IBD_SEGMENTS_NOT_STORED: - ret = "All segments are not stored by default in ibd_segments. Please " - "add the TSK_IBD_STORE_SEGMENTS option flag if they are required. " - "(TSK_ERR_IBD_SEGMENTS_NOT_STORED)"; - break; - - /* Simplify errors */ - case TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE: - ret = "You cannot specify both TSK_SIMPLIFY_KEEP_UNARY and " - "TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVDUALS. " - "(TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE)"; - break; - - /* Individual errors */ - case TSK_ERR_UNSORTED_INDIVIDUALS: - ret = "Individuals must be provided in an order where children are after " - "their parent individuals (TSK_ERR_UNSORTED_INDIVIDUALS)"; - break; - - case TSK_ERR_INDIVIDUAL_SELF_PARENT: - ret = "Individuals cannot be their own parents. " - "(TSK_ERR_INDIVIDUAL_SELF_PARENT)"; - break; - - case TSK_ERR_INDIVIDUAL_PARENT_CYCLE: - ret = "Individuals cannot be their own ancestor. " - "(TSK_ERR_INDIVIDUAL_PARENT_CYCLE)"; - break; - - case TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH: - ret = "Individual populations cannot be returned " - "if an individual has nodes from more than one population. " - "(TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH)"; - break; - - case TSK_ERR_INDIVIDUAL_TIME_MISMATCH: - ret = "Individual times cannot be returned " - "if an individual has nodes from more than one time. " - "(TSK_ERR_INDIVIDUAL_TIME_MISMATCH)"; - break; - } - return ret; -} - -int -tsk_set_kas_error(int err) -{ - if (err == KAS_ERR_IO) { - /* If we've detected an IO error, report it as TSK_ERR_IO so that we have - * a consistent error code covering these situtations */ - return TSK_ERR_IO; - } else { - /* Flip this bit. As the error is negative, this sets the bit to 0 */ - return err ^ (1 << TSK_KAS_ERR_BIT); - } -} - -bool -tsk_is_kas_error(int err) -{ - return !(err & (1 << TSK_KAS_ERR_BIT)); -} - -int -tsk_get_kas_error(int err) -{ - return err ^ (1 << TSK_KAS_ERR_BIT); -} - -const char * -tsk_strerror(int err) -{ - if (err != 0 && tsk_is_kas_error(err)) { - return kas_strerror(tsk_get_kas_error(err)); - } else { - return tsk_strerror_internal(err); - } -} - -void -__tsk_safe_free(void **ptr) -{ - if (ptr != NULL) { - if (*ptr != NULL) { - free(*ptr); - *ptr = NULL; - } - } -} - -/* Block allocator. Simple allocator when we lots of chunks of memory - * and don't need to free them individually. - */ - -void -tsk_blkalloc_print_state(tsk_blkalloc_t *self, FILE *out) -{ - fprintf(out, "Block allocator%p::\n", (void *) self); - fprintf(out, "\ttop = %lld\n", (long long) self->top); - fprintf(out, "\tchunk_size = %lld\n", (long long) self->chunk_size); - fprintf(out, "\tnum_chunks = %lld\n", (long long) self->num_chunks); - fprintf(out, "\ttotal_allocated = %lld\n", (long long) self->total_allocated); - fprintf(out, "\ttotal_size = %lld\n", (long long) self->total_size); -} - -int TSK_WARN_UNUSED -tsk_blkalloc_reset(tsk_blkalloc_t *self) -{ - int ret = 0; - - self->top = 0; - self->current_chunk = 0; - self->total_allocated = 0; - return ret; -} - -int TSK_WARN_UNUSED -tsk_blkalloc_init(tsk_blkalloc_t *self, size_t chunk_size) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_blkalloc_t)); - if (chunk_size < 1) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - self->chunk_size = chunk_size; - self->top = 0; - self->current_chunk = 0; - self->total_allocated = 0; - self->total_size = 0; - self->num_chunks = 0; - self->mem_chunks = malloc(sizeof(char *)); - if (self->mem_chunks == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->mem_chunks[0] = malloc(chunk_size); - if (self->mem_chunks[0] == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->num_chunks = 1; - self->total_size = chunk_size + sizeof(void *); -out: - return ret; -} - -void *TSK_WARN_UNUSED -tsk_blkalloc_get(tsk_blkalloc_t *self, size_t size) -{ - void *ret = NULL; - void *p; - - if (size > self->chunk_size) { - goto out; - } - if ((self->top + size) > self->chunk_size) { - if (self->current_chunk == (self->num_chunks - 1)) { - p = realloc(self->mem_chunks, (self->num_chunks + 1) * sizeof(void *)); - if (p == NULL) { - goto out; - } - self->mem_chunks = p; - p = malloc(self->chunk_size); - if (p == NULL) { - goto out; - } - self->mem_chunks[self->num_chunks] = p; - self->num_chunks++; - self->total_size += self->chunk_size + sizeof(void *); - } - self->current_chunk++; - self->top = 0; - } - ret = self->mem_chunks[self->current_chunk] + self->top; - self->top += size; - self->total_allocated += size; -out: - return ret; -} - -void -tsk_blkalloc_free(tsk_blkalloc_t *self) -{ - size_t j; - - for (j = 0; j < self->num_chunks; j++) { - if (self->mem_chunks[j] != NULL) { - free(self->mem_chunks[j]); - } - } - if (self->mem_chunks != NULL) { - free(self->mem_chunks); - } -} - -/* Mirrors the semantics of numpy's searchsorted function. Uses binary - * search to find the index of the closest value in the array. */ -tsk_size_t -tsk_search_sorted(const double *restrict array, tsk_size_t size, double value) -{ - int64_t upper = (int64_t) size; - int64_t lower = 0; - int64_t offset = 0; - int64_t mid; - - if (upper == 0) { - return 0; - } - - while (upper - lower > 1) { - mid = (upper + lower) / 2; - if (value >= array[mid]) { - lower = mid; - } else { - upper = mid; - } - } - offset = (int64_t)(array[lower] < value); - return (tsk_size_t)(lower + offset); -} - -/* Rounds the specified double to the closest multiple of 10**-num_digits. If - * num_digits > 22, return value without changes. This is intended for use with - * small positive numbers; behaviour with large inputs has not been considered. - * - * Based on double_round from the Python standard library - * https://github.com/python/cpython/blob/master/Objects/floatobject.c#L985 - */ -double -tsk_round(double x, unsigned int ndigits) -{ - double pow1, y, z; - - z = x; - if (ndigits < 22) { - pow1 = pow(10.0, (double) ndigits); - y = x * pow1; - z = round(y); - if (fabs(y - z) == 0.5) { - /* halfway between two integers; use round-half-even */ - z = 2.0 * round(y / 2.0); - } - z = z / pow1; - } - return z; -} - -/* As NANs are not equal, use this function to check for equality to TSK_UNKNOWN_TIME */ -bool -tsk_is_unknown_time(double val) -{ - union { - uint64_t i; - double f; - } nan_union; - nan_union.f = val; - return nan_union.i == TSK_UNKNOWN_TIME_HEX; -} - -/* Work around a bug which seems to show up on various mixtures of - * compiler and libc versions, where isfinite and isnan result in - * spurious warnings about casting down to float. The original issue - * is here: - * https://github.com/tskit-dev/tskit/issues/721 - * - * The simplest approach seems to be to use the builtins where they - * are available (clang and gcc), and to use the library macro - * otherwise. There would be no disadvantage to using the builtin - * version, so there's no real harm in this approach. - */ - -bool -tsk_isnan(double val) -{ -#if defined(__GNUC__) - return __builtin_isnan(val); -#else - return isnan(val); -#endif -} - -bool -tsk_isfinite(double val) -{ -#if defined(__GNUC__) - return __builtin_isfinite(val); -#else - return isfinite(val); -#endif -} - -void * -tsk_malloc(tsk_size_t size) -{ - /* Avoid malloc(0) as it's not portable */ - if (size == 0) { - size = 1; - } -#if TSK_MAX_SIZE > SIZE_MAX - if (size > SIZE_MAX) { - return NULL; - } -#endif - return malloc((size_t) size); -} - -void * -tsk_realloc(void *ptr, tsk_size_t size) -{ - /* We shouldn't ever realloc to a zero size in tskit */ - tsk_bug_assert(size > 0); - return realloc(ptr, (size_t) size); -} - -/* We keep the size argument here as a size_t because we'd have to - * cast the outputs of sizeof() otherwise, which would lead to - * less readable code. We need to be careful to use calloc within - * the library accordingly, so that size can't overflow on 32 bit. - */ -void * -tsk_calloc(tsk_size_t n, size_t size) -{ - /* Avoid calloc(0) as it's not portable */ - if (n == 0) { - n = 1; - } -#if TSK_MAX_SIZE > SIZE_MAX - if (n > SIZE_MAX) { - return NULL; - } -#endif - return calloc((size_t) n, size); -} - -void * -tsk_memset(void *ptr, int fill, tsk_size_t size) -{ - return memset(ptr, fill, (size_t) size); -} - -void * -tsk_memcpy(void *dest, const void *src, tsk_size_t size) -{ - return memcpy(dest, src, (size_t) size); -} - -void * -tsk_memmove(void *dest, const void *src, tsk_size_t size) -{ - return memmove(dest, src, (size_t) size); -} - -int -tsk_memcmp(const void *s1, const void *s2, tsk_size_t size) -{ - return memcmp(s1, s2, (size_t) size); -} - -/* We can't initialise the stream to its real default value because - * of limitations on static initialisers. To work around this, we initialise - * it to NULL and then set the value to the required standard stream - * when called. */ - -FILE *_tsk_debug_stream = NULL; - -void -tsk_set_debug_stream(FILE *f) -{ - _tsk_debug_stream = f; -} - -FILE * -tsk_get_debug_stream(void) -{ - if (_tsk_debug_stream == NULL) { - _tsk_debug_stream = stdout; - } - return _tsk_debug_stream; -} - -/* AVL Tree implementation. This is based directly on Knuth's implementation - * in TAOCP. See the python/tests/test_avl_tree.py for more information, - * and equivalent code annotated with the original algorithm listing. - */ - -static void -tsk_avl_tree_int_print_node(tsk_avl_node_int_t *node, int depth, FILE *out) -{ - int d; - - if (node == NULL) { - return; - } - for (d = 0; d < depth; d++) { - fprintf(out, " "); - } - fprintf(out, "key=%d balance=%d\n", (int) node->key, node->balance); - tsk_avl_tree_int_print_node(node->llink, depth + 1, out); - tsk_avl_tree_int_print_node(node->rlink, depth + 1, out); -} -void -tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out) -{ - fprintf(out, "AVL tree: size=%d height=%d\n", (int) self->size, (int) self->height); - tsk_avl_tree_int_print_node(self->head.rlink, 0, out); -} - -int -tsk_avl_tree_int_init(tsk_avl_tree_int_t *self) -{ - memset(self, 0, sizeof(*self)); - return 0; -} - -int -tsk_avl_tree_int_free(tsk_avl_tree_int_t *TSK_UNUSED(self)) -{ - return 0; -} - -tsk_avl_node_int_t * -tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self) -{ - return self->head.rlink; -} - -tsk_avl_node_int_t * -tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key) -{ - tsk_avl_node_int_t *P = self->head.rlink; - - while (P != NULL) { - if (key == P->key) { - break; - } else if (key < P->key) { - P = P->llink; - } else { - P = P->rlink; - } - } - return P; -} - -static int -tsk_avl_tree_int_insert_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) -{ - self->head.rlink = node; - self->size = 1; - self->height = 1; - node->llink = NULL; - node->rlink = NULL; - node->balance = 0; - return 0; -} - -#define get_link(a, P) ((a) == -1 ? (P)->llink : (P)->rlink) -#define set_link(a, P, val) \ - do { \ - if ((a) == -1) { \ - (P)->llink = val; \ - } else { \ - (P)->rlink = val; \ - } \ - } while (0); - -static int -tsk_avl_tree_int_insert_non_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) -{ - const int64_t K = node->key; - tsk_avl_node_int_t *T = &self->head; - tsk_avl_node_int_t *S = T->rlink; - tsk_avl_node_int_t *P = T->rlink; - tsk_avl_node_int_t *Q, *R; - int a; - - while (true) { - if (K == P->key) { - /* TODO figure out what the most useful semantics are here. Just - * returning 1 as a non-zero value for now. */ - return 1; - } else if (K < P->key) { - Q = P->llink; - if (Q == NULL) { - Q = node; - P->llink = Q; - break; - } - } else { - Q = P->rlink; - if (Q == NULL) { - Q = node; - P->rlink = Q; - break; - } - } - if (Q->balance != 0) { - T = P; - S = Q; - } - P = Q; - } - - self->size++; - Q->llink = NULL; - Q->rlink = NULL; - Q->balance = 0; - - if (K < S->key) { - a = -1; - } else { - a = 1; - } - P = get_link(a, S); - R = P; - while (P != Q) { - if (K < P->key) { - P->balance = -1; - P = P->llink; - } else if (K > P->key) { - P->balance = 1; - P = P->rlink; - } - } - - if (S->balance == 0) { - S->balance = a; - self->height++; - } else if (S->balance == -a) { - S->balance = 0; - } else { - if (R->balance == a) { - P = R; - set_link(a, S, get_link(-a, R)); - set_link(-a, R, S); - S->balance = 0; - R->balance = 0; - } else if (R->balance == -a) { - P = get_link(-a, R); - set_link(-a, R, get_link(a, P)); - set_link(a, P, R); - set_link(a, S, get_link(-a, P)); - set_link(-a, P, S); - if (P->balance == a) { - S->balance = -a; - R->balance = 0; - } else if (P->balance == 0) { - S->balance = 0; - R->balance = 0; - } else { - S->balance = 0; - R->balance = a; - } - P->balance = 0; - } - if (S == T->rlink) { - T->rlink = P; - } else { - T->llink = P; - } - } - return 0; -} - -int -tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) -{ - int ret = 0; - - if (self->size == 0) { - ret = tsk_avl_tree_int_insert_empty(self, node); - } else { - ret = tsk_avl_tree_int_insert_non_empty(self, node); - } - return ret; -} - -/* An inorder traversal of the nodes in an AVL tree (or any binary search tree) - * yields the keys in sorted order. The recursive implementation is safe here - * because this is an AVL tree and it is strictly balanced, the depth is very - * limited. Using GCC's __builtin_frame_address it looks like the size of a stack - * frame for this function is 48 bytes. Assuming a stack size of 1MiB, this - * would give us a maximum tree depth of 21845 - so, we're pretty safe. - */ -static int -ordered_nodes_traverse(tsk_avl_node_int_t *node, int index, tsk_avl_node_int_t **out) -{ - if (node == NULL) { - return index; - } - index = ordered_nodes_traverse(node->llink, index, out); - out[index] = node; - return ordered_nodes_traverse(node->rlink, index + 1, out); -} - -int -tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out) -{ - ordered_nodes_traverse(self->head.rlink, 0, out); - return 0; -} diff --git a/subprojects/tskit/tskit/core.h b/subprojects/tskit/tskit/core.h deleted file mode 100644 index b8b9f354b..000000000 --- a/subprojects/tskit/tskit/core.h +++ /dev/null @@ -1,1002 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2015-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file core.h - * @brief Core utilities used in all of tskit. - */ -#ifndef __TSK_CORE_H__ -#define __TSK_CORE_H__ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include -#include - -#ifdef __GNUC__ -#define TSK_WARN_UNUSED __attribute__((warn_unused_result)) -#define TSK_UNUSED(x) TSK_UNUSED_##x __attribute__((__unused__)) -#else -#define TSK_WARN_UNUSED -#define TSK_UNUSED(x) TSK_UNUSED_##x -/* Don't bother with restrict for MSVC */ -#define restrict -#endif - -/* We assume CHAR_BIT == 8 when loading strings from 8-bit byte arrays */ -#if CHAR_BIT != 8 -#error CHAR_BIT MUST EQUAL 8 -#endif - -/* This sets up TSK_DBL_DECIMAL_DIG, which can then be used as a - * precision specifier when writing out doubles, if you want sufficient - * decimal digits to be written to guarantee a lossless round-trip - * after being read back in. Usage: - * - * printf("%.*g", TSK_DBL_DECIMAL_DIG, foo); - * - * See https://stackoverflow.com/a/19897395/2752221 - */ -#ifdef DBL_DECIMAL_DIG -#define TSK_DBL_DECIMAL_DIG (DBL_DECIMAL_DIG) -#else -#define TSK_DBL_DECIMAL_DIG (DBL_DIG + 3) -#endif - -/** -@brief Tskit Object IDs. - -@rst -All objects in tskit are referred to by integer IDs corresponding to the -row they occupy in the relevant table. The ``tsk_id_t`` type should be used -when manipulating these ID values. The reserved value :c:macro:`TSK_NULL` (-1) defines -missing data. -@endrst -*/ -#ifdef _TSK_BIG_TABLES -/* Allow tables to have more than 2^31 rows. This is an EXPERIMENTAL feature - * and is not supported in any way. This typedef is only included for - * future-proofing purposes, so that we can be sure that we don't make any - * design decisions that are incompatible with big tables by building the - * library in 64 bit mode in CI. See the discussion here for more background: - - * https://github.com/tskit-dev/tskit/issues/343 - * - * If you need big tables, please open an issue on GitHub to discuss, or comment - * on the thread above. - */ -typedef int64_t tsk_id_t; -#define TSK_MAX_ID INT64_MAX - 1 -#define TSK_ID_STORAGE_TYPE KAS_INT64 -#else -typedef int32_t tsk_id_t; -#define TSK_MAX_ID INT32_MAX - 1 -#define TSK_ID_STORAGE_TYPE KAS_INT32 -#endif - -/** -@brief Tskit sizes. - -@rst -The ``tsk_size_t`` type is an unsigned integer used for any size or count value. -@endrst -*/ -typedef uint64_t tsk_size_t; -#define TSK_MAX_SIZE UINT64_MAX -#define TSK_SIZE_STORAGE_TYPE KAS_UINT64 - -/** -@brief Container for bitwise flags. - -@rst -Bitwise flags are used in tskit as a column type and also as a way to -specify options to API functions. -@endrst -*/ -typedef uint32_t tsk_flags_t; -#define TSK_FLAGS_STORAGE_TYPE KAS_UINT32 - -/** -@brief Boolean type. - -@rst -Fixed-size (1 byte) boolean values. -@endrst -*/ -typedef uint8_t tsk_bool_t; - -// clang-format off -/** -@defgroup API_VERSION_GROUP API version macros. -@{ -*/ -/** -The library major version. Incremented when breaking changes to the API or ABI are -introduced. This includes any changes to the signatures of functions and the -sizes and types of externally visible structs. -*/ -#define TSK_VERSION_MAJOR 1 -/** -The library minor version. Incremented when non-breaking backward-compatible changes -to the API or ABI are introduced, i.e., the addition of a new function. -*/ -#define TSK_VERSION_MINOR 1 -/** -The library patch version. Incremented when any changes not relevant to the -to the API or ABI are introduced, i.e., internal refactors of bugfixes. -*/ -#define TSK_VERSION_PATCH 2 -/** @} */ - -/* -We define a specific NAN value for default mutation time which indicates -the time is unknown. We use a specific value so that if mutation time is set to -a NAN from a computation we can reject it. This specific value is a non-signalling -NAN with the last six fraction bytes set to the ascii of "tskit!" -*/ -#define TSK_UNKNOWN_TIME_HEX 0x7FF874736B697421ULL -static inline double -__tsk_nan_f(void) -{ - const union { - uint64_t i; - double f; - } nan_union = { .i = TSK_UNKNOWN_TIME_HEX }; - return nan_union.f; -} - -/** -@defgroup GENERIC_CONSTANTS General options flags used in some functions. -@{ -*/ -/** -Used in node flags to indicate that a node is a sample node. -*/ -#define TSK_NODE_IS_SAMPLE 1u - -/** -Null value used for cases such as absent id references. -*/ -#define TSK_NULL ((tsk_id_t) -1) - -/** -Value used for missing data in genotype arrays. -*/ -#define TSK_MISSING_DATA (-1) - -/** -Value to indicate that a time is unknown. Note that this value is a non-signalling NAN -whose representation differs from the NAN generated by computations such as divide by zeros. -*/ -#define TSK_UNKNOWN_TIME __tsk_nan_f() - -/** @} */ - -#define TSK_TIME_UNITS_UNKNOWN "unknown" -#define TSK_TIME_UNITS_UNCALIBRATED "uncalibrated" - - -#define TSK_FILE_FORMAT_NAME "tskit.trees" -#define TSK_FILE_FORMAT_NAME_LENGTH 11 -#define TSK_FILE_FORMAT_VERSION_MAJOR 12 -#define TSK_FILE_FORMAT_VERSION_MINOR 7 - -/** -@defgroup GENERIC_FUNCTION_OPTIONS General options flags used in some functions. -@{ -*/ - -/* Place the common options at the top of the space; this way we can start -options for individual functions at the bottom without worrying about -clashing with the common options -*/ - -/** Turn on debugging output. Not supported by all functions. */ -#define TSK_DEBUG (1u << 31) - -/** Do not initialise the parameter object. */ -#define TSK_NO_INIT (1u << 30) - -/** -Do not run integrity checks before performing an operation. -This performance optimisation should not be used unless the calling code can -guarantee reference integrity within the table collection. References -to rows not in the table or bad offsets will result in undefined -behaviour. -*/ -#define TSK_NO_CHECK_INTEGRITY (1u << 29) - -/** -Instead of taking a copy of input objects, the function should take ownership -of them and manage their lifecycle. The caller specifying this flag should no -longer modify or free the object or objects passed. See individual functions -using this flag for what object it applies to. -*/ -#define TSK_TAKE_OWNERSHIP (1u << 28) - -/** @} */ - - -/** -@defgroup GENERAL_ERROR_GROUP General errors. -@{ -*/ - -/** -Generic error thrown when no other message can be generated. -*/ -#define TSK_ERR_GENERIC -1 -/** -Memory could not be allocated. -*/ -#define TSK_ERR_NO_MEMORY -2 -/** -An IO error occurred. -*/ -#define TSK_ERR_IO -3 -#define TSK_ERR_BAD_PARAM_VALUE -4 -#define TSK_ERR_BUFFER_OVERFLOW -5 -#define TSK_ERR_UNSUPPORTED_OPERATION -6 -#define TSK_ERR_GENERATE_UUID -7 -/** -The file stream ended after reading zero bytes. -*/ -#define TSK_ERR_EOF -8 -/** @} */ - -/** -@defgroup FILE_FORMAT_ERROR_GROUP File format errors. -@{ -*/ - -/** -A file could not be read because it is in the wrong format -*/ -#define TSK_ERR_FILE_FORMAT -100 -/** -The file is in tskit format, but the version is too old for the -library to read. The file should be upgraded to the latest version -using the ``tskit upgrade`` command line utility. -*/ -#define TSK_ERR_FILE_VERSION_TOO_OLD -101 -/** -The file is in tskit format, but the version is too new for the -library to read. To read the file you must upgrade the version -of tskit. -*/ -#define TSK_ERR_FILE_VERSION_TOO_NEW -102 - -/** -A column that is a required member of a table was not found in -the file. -*/ -#define TSK_ERR_REQUIRED_COL_NOT_FOUND -103 - -/** -One of a pair of columns that must be specified together was -not found in the file. -*/ -#define TSK_ERR_BOTH_COLUMNS_REQUIRED -104 - -/** -An unsupported type was provided for a column in the file. -*/ -#define TSK_ERR_BAD_COLUMN_TYPE -105 -/** @} */ - -/** -@defgroup OOB_ERROR_GROUP Out of bounds errors. -@{ -*/ -/** -A bad value was provided for a ragged column offset, values should -start at zero and be monotonically increasing. -*/ -#define TSK_ERR_BAD_OFFSET -200 -/** -A position to seek to was less than zero or greater than the length -of the genome -*/ -#define TSK_ERR_SEEK_OUT_OF_BOUNDS -201 -/** -A node id was less than zero or greater than the final index -*/ -#define TSK_ERR_NODE_OUT_OF_BOUNDS -202 -/** -A edge id was less than zero or greater than the final index -*/ -#define TSK_ERR_EDGE_OUT_OF_BOUNDS -203 -/** -A population id was less than zero or greater than the final index -*/ -#define TSK_ERR_POPULATION_OUT_OF_BOUNDS -204 -/** -A site id was less than zero or greater than the final index -*/ -#define TSK_ERR_SITE_OUT_OF_BOUNDS -205 -/** -A mutation id was less than zero or greater than the final index -*/ -#define TSK_ERR_MUTATION_OUT_OF_BOUNDS -206 -/** -An individual id was less than zero or greater than the final index -*/ -#define TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS -207 -/** -A migration id was less than zero or greater than the final index -*/ -#define TSK_ERR_MIGRATION_OUT_OF_BOUNDS -208 -/** -A provenance id was less than zero or greater than the final index -*/ -#define TSK_ERR_PROVENANCE_OUT_OF_BOUNDS -209 -/** -A time value was non-finite (NaN counts as finite) -*/ -#define TSK_ERR_TIME_NONFINITE -210 -/** -A genomic position was non-finite -*/ -#define TSK_ERR_GENOME_COORDS_NONFINITE -211 -/** -One of the rows in the retained table refers to a row that has been -deleted. -*/ -#define TSK_ERR_KEEP_ROWS_MAP_TO_DELETED -212 - -/** @} */ - -/** -@defgroup EDGE_ERROR_GROUP Edge errors. -@{ -*/ -/** -A parent node of an edge was TSK_NULL. -*/ -#define TSK_ERR_NULL_PARENT -300 -/** -A child node of an edge was TSK_NULL. -*/ -#define TSK_ERR_NULL_CHILD -301 -/** -The edge table was not sorted by the time of each edge's parent -nodes. Sort order is (time[parent], child, left). -*/ -#define TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME -302 -/** -A parent node had edges that were non-contigious. -*/ -#define TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS -303 -/** -The edge table was not sorted by the id of the child node of each edge. -Sort order is (time[parent], child, left). -*/ -#define TSK_ERR_EDGES_NOT_SORTED_CHILD -304 -/** -The edge table was not sorted by the left coordinate each edge. -Sort order is (time[parent], child, left). -*/ -#define TSK_ERR_EDGES_NOT_SORTED_LEFT -305 -/** -An edge had child node that was older than the parent. Parent times must -be greater than the child time. -*/ -#define TSK_ERR_BAD_NODE_TIME_ORDERING -306 -/** -An edge had a genomic interval where right was greater or equal to left. -*/ -#define TSK_ERR_BAD_EDGE_INTERVAL -307 -/** -An edge was duplicated. -*/ -#define TSK_ERR_DUPLICATE_EDGES -308 -/** -An edge had a right coord greater than the genomic length. -*/ -#define TSK_ERR_RIGHT_GREATER_SEQ_LENGTH -309 -/** -An edge had a left coord less than zero. -*/ -#define TSK_ERR_LEFT_LESS_ZERO -310 -/** -A parent node had edges that were contradictory over an interval. -*/ -#define TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN -311 -/** -A method that doesn't support edge metadata was attempted on an edge -table containing metadata. -*/ -#define TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA -312 -/** @} */ - -/** -@defgroup SITE_ERROR_GROUP Site errors. -@{ -*/ -/** -The site table was not in order of increasing genomic position. -*/ -#define TSK_ERR_UNSORTED_SITES -400 -/** -The site table had more than one site at a single genomic position. -*/ -#define TSK_ERR_DUPLICATE_SITE_POSITION -401 -/** -A site had a position that was less than zero or greater than the sequence -length. -*/ -#define TSK_ERR_BAD_SITE_POSITION -402 -/** @} */ - -/** -@defgroup MUTATION_ERROR_GROUP Mutation errors. -@{ -*/ -/** -A mutation had a parent mutation that was at a different site. -*/ -#define TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE -500 -/** -A mutation had a parent mutation that was itself. -*/ -#define TSK_ERR_MUTATION_PARENT_EQUAL -501 -/** -A mutation had a parent mutation that had a greater id. -*/ -#define TSK_ERR_MUTATION_PARENT_AFTER_CHILD -502 -/** -Two or more mutation parent references formed a loop -*/ -#define TSK_ERR_MUTATION_PARENT_INCONSISTENT -503 -/** -The mutation table was not in the order of non-decreasing site id and -non-increasing time within each site. -*/ -#define TSK_ERR_UNSORTED_MUTATIONS -504 -/* 505 was the now unused TSK_ERR_NON_SINGLE_CHAR_MUTATION */ -/** -A mutation's time was younger (not >=) the time of its node -and wasn't TSK_UNKNOWN_TIME. -*/ -#define TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE -506 -/** -A mutation's time was older (not <=) than the time of its parent -mutation, and wasn't TSK_UNKNOWN_TIME. -*/ -#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION -507 -/** -A mutation's time was older (not <) than the time of the parent node of -the edge on which it occurs, and wasn't TSK_UNKNOWN_TIME. -*/ -#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -508 -/** -A single site had a mixture of known mutation times and TSK_UNKNOWN_TIME -*/ -#define TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN -509 -/** @} */ - -/** -@defgroup MIGRATION_ERROR_GROUP Migration errors. -@{ -*/ -/** -The migration table was not sorted by time. -*/ -#define TSK_ERR_UNSORTED_MIGRATIONS -550 -/** @} */ - -/** -@defgroup SAMPLE_ERROR_GROUP Sample errors. -@{ -*/ -/** -A duplicate sample was specified. -*/ -#define TSK_ERR_DUPLICATE_SAMPLE -600 -/** -A sample id that was not valid was specified. -*/ -#define TSK_ERR_BAD_SAMPLES -601 -/** @} */ - -/** -@defgroup TABLE_ERROR_GROUP Table errors. -@{ -*/ -/** -An invalid table position was specifed. -*/ -#define TSK_ERR_BAD_TABLE_POSITION -700 -/** -A sequence length equal to or less than zero was specified. -*/ -#define TSK_ERR_BAD_SEQUENCE_LENGTH -701 -/** -The table collection was not indexed. -*/ -#define TSK_ERR_TABLES_NOT_INDEXED -702 -/** -Tables cannot be larger than 2**31 rows. -*/ -#define TSK_ERR_TABLE_OVERFLOW -703 -/** -Ragged array columns cannot be larger than 2**64 bytes. -*/ -#define TSK_ERR_COLUMN_OVERFLOW -704 -/** -The table collection contains more than 2**31 trees. -*/ -#define TSK_ERR_TREE_OVERFLOW -705 -/** -Metadata was attempted to be set on a table where it is disabled. -*/ -#define TSK_ERR_METADATA_DISABLED -706 -/** -There was an error with the table's indexes. -*/ -#define TSK_ERR_TABLES_BAD_INDEXES -707 -/** @} */ - -/** -@defgroup LIMITATION_ERROR_GROUP Limitation errors. -@{ -*/ -/** -An operation was attempted that only supports infinite sites, i.e. -at most a single mutation per site. -*/ -#define TSK_ERR_ONLY_INFINITE_SITES -800 -/** -Simplification was attempted with migrations present, which are not -supported. -*/ -#define TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED -801 -/** -Sorting was attempted on migrations, which is not supported. -*/ -#define TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED -802 -/** -An invalid sort offset was specified, for sites and mutations this must -be either 0 or the table length. -*/ -#define TSK_ERR_SORT_OFFSET_NOT_SUPPORTED -803 -/** -An operation was attempted that only supports binary mutations. -*/ -#define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804 -/** -An operation was attempted that doesn't support migrations, with a -non-empty migration table. -*/ -#define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805 -/** -A table attempted to extend from itself. -*/ -#define TSK_ERR_CANNOT_EXTEND_FROM_SELF -806 -/** -An operation was attempted that doesn't support silent mutations, i.e. -a mutation that doesn't change the allelic state. -*/ -#define TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED -807 -/** -A copy of a variant cannot be decoded. -*/ -#define TSK_ERR_VARIANT_CANT_DECODE_COPY -808 -/** -A tree sequence cannot take ownership of a table collection where -TSK_NO_EDGE_METADATA. -*/ -#define TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA -809 -/** -Operation is undefined for nonbinary trees -*/ -#define TSK_ERR_UNDEFINED_NONBINARY -810 -/** -Operation is undefined for trees with multiple roots. -*/ -#define TSK_ERR_UNDEFINED_MULTIROOT -811 - -/** @} */ - -/** -@defgroup STATS_ERROR_GROUP Stats errors. -@{ -*/ -/** -Zero windows were specified, at least one window must be specified. -*/ -#define TSK_ERR_BAD_NUM_WINDOWS -900 -/** -The window specification was not an increasing list of positions between -0 and the sequence length. -*/ -#define TSK_ERR_BAD_WINDOWS -901 -/** -More than one stat mode was specified. -*/ -#define TSK_ERR_MULTIPLE_STAT_MODES -902 -/** -The state dimension was not >=1. -*/ -#define TSK_ERR_BAD_STATE_DIMS -903 -/** -The result dimension was not >=1. -*/ -#define TSK_ERR_BAD_RESULT_DIMS -904 -/** -Insufficient sample sets were provided. -*/ -#define TSK_ERR_INSUFFICIENT_SAMPLE_SETS -905 -/** -Insufficient sample set index tuples were provided. -*/ -#define TSK_ERR_INSUFFICIENT_INDEX_TUPLES -906 -/** -The sample set index was out of bounds. -*/ -#define TSK_ERR_BAD_SAMPLE_SET_INDEX -907 -/** -The sample set index was empty. -*/ -#define TSK_ERR_EMPTY_SAMPLE_SET -908 -/** -A stat mode was attempted that is not supported by the operation. -*/ -#define TSK_ERR_UNSUPPORTED_STAT_MODE -909 -/** -Statistics based on branch lengths were attempted when the ``time_units`` -were ``uncalibrated``. -*/ -#define TSK_ERR_TIME_UNCALIBRATED -910 -/** @} */ - -/** -@defgroup MAPPING_ERROR_GROUP Mutation mapping errors. -@{ -*/ -/** -Only missing genotypes were specified, at least one non-missing is -required. -*/ -#define TSK_ERR_GENOTYPES_ALL_MISSING -1000 -/** -A genotype value was greater than the maximum allowed (64) or less -than TSK_MISSING_DATA (-1). -*/ -#define TSK_ERR_BAD_GENOTYPE -1001 -/** -A ancestral genotype value was greater than the maximum allowed (64) or less -than 0. -*/ -#define TSK_ERR_BAD_ANCESTRAL_STATE -1002 -/** @} */ - -/** -@defgroup GENOTYPE_ERROR_GROUP Genotype decoding errors. -@{ -*/ -/** -Genotypes were requested for non-samples at the same time -as asking that isolated nodes be marked as missing. This is not -supported. -*/ -#define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100 -/** -A user-specified allele map was used, but didn't contain an allele -found in the tree sequence. -*/ -#define TSK_ERR_ALLELE_NOT_FOUND -1101 -/** -More than 2147483647 alleles were specified. -*/ -#define TSK_ERR_TOO_MANY_ALLELES -1102 -/** -A user-specified allele map was used, but it contained zero alleles. -*/ -#define TSK_ERR_ZERO_ALLELES -1103 -/** @} */ - -/** -@defgroup DISTANCE_ERROR_GROUP Distance metric errors. -@{ -*/ -/** -Trees with different numbers of samples were specified. -*/ -#define TSK_ERR_SAMPLE_SIZE_MISMATCH -1200 -/** -Trees with nonidentical samples were specified. -*/ -#define TSK_ERR_SAMPLES_NOT_EQUAL -1201 -/** -A tree with multiple roots was specified. -*/ -#define TSK_ERR_MULTIPLE_ROOTS -1202 -/** -A tree with unary nodes was specified. -*/ -#define TSK_ERR_UNARY_NODES -1203 -/** -Trees were specifed that had unequal sequence lengths. -*/ -#define TSK_ERR_SEQUENCE_LENGTH_MISMATCH -1204 -/** -A tree was specifed that did not have the sample lists option -enabled (TSK_SAMPLE_LISTS). -*/ -#define TSK_ERR_NO_SAMPLE_LISTS -1205 -/** @} */ - -/** -@defgroup HAPLOTYPE_ERROR_GROUP Haplotype matching errors. -@{ -*/ -/** -The Viterbi matrix has not filled (it has zero transitions). -*/ -#define TSK_ERR_NULL_VITERBI_MATRIX -1300 -/** -There was no matching haplotype. -*/ -#define TSK_ERR_MATCH_IMPOSSIBLE -1301 -/** -The compressed matrix has a node that has no samples in it's descendants. -*/ -#define TSK_ERR_BAD_COMPRESSED_MATRIX_NODE -1302 -/** -There are too many values to compress. -*/ -#define TSK_ERR_TOO_MANY_VALUES -1303 -/** @} */ - -/** -@defgroup UNION_ERROR_GROUP Union errors. -@{ -*/ -/** -A node map was specified that contained a node not present in the -specified table collection. -*/ -#define TSK_ERR_UNION_BAD_MAP -1400 -/** -The shared portions of the specified tree sequences are not equal. -Note that this may be the case if the table collections were not -fully sorted before union was called. -*/ -#define TSK_ERR_UNION_DIFF_HISTORIES -1401 -/** @} */ - -/** -@defgroup IBD_ERROR_GROUP IBD errors. -@{ -*/ -/** -Both nodes in a sample pair are the same node. -*/ -#define TSK_ERR_SAME_NODES_IN_PAIR -1500 -/** -Per-pair statistics were requested without TSK_IBD_STORE_PAIRS being -specified. -*/ -#define TSK_ERR_IBD_PAIRS_NOT_STORED -1501 -/** -Segments were requested without TSK_IBD_STORE_SEGMENTS being specified. -*/ -#define TSK_ERR_IBD_SEGMENTS_NOT_STORED -1502 -/** @} */ - -/** -@defgroup SIMPLIFY_ERROR_GROUP Simplify errors. -@{ -*/ -/** -Both TSK_SIMPLIFY_KEEP_UNARY and TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS -were specified. Only one can be used. -*/ -#define TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE -1600 -/** @} */ - -/** -@defgroup INDIVIDUAL_ERROR_GROUP Individual errors. -@{ -*/ -/** -Individuals were provided in an order where parents were after their -children. -*/ -#define TSK_ERR_UNSORTED_INDIVIDUALS -1700 -/** -An individual was its own parent. -*/ -#define TSK_ERR_INDIVIDUAL_SELF_PARENT -1701 -/** -An individual was its own ancestor in a cycle of references. -*/ -#define TSK_ERR_INDIVIDUAL_PARENT_CYCLE -1702 -/** -An individual had nodes from more than one population -(and only one was requested). -*/ -#define TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH -1703 -/** -An individual had nodes from more than one time -(and only one was requested). -*/ -#define TSK_ERR_INDIVIDUAL_TIME_MISMATCH -1704 -/** @} */ -// clang-format on - -/* This bit is 0 for any errors originating from kastore */ -#define TSK_KAS_ERR_BIT 14 - -int tsk_set_kas_error(int err); -bool tsk_is_kas_error(int err); -int tsk_get_kas_error(int err); - -/** -@brief Return a description of the specified error. - -The memory for the returned string is handled by the library and should -not be freed by client code. - -@param err A tskit error code. -@return A description of the error. -*/ -const char *tsk_strerror(int err); - -#ifndef TSK_BUG_ASSERT_MESSAGE -#define TSK_BUG_ASSERT_MESSAGE \ - "If you are using tskit directly please open an issue on" \ - " GitHub, ideally with a reproducible example." \ - " (https://github.com/tskit-dev/tskit/issues) If you are" \ - " using software that uses tskit, please report an issue" \ - " to that software's issue tracker, at least initially." -#endif - -/** -We often wish to assert a condition that is unexpected, but using the normal `assert` -means compiling without NDEBUG. This macro still asserts when NDEBUG is defined. -If you are using this macro in your own software then please set TSK_BUG_ASSERT_MESSAGE -to point users to your issue tracker. -*/ -#define tsk_bug_assert(condition) \ - do { \ - if (!(condition)) { \ - fprintf(stderr, "Bug detected in %s at line %d. %s\n", __FILE__, __LINE__, \ - TSK_BUG_ASSERT_MESSAGE); \ - abort(); \ - } \ - } while (0) - -void __tsk_safe_free(void **ptr); -#define tsk_safe_free(pointer) __tsk_safe_free((void **) &(pointer)) - -#define TSK_MAX(a, b) ((a) > (b) ? (a) : (b)) -#define TSK_MIN(a, b) ((a) < (b) ? (a) : (b)) - -/* This is a simple allocator that is optimised to efficiently allocate a - * large number of small objects without large numbers of calls to malloc. - * The allocator mallocs memory in chunks of a configurable size. When - * responding to calls to get(), it will return a chunk of this memory. - * This memory cannot be subsequently handed back to the allocator. However, - * all memory allocated by the allocator can be returned at once by calling - * reset. - */ - -typedef struct { - size_t chunk_size; /* number of bytes per chunk */ - size_t top; /* the offset of the next available byte in the current chunk */ - size_t current_chunk; /* the index of the chunk currently being used */ - size_t total_size; /* the total number of bytes allocated + overhead. */ - size_t total_allocated; /* the total number of bytes allocated. */ - size_t num_chunks; /* the number of memory chunks. */ - char **mem_chunks; /* the memory chunks */ -} tsk_blkalloc_t; - -extern void tsk_blkalloc_print_state(tsk_blkalloc_t *self, FILE *out); -extern int tsk_blkalloc_reset(tsk_blkalloc_t *self); -extern int tsk_blkalloc_init(tsk_blkalloc_t *self, size_t chunk_size); -extern void *tsk_blkalloc_get(tsk_blkalloc_t *self, size_t size); -extern void tsk_blkalloc_free(tsk_blkalloc_t *self); - -typedef struct _tsk_avl_node_int_t { - int64_t key; - void *value; - struct _tsk_avl_node_int_t *llink; - struct _tsk_avl_node_int_t *rlink; - /* This can only contain -1, 0, 1. We could set it to a smaller type, - * but there's no point because of struct padding and alignment so - * it's simplest to keep it as a plain int. */ - int balance; -} tsk_avl_node_int_t; - -typedef struct { - tsk_avl_node_int_t head; - tsk_size_t size; - tsk_size_t height; -} tsk_avl_tree_int_t; - -int tsk_avl_tree_int_init(tsk_avl_tree_int_t *self); -int tsk_avl_tree_int_free(tsk_avl_tree_int_t *self); -void tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out); -int tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node); -tsk_avl_node_int_t *tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key); -int tsk_avl_tree_int_ordered_nodes( - const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out); -tsk_avl_node_int_t *tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self); - -tsk_size_t tsk_search_sorted(const double *array, tsk_size_t size, double value); - -double tsk_round(double x, unsigned int ndigits); - -/** -@brief Check if a number is ``TSK_UNKNOWN_TIME`` - -@rst -Unknown time values in tskit are represented by a particular NaN value. Since NaN values -are not equal to each other by definition, a simple comparison like -``mutation.time == TSK_UNKNOWN_TIME`` will fail even if the mutation's time is -TSK_UNKNOWN_TIME. This function compares the underlying bit representation of a double -value and returns true iff it is equal to the specific NaN value -:c:macro:`TSK_UNKNOWN_TIME`. -@endrst - -@param val The number to check -@return true if the number is ``TSK_UNKNOWN_TIME`` else false -*/ -bool tsk_is_unknown_time(double val); - -/* We define local versions of isnan and isfinite to workaround some portability - * issues. */ -bool tsk_isnan(double val); -bool tsk_isfinite(double val); - -#define TSK_UUID_SIZE 36 -int tsk_generate_uuid(char *dest, int flags); - -/* TODO most of these can probably be macros so they compile out as no-ops. - * Lets do the 64 bit tsk_size_t switch first though. */ -void *tsk_malloc(tsk_size_t size); -void *tsk_realloc(void *ptr, tsk_size_t size); -void *tsk_calloc(tsk_size_t n, size_t size); -void *tsk_memset(void *ptr, int fill, tsk_size_t size); -void *tsk_memcpy(void *dest, const void *src, tsk_size_t size); -void *tsk_memmove(void *dest, const void *src, tsk_size_t size); -int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size); - -/* Developer debug utilities. These are **not** threadsafe */ -void tsk_set_debug_stream(FILE *f); -FILE *tsk_get_debug_stream(void); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/subprojects/tskit/tskit/genotypes.c b/subprojects/tskit/tskit/genotypes.c deleted file mode 100644 index d4d1ecb08..000000000 --- a/subprojects/tskit/tskit/genotypes.c +++ /dev/null @@ -1,646 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2022 Tskit Developers - * Copyright (c) 2016-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -/* ======================================================== * - * Variant generator - * ======================================================== */ - -void -tsk_variant_print_state(const tsk_variant_t *self, FILE *out) -{ - tsk_size_t j; - - fprintf(out, "tsk_variant state\n"); - fprintf(out, "user_alleles = %lld\n", (long long) self->user_alleles); - fprintf(out, "num_alleles = %lld\n", (long long) self->num_alleles); - for (j = 0; j < self->num_alleles; j++) { - fprintf(out, "\tlen = %lld, '%.*s'\n", (long long) self->allele_lengths[j], - (int) self->allele_lengths[j], self->alleles[j]); - } - fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); -} - -void -tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out) -{ - tsk_variant_print_state(&self->variant, out); -} - -/* Copy the fixed allele mapping specified by the user into local - * memory. */ -static int -tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) -{ - int ret = 0; - tsk_size_t j; - size_t total_len, allele_len, offset; - - self->num_alleles = self->max_alleles; - - total_len = 0; - for (j = 0; j < self->num_alleles; j++) { - allele_len = strlen(alleles[j]); - self->allele_lengths[j] = (tsk_size_t) allele_len; - total_len += allele_len; - } - self->user_alleles_mem = tsk_malloc(total_len * sizeof(char *)); - if (self->user_alleles_mem == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - offset = 0; - for (j = 0; j < self->num_alleles; j++) { - strcpy(self->user_alleles_mem + offset, alleles[j]); - self->alleles[j] = self->user_alleles_mem + offset; - offset += (size_t) self->allele_lengths[j]; - } -out: - return ret; -} - -static int -variant_init_samples_and_index_map(tsk_variant_t *self, - const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, - size_t num_samples_alloc, tsk_flags_t options) -{ - int ret = 0; - const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; - tsk_size_t j, num_nodes; - bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING); - tsk_id_t u; - - num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); - self->alt_samples = tsk_malloc(num_samples_alloc * sizeof(*samples)); - self->alt_sample_index_map - = tsk_malloc(num_nodes * sizeof(*self->alt_sample_index_map)); - if (self->alt_samples == NULL || self->alt_sample_index_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->alt_samples, samples, num_samples * sizeof(*samples)); - tsk_memset(self->alt_sample_index_map, 0xff, - num_nodes * sizeof(*self->alt_sample_index_map)); - /* Create the reverse mapping */ - for (j = 0; j < num_samples; j++) { - u = samples[j]; - if (u < 0 || u >= (tsk_id_t) num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->alt_sample_index_map[u] != TSK_NULL) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - /* We can only detect missing data for samples */ - if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) { - ret = TSK_ERR_MUST_IMPUTE_NON_SAMPLES; - goto out; - } - self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; - } -out: - return ret; -} - -int -tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, - const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, - tsk_flags_t options) -{ - int ret = 0; - tsk_size_t max_alleles_limit, max_alleles; - tsk_size_t num_samples_alloc; - - tsk_memset(self, 0, sizeof(tsk_variant_t)); - - /* Set site id to NULL to indicate the variant is not decoded */ - self->site.id = TSK_NULL; - - self->tree_sequence = tree_sequence; - ret = tsk_tree_init( - &self->tree, tree_sequence, samples == NULL ? TSK_SAMPLE_LISTS : 0); - if (ret != 0) { - goto out; - } - - if (samples != NULL) { - /* Take a copy of the samples so we don't have to manage the lifecycle*/ - self->samples = tsk_malloc(num_samples * sizeof(*samples)); - if (self->samples == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->samples, samples, num_samples * sizeof(*samples)); - self->num_samples = num_samples; - } - - self->options = options; - - max_alleles_limit = INT32_MAX; - - if (alleles == NULL) { - self->user_alleles = false; - max_alleles = 4; /* Arbitrary --- we'll rarely have more than this */ - } else { - self->user_alleles = true; - /* Count the input alleles. The end is designated by the NULL sentinel. */ - for (max_alleles = 0; alleles[max_alleles] != NULL; max_alleles++) - ; - if (max_alleles > max_alleles_limit) { - ret = TSK_ERR_TOO_MANY_ALLELES; - goto out; - } - if (max_alleles == 0) { - ret = TSK_ERR_ZERO_ALLELES; - goto out; - } - } - self->max_alleles = max_alleles; - self->alleles = tsk_calloc(max_alleles, sizeof(*self->alleles)); - self->allele_lengths = tsk_malloc(max_alleles * sizeof(*self->allele_lengths)); - if (self->alleles == NULL || self->allele_lengths == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (self->user_alleles) { - ret = tsk_variant_copy_alleles(self, alleles); - if (ret != 0) { - goto out; - } - } - if (self->samples == NULL) { - self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); - self->samples = tsk_malloc(self->num_samples * sizeof(*self->samples)); - if (self->samples == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->samples, tsk_treeseq_get_samples(tree_sequence), - self->num_samples * sizeof(*self->samples)); - - self->sample_index_map = tsk_treeseq_get_sample_index_map(tree_sequence); - num_samples_alloc = self->num_samples; - } else { - num_samples_alloc = self->num_samples; - ret = variant_init_samples_and_index_map(self, tree_sequence, self->samples, - self->num_samples, (size_t) num_samples_alloc, self->options); - if (ret != 0) { - goto out; - } - self->sample_index_map = self->alt_sample_index_map; - } - /* When a list of samples is given, we use the traversal based algorithm - * which doesn't use sample list tracking in the tree */ - if (self->alt_samples != NULL) { - self->traversal_stack = tsk_malloc( - tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack)); - if (self->traversal_stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } - - self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes)); - if (self->genotypes == NULL || self->alleles == NULL - || self->allele_lengths == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - -out: - return ret; -} - -int -tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, - const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, - tsk_flags_t options) -{ - int ret = 0; - - tsk_bug_assert(tree_sequence != NULL); - tsk_memset(self, 0, sizeof(tsk_vargen_t)); - - self->tree_sequence = tree_sequence; - ret = tsk_variant_init( - &self->variant, tree_sequence, samples, num_samples, alleles, options); - if (ret != 0) { - goto out; - } - ret = 0; -out: - return ret; -} - -int -tsk_variant_free(tsk_variant_t *self) -{ - if (self->tree_sequence != NULL) { - tsk_tree_free(&self->tree); - } - tsk_safe_free(self->genotypes); - tsk_safe_free(self->alleles); - tsk_safe_free(self->allele_lengths); - tsk_safe_free(self->user_alleles_mem); - tsk_safe_free(self->samples); - tsk_safe_free(self->alt_samples); - tsk_safe_free(self->alt_sample_index_map); - tsk_safe_free(self->traversal_stack); - return 0; -} - -int -tsk_vargen_free(tsk_vargen_t *self) -{ - tsk_variant_free(&self->variant); - return 0; -} - -static int -tsk_variant_expand_alleles(tsk_variant_t *self) -{ - int ret = 0; - void *p; - tsk_size_t hard_limit = INT32_MAX; - - if (self->max_alleles == hard_limit) { - ret = TSK_ERR_TOO_MANY_ALLELES; - goto out; - } - self->max_alleles = TSK_MIN(hard_limit, self->max_alleles * 2); - p = tsk_realloc(self->alleles, self->max_alleles * sizeof(*self->alleles)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->alleles = p; - p = tsk_realloc( - self->allele_lengths, self->max_alleles * sizeof(*self->allele_lengths)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->allele_lengths = p; -out: - return ret; -} - -/* The following pair of functions are identical except one handles 8 bit - * genotypes and the other handles 16 bit genotypes. This is done for performance - * reasons as this is a key function and for common alleles can entail - * iterating over millions of samples. The compiler hints are included for the - * same reason. - */ -static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_sample_list( - tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) -{ - int32_t *restrict genotypes = self->genotypes; - const tsk_id_t *restrict list_left = self->tree.left_sample; - const tsk_id_t *restrict list_right = self->tree.right_sample; - const tsk_id_t *restrict list_next = self->tree.next_sample; - tsk_id_t index, stop; - int ret = 0; - - tsk_bug_assert(derived < INT32_MAX); - - index = list_left[node]; - if (index != TSK_NULL) { - stop = list_right[node]; - while (true) { - - ret += genotypes[index] == TSK_MISSING_DATA; - genotypes[index] = (int32_t) derived; - if (index == stop) { - break; - } - index = list_next[index]; - } - } - - return ret; -} - -/* The following functions implement the genotype setting by traversing - * down the tree to the samples. We're not so worried about performance here - * because this should only be used when we have a very small number of samples, - * and so we use a visit function to avoid duplicating code. - */ - -typedef int (*visit_func_t)(tsk_variant_t *, tsk_id_t, tsk_id_t); - -static int TSK_WARN_UNUSED -tsk_variant_traverse( - tsk_variant_t *self, tsk_id_t node, tsk_id_t derived, visit_func_t visit) -{ - int ret = 0; - tsk_id_t *restrict stack = self->traversal_stack; - const tsk_id_t *restrict left_child = self->tree.left_child; - const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t *restrict sample_index_map = self->sample_index_map; - tsk_id_t u, v, sample_index; - int stack_top; - int no_longer_missing = 0; - - stack_top = 0; - stack[0] = node; - while (stack_top >= 0) { - u = stack[stack_top]; - sample_index = sample_index_map[u]; - if (sample_index != TSK_NULL) { - ret = visit(self, sample_index, derived); - if (ret < 0) { - goto out; - } - no_longer_missing += ret; - } - stack_top--; - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - stack_top++; - stack[stack_top] = v; - } - } - ret = no_longer_missing; -out: - return ret; -} - -static int -tsk_variant_visit(tsk_variant_t *self, tsk_id_t sample_index, tsk_id_t derived) -{ - int ret = 0; - int32_t *restrict genotypes = self->genotypes; - - tsk_bug_assert(derived < INT32_MAX); - tsk_bug_assert(sample_index != -1); - - ret = genotypes[sample_index] == TSK_MISSING_DATA; - genotypes[sample_index] = (int32_t) derived; - - return ret; -} - -static int TSK_WARN_UNUSED -tsk_variant_update_genotypes_traversal( - tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) -{ - return tsk_variant_traverse(self, node, derived, tsk_variant_visit); -} - -static tsk_size_t -tsk_variant_mark_missing(tsk_variant_t *self) -{ - tsk_size_t num_missing = 0; - const tsk_id_t *restrict left_child = self->tree.left_child; - const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t *restrict sample_index_map = self->sample_index_map; - const tsk_id_t N = self->tree.virtual_root; - int32_t *restrict genotypes = self->genotypes; - tsk_id_t root, sample_index; - - for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { - if (left_child[root] == TSK_NULL) { - sample_index = sample_index_map[root]; - if (sample_index != TSK_NULL) { - genotypes[sample_index] = TSK_MISSING_DATA; - num_missing++; - } - } - } - return num_missing; -} - -static tsk_id_t -tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length) -{ - tsk_id_t ret = -1; - tsk_size_t j; - - for (j = 0; j < self->num_alleles; j++) { - if (length == self->allele_lengths[j] - && tsk_memcmp(allele, self->alleles[j], length) == 0) { - ret = (tsk_id_t) j; - break; - } - } - return ret; -} - -int -tsk_variant_decode( - tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t allele_index; - tsk_size_t j, num_missing; - int no_longer_missing; - tsk_mutation_t mutation; - bool impute_missing = !!(self->options & TSK_ISOLATED_NOT_MISSING); - bool by_traversal = self->alt_samples != NULL; - int (*update_genotypes)(tsk_variant_t *, tsk_id_t, tsk_id_t); - tsk_size_t (*mark_missing)(tsk_variant_t *); - - if (self->tree_sequence == NULL) { - ret = TSK_ERR_VARIANT_CANT_DECODE_COPY; - goto out; - } - - ret = tsk_treeseq_get_site(self->tree_sequence, site_id, &self->site); - if (ret != 0) { - goto out; - } - - ret = tsk_tree_seek(&self->tree, self->site.position, 0); - if (ret != 0) { - goto out; - } - - /* When we have no specified samples we need sample lists to be active - * on the tree, as indicated by the presence of left_sample */ - if (!by_traversal && self->tree.left_sample == NULL) { - ret = TSK_ERR_NO_SAMPLE_LISTS; - goto out; - } - - /* For now we use a traversal method to find genotypes when we have a - * specified set of samples, but we should provide the option to do it - * via tracked_samples in the tree also. There will be a tradeoff: if - * we only have a small number of samples, it's probably better to - * do it by traversal. For large sets of samples though, it may be - * better to use the sample list infrastructure. */ - - mark_missing = tsk_variant_mark_missing; - update_genotypes = tsk_variant_update_genotypes_sample_list; - if (by_traversal) { - update_genotypes = tsk_variant_update_genotypes_traversal; - } - - if (self->user_alleles) { - allele_index = tsk_variant_get_allele_index( - self, self->site.ancestral_state, self->site.ancestral_state_length); - if (allele_index == -1) { - ret = TSK_ERR_ALLELE_NOT_FOUND; - goto out; - } - } else { - /* Ancestral state is always allele 0 */ - self->alleles[0] = self->site.ancestral_state; - self->allele_lengths[0] = self->site.ancestral_state_length; - self->num_alleles = 1; - allele_index = 0; - } - - /* The algorithm for generating the allelic state of every sample works by - * examining each mutation in order, and setting the state for all the - * samples under the mutation's node. For complex sites where there is - * more than one mutation, we depend on the ordering of mutations being - * correct. Specifically, any mutation that is above another mutation in - * the tree must be visited first. This is enforced using the mutation.parent - * field, where we require that a mutation's parent must appear before it - * in the list of mutations. This guarantees the correctness of this algorithm. - */ - for (j = 0; j < self->num_samples; j++) { - self->genotypes[j] = (int32_t) allele_index; - } - - /* We mark missing data *before* updating the genotypes because - * mutations directly over samples should not be missing */ - num_missing = 0; - if (!impute_missing) { - num_missing = mark_missing(self); - } - for (j = 0; j < self->site.mutations_length; j++) { - mutation = self->site.mutations[j]; - /* Compute the allele index for this derived state value. */ - allele_index = tsk_variant_get_allele_index( - self, mutation.derived_state, mutation.derived_state_length); - if (allele_index == -1) { - if (self->user_alleles) { - ret = TSK_ERR_ALLELE_NOT_FOUND; - goto out; - } - if (self->num_alleles == self->max_alleles) { - ret = tsk_variant_expand_alleles(self); - if (ret != 0) { - goto out; - } - } - allele_index = (tsk_id_t) self->num_alleles; - self->alleles[allele_index] = mutation.derived_state; - self->allele_lengths[allele_index] = mutation.derived_state_length; - self->num_alleles++; - } - - no_longer_missing = update_genotypes(self, mutation.node, allele_index); - if (no_longer_missing < 0) { - ret = no_longer_missing; - goto out; - } - /* Update genotypes returns the number of missing values marked - * not-missing */ - num_missing -= (tsk_size_t) no_longer_missing; - } - self->has_missing_data = num_missing > 0; -out: - return ret; -} - -int -tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) -{ - int ret = 0; - tsk_size_t total_len, offset, j; - - /* Copy everything */ - tsk_memcpy(other, self, sizeof(*other)); - /* Tree sequence left as NULL and zero'd tree is a way of indicating this variant is - * fixed and cannot be further decoded. */ - other->tree_sequence = NULL; - tsk_memset(&other->tree, sizeof(other->tree), 0); - other->traversal_stack = NULL; - other->samples = NULL; - other->sample_index_map = NULL; - other->alt_samples = NULL; - other->alt_sample_index_map = NULL; - other->user_alleles_mem = NULL; - - total_len = 0; - for (j = 0; j < self->num_alleles; j++) { - total_len += self->allele_lengths[j]; - } - other->samples = tsk_malloc(other->num_samples * sizeof(*other->samples)); - other->genotypes = tsk_malloc(other->num_samples * sizeof(*other->genotypes)); - other->user_alleles_mem = tsk_malloc(total_len * sizeof(*other->user_alleles_mem)); - other->allele_lengths - = tsk_malloc(other->num_alleles * sizeof(*other->allele_lengths)); - other->alleles = tsk_malloc(other->num_alleles * sizeof(*other->alleles)); - if (other->samples == NULL || other->genotypes == NULL - || other->user_alleles_mem == NULL || other->allele_lengths == NULL - || other->alleles == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy( - other->samples, self->samples, other->num_samples * sizeof(*other->samples)); - tsk_memcpy(other->genotypes, self->genotypes, - other->num_samples * sizeof(*other->genotypes)); - tsk_memcpy(other->allele_lengths, self->allele_lengths, - other->num_alleles * sizeof(*other->allele_lengths)); - offset = 0; - for (j = 0; j < other->num_alleles; j++) { - tsk_memcpy(other->user_alleles_mem + offset, self->alleles[j], - other->allele_lengths[j] * sizeof(*other->user_alleles_mem)); - other->alleles[j] = other->user_alleles_mem + offset; - offset += other->allele_lengths[j]; - } - -out: - return ret; -} - -int -tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant) -{ - int ret = 0; - - if ((tsk_size_t) self->site_index < tsk_treeseq_get_num_sites(self->tree_sequence)) { - ret = tsk_variant_decode(&self->variant, self->site_index, 0); - if (ret != 0) { - goto out; - } - self->site_index++; - *variant = &self->variant; - ret = 1; - } -out: - return ret; -} diff --git a/subprojects/tskit/tskit/genotypes.h b/subprojects/tskit/tskit/genotypes.h deleted file mode 100644 index 8c3b769e5..000000000 --- a/subprojects/tskit/tskit/genotypes.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2022 Tskit Developers - * Copyright (c) 2016-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TSK_GENOTYPES_H -#define TSK_GENOTYPES_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -#define TSK_ISOLATED_NOT_MISSING (1 << 1) - -/** -@brief A variant at a specific site. - -@rst -Used to generate the genotypes for a given set of samples at a given -site. -@endrst -*/ -typedef struct { - /** @brief Unowned reference to the tree sequence of the variant */ - const tsk_treeseq_t *tree_sequence; - /** @brief The site this variant is currently decoded at*/ - tsk_site_t site; - tsk_tree_t tree; - /** @brief Array of allele strings that the genotypes of the variant refer to - * These are not NULL terminated - use `allele_lengths` for example:. - * `printf("%.*s", (int) var->allele_lengths[j], var->alleles[j]);` - */ - const char **alleles; - /** @brief Lengths of the allele strings */ - tsk_size_t *allele_lengths; - /** @brief Length of the allele array */ - tsk_size_t num_alleles; - tsk_size_t max_alleles; - /** @brief If True the genotypes of isolated nodes have been decoded to the "missing" - * genotype. If False they are set to the ancestral state (in the absence of - * mutations above them)*/ - bool has_missing_data; - /** @brief Array of genotypes for the current site */ - int32_t *genotypes; - /** @brief Number of samples */ - tsk_size_t num_samples; - /** @brief Array of sample ids used*/ - tsk_id_t *samples; - - const tsk_id_t *sample_index_map; - bool user_alleles; - char *user_alleles_mem; - tsk_id_t *traversal_stack; - tsk_flags_t options; - tsk_id_t *alt_samples; - tsk_id_t *alt_sample_index_map; - -} tsk_variant_t; - -/* All vargen related structs and methods were deprecated in C API v1.0 */ -typedef struct { - const tsk_treeseq_t *tree_sequence; - tsk_id_t site_index; - tsk_variant_t variant; -} tsk_vargen_t; - -/** -@defgroup VARIANT_API_GROUP Variant API for obtaining genotypes. -@{ -*/ - -/** -@brief Initialises the variant by allocating the internal memory - -@rst -This must be called before any operations are performed on the variant. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_variant_t object. -@param tree_sequence A pointer to the tree sequence from which this variant -will decode genotypes. No copy is taken, so this tree sequence must persist -for the lifetime of the variant. -@param samples Optional. Either `NULL` or an array of node ids of the samples that are to -have their genotypes decoded. A copy of this array will be taken by the variant. If -`NULL` then the samples from the tree sequence will be used. -@param num_samples The number of ids in the samples array, ignored if `samples` is `NULL` -@param alleles Optional. Either ``NULL`` or an array of string alleles with a terminal -``NULL`` sentinel value. -If specified, the genotypes will be decoded to match the index in this allele array. -If ``NULL`` then alleles will be automatically determined from the mutations encountered. -@param options Variant options. Either ``0`` or ``TSK_ISOLATED_NOT_MISSING`` which -if specified indicates that isolated sample nodes should not be decoded as the "missing" -state but as the ancestral state (or the state of any mutation above them). -@return Return 0 on success or a negative value on failure. -*/ -int tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, - const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, - tsk_flags_t options); - -/** -@brief Copies the state of this variant to another variant - -@rst -Copies the site, genotypes and alleles from this variant to another. Note that -the other variant should be uninitialised as this method does not free any -memory that the other variant owns. After copying `other` is frozen and -this restricts it from being further decoded at any site. `self` remains unchanged. -@endrst - -@param self A pointer to an initialised and decoded tsk_variant_t object. -@param other A pointer to an uninitialised tsk_variant_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other); - -/** -@brief Decode the genotypes at the given site, storing them in this variant. - -@rst -Decodes the genotypes for this variant's samples, indexed to this variant's alleles, -at the specified site. -This method is most efficient at decoding sites in-order, either forwards or backwards -along the tree sequence. Resulting genotypes are stored in the ``genotypes`` member of -this variant. -@endrst - -@param self A pointer to an initialised tsk_variant_t object. -@param site_id A valid site id for the tree sequence of this variant. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of `tskit`. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_variant_decode(tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified variant. - -@param self A pointer to an initialised tsk_variant_t object. -@return Always returns 0. -*/ -int tsk_variant_free(tsk_variant_t *self); - -/** -@brief Print out the state of this variant to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_variant_t object. -@param out The stream to write the summary to. -*/ -void tsk_variant_print_state(const tsk_variant_t *self, FILE *out); - -/** @} */ - -/* Deprecated vargen methods (since C API v1.0) */ -int tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, - const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, - tsk_flags_t options); -int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant); -int tsk_vargen_free(tsk_vargen_t *self); -void tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/subprojects/tskit/tskit/haplotype_matching.c b/subprojects/tskit/tskit/haplotype_matching.c deleted file mode 100644 index b942da18d..000000000 --- a/subprojects/tskit/tskit/haplotype_matching.c +++ /dev/null @@ -1,1621 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#define MAX_PARSIMONY_WORDS 256 - -const char *_zero_one_alleles[] = { "0", "1", NULL }; -const char *_acgt_alleles[] = { "A", "C", "G", "T", NULL }; - -static int -cmp_double(const void *a, const void *b) -{ - const double *ia = (const double *) a; - const double *ib = (const double *) b; - return (*ia > *ib) - (*ia < *ib); -} - -static int -cmp_argsort(const void *a, const void *b) -{ - const tsk_argsort_t *ia = (const tsk_argsort_t *) a; - const tsk_argsort_t *ib = (const tsk_argsort_t *) b; - int ret = (ia->value > ib->value) - (ia->value < ib->value); - /* Break any ties using the index to ensure consistency */ - if (ret == 0) { - ret = (ia->index > ib->index) - (ia->index < ib->index); - } - return ret; -} - -static void -tsk_ls_hmm_check_state(tsk_ls_hmm_t *self) -{ - tsk_id_t *T_index = self->transition_index; - tsk_value_transition_t *T = self->transitions; - tsk_id_t j; - - for (j = 0; j < (tsk_id_t) self->num_transitions; j++) { - if (T[j].tree_node != TSK_NULL) { - tsk_bug_assert(T_index[T[j].tree_node] == j); - } - } - /* tsk_bug_assert(self->num_transitions <= self->num_samples); */ - - if (self->num_transitions > 0) { - for (j = 0; j < (tsk_id_t) self->num_nodes; j++) { - if (T_index[j] != TSK_NULL) { - tsk_bug_assert(T[T_index[j]].tree_node == j); - } - tsk_bug_assert(self->tree.parent[j] == self->parent[j]); - } - } -} - -void -tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out) -{ - tsk_size_t j, l; - - fprintf(out, "tree_sequence = %p\n", (void *) self->tree_sequence); - fprintf(out, "num_sites = %lld\n", (long long) self->num_sites); - fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); - fprintf(out, "num_values = %lld\n", (long long) self->num_values); - fprintf(out, "max_values = %lld\n", (long long) self->max_values); - fprintf(out, "num_optimal_value_set_words = %lld\n", - (long long) self->num_optimal_value_set_words); - - fprintf(out, "sites::\n"); - for (l = 0; l < self->num_sites; l++) { - fprintf(out, "%lld\t%lld\t[", (long long) l, (long long) self->num_alleles[l]); - for (j = 0; j < self->num_alleles[l]; j++) { - fprintf(out, "%s,", self->alleles[l][j]); - } - fprintf(out, "]\n"); - } - fprintf(out, "transitions::%lld\n", (long long) self->num_transitions); - for (j = 0; j < self->num_transitions; j++) { - fprintf(out, "tree_node=%lld\tvalue=%.14f\tvalue_index=%lld\n", - (long long) self->transitions[j].tree_node, self->transitions[j].value, - (long long) self->transitions[j].value_index); - } - if (self->num_transitions > 0) { - fprintf(out, "tree::%lld\n", (long long) self->num_nodes); - for (j = 0; j < self->num_nodes; j++) { - fprintf(out, "%lld\tparent=%lld\ttransition=%lld\n", (long long) j, - (long long) self->parent[j], (long long) self->transition_index[j]); - } - } - tsk_ls_hmm_check_state(self); -} - -int TSK_WARN_UNUSED -tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, - double *recombination_rate, double *mutation_rate, tsk_flags_t options) -{ - int ret = TSK_ERR_GENERIC; - tsk_size_t l; - - tsk_memset(self, 0, sizeof(tsk_ls_hmm_t)); - self->tree_sequence = tree_sequence; - self->precision = 6; /* Seems like a safe value, but probably not ideal for perf */ - self->num_sites = tsk_treeseq_get_num_sites(tree_sequence); - self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); - self->num_alleles = tsk_malloc(self->num_sites * sizeof(*self->num_alleles)); - self->num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); - self->parent = tsk_malloc(self->num_nodes * sizeof(*self->parent)); - self->allelic_state = tsk_malloc(self->num_nodes * sizeof(*self->allelic_state)); - self->transition_index - = tsk_malloc(self->num_nodes * sizeof(*self->transition_index)); - self->transition_stack - = tsk_malloc(self->num_nodes * sizeof(*self->transition_stack)); - /* We can't have more than 2 * num_samples transitions, so we use this as the - * upper bound. Because of the implementation, we'll also have to worry about - * the extra mutations at the first site, which in worst case involves all - * mutations. We can definitely save some memory here if we want to.*/ - self->max_transitions - = 2 * self->num_samples + tsk_treeseq_get_num_mutations(tree_sequence); - /* FIXME Arbitrarily doubling this after hitting problems */ - self->max_transitions *= 2; - self->transitions = tsk_malloc(self->max_transitions * sizeof(*self->transitions)); - self->transitions_copy - = tsk_malloc(self->max_transitions * sizeof(*self->transitions)); - self->num_transition_samples - = tsk_malloc(self->max_transitions * sizeof(*self->num_transition_samples)); - self->transition_parent - = tsk_malloc(self->max_transitions * sizeof(*self->transition_parent)); - self->transition_time_order - = tsk_malloc(self->max_transitions * sizeof(*self->transition_time_order)); - self->values = tsk_malloc(self->max_transitions * sizeof(*self->values)); - self->recombination_rate - = tsk_malloc(self->num_sites * sizeof(*self->recombination_rate)); - self->mutation_rate = tsk_malloc(self->num_sites * sizeof(*self->mutation_rate)); - self->alleles = tsk_calloc(self->num_sites, sizeof(*self->alleles)); - if (self->num_alleles == NULL || self->parent == NULL || self->allelic_state == NULL - || self->transition_index == NULL || self->transition_stack == NULL - || self->transitions == NULL || self->transitions_copy == NULL - || self->num_transition_samples == NULL || self->transition_parent == NULL - || self->transition_time_order == NULL || self->values == NULL - || self->recombination_rate == NULL || self->mutation_rate == NULL - || self->alleles == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (l = 0; l < self->num_sites; l++) { - /* TODO check these inputs */ - self->recombination_rate[l] = recombination_rate[l]; - self->mutation_rate[l] = mutation_rate[l]; - if (options & TSK_ALLELES_ACGT) { - self->num_alleles[l] = 4; - self->alleles[l] = _acgt_alleles; - } else { - /* Default to the 0/1 alleles */ - self->num_alleles[l] = 2; - self->alleles[l] = _zero_one_alleles; - } - } - ret = tsk_tree_init(&self->tree, self->tree_sequence, 0); - if (ret != 0) { - goto out; - } - self->num_values = 0; - self->max_values = 0; - /* Keep this as a struct variable so that we can test overflow, but this - * should never be set to more than MAX_PARSIMONY_WORDS as we're doing - * a bunch of stack allocations based on this. */ - self->max_parsimony_words = MAX_PARSIMONY_WORDS; - ret = 0; -out: - return ret; -} - -int -tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision) -{ - self->precision = precision; - return 0; -} - -int -tsk_ls_hmm_free(tsk_ls_hmm_t *self) -{ - tsk_tree_free(&self->tree); - tsk_diff_iter_free(&self->diffs); - tsk_safe_free(self->recombination_rate); - tsk_safe_free(self->mutation_rate); - tsk_safe_free(self->recombination_rate); - tsk_safe_free(self->alleles); - tsk_safe_free(self->num_alleles); - tsk_safe_free(self->parent); - tsk_safe_free(self->allelic_state); - tsk_safe_free(self->transition_index); - tsk_safe_free(self->transition_stack); - tsk_safe_free(self->transitions); - tsk_safe_free(self->transitions_copy); - tsk_safe_free(self->transition_time_order); - tsk_safe_free(self->values); - tsk_safe_free(self->num_transition_samples); - tsk_safe_free(self->transition_parent); - tsk_safe_free(self->optimal_value_sets); - return 0; -} - -static int -tsk_ls_hmm_reset(tsk_ls_hmm_t *self) -{ - int ret = 0; - double n = (double) self->num_samples; - tsk_size_t j; - tsk_id_t u; - const tsk_id_t *samples; - tsk_size_t N = self->num_nodes; - - tsk_memset(self->parent, 0xff, N * sizeof(*self->parent)); - tsk_memset(self->transition_index, 0xff, N * sizeof(*self->transition_index)); - tsk_memset(self->allelic_state, 0xff, N * sizeof(*self->allelic_state)); - tsk_memset(self->transitions, 0, self->max_transitions * sizeof(*self->transitions)); - tsk_memset(self->num_transition_samples, 0, - self->max_transitions * sizeof(*self->num_transition_samples)); - tsk_memset(self->transition_parent, 0xff, - self->max_transitions * sizeof(*self->transition_parent)); - - /* This is safe because we've already zero'd out the memory. */ - tsk_diff_iter_free(&self->diffs); - ret = tsk_diff_iter_init_from_ts(&self->diffs, self->tree_sequence, false); - if (ret != 0) { - goto out; - } - samples = tsk_treeseq_get_samples(self->tree_sequence); - for (j = 0; j < self->num_samples; j++) { - u = samples[j]; - self->transitions[j].tree_node = u; - self->transitions[j].value = 1.0 / n; - self->transition_index[u] = (tsk_id_t) j; - } - self->num_transitions = self->num_samples; -out: - return ret; -} - -/* After we have moved on to a new tree we can have transitions still associated - * with the old roots, which are now disconnected. Remove. */ -static int -tsk_ls_hmm_remove_dead_roots(tsk_ls_hmm_t *self) -{ - tsk_id_t *restrict T_index = self->transition_index; - tsk_value_transition_t *restrict T = self->transitions; - const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t left_root = tsk_tree_get_left_root(&self->tree); - const tsk_id_t *restrict parent = self->parent; - tsk_id_t root, u; - tsk_size_t j; - const tsk_id_t root_marker = -2; - - for (root = left_root; root != TSK_NULL; root = right_sib[root]) { - if (T_index[root] != TSK_NULL) { - /* Use the value_index slot as a marker. We don't use this between - * iterations, so it's safe to appropriate here */ - T[T_index[root]].value_index = root_marker; - } - } - for (j = 0; j < self->num_transitions; j++) { - u = T[j].tree_node; - if (u != TSK_NULL) { - if (parent[u] == TSK_NULL && T[j].value_index != root_marker) { - T_index[u] = TSK_NULL; - T[j].tree_node = TSK_NULL; - } - T[j].value_index = -1; - } - } - return 0; -} - -static int -tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self) -{ - int ret = 0; - tsk_id_t *restrict parent = self->parent; - tsk_id_t *restrict T_index = self->transition_index; - tsk_value_transition_t *restrict T = self->transitions; - tsk_edge_list_node_t *record; - tsk_edge_list_t records_out, records_in; - tsk_edge_t edge; - double left, right; - tsk_id_t u; - tsk_value_transition_t *vt; - - ret = tsk_diff_iter_next(&self->diffs, &left, &right, &records_out, &records_in); - if (ret < 0) { - goto out; - } - - for (record = records_out.head; record != NULL; record = record->next) { - u = record->edge.child; - if (T_index[u] == TSK_NULL) { - /* Ensure the subtree we're detaching has a transition at the root */ - while (T_index[u] == TSK_NULL) { - u = parent[u]; - tsk_bug_assert(u != TSK_NULL); - } - tsk_bug_assert(self->num_transitions < self->max_transitions); - T_index[record->edge.child] = (tsk_id_t) self->num_transitions; - T[self->num_transitions].tree_node = record->edge.child; - T[self->num_transitions].value = T[T_index[u]].value; - self->num_transitions++; - } - parent[record->edge.child] = TSK_NULL; - } - - for (record = records_in.head; record != NULL; record = record->next) { - edge = record->edge; - parent[edge.child] = edge.parent; - u = edge.parent; - if (parent[edge.parent] == TSK_NULL) { - /* Grafting onto a new root. */ - if (T_index[record->edge.parent] == TSK_NULL) { - T_index[edge.parent] = (tsk_id_t) self->num_transitions; - tsk_bug_assert(self->num_transitions < self->max_transitions); - T[self->num_transitions].tree_node = edge.parent; - T[self->num_transitions].value = T[T_index[edge.child]].value; - self->num_transitions++; - } - } else { - /* Grafting into an existing subtree. */ - while (T_index[u] == TSK_NULL) { - u = parent[u]; - } - tsk_bug_assert(u != TSK_NULL); - } - tsk_bug_assert(T_index[u] != -1 && T_index[edge.child] != -1); - if (T[T_index[u]].value == T[T_index[edge.child]].value) { - vt = &T[T_index[edge.child]]; - /* Mark the value transition as unusued */ - vt->value = -1; - vt->tree_node = TSK_NULL; - T_index[edge.child] = TSK_NULL; - } - } - - ret = tsk_ls_hmm_remove_dead_roots(self); -out: - return ret; -} - -static int -tsk_ls_hmm_get_allele_index(tsk_ls_hmm_t *self, tsk_id_t site, const char *allele_state, - const tsk_size_t allele_length) -{ - int ret = TSK_ERR_ALLELE_NOT_FOUND; - const char **alleles = self->alleles[site]; - const tsk_id_t num_alleles = (tsk_id_t) self->num_alleles[site]; - - tsk_id_t j; - - for (j = 0; j < num_alleles; j++) { - if (strlen(alleles[j]) != allele_length) { - break; - } - if (strncmp(alleles[j], allele_state, (size_t) allele_length) == 0) { - ret = (int) j; - break; - } - } - return ret; -} - -static int -tsk_ls_hmm_update_probabilities( - tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) -{ - int ret = 0; - tsk_id_t root; - tsk_tree_t *tree = &self->tree; - tsk_id_t *restrict parent = self->parent; - tsk_id_t *restrict T_index = self->transition_index; - tsk_value_transition_t *restrict T = self->transitions; - int32_t *restrict allelic_state = self->allelic_state; - const tsk_id_t left_root = tsk_tree_get_left_root(tree); - tsk_mutation_t mut; - tsk_id_t j, u, v; - double x; - bool match; - - /* Set the allelic states */ - ret = tsk_ls_hmm_get_allele_index( - self, site->id, site->ancestral_state, site->ancestral_state_length); - if (ret < 0) { - goto out; - } - for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { - allelic_state[root] = (int32_t) ret; - } - - for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { - mut = site->mutations[j]; - ret = tsk_ls_hmm_get_allele_index( - self, site->id, mut.derived_state, mut.derived_state_length); - if (ret < 0) { - goto out; - } - u = mut.node; - allelic_state[u] = (int32_t) ret; - if (T_index[u] == TSK_NULL) { - while (T_index[u] == TSK_NULL) { - u = parent[u]; - } - tsk_bug_assert(self->num_transitions < self->max_transitions); - T_index[mut.node] = (tsk_id_t) self->num_transitions; - T[self->num_transitions].tree_node = mut.node; - T[self->num_transitions].value = T[T_index[u]].value; - self->num_transitions++; - } - } - - for (j = 0; j < (tsk_id_t) self->num_transitions; j++) { - u = T[j].tree_node; - if (u != TSK_NULL) { - /* Get the allelic_state at u. */ - v = u; - while (allelic_state[v] == TSK_MISSING_DATA) { - v = parent[v]; - tsk_bug_assert(v != -1); - } - match = haplotype_state == TSK_MISSING_DATA - || haplotype_state == allelic_state[v]; - ret = self->next_probability(self, site->id, T[j].value, match, u, &x); - if (ret != 0) { - goto out; - } - T[j].value = x; - } - } - - /* Unset the allelic states */ - for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { - allelic_state[root] = TSK_MISSING_DATA; - } - for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { - mut = site->mutations[j]; - allelic_state[mut.node] = TSK_MISSING_DATA; - } - ret = 0; -out: - return ret; -} - -static int -tsk_ls_hmm_discretise_values(tsk_ls_hmm_t *self) -{ - int ret = 0; - tsk_value_transition_t *T = self->transitions; - double *values = self->values; - tsk_size_t j, k, num_values; - - num_values = 0; - for (j = 0; j < self->num_transitions; j++) { - if (T[j].tree_node != TSK_NULL) { - values[num_values] = T[j].value; - num_values++; - } - } - tsk_bug_assert(num_values > 0); - - qsort(values, (size_t) num_values, sizeof(double), cmp_double); - - k = 0; - for (j = 1; j < num_values; j++) { - if (values[j] != values[k]) { - k++; - values[k] = values[j]; - } - } - num_values = k + 1; - self->num_values = num_values; - for (j = 0; j < self->num_transitions; j++) { - if (T[j].tree_node != TSK_NULL) { - T[j].value_index - = (tsk_id_t) tsk_search_sorted(values, num_values, T[j].value); - tsk_bug_assert(T[j].value == self->values[T[j].value_index]); - } - } - return ret; -} - -/* - * TODO We also have these function in tree.c where they're used in the - * parsimony calculations (which are slightly different). It would be good to bring - * these together, or at least avoid having the same function in two - * files. Keeping it as it is for now so that it can be inlined, since - * it's perf-sensitive. */ - -static inline tsk_id_t -get_smallest_set_bit(uint64_t v) -{ - /* This is an inefficient implementation, there are several better - * approaches. On GCC we can use - * return (uint8_t) (__builtin_ffsll((long long) v) - 1); - */ - uint64_t t = 1; - tsk_id_t r = 0; - assert(v != 0); - - while ((v & t) == 0) { - t <<= 1; - r++; - } - return r; -} - -static inline uint64_t -set_bit(uint64_t value, uint8_t bit) -{ - return value | (1ULL << bit); -} - -static inline bool -bit_is_set(uint64_t value, uint8_t bit) -{ - return (value & (1ULL << bit)) != 0; -} - -static inline tsk_id_t -get_smallest_element(const uint64_t *restrict A, tsk_size_t u, tsk_size_t num_words) -{ - tsk_size_t base = u * num_words; - const uint64_t *restrict a = A + base; - tsk_id_t j = 0; - - while (a[j] == 0) { - j++; - tsk_bug_assert(j < (tsk_id_t) num_words); - } - return j * 64 + get_smallest_set_bit(a[j]); -} - -/* static variables are zero-initialised by default. */ -static const uint64_t zero_block[MAX_PARSIMONY_WORDS]; - -static inline bool -all_zero(const uint64_t *restrict A, tsk_id_t u, tsk_size_t num_words) -{ - if (num_words == 1) { - return A[u] == 0; - } else { - return tsk_memcmp( - zero_block, A + (tsk_size_t) u * num_words, num_words * sizeof(*A)) - == 0; - } -} - -static inline bool -element_in( - const uint64_t *restrict A, tsk_id_t u, const tsk_id_t state, tsk_size_t num_words) -{ - tsk_size_t index = ((tsk_size_t) u) * num_words + (tsk_size_t)(state / 64); - return (A[index] & (1ULL << (state % 64))) != 0; -} - -static inline void -set_optimal_value( - uint64_t *restrict A, tsk_id_t u, const tsk_size_t num_words, tsk_id_t state) -{ - tsk_size_t index = ((tsk_size_t) u) * num_words + (tsk_size_t)(state / 64); - tsk_bug_assert(((tsk_size_t) state) / 64 < num_words); - A[index] |= 1ULL << (state % 64); -} - -/* TODO the implementation here isn't particularly optimal and the way things - * were organised was really driven by the old Fitch parsimony algorithm - * (which only worked on binary trees. In particular, we should be working - * word-by-word where possible rather than iterating by values like we do here. - * Needs to be reworked when we're documenting/writing up this algorithm. - */ - -static void -compute_optimal_value_1(uint64_t *restrict A, const tsk_id_t *restrict left_child, - const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, - const tsk_size_t num_values) -{ - tsk_id_t v; - uint64_t child; - tsk_size_t value_count[64], max_value_count; - uint8_t j; - - assert(num_values < 64); - - tsk_memset(value_count, 0, num_values * sizeof(*value_count)); - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - child = A[v]; - /* If the set for a given child is empty, then we know it inherits - * directly from the parent state and must be a singleton set. */ - if (child == 0) { - child = 1ULL << parent_state; - } - for (j = 0; j < num_values; j++) { - value_count[j] += bit_is_set(child, j); - } - } - max_value_count = 0; - for (j = 0; j < num_values; j++) { - max_value_count = TSK_MAX(max_value_count, value_count[j]); - } - A[u] = 0; - for (j = 0; j < num_values; j++) { - if (value_count[j] == max_value_count) { - A[u] = set_bit(A[u], j); - } - } -} - -static void -compute_optimal_value_general(uint64_t *restrict A, const tsk_id_t *restrict left_child, - const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, - const tsk_size_t num_values, const tsk_size_t num_words) -{ - tsk_id_t v; - uint64_t child[MAX_PARSIMONY_WORDS]; - uint64_t *Au; - tsk_size_t base, word, bit; - bool child_all_zero; - const tsk_id_t state_index = parent_state / 64; - const uint64_t state_word = 1ULL << (parent_state % 64); - tsk_size_t value_count[64 * MAX_PARSIMONY_WORDS], max_value_count; - tsk_size_t j; - - tsk_bug_assert(num_values < 64 * MAX_PARSIMONY_WORDS); - tsk_bug_assert(num_words <= MAX_PARSIMONY_WORDS); - for (j = 0; j < num_values; j++) { - value_count[j] = 0; - } - - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - child_all_zero = true; - base = ((tsk_size_t) v) * num_words; - for (word = 0; word < num_words; word++) { - child[word] = A[base + word]; - child_all_zero = child_all_zero && (child[word] == 0); - } - /* If the set for a given child is empty, then we know it inherits - * directly from the parent state and must be a singleton set. */ - if (child_all_zero) { - child[state_index] = state_word; - } - for (j = 0; j < num_values; j++) { - word = j / 64; - bit = j % 64; - assert(word < num_words); - value_count[j] += bit_is_set(child[word], (uint8_t) bit); - } - } - max_value_count = 0; - for (j = 0; j < num_values; j++) { - max_value_count = TSK_MAX(max_value_count, value_count[j]); - } - - Au = A + ((size_t) u * num_words); - for (word = 0; word < num_words; word++) { - Au[word] = 0; - } - for (j = 0; j < num_values; j++) { - if (value_count[j] == max_value_count) { - word = j / 64; - bit = j % 64; - Au[word] = set_bit(Au[word], (uint8_t) bit); - } - } -} - -static void -compute_optimal_value(uint64_t *restrict A, const tsk_id_t *restrict left_child, - const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, - const tsk_size_t num_values, const tsk_size_t num_words) -{ - if (num_words == 1) { - compute_optimal_value_1(A, left_child, right_sib, u, parent_state, num_values); - } else { - compute_optimal_value_general( - A, left_child, right_sib, u, parent_state, num_values, num_words); - } -} - -static int -tsk_ls_hmm_setup_optimal_value_sets(tsk_ls_hmm_t *self) -{ - int ret = 0; - - /* We expect that most of the time there will be one word per optimal_value set, - * but there will be times when we need more than one word. This approach - * lets us expand the memory if we need to, but when the number of - * values goes back below 64 we revert to using one word per set. We - * could in principle release back the memory as well, but it doesn't seem - * worth the bother. */ - self->num_optimal_value_set_words = (self->num_values / 64) + 1; - if (self->num_optimal_value_set_words > self->max_parsimony_words) { - ret = TSK_ERR_TOO_MANY_VALUES; - goto out; - } - if (self->num_values >= self->max_values) { - self->max_values = self->num_optimal_value_set_words * 64; - tsk_safe_free(self->optimal_value_sets); - self->optimal_value_sets - = tsk_calloc(self->num_nodes * self->num_optimal_value_set_words, - sizeof(*self->optimal_value_sets)); - if (self->optimal_value_sets == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } -out: - return ret; -} - -static int -tsk_ls_hmm_build_optimal_value_sets(tsk_ls_hmm_t *self) -{ - int ret = 0; - const double *restrict node_time = self->tree_sequence->tables->nodes.time; - const tsk_id_t *restrict left_child = self->tree.left_child; - const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t *restrict parent = self->parent; - const tsk_value_transition_t *restrict T = self->transitions; - const tsk_id_t *restrict T_index = self->transition_index; - tsk_argsort_t *restrict order = self->transition_time_order; - const tsk_size_t num_optimal_value_set_words = self->num_optimal_value_set_words; - uint64_t *restrict A = self->optimal_value_sets; - tsk_size_t j; - tsk_id_t u, v, state, parent_state; - - /* argsort the transitions by node time so we can visit them in the - * correct order */ - for (j = 0; j < self->num_transitions; j++) { - order[j].index = j; - order[j].value = DBL_MAX; - if (T[j].tree_node != TSK_NULL) { - order[j].value = node_time[T[j].tree_node]; - } - } - qsort(order, (size_t) self->num_transitions, sizeof(*order), cmp_argsort); - - for (j = 0; j < self->num_transitions; j++) { - u = T[order[j].index].tree_node; - if (u != TSK_NULL) { - state = T[order[j].index].value_index; - if (left_child[u] == TSK_NULL) { - /* leaf node */ - set_optimal_value(A, u, num_optimal_value_set_words, state); - } else { - compute_optimal_value(A, left_child, right_sib, u, state, - self->num_values, num_optimal_value_set_words); - } - v = parent[u]; - if (v != TSK_NULL) { - while (T_index[v] == TSK_NULL) { - v = parent[v]; - tsk_bug_assert(v != TSK_NULL); - } - parent_state = T[T_index[v]].value_index; - v = parent[u]; - while (T_index[v] == TSK_NULL) { - compute_optimal_value(A, left_child, right_sib, v, parent_state, - self->num_values, num_optimal_value_set_words); - v = parent[v]; - tsk_bug_assert(v != TSK_NULL); - } - } - } - } - return ret; -} - -static int -tsk_ls_hmm_redistribute_transitions(tsk_ls_hmm_t *self) -{ - int ret = 0; - const tsk_id_t *restrict left_child = self->tree.left_child; - const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t *restrict parent = self->parent; - tsk_id_t *restrict T_index = self->transition_index; - tsk_id_t *restrict T_parent = self->transition_parent; - tsk_value_transition_t *restrict T = self->transitions; - tsk_value_transition_t *restrict T_old = self->transitions_copy; - tsk_transition_stack_t *stack = self->transition_stack; - uint64_t *restrict A = self->optimal_value_sets; - const tsk_size_t num_optimal_value_set_words = self->num_optimal_value_set_words; - tsk_transition_stack_t s, child_s; - tsk_id_t root, u, v; - int stack_top = 0; - tsk_size_t j, old_num_transitions; - - tsk_memcpy(T_old, T, self->num_transitions * sizeof(*T)); - old_num_transitions = self->num_transitions; - self->num_transitions = 0; - - /* TODO refactor this to push the virtual root onto the stack rather then - * iterating over the roots. See the existing parsimony implementations - * for an example. */ - for (root = tsk_tree_get_left_root(&self->tree); root != TSK_NULL; - root = right_sib[root]) { - stack[0].tree_node = root; - stack[0].old_state = T_old[T_index[root]].value_index; - stack[0].new_state - = get_smallest_element(A, (tsk_size_t) root, num_optimal_value_set_words); - stack[0].transition_parent = 0; - stack_top = 0; - - tsk_bug_assert(self->num_transitions < self->max_transitions); - T_parent[self->num_transitions] = TSK_NULL; - T[self->num_transitions].tree_node = stack[0].tree_node; - T[self->num_transitions].value_index = stack[0].new_state; - self->num_transitions++; - - while (stack_top >= 0) { - s = stack[stack_top]; - stack_top--; - for (v = left_child[s.tree_node]; v != TSK_NULL; v = right_sib[v]) { - child_s = s; - child_s.tree_node = v; - if (T_index[v] != TSK_NULL) { - child_s.old_state = T_old[T_index[v]].value_index; - } - if (!all_zero(A, v, num_optimal_value_set_words)) { - if (!element_in(A, v, s.new_state, num_optimal_value_set_words)) { - child_s.new_state = get_smallest_element( - A, (tsk_size_t) v, num_optimal_value_set_words); - child_s.transition_parent = (tsk_id_t) self->num_transitions; - /* Add a new transition */ - tsk_bug_assert(self->num_transitions < self->max_transitions); - T_parent[self->num_transitions] = s.transition_parent; - T[self->num_transitions].tree_node = v; - T[self->num_transitions].value_index = child_s.new_state; - self->num_transitions++; - } - stack_top++; - stack[stack_top] = child_s; - } else { - /* Node that we didn't visit when moving up the tree */ - if (s.old_state != s.new_state) { - tsk_bug_assert(self->num_transitions < self->max_transitions); - T_parent[self->num_transitions] = s.transition_parent; - T[self->num_transitions].tree_node = v; - T[self->num_transitions].value_index = s.old_state; - self->num_transitions++; - } - } - } - } - } - - /* Unset the old T_index pointers and optimal_value sets. */ - for (j = 0; j < old_num_transitions; j++) { - u = T_old[j].tree_node; - if (u != TSK_NULL) { - T_index[u] = TSK_NULL; - while (u != TSK_NULL && !all_zero(A, u, num_optimal_value_set_words)) { - tsk_memset(A + ((tsk_size_t) u) * num_optimal_value_set_words, 0, - num_optimal_value_set_words * sizeof(uint64_t)); - u = parent[u]; - } - } - } - /* Set the new pointers for transition nodes and the values.*/ - for (j = 0; j < self->num_transitions; j++) { - T_index[T[j].tree_node] = (tsk_id_t) j; - T[j].value = self->values[T[j].value_index]; - } - return ret; -} - -static int -tsk_ls_hmm_compress(tsk_ls_hmm_t *self) -{ - int ret = 0; - - ret = tsk_ls_hmm_discretise_values(self); - if (ret != 0) { - goto out; - } - ret = tsk_ls_hmm_setup_optimal_value_sets(self); - if (ret != 0) { - goto out; - } - ret = tsk_ls_hmm_build_optimal_value_sets(self); - if (ret != 0) { - goto out; - } - ret = tsk_ls_hmm_redistribute_transitions(self); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int -tsk_ls_hmm_process_site( - tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) -{ - int ret = 0; - double x, normalisation_factor; - tsk_compressed_matrix_t *output = (tsk_compressed_matrix_t *) self->output; - tsk_value_transition_t *restrict T = self->transitions; - const unsigned int precision = (unsigned int) self->precision; - tsk_size_t j; - - ret = tsk_ls_hmm_update_probabilities(self, site, haplotype_state); - if (ret != 0) { - goto out; - } - /* See notes in the Python implementation on why we don't want to compress - * here, but rather should be doing it after rounding. */ - ret = tsk_ls_hmm_compress(self); - if (ret != 0) { - goto out; - } - tsk_bug_assert(self->num_transitions <= self->num_samples); - normalisation_factor = self->compute_normalisation_factor(self); - - if (normalisation_factor == 0) { - ret = TSK_ERR_MATCH_IMPOSSIBLE; - goto out; - } - for (j = 0; j < self->num_transitions; j++) { - tsk_bug_assert(T[j].tree_node != TSK_NULL); - x = T[j].value / normalisation_factor; - T[j].value = tsk_round(x, precision); - } - - ret = tsk_compressed_matrix_store_site( - output, site->id, normalisation_factor, (tsk_size_t) self->num_transitions, T); -out: - return ret; -} - -int -tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, - int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), - double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output) -{ - int ret = 0; - int t_ret; - const tsk_site_t *sites; - tsk_size_t j, num_sites; - - self->next_probability = next_probability; - self->compute_normalisation_factor = compute_normalisation_factor; - self->output = output; - - ret = tsk_ls_hmm_reset(self); - if (ret != 0) { - goto out; - } - - for (t_ret = tsk_tree_first(&self->tree); t_ret == TSK_TREE_OK; - t_ret = tsk_tree_next(&self->tree)) { - ret = tsk_ls_hmm_update_tree(self); - if (ret != 0) { - goto out; - } - /* tsk_ls_hmm_check_state(self); */ - ret = tsk_tree_get_sites(&self->tree, &sites, &num_sites); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_sites; j++) { - ret = tsk_ls_hmm_process_site(self, &sites[j], haplotype[sites[j].id]); - if (ret != 0) { - goto out; - } - } - } - /* Set to zero so we can print and check the state OK. */ - self->num_transitions = 0; - if (t_ret != 0) { - ret = t_ret; - goto out; - } -out: - return ret; -} - -/**************************************************************** - * Forward Algorithm - ****************************************************************/ - -static double -tsk_ls_hmm_compute_normalisation_factor_forward(tsk_ls_hmm_t *self) -{ - tsk_size_t *restrict N = self->num_transition_samples; - tsk_value_transition_t *restrict T = self->transitions; - const tsk_id_t *restrict T_parent = self->transition_parent; - const tsk_size_t *restrict num_samples = self->tree.num_samples; - const tsk_id_t num_transitions = (tsk_id_t) self->num_transitions; - double normalisation_factor; - tsk_id_t j; - - /* Compute the number of samples directly inheriting from each transition */ - for (j = 0; j < num_transitions; j++) { - tsk_bug_assert(T[j].tree_node != TSK_NULL); - N[j] = num_samples[T[j].tree_node]; - } - for (j = 0; j < num_transitions; j++) { - if (T_parent[j] != TSK_NULL) { - N[T_parent[j]] -= N[j]; - } - } - - /* Compute the normalising constant used to avoid underflow */ - normalisation_factor = 0; - for (j = 0; j < num_transitions; j++) { - normalisation_factor += (double) N[j] * T[j].value; - } - return normalisation_factor; -} - -static int -tsk_ls_hmm_next_probability_forward(tsk_ls_hmm_t *self, tsk_id_t site_id, double p_last, - bool is_match, tsk_id_t TSK_UNUSED(node), double *result) -{ - const double rho = self->recombination_rate[site_id]; - const double mu = self->mutation_rate[site_id]; - const double n = (double) self->num_samples; - const double num_alleles = self->num_alleles[site_id]; - double p_t, p_e; - - p_t = p_last * (1 - rho) + rho / n; - p_e = mu; - if (is_match) { - p_e = 1 - (num_alleles - 1) * mu; - } - *result = p_t * p_e; - return 0; -} - -int -tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, - tsk_compressed_matrix_t *output, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_compressed_matrix_init(output, self->tree_sequence, 0, 0); - if (ret != 0) { - goto out; - } - } else { - if (output->tree_sequence != self->tree_sequence) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_compressed_matrix_clear(output); - if (ret != 0) { - goto out; - } - } - ret = tsk_ls_hmm_run(self, haplotype, tsk_ls_hmm_next_probability_forward, - tsk_ls_hmm_compute_normalisation_factor_forward, output); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -/**************************************************************** - * Viterbi Algorithm - ****************************************************************/ - -static double -tsk_ls_hmm_compute_normalisation_factor_viterbi(tsk_ls_hmm_t *self) -{ - tsk_value_transition_t *restrict T = self->transitions; - const tsk_id_t num_transitions = (tsk_id_t) self->num_transitions; - tsk_value_transition_t max_vt; - tsk_id_t j; - - max_vt.value = -1; - max_vt.tree_node = 0; /* keep compiler happy */ - tsk_bug_assert(num_transitions > 0); - for (j = 0; j < num_transitions; j++) { - tsk_bug_assert(T[j].tree_node != TSK_NULL); - if (T[j].value > max_vt.value) { - max_vt = T[j]; - } - } - return max_vt.value; -} - -static int -tsk_ls_hmm_next_probability_viterbi(tsk_ls_hmm_t *self, tsk_id_t site, double p_last, - bool is_match, tsk_id_t node, double *result) -{ - const double rho = self->recombination_rate[site]; - const double mu = self->mutation_rate[site]; - const double num_alleles = self->num_alleles[site]; - const double n = (double) self->num_samples; - double p_recomb, p_no_recomb, p_t, p_e; - bool recombination_required = false; - - p_no_recomb = p_last * (1 - rho + rho / n); - p_recomb = rho / n; - if (p_no_recomb > p_recomb) { - p_t = p_no_recomb; - } else { - p_t = p_recomb; - recombination_required = true; - } - p_e = mu; - if (is_match) { - p_e = 1 - (num_alleles - 1) * mu; - } - *result = p_t * p_e; - return tsk_viterbi_matrix_add_recombination_required( - self->output, site, node, recombination_required); -} - -int -tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, - tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_viterbi_matrix_init(output, self->tree_sequence, 0, 0); - if (ret != 0) { - goto out; - } - } else { - if (output->matrix.tree_sequence != self->tree_sequence) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_viterbi_matrix_clear(output); - if (ret != 0) { - goto out; - } - } - ret = tsk_ls_hmm_run(self, haplotype, tsk_ls_hmm_next_probability_viterbi, - tsk_ls_hmm_compute_normalisation_factor_viterbi, output); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -/**************************************************************** - * Compressed matrix - ****************************************************************/ - -int -tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, tsk_treeseq_t *tree_sequence, - tsk_size_t block_size, tsk_flags_t options) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(*self)); - self->tree_sequence = tree_sequence; - self->options = options; - self->num_sites = tsk_treeseq_get_num_sites(tree_sequence); - self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); - self->num_transitions = tsk_malloc(self->num_sites * sizeof(*self->num_transitions)); - self->normalisation_factor - = tsk_malloc(self->num_sites * sizeof(*self->normalisation_factor)); - self->values = tsk_malloc(self->num_sites * sizeof(*self->values)); - self->nodes = tsk_malloc(self->num_sites * sizeof(*self->nodes)); - if (self->num_transitions == NULL || self->values == NULL || self->nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (block_size == 0) { - block_size = 1 << 20; - } - ret = tsk_blkalloc_init(&self->memory, (size_t) block_size); - if (ret != 0) { - goto out; - } - ret = tsk_compressed_matrix_clear(self); -out: - return ret; -} - -int -tsk_compressed_matrix_free(tsk_compressed_matrix_t *self) -{ - tsk_blkalloc_free(&self->memory); - tsk_safe_free(self->num_transitions); - tsk_safe_free(self->normalisation_factor); - tsk_safe_free(self->values); - tsk_safe_free(self->nodes); - return 0; -} - -int -tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self) -{ - tsk_blkalloc_reset(&self->memory); - tsk_memset( - self->num_transitions, 0, self->num_sites * sizeof(*self->num_transitions)); - tsk_memset(self->normalisation_factor, 0, - self->num_sites * sizeof(*self->normalisation_factor)); - return 0; -} - -void -tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out) -{ - tsk_size_t l, j; - - fprintf(out, "Compressed matrix for %p\n", (void *) self->tree_sequence); - fprintf(out, "num_sites = %lld\n", (long long) self->num_sites); - fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); - for (l = 0; l < self->num_sites; l++) { - fprintf(out, "%lld\ts=%f\tv=%lld [", (long long) l, - self->normalisation_factor[l], (long long) self->num_transitions[l]); - for (j = 0; j < self->num_transitions[l]; j++) { - fprintf( - out, "(%lld, %f)", (long long) self->nodes[l][j], self->values[l][j]); - if (j < self->num_transitions[l] - 1) { - fprintf(out, ","); - } else { - fprintf(out, "]\n"); - } - } - } - fprintf(out, "Memory:\n"); - tsk_blkalloc_print_state(&self->memory, out); -} - -int -tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, - double normalisation_factor, tsk_size_t num_transitions, - const tsk_value_transition_t *transitions) -{ - int ret = 0; - tsk_size_t j; - - if (site < 0 || site >= (tsk_id_t) self->num_sites) { - ret = TSK_ERR_SITE_OUT_OF_BOUNDS; - goto out; - } - - self->num_transitions[site] = num_transitions; - self->normalisation_factor[site] = normalisation_factor; - self->nodes[site] - = tsk_blkalloc_get(&self->memory, (size_t) num_transitions * sizeof(tsk_id_t)); - self->values[site] - = tsk_blkalloc_get(&self->memory, (size_t) num_transitions * sizeof(double)); - if (self->nodes[site] == NULL || self->values[site] == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (j = 0; j < num_transitions; j++) { - self->values[site][j] = transitions[j].value; - self->nodes[site][j] = transitions[j].tree_node; - } -out: - return ret; -} - -static int -tsk_compressed_matrix_decode_site(tsk_compressed_matrix_t *self, const tsk_tree_t *tree, - const tsk_id_t site, double *values) -{ - int ret = 0; - const tsk_id_t *restrict list_left = tree->left_sample; - const tsk_id_t *restrict list_right = tree->right_sample; - const tsk_id_t *restrict list_next = tree->next_sample; - const tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(self->tree_sequence); - tsk_size_t j; - tsk_id_t node, index, stop; - double value; - - for (j = 0; j < self->num_transitions[site]; j++) { - node = self->nodes[site][j]; - if (node < 0 || node >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - value = self->values[site][j]; - index = list_left[node]; - if (index == TSK_NULL) { - /* It's an error if there are nodes that don't subtend any samples */ - ret = TSK_ERR_BAD_COMPRESSED_MATRIX_NODE; - goto out; - } - stop = list_right[node]; - while (true) { - values[index] = value; - if (index == stop) { - break; - } - index = list_next[index]; - } - } -out: - return ret; -} - -int -tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values) -{ - int ret = 0; - int t_ret; - tsk_tree_t tree; - tsk_size_t j, num_tree_sites; - const tsk_site_t *sites = NULL; - tsk_id_t site_id; - double *site_array; - - ret = tsk_tree_init(&tree, self->tree_sequence, TSK_SAMPLE_LISTS); - if (ret != 0) { - goto out; - } - - for (t_ret = tsk_tree_first(&tree); t_ret == TSK_TREE_OK; - t_ret = tsk_tree_next(&tree)) { - ret = tsk_tree_get_sites(&tree, &sites, &num_tree_sites); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_tree_sites; j++) { - site_id = sites[j].id; - site_array = values + ((tsk_size_t) site_id) * self->num_samples; - if (self->num_transitions[site_id] == 0) { - tsk_memset(site_array, 0, self->num_samples * sizeof(*site_array)); - } else { - ret = tsk_compressed_matrix_decode_site( - self, &tree, site_id, site_array); - if (ret != 0) { - goto out; - } - } - } - } - if (t_ret < 0) { - ret = t_ret; - goto out; - } -out: - tsk_tree_free(&tree); - return ret; -} - -/**************************************************************** - * Viterbi matrix - ****************************************************************/ - -static int -tsk_viterbi_matrix_expand_recomb_records(tsk_viterbi_matrix_t *self) -{ - int ret = 0; - tsk_recomb_required_record *tmp = tsk_realloc( - self->recombination_required, self->max_recomb_records * sizeof(*tmp)); - - if (tmp == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->recombination_required = tmp; -out: - return ret; -} - -int -tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence, - tsk_size_t block_size, tsk_flags_t options) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(*self)); - if (block_size == 0) { - block_size = 1 << 20; /* 1MiB */ - } - ret = tsk_compressed_matrix_init(&self->matrix, tree_sequence, block_size, options); - if (ret != 0) { - goto out; - } - - self->max_recomb_records - = TSK_MAX(1, block_size / sizeof(tsk_recomb_required_record)); - ret = tsk_viterbi_matrix_expand_recomb_records(self); - if (ret != 0) { - goto out; - } - /* Add the sentinel at the start to simplify traceback */ - self->recombination_required[0].site = -1; - - ret = tsk_viterbi_matrix_clear(self); -out: - return ret; -} - -int -tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self) -{ - tsk_compressed_matrix_free(&self->matrix); - tsk_safe_free(self->recombination_required); - return 0; -} - -int -tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self) -{ - self->num_recomb_records = 1; - tsk_compressed_matrix_clear(&self->matrix); - return 0; -} - -void -tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out) -{ - tsk_id_t l, j; - - fprintf(out, "viterbi_matrix\n"); - fprintf(out, "num_recomb_records = %lld\n", (long long) self->num_recomb_records); - fprintf(out, "max_recomb_records = %lld\n", (long long) self->max_recomb_records); - - j = 1; - for (l = 0; l < (tsk_id_t) self->matrix.num_sites; l++) { - fprintf(out, "%lld\t[", (long long) l); - while (j < (tsk_id_t) self->num_recomb_records - && self->recombination_required[j].site == l) { - fprintf(out, "(%lld, %d) ", (long long) self->recombination_required[j].node, - self->recombination_required[j].required); - j++; - } - fprintf(out, "]\n"); - } - tsk_compressed_matrix_print_state(&self->matrix, out); -} - -TSK_WARN_UNUSED int -tsk_viterbi_matrix_add_recombination_required( - tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required) -{ - int ret = 0; - tsk_recomb_required_record *record; - - if (self->num_recomb_records == self->max_recomb_records) { - self->max_recomb_records *= 2; - ret = tsk_viterbi_matrix_expand_recomb_records(self); - if (ret != 0) { - goto out; - } - } - record = self->recombination_required + self->num_recomb_records; - record->site = site; - record->node = node; - record->required = required; - self->num_recomb_records++; -out: - return ret; -} - -static tsk_id_t -tsk_viterbi_matrix_choose_sample( - tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_tree_t *tree) -{ - tsk_id_t ret; - tsk_id_t u = TSK_NULL; - const tsk_flags_t *node_flags = self->matrix.tree_sequence->tables->nodes.flags; - const tsk_size_t num_transitions = self->matrix.num_transitions[site]; - const tsk_id_t *transition_nodes = self->matrix.nodes[site]; - const double *transition_values = self->matrix.values[site]; - double max_value = -1; - tsk_size_t j; - tsk_id_t v; - bool found; - - if (num_transitions == 0) { - ret = TSK_ERR_NULL_VITERBI_MATRIX; - goto out; - } - for (j = 0; j < num_transitions; j++) { - if (max_value < transition_values[j]) { - u = transition_nodes[j]; - max_value = transition_values[j]; - } - } - tsk_bug_assert(u != TSK_NULL); - - while (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) { - found = false; - for (v = tree->left_child[u]; v != TSK_NULL; v = tree->right_sib[v]) { - /* Choose the first child that is not in the list of transition nodes */ - for (j = 0; j < num_transitions; j++) { - if (transition_nodes[j] == v) { - break; - } - } - if (j == num_transitions) { - u = v; - found = true; - break; - } - } - /* TODO: should remove this once we're sure this is robust */ - tsk_bug_assert(found); - } - ret = u; -out: - return ret; -} - -int -tsk_viterbi_matrix_traceback( - tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_site_t site; - tsk_id_t u, site_id, current_node; - tsk_recomb_required_record *rr_record, *rr_record_tmp; - const tsk_id_t num_sites = (tsk_id_t) self->matrix.num_sites; - const tsk_id_t num_nodes - = (tsk_id_t) tsk_treeseq_get_num_nodes(self->matrix.tree_sequence); - tsk_tree_t tree; - tsk_id_t *recombination_tree - = tsk_malloc((size_t) num_nodes * sizeof(*recombination_tree)); - - ret = tsk_tree_init(&tree, self->matrix.tree_sequence, 0); - if (ret != 0) { - goto out; - } - if (recombination_tree == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - /* Initialise the path an recombination_tree to contain TSK_NULL */ - tsk_memset(path, 0xff, ((size_t) num_sites) * sizeof(*path)); - tsk_memset(recombination_tree, 0xff, ((size_t) num_nodes) * sizeof(*path)); - - current_node = TSK_NULL; - rr_record = &self->recombination_required[self->num_recomb_records - 1]; - ret = tsk_tree_last(&tree); - if (ret < 0) { - goto out; - } - - for (site_id = num_sites - 1; site_id >= 0; site_id--) { - ret = tsk_treeseq_get_site(self->matrix.tree_sequence, site_id, &site); - if (ret != 0) { - goto out; - } - while (tree.interval.left > site.position) { - ret = tsk_tree_prev(&tree); - if (ret < 0) { - goto out; - } - } - tsk_bug_assert(tree.interval.left <= site.position); - tsk_bug_assert(site.position < tree.interval.right); - - /* Fill in the recombination tree */ - rr_record_tmp = rr_record; - while (rr_record->site == site.id) { - recombination_tree[rr_record->node] = rr_record->required; - rr_record--; - } - if (current_node == TSK_NULL) { - current_node = tsk_viterbi_matrix_choose_sample(self, site.id, &tree); - if (current_node < 0) { - ret = (int) current_node; - goto out; - } - } - path[site.id] = current_node; - /* Now traverse up the tree from the current node. The - * first marked node tells us whether we need to recombine */ - u = current_node; - while (u != TSK_NULL && recombination_tree[u] == TSK_NULL) { - u = tree.parent[u]; - } - tsk_bug_assert(u != TSK_NULL); - if (recombination_tree[u] == 1) { - /* Switch at the next site */ - current_node = TSK_NULL; - } - - /* Reset in the recombination tree */ - rr_record = rr_record_tmp; - while (rr_record->site == site.id) { - recombination_tree[rr_record->node] = TSK_NULL; - rr_record--; - } - } - ret = 0; -out: - tsk_tree_free(&tree); - tsk_safe_free(recombination_tree); - return ret; -} diff --git a/subprojects/tskit/tskit/haplotype_matching.h b/subprojects/tskit/tskit/haplotype_matching.h deleted file mode 100644 index 46631fb08..000000000 --- a/subprojects/tskit/tskit/haplotype_matching.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2022 Tskit Developers - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TSK_HAPLOTYPE_MATCHING_H -#define TSK_HAPLOTYPE_MATCHING_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -/* Seems like we might use this somewhere else as well, so putting it into the middle - * of the flags space */ -#define TSK_ALLELES_ACGT (1 << 16) - -typedef struct { - tsk_id_t tree_node; - tsk_id_t value_index; - double value; -} tsk_value_transition_t; - -typedef struct { - tsk_size_t index; - double value; -} tsk_argsort_t; - -typedef struct { - tsk_id_t tree_node; - tsk_id_t old_state; - tsk_id_t new_state; - tsk_id_t transition_parent; -} tsk_transition_stack_t; - -typedef struct { - double normalisation_factor; - double *value; - tsk_id_t *node; - tsk_size_t num_values; -} tsk_site_probability_t; - -typedef struct { - tsk_treeseq_t *tree_sequence; - tsk_flags_t options; - tsk_size_t num_sites; - tsk_size_t num_samples; - double *normalisation_factor; - tsk_size_t *num_transitions; - double **values; - tsk_id_t **nodes; - tsk_blkalloc_t memory; -} tsk_compressed_matrix_t; - -typedef struct { - tsk_id_t site; - tsk_id_t node; - bool required; -} tsk_recomb_required_record; - -typedef struct { - tsk_compressed_matrix_t matrix; - tsk_recomb_required_record *recombination_required; - tsk_size_t num_recomb_records; - tsk_size_t max_recomb_records; -} tsk_viterbi_matrix_t; - -typedef struct _tsk_ls_hmm_t { - /* input */ - tsk_treeseq_t *tree_sequence; - double *recombination_rate; - double *mutation_rate; - const char ***alleles; - unsigned int precision; - uint32_t *num_alleles; - tsk_size_t num_samples; - tsk_size_t num_sites; - tsk_size_t num_nodes; - /* state */ - tsk_tree_t tree; - tsk_diff_iter_t diffs; - tsk_id_t *parent; - /* The probability value transitions on the tree */ - tsk_value_transition_t *transitions; - tsk_value_transition_t *transitions_copy; - /* Stack used when distributing transitions on the tree */ - tsk_transition_stack_t *transition_stack; - /* Map of node_id to index in the transitions list */ - tsk_id_t *transition_index; - /* Buffer used to argsort the transitions by node time */ - tsk_argsort_t *transition_time_order; - tsk_size_t num_transitions; - tsk_size_t max_transitions; - /* The distinct values in the transitions */ - double *values; - tsk_size_t num_values; - tsk_size_t max_values; - tsk_size_t max_parsimony_words; - /* Number of machine words per node optimal value set. */ - tsk_size_t num_optimal_value_set_words; - uint64_t *optimal_value_sets; - /* The parent transition; used during compression */ - tsk_id_t *transition_parent; - /* The number of samples directly subtended by a transition */ - tsk_size_t *num_transition_samples; - int32_t *allelic_state; - /* Algorithms set these values before they are run */ - int (*next_probability)( - struct _tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *); - double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *); - void *output; -} tsk_ls_hmm_t; - -int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, - double *recombination_rate, double *mutation_rate, tsk_flags_t options); -int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision); -int tsk_ls_hmm_free(tsk_ls_hmm_t *self); -void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out); -int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, - tsk_compressed_matrix_t *output, tsk_flags_t options); -int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, - tsk_viterbi_matrix_t *output, tsk_flags_t options); -int tsk_ls_hmm_run(tsk_ls_hmm_t *self, int32_t *haplotype, - int (*next_probability)(tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *), - double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *), void *output); - -int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, - tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options); -int tsk_compressed_matrix_free(tsk_compressed_matrix_t *self); -int tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self); -void tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out); -int tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, - double normalisation_factor, tsk_size_t num_transitions, - const tsk_value_transition_t *transitions); -int tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values); - -int tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence, - tsk_size_t block_size, tsk_flags_t options); -int tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self); -int tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self); -void tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out); -int tsk_viterbi_matrix_add_recombination_required( - tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required); -int tsk_viterbi_matrix_traceback( - tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t options); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/subprojects/tskit/tskit/stats.c b/subprojects/tskit/tskit/stats.c deleted file mode 100644 index 1c1aeea68..000000000 --- a/subprojects/tskit/tskit/stats.c +++ /dev/null @@ -1,305 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2018-2022 Tskit Developers - * Copyright (c) 2016-2017 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include - -#include - -void -tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out) -{ - fprintf(out, "tree = %p\n", (const void *) &self->tree); - fprintf(out, "max_sites = %d\n", (int) self->max_sites); - fprintf(out, "max_distance = %f\n", self->max_distance); -} - -int TSK_WARN_UNUSED -tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence) -{ - int ret = 0; - tsk_memset(self, 0, sizeof(*self)); - - ret = tsk_tree_init(&self->tree, tree_sequence, 0); - if (ret != 0) { - goto out; - } - self->tree_sequence = tree_sequence; - self->total_samples = tsk_treeseq_get_num_samples(self->tree_sequence); - - self->sample_buffer = tsk_malloc(self->total_samples * sizeof(*self->sample_buffer)); - if (self->sample_buffer == NULL) { - goto out; - } -out: - return ret; -} - -int -tsk_ld_calc_free(tsk_ld_calc_t *self) -{ - tsk_tree_free(&self->tree); - tsk_safe_free(self->sample_buffer); - return 0; -} - -static int -tsk_ld_calc_check_site(tsk_ld_calc_t *TSK_UNUSED(self), const tsk_site_t *site) -{ - int ret = 0; - - /* These are both limitations in the current implementation, there's no - * fundamental reason why we can't support them */ - if (site->mutations_length != 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; - goto out; - } - if (site->ancestral_state_length == site->mutations[0].derived_state_length - && tsk_memcmp(site->ancestral_state, site->mutations[0].derived_state, - site->ancestral_state_length) - == 0) { - ret = TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED; - goto out; - } -out: - return ret; -} - -static int -tsk_ld_calc_set_focal_samples(tsk_ld_calc_t *self) -{ - int ret = 0; - tsk_id_t focal_node = self->focal_site.mutations[0].node; - - ret = tsk_tree_track_descendant_samples(&self->tree, focal_node); - if (ret != 0) { - goto out; - } - self->focal_samples = self->tree.num_tracked_samples[focal_node]; -out: - return ret; -} - -static int -tsk_ld_calc_initialise(tsk_ld_calc_t *self, tsk_id_t a) -{ - int ret = 0; - - ret = tsk_treeseq_get_site(self->tree_sequence, a, &self->focal_site); - if (ret != 0) { - goto out; - } - ret = tsk_ld_calc_check_site(self, &self->focal_site); - if (ret != 0) { - goto out; - } - ret = tsk_tree_seek(&self->tree, self->focal_site.position, 0); - if (ret != 0) { - goto out; - } - ret = tsk_ld_calc_set_focal_samples(self); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int -tsk_ld_calc_compute_r2(tsk_ld_calc_t *self, const tsk_site_t *target_site, double *r2) -{ - const double n = (double) self->total_samples; - double f_a, f_b, f_ab, D, denom; - tsk_id_t node; - int ret = tsk_ld_calc_check_site(self, target_site); - - if (ret != 0) { - goto out; - } - node = target_site->mutations[0].node; - f_a = ((double) self->focal_samples) / n; - f_b = ((double) self->tree.num_samples[node]) / n; - f_ab = ((double) self->tree.num_tracked_samples[node]) / n; - D = f_ab - f_a * f_b; - denom = f_a * f_b * (1 - f_a) * (1 - f_b); - *r2 = (D * D) / denom; -out: - return ret; -} - -static int -tsk_ld_calc_compute_and_append( - tsk_ld_calc_t *self, const tsk_site_t *target_site, bool *ret_done) -{ - int ret = 0; - double r2; - double distance = fabs(self->focal_site.position - target_site->position); - bool done = true; - - if (distance <= self->max_distance && self->result_length < self->max_sites) { - ret = tsk_ld_calc_compute_r2(self, target_site, &r2); - if (ret != 0) { - goto out; - } - self->result[self->result_length] = r2; - self->result_length++; - done = false; - } - *ret_done = done; -out: - return ret; -} - -static int -tsk_ld_calc_run_forward(tsk_ld_calc_t *self) -{ - int ret = 0; - tsk_size_t j; - bool done = false; - - for (j = 0; j < self->tree.sites_length; j++) { - if (self->tree.sites[j].id > self->focal_site.id) { - ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); - if (ret != 0) { - goto out; - } - if (done) { - break; - } - } - } - while (((ret = tsk_tree_next(&self->tree)) == TSK_TREE_OK) && !done) { - for (j = 0; j < self->tree.sites_length; j++) { - ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); - if (ret != 0) { - goto out; - } - if (done) { - break; - } - } - } - if (ret < 0) { - goto out; - } - ret = 0; -out: - return ret; -} - -static int -tsk_ld_calc_run_reverse(tsk_ld_calc_t *self) -{ - int ret = 0; - tsk_id_t j; - bool done = false; - - for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { - if (self->tree.sites[j].id < self->focal_site.id) { - ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); - if (ret != 0) { - goto out; - } - if (done) { - break; - } - } - } - while (((ret = tsk_tree_prev(&self->tree)) == TSK_TREE_OK) && !done) { - for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { - ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); - if (ret != 0) { - goto out; - } - if (done) { - break; - } - } - } - if (ret < 0) { - goto out; - } - ret = 0; -out: - return ret; -} - -int -tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2) -{ - int ret = 0; - tsk_site_t target_site; - - ret = tsk_ld_calc_initialise(self, a); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_get_site(self->tree_sequence, b, &target_site); - if (ret != 0) { - goto out; - } - ret = tsk_tree_seek(&self->tree, target_site.position, 0); - if (ret != 0) { - goto out; - } - ret = tsk_ld_calc_compute_r2(self, &target_site, r2); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int -tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, - tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) -{ - int ret = tsk_ld_calc_initialise(self, a); - - if (ret != 0) { - goto out; - } - - self->max_sites = max_sites; - self->max_distance = max_distance; - self->result_length = 0; - self->result = r2; - - if (direction == TSK_DIR_FORWARD) { - ret = tsk_ld_calc_run_forward(self); - } else if (direction == TSK_DIR_REVERSE) { - ret = tsk_ld_calc_run_reverse(self); - } else { - ret = TSK_ERR_BAD_PARAM_VALUE; - } - if (ret != 0) { - goto out; - } - *num_r2_values = self->result_length; -out: - return ret; -} diff --git a/subprojects/tskit/tskit/stats.h b/subprojects/tskit/tskit/stats.h deleted file mode 100644 index 0632b6db1..000000000 --- a/subprojects/tskit/tskit/stats.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2021 Tskit Developers - * Copyright (c) 2016-2017 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef TSK_STATS_H -#define TSK_STATS_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -typedef struct { - const tsk_treeseq_t *tree_sequence; - tsk_site_t focal_site; - tsk_size_t total_samples; - tsk_size_t focal_samples; - double max_distance; - tsk_size_t max_sites; - tsk_tree_t tree; - tsk_id_t *sample_buffer; - double *result; - tsk_size_t result_length; -} tsk_ld_calc_t; - -int tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence); -int tsk_ld_calc_free(tsk_ld_calc_t *self); -void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out); -int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2); -int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, - tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c deleted file mode 100644 index 8eea85f5a..000000000 --- a/subprojects/tskit/tskit/tables.c +++ /dev/null @@ -1,13609 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2017-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define TABLE_SEP "-----------------------------------------\n" - -#define TSK_COL_OPTIONAL (1 << 0) - -typedef struct { - const char *name; - void **array_dest; - int type; - tsk_flags_t options; -} read_table_col_t; - -typedef struct { - const char *name; - void **data_array_dest; - tsk_size_t *data_len_dest; - int data_type; - tsk_size_t **offset_array_dest; - tsk_flags_t options; -} read_table_ragged_col_t; - -typedef struct { - const char *name; - void **array_dest; - tsk_size_t *len_dest; - int type; - tsk_flags_t options; -} read_table_property_t; - -typedef struct { - const char *name; - const void *array; - tsk_size_t len; - int type; -} write_table_col_t; - -typedef struct { - const char *name; - const void *data_array; - tsk_size_t data_len; - int data_type; - const tsk_size_t *offset_array; - tsk_size_t num_rows; -} write_table_ragged_col_t; - -/* Returns true if adding the specified number of rows would result in overflow. - * Tables can support indexes from 0 to TSK_MAX_ID, and therefore could have at most - * TSK_MAX_ID + 1 rows. However we limit to TSK_MAX_ID rows so that counts of rows - * can fit in a tsk_id_t. */ -static bool -check_table_overflow(tsk_size_t current_size, tsk_size_t additional_rows) -{ - tsk_size_t max_val = TSK_MAX_ID; - return additional_rows > max_val || current_size > (max_val - additional_rows); -} - -/* Returns true if adding the specified number of elements would result in overflow - * of an offset column. - */ -static bool -check_offset_overflow(tsk_size_t current_size, tsk_size_t additional_elements) -{ - tsk_size_t max_val = TSK_MAX_SIZE; - return additional_elements > max_val - || current_size > (max_val - additional_elements); -} - -#define TSK_NUM_ROWS_UNSET ((tsk_size_t) -1) -#define TSK_MAX_COL_NAME_LEN 64 - -static int -read_table_cols(kastore_t *store, tsk_size_t *num_rows, read_table_col_t *cols, - tsk_flags_t TSK_UNUSED(flags)) -{ - int ret = 0; - size_t len; - int type; - read_table_col_t *col; - - for (col = cols; col->name != NULL; col++) { - ret = kastore_containss(store, col->name); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (ret == 1) { - ret = kastore_gets(store, col->name, col->array_dest, &len, &type); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (*num_rows == TSK_NUM_ROWS_UNSET) { - *num_rows = (tsk_size_t) len; - } else { - if (*num_rows != (tsk_size_t) len) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - } - if (type != col->type) { - ret = TSK_ERR_BAD_COLUMN_TYPE; - goto out; - } - } else if (!(col->options & TSK_COL_OPTIONAL)) { - ret = TSK_ERR_REQUIRED_COL_NOT_FOUND; - goto out; - } - } -out: - return ret; -} - -static int -cast_offset_array(read_table_ragged_col_t *col, uint32_t *source, tsk_size_t num_rows) -{ - int ret = 0; - tsk_size_t len = num_rows + 1; - tsk_size_t j; - uint64_t *dest = tsk_malloc(len * sizeof(*dest)); - - if (dest == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - *col->offset_array_dest = dest; - for (j = 0; j < len; j++) { - dest[j] = source[j]; - } -out: - return ret; -} - -static int -read_table_ragged_cols(kastore_t *store, tsk_size_t *num_rows, - read_table_ragged_col_t *cols, tsk_flags_t TSK_UNUSED(flags)) -{ - int ret = 0; - size_t data_len = 0; // initial value unused, just to keep the compiler happy. - size_t offset_len; - int type; - read_table_ragged_col_t *col; - char offset_col_name[TSK_MAX_COL_NAME_LEN]; - bool data_col_present, offset_col_present; - void *store_offset_array = NULL; - tsk_size_t *offset_array; - - for (col = cols; col->name != NULL; col++) { - ret = kastore_containss(store, col->name); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - data_col_present = false; - if (ret == 1) { - ret = kastore_gets(store, col->name, col->data_array_dest, &data_len, &type); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (type != col->data_type) { - ret = TSK_ERR_BAD_COLUMN_TYPE; - goto out; - } - *col->data_len_dest = (tsk_size_t) data_len; - data_col_present = true; - } else if (!(col->options & TSK_COL_OPTIONAL)) { - ret = TSK_ERR_REQUIRED_COL_NOT_FOUND; - goto out; - } - - assert(strlen(col->name) + strlen("_offset") + 2 < sizeof(offset_col_name)); - strcpy(offset_col_name, col->name); - strcat(offset_col_name, "_offset"); - - ret = kastore_containss(store, offset_col_name); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - offset_col_present = ret == 1; - if (offset_col_present != data_col_present) { - ret = TSK_ERR_BOTH_COLUMNS_REQUIRED; - goto out; - } - if (offset_col_present) { - ret = kastore_gets( - store, offset_col_name, &store_offset_array, &offset_len, &type); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - /* A table with zero rows will still have an offset length of 1; - * catching this here prevents underflows in the logic below */ - if (offset_len == 0) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - /* Some tables have only ragged columns */ - if (*num_rows == TSK_NUM_ROWS_UNSET) { - *num_rows = (tsk_size_t) offset_len - 1; - } else { - if (*num_rows != (tsk_size_t) offset_len - 1) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - } - if (type == KAS_UINT64) { - *col->offset_array_dest = (uint64_t *) store_offset_array; - store_offset_array = NULL; - } else if (type == KAS_UINT32) { - ret = cast_offset_array(col, (uint32_t *) store_offset_array, *num_rows); - if (ret != 0) { - goto out; - } - tsk_safe_free(store_offset_array); - store_offset_array = NULL; - } else { - ret = TSK_ERR_BAD_COLUMN_TYPE; - goto out; - } - offset_array = *col->offset_array_dest; - if (offset_array[*num_rows] != (tsk_size_t) data_len) { - ret = TSK_ERR_BAD_OFFSET; - goto out; - } - } - } -out: - tsk_safe_free(store_offset_array); - return ret; -} - -static int -read_table_properties( - kastore_t *store, read_table_property_t *properties, tsk_flags_t TSK_UNUSED(flags)) -{ - int ret = 0; - size_t len; - int type; - read_table_property_t *property; - - for (property = properties; property->name != NULL; property++) { - ret = kastore_containss(store, property->name); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (ret == 1) { - ret = kastore_gets(store, property->name, property->array_dest, &len, &type); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - assert(ret != 0); /* Tell static analysers that we're handling errors */ - goto out; - } - if (type != property->type) { - ret = TSK_ERR_BAD_COLUMN_TYPE; - goto out; - } - *property->len_dest = (tsk_size_t) len; - } - assert(property->options & TSK_COL_OPTIONAL); - } -out: - return ret; -} - -static int -read_table(kastore_t *store, tsk_size_t *num_rows, read_table_col_t *cols, - read_table_ragged_col_t *ragged_cols, read_table_property_t *properties, - tsk_flags_t options) -{ - int ret = 0; - - *num_rows = TSK_NUM_ROWS_UNSET; - if (cols != NULL) { - ret = read_table_cols(store, num_rows, cols, options); - if (ret != 0) { - goto out; - } - } - if (ragged_cols != NULL) { - ret = read_table_ragged_cols(store, num_rows, ragged_cols, options); - if (ret != 0) { - goto out; - } - } - if (*num_rows == TSK_NUM_ROWS_UNSET) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - if (properties != NULL) { - ret = read_table_properties(store, properties, options); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static void -free_read_table_mem(read_table_col_t *cols, read_table_ragged_col_t *ragged_cols, - read_table_property_t *properties) -{ - read_table_col_t *col; - read_table_ragged_col_t *ragged_col; - read_table_property_t *property; - - if (cols != NULL) { - for (col = cols; col->name != NULL; col++) { - tsk_safe_free(*(col->array_dest)); - } - } - if (ragged_cols != NULL) { - for (ragged_col = ragged_cols; ragged_col->name != NULL; ragged_col++) { - tsk_safe_free(*(ragged_col->data_array_dest)); - tsk_safe_free(*(ragged_col->offset_array_dest)); - } - } - if (properties != NULL) { - for (property = properties; property->name != NULL; property++) { - tsk_safe_free(*(property->array_dest)); - } - } -} - -static int -write_offset_col( - kastore_t *store, const write_table_ragged_col_t *col, tsk_flags_t options) -{ - int ret = 0; - char offset_col_name[TSK_MAX_COL_NAME_LEN]; - uint32_t *offset32 = NULL; - tsk_size_t len = col->num_rows + 1; - tsk_size_t j; - int32_t put_flags = 0; - int type; - const void *data; - bool needs_64 = col->offset_array[col->num_rows] > UINT32_MAX; - - assert(strlen(col->name) + strlen("_offset") + 2 < sizeof(offset_col_name)); - strcpy(offset_col_name, col->name); - strcat(offset_col_name, "_offset"); - - if (options & TSK_DUMP_FORCE_OFFSET_64 || needs_64) { - type = KAS_UINT64; - data = col->offset_array; - put_flags = KAS_BORROWS_ARRAY; - } else { - offset32 = tsk_malloc(len * sizeof(*offset32)); - if (offset32 == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (j = 0; j < len; j++) { - offset32[j] = (uint32_t) col->offset_array[j]; - } - type = KAS_UINT32; - data = offset32; - /* We've just allocated a temp buffer, so kas can't borrow so leave put_flags=0*/ - } - ret = kastore_puts(store, offset_col_name, data, (size_t) len, type, put_flags); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } -out: - tsk_safe_free(offset32); - return ret; -} - -static int -write_table_ragged_cols( - kastore_t *store, const write_table_ragged_col_t *write_cols, tsk_flags_t options) -{ - int ret = 0; - const write_table_ragged_col_t *col; - - for (col = write_cols; col->name != NULL; col++) { - ret = kastore_puts(store, col->name, col->data_array, (size_t) col->data_len, - col->data_type, KAS_BORROWS_ARRAY); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - ret = write_offset_col(store, col, options); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -write_table_cols(kastore_t *store, const write_table_col_t *write_cols, - tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - const write_table_col_t *col; - - for (col = write_cols; col->name != NULL; col++) { - ret = kastore_puts(store, col->name, col->array, (size_t) col->len, col->type, - KAS_BORROWS_ARRAY); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - } -out: - return ret; -} - -static int -write_table(kastore_t *store, const write_table_col_t *cols, - const write_table_ragged_col_t *ragged_cols, tsk_flags_t options) -{ - int ret = write_table_cols(store, cols, options); - - if (ret != 0) { - goto out; - } - ret = write_table_ragged_cols(store, ragged_cols, options); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -/* Checks that the specified list of offsets is well-formed. */ -static int -check_offsets( - tsk_size_t num_rows, const tsk_size_t *offsets, tsk_size_t length, bool check_length) -{ - int ret = TSK_ERR_BAD_OFFSET; - tsk_size_t j; - - if (offsets[0] != 0) { - goto out; - } - if (check_length && offsets[num_rows] != length) { - goto out; - } - for (j = 0; j < num_rows; j++) { - if (offsets[j] > offsets[j + 1]) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -static int -calculate_max_rows(tsk_size_t num_rows, tsk_size_t max_rows, - tsk_size_t max_rows_increment, tsk_size_t additional_rows, - tsk_size_t *ret_new_max_rows) -{ - tsk_size_t new_max_rows; - int ret = 0; - - if (check_table_overflow(num_rows, additional_rows)) { - ret = TSK_ERR_TABLE_OVERFLOW; - goto out; - } - - if (num_rows + additional_rows <= max_rows) { - new_max_rows = max_rows; - } else { - if (max_rows_increment == 0) { - /* Doubling by default */ - new_max_rows = TSK_MIN(max_rows * 2, TSK_MAX_ID + (tsk_size_t) 1); - /* Add some constraints to prevent very small allocations */ - if (new_max_rows < 1024) { - new_max_rows = 1024; - } - /* Prevent allocating more than ~2 million additional rows unless needed*/ - if (new_max_rows - max_rows > 2097152) { - new_max_rows = max_rows + 2097152; - } - } else { - /* Use user increment value */ - if (check_table_overflow(max_rows, max_rows_increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; - goto out; - } - new_max_rows = max_rows + max_rows_increment; - } - new_max_rows = TSK_MAX(new_max_rows, num_rows + additional_rows); - } - *ret_new_max_rows = new_max_rows; -out: - return ret; -} - -static int -calculate_max_length(tsk_size_t current_length, tsk_size_t max_length, - tsk_size_t max_length_increment, tsk_size_t additional_length, - tsk_size_t *ret_new_max_length) -{ - tsk_size_t new_max_length; - int ret = 0; - - if (check_offset_overflow(current_length, additional_length)) { - ret = TSK_ERR_COLUMN_OVERFLOW; - goto out; - } - - if (current_length + additional_length <= max_length) { - new_max_length = max_length; - } else { - if (max_length_increment == 0) { - /* Doubling by default */ - new_max_length = TSK_MIN(max_length * 2, TSK_MAX_SIZE); - /* Add some constraints to prevent very small allocations */ - if (new_max_length < 65536) { - new_max_length = 65536; - } - /* Prevent allocating more than 100MB additional unless needed*/ - if (new_max_length - max_length > 104857600) { - new_max_length = max_length + 104857600; - } - new_max_length = TSK_MAX(new_max_length, current_length + additional_length); - } else { - /* Use user increment value */ - if (check_offset_overflow(max_length, max_length_increment)) { - /* Here we could allocate to the maximum size. - * Instead we are erroring out as this is much easier to test. - * The cost is that (at most) the last "max_length_increment"-1 - * bytes of the possible array space can't be used. */ - ret = TSK_ERR_COLUMN_OVERFLOW; - goto out; - } - new_max_length = max_length + max_length_increment; - } - new_max_length = TSK_MAX(new_max_length, current_length + additional_length); - } - *ret_new_max_length = new_max_length; -out: - return ret; -} - -static int -expand_column(void **column, tsk_size_t new_max_rows, size_t element_size) -{ - int ret = 0; - void *tmp; - - tmp = tsk_realloc((void **) *column, new_max_rows * element_size); - if (tmp == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - *column = tmp; -out: - return ret; -} - -static int -expand_ragged_column(tsk_size_t current_length, tsk_size_t additional_length, - tsk_size_t max_length_increment, tsk_size_t *max_length, void **column, - size_t element_size) -{ - int ret = 0; - tsk_size_t new_max_length; - - ret = calculate_max_length(current_length, *max_length, max_length_increment, - additional_length, &new_max_length); - if (ret != 0) { - goto out; - } - - if (new_max_length > *max_length) { - ret = expand_column(column, new_max_length, element_size); - if (ret != 0) { - goto out; - } - *max_length = new_max_length; - } -out: - return ret; -} - -/* TODO rename to copy_string or replace_and_copy_string */ -static int -replace_string( - char **str, tsk_size_t *len, const char *new_str, const tsk_size_t new_len) -{ - int ret = 0; - tsk_safe_free(*str); - *str = NULL; - *len = new_len; - if (new_len > 0) { - *str = tsk_malloc(new_len * sizeof(char)); - if (*str == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(*str, new_str, new_len * sizeof(char)); - } -out: - return ret; -} - -static int -takeset_string(char **str, tsk_size_t *len, char *new_str, const tsk_size_t new_len) -{ - tsk_safe_free(*str); - *str = new_str; - *len = new_len; - return 0; -} - -static int -alloc_empty_ragged_column(tsk_size_t num_rows, void **data_col, tsk_size_t **offset_col) -{ - int ret = 0; - - *data_col = tsk_malloc(1); - *offset_col = tsk_calloc(num_rows + 1, sizeof(tsk_size_t)); - if (*data_col == NULL || *offset_col == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -static int -check_ragged_column(tsk_size_t num_rows, void *data, tsk_size_t *offset) -{ - int ret = 0; - if ((data == NULL) != (offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (data != NULL) { - ret = check_offsets(num_rows, offset, 0, false); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -takeset_ragged_column(tsk_size_t num_rows, void *data, tsk_size_t *offset, - void **data_dest, tsk_size_t **offset_dest, tsk_size_t *length_dest) -{ - int ret = 0; - if (data == NULL) { - ret = alloc_empty_ragged_column(num_rows, (void *) data_dest, offset_dest); - if (ret != 0) { - goto out; - } - } else { - *data_dest = data; - *offset_dest = offset; - } - *length_dest = (*offset_dest)[num_rows]; -out: - return ret; -} - -static int -takeset_optional_id_column(tsk_size_t num_rows, tsk_id_t *input, tsk_id_t **dest) -{ - int ret = 0; - tsk_size_t buffsize; - tsk_id_t *buff; - - if (input == NULL) { - buffsize = num_rows * sizeof(*buff); - buff = tsk_malloc(buffsize); - if (buff == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - *dest = buff; - tsk_memset(buff, 0xff, buffsize); - } else { - *dest = input; - } -out: - return ret; -} - -static int -write_metadata_schema_header( - FILE *out, const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - const char *fmt = "#metadata_schema#\n" - "%.*s\n" - "#end#metadata_schema\n" TABLE_SEP; - return fprintf(out, fmt, (int) metadata_schema_length, metadata_schema); -} - -/* Utilities for in-place subsetting columns */ - -static tsk_size_t -count_true(tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j; - tsk_size_t count = 0; - - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - count++; - } - } - return count; -} - -static void -keep_mask_to_id_map( - tsk_size_t num_rows, const tsk_bool_t *restrict keep, tsk_id_t *restrict id_map) -{ - tsk_size_t j; - tsk_id_t next_id = 0; - - for (j = 0; j < num_rows; j++) { - id_map[j] = TSK_NULL; - if (keep[j]) { - id_map[j] = next_id; - next_id++; - } - } -} - -static tsk_size_t -subset_remap_id_column(tsk_id_t *restrict column, tsk_size_t num_rows, - const tsk_bool_t *restrict keep, const tsk_id_t *restrict id_map) -{ - tsk_size_t j, k; - tsk_id_t value; - - k = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - value = column[j]; - if (value != TSK_NULL) { - value = id_map[value]; - } - column[k] = value; - k++; - } - } - return k; -} - -/* Trigger warning: C++ programmers should look away... This may be one of the - * few cases where some macro funkiness is warranted, as these are exact - * duplicates of the same function with just the type of the column - * parameter changed. */ - -static tsk_size_t -subset_id_column( - tsk_id_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j, k; - - k = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - column[k] = column[j]; - k++; - } - } - return k; -} - -static tsk_size_t -subset_flags_column( - tsk_flags_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j, k; - - k = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - column[k] = column[j]; - k++; - } - } - return k; -} - -static tsk_size_t -subset_double_column( - double *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j, k; - - k = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - column[k] = column[j]; - k++; - } - } - return k; -} - -static tsk_size_t -subset_ragged_char_column(char *restrict data, tsk_size_t *restrict offset_col, - tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j, k, i, offset; - - k = 0; - offset = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - offset_col[k] = offset; - /* Note: Unclear whether it's worth calling memcpy instead here? - * Need to be careful since the regions are overlapping */ - for (i = offset_col[j]; i < offset_col[j + 1]; i++) { - data[offset] = data[i]; - offset++; - } - k++; - } - } - offset_col[k] = offset; - return offset; -} - -static tsk_size_t -subset_ragged_double_column(double *restrict data, tsk_size_t *restrict offset_col, - tsk_size_t num_rows, const tsk_bool_t *restrict keep) -{ - tsk_size_t j, k, i, offset; - - k = 0; - offset = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - offset_col[k] = offset; - /* Note: Unclear whether it's worth calling memcpy instead here? - * Need to be careful since the regions are overlapping */ - for (i = offset_col[j]; i < offset_col[j + 1]; i++) { - data[offset] = data[i]; - offset++; - } - k++; - } - } - offset_col[k] = offset; - return offset; -} - -static tsk_size_t -subset_remap_ragged_id_column(tsk_id_t *restrict data, tsk_size_t *restrict offset_col, - tsk_size_t num_rows, const tsk_bool_t *restrict keep, - const tsk_id_t *restrict id_map) -{ - tsk_size_t j, k, i, offset; - tsk_id_t di; - - k = 0; - offset = 0; - for (j = 0; j < num_rows; j++) { - if (keep[j]) { - offset_col[k] = offset; - for (i = offset_col[j]; i < offset_col[j + 1]; i++) { - di = data[i]; - if (di != TSK_NULL) { - di = id_map[di]; - } - data[offset] = di; - offset++; - } - k++; - } - } - offset_col[k] = offset; - return offset; -} - -/************************* - * reference sequence - *************************/ - -int -tsk_reference_sequence_init( - tsk_reference_sequence_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - tsk_memset(self, 0, sizeof(*self)); - return 0; -} - -int -tsk_reference_sequence_free(tsk_reference_sequence_t *self) -{ - tsk_safe_free(self->data); - tsk_safe_free(self->url); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_schema); - return 0; -} - -bool -tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self) -{ - return self->data_length == 0 && self->url_length == 0 && self->metadata_length == 0 - && self->metadata_schema_length == 0; -} - -bool -tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, - const tsk_reference_sequence_t *other, tsk_flags_t options) -{ - int ret - = self->data_length == other->data_length - && self->url_length == other->url_length - && tsk_memcmp(self->data, other->data, self->data_length * sizeof(char)) == 0 - && tsk_memcmp(self->url, other->url, self->url_length * sizeof(char)) == 0; - - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, - tsk_reference_sequence_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_reference_sequence_init(dest, 0); - if (ret != 0) { - goto out; - } - } - - if (tsk_reference_sequence_is_null(self)) { - /* This is a simple way to get any input into the NULL state */ - tsk_reference_sequence_free(dest); - } else { - ret = tsk_reference_sequence_set_data(dest, self->data, self->data_length); - if (ret != 0) { - goto out; - } - ret = tsk_reference_sequence_set_url(dest, self->url, self->url_length); - if (ret != 0) { - goto out; - } - ret = tsk_reference_sequence_set_metadata( - dest, self->metadata, self->metadata_length); - if (ret != 0) { - goto out; - } - ret = tsk_reference_sequence_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_reference_sequence_set_data( - tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length) -{ - return replace_string(&self->data, &self->data_length, data, data_length); -} - -int -tsk_reference_sequence_set_url( - tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length) -{ - return replace_string(&self->url, &self->url_length, url, url_length); -} - -int -tsk_reference_sequence_set_metadata( - tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length) -{ - return replace_string( - &self->metadata, &self->metadata_length, metadata, metadata_length); -} - -int -tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_reference_sequence_takeset_data( - tsk_reference_sequence_t *self, char *data, tsk_size_t data_length) -{ - return takeset_string(&self->data, &self->data_length, data, data_length); -} - -int -tsk_reference_sequence_takeset_metadata( - tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length) -{ - return takeset_string( - &self->metadata, &self->metadata_length, metadata, metadata_length); -} - -/************************* - * individual table - *************************/ - -static void -tsk_individual_table_free_columns(tsk_individual_table_t *self) -{ - tsk_safe_free(self->flags); - tsk_safe_free(self->location); - tsk_safe_free(self->location_offset); - tsk_safe_free(self->parents); - tsk_safe_free(self->parents_offset); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_individual_table_free(tsk_individual_table_t *self) -{ - tsk_individual_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_individual_table_expand_main_columns( - tsk_individual_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->location_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->parents_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_individual_table_expand_location( - tsk_individual_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->location_length, additional_length, - self->max_location_length_increment, &self->max_location_length, - (void **) &self->location, sizeof(*self->location)); -} - -static int -tsk_individual_table_expand_parents( - tsk_individual_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->parents_length, additional_length, - self->max_parents_length_increment, &self->max_parents_length, - (void **) &self->parents, sizeof(*self->parents)); -} - -static int -tsk_individual_table_expand_metadata( - tsk_individual_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_individual_table_set_max_rows_increment( - tsk_individual_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_individual_table_set_max_metadata_length_increment( - tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = (tsk_size_t) max_metadata_length_increment; - return 0; -} - -int -tsk_individual_table_set_max_location_length_increment( - tsk_individual_table_t *self, tsk_size_t max_location_length_increment) -{ - self->max_location_length_increment = (tsk_size_t) max_location_length_increment; - return 0; -} - -int -tsk_individual_table_set_max_parents_length_increment( - tsk_individual_table_t *self, tsk_size_t max_parents_length_increment) -{ - self->max_parents_length_increment = (tsk_size_t) max_parents_length_increment; - return 0; -} - -int -tsk_individual_table_init(tsk_individual_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_individual_table_t)); - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_location_length_increment = 1; - self->max_parents_length_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_individual_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_expand_location(self, 1); - if (ret != 0) { - goto out; - } - self->location_offset[0] = 0; - ret = tsk_individual_table_expand_parents(self, 1); - if (ret != 0) { - goto out; - } - self->parents_offset[0] = 0; - ret = tsk_individual_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_location_length_increment = 0; - self->max_parents_length_increment = 0; - self->max_metadata_length_increment = 0; - tsk_individual_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_individual_table_copy(const tsk_individual_table_t *self, - tsk_individual_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_individual_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_individual_table_set_columns(dest, self->num_rows, self->flags, - self->location, self->location_offset, self->parents, self->parents_offset, - self->metadata, self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_individual_table_set_columns(tsk_individual_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, - const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret; - - ret = tsk_individual_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_append_columns(self, num_rows, flags, location, - location_offset, parents, parents_offset, metadata, metadata_offset); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_individual_table_takeset_columns(tsk_individual_table_t *self, tsk_size_t num_rows, - tsk_flags_t *flags, double *location, tsk_size_t *location_offset, tsk_id_t *parents, - tsk_size_t *parents_offset, char *metadata, tsk_size_t *metadata_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - ret = check_ragged_column(num_rows, location, location_offset); - if (ret != 0) { - goto out; - } - ret = check_ragged_column(num_rows, parents, parents_offset); - if (ret != 0) { - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_individual_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - - if (flags == NULL) { - /* Flags defaults to all zeros if not specified. The column is often - * unused so this is a worthwhile optimisation. */ - self->flags = tsk_calloc(num_rows, sizeof(*self->flags)); - if (self->flags == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } else { - self->flags = flags; - } - - ret = takeset_ragged_column(num_rows, location, location_offset, - (void *) &self->location, &self->location_offset, &self->location_length); - if (ret != 0) { - goto out; - } - ret = takeset_ragged_column(num_rows, parents, parents_offset, - (void *) &self->parents, &self->parents_offset, &self->parents_length); - if (ret != 0) { - goto out; - } - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int -tsk_individual_table_append_columns(tsk_individual_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, - const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret; - tsk_size_t j, metadata_length, location_length, parents_length; - - if (flags == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((location == NULL) != (location_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((parents == NULL) != (parents_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_individual_table_expand_main_columns(self, (tsk_size_t) num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->flags + self->num_rows, flags, num_rows * sizeof(tsk_flags_t)); - if (location == NULL) { - for (j = 0; j < num_rows; j++) { - self->location_offset[self->num_rows + j + 1] - = (tsk_size_t) self->location_length; - } - } else { - ret = check_offsets(num_rows, location_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->location_offset[self->num_rows + j] - = (tsk_size_t) self->location_length + location_offset[j]; - } - location_length = location_offset[num_rows]; - ret = tsk_individual_table_expand_location(self, location_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->location + self->location_length, location, - location_length * sizeof(double)); - self->location_length += location_length; - } - if (parents == NULL) { - for (j = 0; j < num_rows; j++) { - self->parents_offset[self->num_rows + j + 1] - = (tsk_size_t) self->parents_length; - } - } else { - ret = check_offsets(num_rows, parents_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->parents_offset[self->num_rows + j] - = (tsk_size_t) self->parents_length + parents_offset[j]; - } - parents_length = parents_offset[num_rows]; - ret = tsk_individual_table_expand_parents(self, parents_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->parents + self->parents_length, parents, - parents_length * sizeof(tsk_id_t)); - self->parents_length += parents_length; - } - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] - = (tsk_size_t) self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = (tsk_size_t) self->metadata_length + metadata_offset[j]; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_individual_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - self->metadata_length += metadata_length; - } - self->num_rows += (tsk_size_t) num_rows; - self->location_offset[self->num_rows] = self->location_length; - self->parents_offset[self->num_rows] = self->parents_length; - self->metadata_offset[self->num_rows] = self->metadata_length; -out: - return ret; -} - -static tsk_id_t -tsk_individual_table_add_row_internal(tsk_individual_table_t *self, tsk_flags_t flags, - const double *location, tsk_size_t location_length, const tsk_id_t *parents, - const tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) -{ - tsk_bug_assert(self->num_rows < self->max_rows); - tsk_bug_assert(self->parents_length + parents_length <= self->max_parents_length); - tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); - tsk_bug_assert(self->location_length + location_length <= self->max_location_length); - self->flags[self->num_rows] = flags; - tsk_memmove(self->location + self->location_length, location, - location_length * sizeof(*self->location)); - self->location_offset[self->num_rows + 1] = self->location_length + location_length; - self->location_length += location_length; - tsk_memmove(self->parents + self->parents_length, parents, - parents_length * sizeof(*self->parents)); - self->parents_offset[self->num_rows + 1] = self->parents_length + parents_length; - self->parents_length += parents_length; - tsk_memmove(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(*self->metadata)); - self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; - self->metadata_length += metadata_length; - self->num_rows++; - return (tsk_id_t) self->num_rows - 1; -} - -tsk_id_t -tsk_individual_table_add_row(tsk_individual_table_t *self, tsk_flags_t flags, - const double *location, tsk_size_t location_length, const tsk_id_t *parents, - tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - ret = tsk_individual_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_expand_location(self, location_length); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_expand_parents(self, parents_length); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_add_row_internal(self, flags, location, location_length, - parents, parents_length, metadata, metadata_length); -out: - return ret; -} - -static int -tsk_individual_table_update_row_rewrite(tsk_individual_table_t *self, tsk_id_t index, - tsk_flags_t flags, const double *location, tsk_size_t location_length, - const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_individual_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_individual_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_individual_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_individual_table_add_row(self, flags, location, location_length, - parents, parents_length, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_individual_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_individual_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_individual_table_update_row(tsk_individual_table_t *self, tsk_id_t index, - tsk_flags_t flags, const double *location, tsk_size_t location_length, - const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_individual_t current_row; - - ret = tsk_individual_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.location_length == location_length - && current_row.parents_length == parents_length - && current_row.metadata_length == metadata_length) { - self->flags[index] = flags; - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->location[self->location_offset[index]], location, - location_length * sizeof(*location)); - tsk_memmove(&self->parents[self->parents_offset[index]], parents, - parents_length * sizeof(*parents)); - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_individual_table_update_row_rewrite(self, index, flags, location, - location_length, parents, parents_length, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_individual_table_clear(tsk_individual_table_t *self) -{ - return tsk_individual_table_truncate(self, 0); -} - -int -tsk_individual_table_truncate(tsk_individual_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->location_length = self->location_offset[num_rows]; - self->parents_length = self->parents_offset[num_rows]; - self->metadata_length = self->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_individual_table_extend(tsk_individual_table_t *self, - const tsk_individual_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_individual_t individual; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_individual_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_individual_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &individual); - if (ret != 0) { - goto out; - } - ret_id = tsk_individual_table_add_row(self, individual.flags, - individual.location, individual.location_length, individual.parents, - individual.parents_length, individual.metadata, individual.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out) -{ - tsk_size_t j, k; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "tsk_individual_tbl: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - /* We duplicate the dump_text code here because we want to output - * the offset columns. */ - write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - fprintf(out, "id\tflags\tlocation_offset\tlocation\t"); - fprintf(out, "parents_offset\tparents\t"); - fprintf(out, "metadata_offset\tmetadata\n"); - for (j = 0; j < self->num_rows; j++) { - fprintf(out, "%lld\t%lld\t", (long long) j, (long long) self->flags[j]); - fprintf(out, "%lld\t", (long long) self->location_offset[j]); - for (k = self->location_offset[j]; k < self->location_offset[j + 1]; k++) { - fprintf(out, "%f", self->location[k]); - if (k + 1 < self->location_offset[j + 1]) { - fprintf(out, ","); - } - } - fprintf(out, "\t"); - fprintf(out, "%lld\t", (long long) self->parents_offset[j]); - for (k = self->parents_offset[j]; k < self->parents_offset[j + 1]; k++) { - fprintf(out, "%lld", (long long) self->parents[k]); - if (k + 1 < self->parents_offset[j + 1]) { - fprintf(out, ","); - } - } - fprintf(out, "\t"); - fprintf(out, "%lld\t", (long long) self->metadata_offset[j]); - for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { - fprintf(out, "%c", self->metadata[k]); - } - fprintf(out, "\n"); - } -} - -static inline void -tsk_individual_table_get_row_unsafe( - const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row) -{ - row->id = (tsk_id_t) index; - row->flags = self->flags[index]; - row->location_length - = self->location_offset[index + 1] - self->location_offset[index]; - row->location = self->location + self->location_offset[index]; - row->parents_length = self->parents_offset[index + 1] - self->parents_offset[index]; - row->parents = self->parents + self->parents_offset[index]; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; - /* Also have referencing individuals here. Should this be a different struct? - * See also site. */ - row->nodes_length = 0; - row->nodes = NULL; -} - -int -tsk_individual_table_get_row( - const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS; - goto out; - } - tsk_individual_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_individual_table_set_metadata_schema(tsk_individual_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out) -{ - int ret = TSK_ERR_IO; - tsk_size_t j, k; - tsk_size_t metadata_len; - int err; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "id\tflags\tlocation\tparents\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%lld\t%lld\t", (long long) j, (long long) self->flags[j]); - if (err < 0) { - goto out; - } - for (k = self->location_offset[j]; k < self->location_offset[j + 1]; k++) { - err = fprintf(out, "%.*g", TSK_DBL_DECIMAL_DIG, self->location[k]); - if (err < 0) { - goto out; - } - if (k + 1 < self->location_offset[j + 1]) { - err = fprintf(out, ","); - if (err < 0) { - goto out; - } - } - } - err = fprintf(out, "\t"); - if (err < 0) { - goto out; - } - for (k = self->parents_offset[j]; k < self->parents_offset[j + 1]; k++) { - err = fprintf(out, "%lld", (long long) self->parents[k]); - if (err < 0) { - goto out; - } - if (k + 1 < self->parents_offset[j + 1]) { - err = fprintf(out, ","); - if (err < 0) { - goto out; - } - } - } - err = fprintf(out, "\t%.*s\n", (int) metadata_len, - self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_individual_table_equals(const tsk_individual_table_t *self, - const tsk_individual_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && tsk_memcmp(self->flags, other->flags, self->num_rows * sizeof(tsk_flags_t)) - == 0 - && tsk_memcmp(self->location_offset, other->location_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp( - self->location, other->location, self->location_length * sizeof(double)) - == 0 - && tsk_memcmp(self->parents_offset, other->parents_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp( - self->parents, other->parents, self->parents_length * sizeof(tsk_id_t)) - == 0; - - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) -{ - int ret = 0; - const tsk_size_t current_num_rows = self->num_rows; - tsk_size_t j, k, remaining_rows; - tsk_id_t pk; - tsk_id_t *id_map = ret_id_map; - tsk_id_t *restrict parents = self->parents; - tsk_size_t *restrict parents_offset = self->parents_offset; - - if (ret_id_map == NULL) { - id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); - if (id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } - - keep_mask_to_id_map(current_num_rows, keep, id_map); - - /* See notes in tsk_mutation_table_keep_rows for possibilities - * on making this more flexible */ - for (j = 0; j < current_num_rows; j++) { - if (keep[j]) { - for (k = parents_offset[j]; k < parents_offset[j + 1]; k++) { - pk = parents[k]; - if (pk != TSK_NULL) { - if (pk < 0 || pk >= (tsk_id_t) current_num_rows) { - ret = TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS; - ; - goto out; - } - if (id_map[pk] == TSK_NULL) { - ret = TSK_ERR_KEEP_ROWS_MAP_TO_DELETED; - goto out; - } - } - } - } - } - - remaining_rows = subset_flags_column(self->flags, current_num_rows, keep); - self->parents_length = subset_remap_ragged_id_column( - self->parents, self->parents_offset, current_num_rows, keep, id_map); - self->location_length = subset_ragged_double_column( - self->location, self->location_offset, current_num_rows, keep); - if (self->metadata_length > 0) { - /* Implementation note: we special case metadata here because - * it'll make the common-case of no metadata a bit faster, and - * to also potentially support more general use of the - * TSK_TABLE_NO_METADATA option. This is done for all the tables - * but only commented on here. */ - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, current_num_rows, keep); - } - self->num_rows = remaining_rows; -out: - if (ret_id_map == NULL) { - tsk_safe_free(id_map); - } - return ret; -} - -static int -tsk_individual_table_dump( - const tsk_individual_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t write_cols[] = { - { "individuals/flags", (void *) self->flags, self->num_rows, - TSK_FLAGS_STORAGE_TYPE }, - { "individuals/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "individuals/location", (void *) self->location, self->location_length, - KAS_FLOAT64, self->location_offset, self->num_rows }, - { "individuals/parents", (void *) self->parents, self->parents_length, - TSK_ID_STORAGE_TYPE, self->parents_offset, self->num_rows }, - { "individuals/metadata", (void *) self->metadata, self->metadata_length, - KAS_UINT8, self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, write_cols, ragged_cols, options); -} - -static int -tsk_individual_table_load(tsk_individual_table_t *self, kastore_t *store) -{ - int ret = 0; - tsk_flags_t *flags = NULL; - double *location = NULL; - tsk_size_t *location_offset = NULL; - tsk_id_t *parents = NULL; - tsk_size_t *parents_offset = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - char *metadata_schema = NULL; - tsk_size_t num_rows, location_length, parents_length, metadata_length, - metadata_schema_length; - - read_table_col_t cols[] = { - { "individuals/flags", (void **) &flags, TSK_FLAGS_STORAGE_TYPE, 0 }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "individuals/location", (void **) &location, &location_length, KAS_FLOAT64, - &location_offset, 0 }, - { "individuals/parents", (void **) &parents, &parents_length, - TSK_ID_STORAGE_TYPE, &parents_offset, TSK_COL_OPTIONAL }, - { "individuals/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, 0 }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "individuals/metadata_schema", (void **) &metadata_schema, - &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_individual_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_individual_table_takeset_columns(self, num_rows, flags, location, - location_offset, parents, parents_offset, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - flags = NULL; - location = NULL; - location_offset = NULL; - parents = NULL; - parents_offset = NULL; - metadata = NULL; - metadata_offset = NULL; - -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -/************************* - * node table - *************************/ - -static void -tsk_node_table_free_columns(tsk_node_table_t *self) -{ - tsk_safe_free(self->flags); - tsk_safe_free(self->time); - tsk_safe_free(self->population); - tsk_safe_free(self->individual); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_node_table_free(tsk_node_table_t *self) -{ - tsk_node_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_node_table_expand_main_columns(tsk_node_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - - if (new_max_rows > self->max_rows) { - ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->population, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->individual, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_node_table_expand_metadata(tsk_node_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_node_table_set_max_rows_increment( - tsk_node_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_node_table_set_max_metadata_length_increment( - tsk_node_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_node_table_init(tsk_node_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_node_table_t)); - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_node_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_metadata_length_increment = 0; - tsk_node_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_node_table_copy( - const tsk_node_table_t *self, tsk_node_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_node_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_node_table_set_columns(dest, self->num_rows, self->flags, self->time, - self->population, self->individual, self->metadata, self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_node_table_set_columns(tsk_node_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - - ret = tsk_node_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_append_columns( - self, num_rows, flags, time, population, individual, metadata, metadata_offset); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_node_table_takeset_columns(tsk_node_table_t *self, tsk_size_t num_rows, - tsk_flags_t *flags, double *time, tsk_id_t *population, tsk_id_t *individual, - char *metadata, tsk_size_t *metadata_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - if (flags == NULL || time == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_node_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - self->flags = flags; - self->time = time; - - ret = takeset_optional_id_column(num_rows, population, &self->population); - if (ret != 0) { - goto out; - } - ret = takeset_optional_id_column(num_rows, individual, &self->individual); - if (ret != 0) { - goto out; - } - - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int -tsk_node_table_append_columns(tsk_node_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - tsk_size_t j, metadata_length; - - if (flags == NULL || time == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_node_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); - tsk_memcpy(self->flags + self->num_rows, flags, num_rows * sizeof(tsk_flags_t)); - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = (tsk_size_t) self->metadata_length + metadata_offset[j]; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_node_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - self->metadata_length += metadata_length; - } - if (population == NULL) { - /* Set population to NULL_POPULATION (-1) if not specified */ - tsk_memset(self->population + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); - } else { - tsk_memcpy( - self->population + self->num_rows, population, num_rows * sizeof(tsk_id_t)); - } - if (individual == NULL) { - /* Set individual to NULL_INDIVIDUAL (-1) if not specified */ - tsk_memset(self->individual + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); - } else { - tsk_memcpy( - self->individual + self->num_rows, individual, num_rows * sizeof(tsk_id_t)); - } - self->num_rows += (tsk_size_t) num_rows; - self->metadata_offset[self->num_rows] = self->metadata_length; -out: - return ret; -} - -static tsk_id_t -tsk_node_table_add_row_internal(tsk_node_table_t *self, tsk_flags_t flags, double time, - tsk_id_t population, tsk_id_t individual, const char *metadata, - tsk_size_t metadata_length) -{ - tsk_bug_assert(self->num_rows < self->max_rows); - tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); - tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); - self->flags[self->num_rows] = flags; - self->time[self->num_rows] = time; - self->population[self->num_rows] = population; - self->individual[self->num_rows] = individual; - self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; - self->metadata_length += metadata_length; - self->num_rows++; - return (tsk_id_t) self->num_rows - 1; -} - -tsk_id_t -tsk_node_table_add_row(tsk_node_table_t *self, tsk_flags_t flags, double time, - tsk_id_t population, tsk_id_t individual, const char *metadata, - tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - ret = tsk_node_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_add_row_internal( - self, flags, time, population, individual, metadata, metadata_length); -out: - return ret; -} - -static int -tsk_node_table_update_row_rewrite(tsk_node_table_t *self, tsk_id_t index, - tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, - const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_node_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_node_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_node_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_node_table_add_row( - self, flags, time, population, individual, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_node_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_node_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_node_table_update_row(tsk_node_table_t *self, tsk_id_t index, tsk_flags_t flags, - double time, tsk_id_t population, tsk_id_t individual, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_node_t current_row; - - ret = tsk_node_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length) { - self->flags[index] = flags; - self->time[index] = time; - self->population[index] = population; - self->individual[index] = individual; - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_node_table_update_row_rewrite( - self, index, flags, time, population, individual, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_node_table_clear(tsk_node_table_t *self) -{ - return tsk_node_table_truncate(self, 0); -} - -int -tsk_node_table_truncate(tsk_node_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->metadata_length = self->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_node_table_extend(tsk_node_table_t *self, const tsk_node_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_node_t node; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_node_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_node_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &node); - if (ret != 0) { - goto out; - } - ret_id = tsk_node_table_add_row(self, node.flags, node.time, node.population, - node.individual, node.metadata, node.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out) -{ - tsk_size_t j, k; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "tsk_node_tbl: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - /* We duplicate the dump_text code here for simplicity because we want to output - * the flags column directly. */ - write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - fprintf(out, "id\tflags\ttime\tpopulation\tindividual\tmetadata_offset\tmetadata\n"); - for (j = 0; j < self->num_rows; j++) { - fprintf(out, "%lld\t%lld\t%f\t%lld\t%lld\t%lld\t", (long long) j, - (long long) self->flags[j], self->time[j], (long long) self->population[j], - (long long) self->individual[j], (long long) self->metadata_offset[j]); - for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { - fprintf(out, "%c", self->metadata[k]); - } - fprintf(out, "\n"); - } - tsk_bug_assert(self->metadata_offset[0] == 0); - tsk_bug_assert(self->metadata_offset[self->num_rows] == self->metadata_length); -} - -int -tsk_node_table_set_metadata_schema(tsk_node_table_t *self, const char *metadata_schema, - tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_node_table_dump_text(const tsk_node_table_t *self, FILE *out) -{ - int ret = TSK_ERR_IO; - tsk_size_t j; - tsk_size_t metadata_len; - int err; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "id\tis_sample\ttime\tpopulation\tindividual\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%lld\t%lld\t%f\t%lld\t%lld\t%.*s\n", (long long) j, - (long long) (self->flags[j] & TSK_NODE_IS_SAMPLE), self->time[j], - (long long) self->population[j], (long long) self->individual[j], - (int) metadata_len, self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_node_table_equals( - const tsk_node_table_t *self, const tsk_node_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->flags, other->flags, self->num_rows * sizeof(tsk_flags_t)) - == 0 - && tsk_memcmp( - self->population, other->population, self->num_rows * sizeof(tsk_id_t)) - == 0 - && tsk_memcmp( - self->individual, other->individual, self->num_rows * sizeof(tsk_id_t)) - == 0; - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -static inline void -tsk_node_table_get_row_unsafe( - const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row) -{ - row->id = (tsk_id_t) index; - row->flags = self->flags[index]; - row->time = self->time[index]; - row->population = self->population[index]; - row->individual = self->individual[index]; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; -} - -int -tsk_node_table_get_row(const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - tsk_node_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - tsk_size_t remaining_rows; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - - remaining_rows = subset_flags_column(self->flags, self->num_rows, keep); - subset_double_column(self->time, self->num_rows, keep); - subset_id_column(self->population, self->num_rows, keep); - subset_id_column(self->individual, self->num_rows, keep); - if (self->metadata_length > 0) { - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, self->num_rows, keep); - } - self->num_rows = remaining_rows; - return ret; -} - -static int -tsk_node_table_dump(const tsk_node_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t cols[] = { - { "nodes/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, - { "nodes/flags", (void *) self->flags, self->num_rows, TSK_FLAGS_STORAGE_TYPE }, - { "nodes/population", (void *) self->population, self->num_rows, - TSK_ID_STORAGE_TYPE }, - { "nodes/individual", (void *) self->individual, self->num_rows, - TSK_ID_STORAGE_TYPE }, - { "nodes/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "nodes/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, - self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, cols, ragged_cols, options); -} - -static int -tsk_node_table_load(tsk_node_table_t *self, kastore_t *store) -{ - int ret = 0; - char *metadata_schema = NULL; - double *time = NULL; - tsk_flags_t *flags = NULL; - tsk_id_t *population = NULL; - tsk_id_t *individual = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - tsk_size_t num_rows, metadata_length, metadata_schema_length; - read_table_col_t cols[] = { - { "nodes/time", (void **) &time, KAS_FLOAT64, 0 }, - { "nodes/flags", (void **) &flags, TSK_FLAGS_STORAGE_TYPE, 0 }, - { "nodes/population", (void **) &population, TSK_ID_STORAGE_TYPE, 0 }, - { "nodes/individual", (void **) &individual, TSK_ID_STORAGE_TYPE, 0 }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "nodes/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, 0 }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "nodes/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, - KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_node_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_node_table_takeset_columns( - self, num_rows, flags, time, population, individual, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - flags = NULL; - time = NULL; - population = NULL; - individual = NULL; - metadata = NULL; - metadata_offset = NULL; -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -/************************* - * edge table - *************************/ - -static void -tsk_edge_table_free_columns(tsk_edge_table_t *self) -{ - tsk_safe_free(self->left); - tsk_safe_free(self->right); - tsk_safe_free(self->parent); - tsk_safe_free(self->child); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_edge_table_free(tsk_edge_table_t *self) -{ - tsk_edge_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_edge_table_has_metadata(const tsk_edge_table_t *self) -{ - return !(self->options & TSK_TABLE_NO_METADATA); -} - -static int -tsk_edge_table_expand_main_columns(tsk_edge_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->child, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - if (tsk_edge_table_has_metadata(self)) { - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_edge_table_expand_metadata(tsk_edge_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_edge_table_set_max_rows_increment( - tsk_edge_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_edge_table_set_max_metadata_length_increment( - tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_edge_table_init(tsk_edge_table_t *self, tsk_flags_t options) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(*self)); - self->options = options; - - /* Allocate space for one row initially, ensuring we always have valid - * pointers even if the table is empty */ - self->max_rows_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_edge_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - if (tsk_edge_table_has_metadata(self)) { - ret = tsk_edge_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->metadata_offset[0] = 0; - } - self->max_rows_increment = 0; - self->max_metadata_length_increment = 0; - tsk_edge_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -tsk_id_t -tsk_edge_table_add_row(tsk_edge_table_t *self, double left, double right, - tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - if (metadata_length > 0 && !tsk_edge_table_has_metadata(self)) { - ret = TSK_ERR_METADATA_DISABLED; - goto out; - } - - ret = tsk_edge_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - - tsk_bug_assert(self->num_rows < self->max_rows); - self->left[self->num_rows] = left; - self->right[self->num_rows] = right; - self->parent[self->num_rows] = parent; - self->child[self->num_rows] = child; - - if (tsk_edge_table_has_metadata(self)) { - ret = tsk_edge_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_bug_assert( - self->metadata_length + metadata_length <= self->max_metadata_length); - tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); - self->metadata_offset[self->num_rows + 1] - = self->metadata_length + metadata_length; - self->metadata_length += metadata_length; - } - ret = (tsk_id_t) self->num_rows; - self->num_rows++; -out: - return ret; -} - -static int -tsk_edge_table_update_row_rewrite(tsk_edge_table_t *self, tsk_id_t index, double left, - double right, tsk_id_t parent, tsk_id_t child, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_edge_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_edge_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_edge_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_edge_table_add_row( - self, left, right, parent, child, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_edge_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_edge_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_edge_table_update_row(tsk_edge_table_t *self, tsk_id_t index, double left, - double right, tsk_id_t parent, tsk_id_t child, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_edge_t current_row; - - ret = tsk_edge_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length) { - self->left[index] = left; - self->right[index] = right; - self->parent[index] = parent; - self->child[index] = child; - if (tsk_edge_table_has_metadata(self)) { - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } - } else { - ret = tsk_edge_table_update_row_rewrite( - self, index, left, right, parent, child, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_edge_table_copy( - const tsk_edge_table_t *self, tsk_edge_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_edge_table_init(dest, options); - if (ret != 0) { - goto out; - } - } - - /* We can't use TSK_TABLE_NO_METADATA in dest if metadata_length is non-zero. - * This also captures the case where TSK_TABLE_NO_METADATA is set on this table. - */ - if (self->metadata_length > 0 && !tsk_edge_table_has_metadata(dest)) { - ret = TSK_ERR_METADATA_DISABLED; - goto out; - } - if (tsk_edge_table_has_metadata(dest)) { - metadata = self->metadata; - metadata_offset = self->metadata_offset; - } - ret = tsk_edge_table_set_columns(dest, self->num_rows, self->left, self->right, - self->parent, self->child, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int -tsk_edge_table_set_columns(tsk_edge_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret = 0; - - ret = tsk_edge_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_append_columns( - self, num_rows, left, right, parent, child, metadata, metadata_offset); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_edge_table_takeset_columns(tsk_edge_table_t *self, tsk_size_t num_rows, double *left, - double *right, tsk_id_t *parent, tsk_id_t *child, char *metadata, - tsk_size_t *metadata_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - if (left == NULL || right == NULL || parent == NULL || child == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (metadata != NULL && !tsk_edge_table_has_metadata(self)) { - ret = TSK_ERR_METADATA_DISABLED; - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_edge_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - self->left = left; - self->right = right; - self->parent = parent; - self->child = child; - - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int -tsk_edge_table_append_columns(tsk_edge_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - tsk_size_t j, metadata_length; - - if (left == NULL || right == NULL || parent == NULL || child == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (metadata != NULL && !tsk_edge_table_has_metadata(self)) { - ret = TSK_ERR_METADATA_DISABLED; - goto out; - } - - ret = tsk_edge_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->left + self->num_rows, left, num_rows * sizeof(double)); - tsk_memcpy(self->right + self->num_rows, right, num_rows * sizeof(double)); - tsk_memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); - tsk_memcpy(self->child + self->num_rows, child, num_rows * sizeof(tsk_id_t)); - if (tsk_edge_table_has_metadata(self)) { - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = (tsk_size_t) self->metadata_length + metadata_offset[j]; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_edge_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - self->metadata_length += metadata_length; - } - self->num_rows += num_rows; - self->metadata_offset[self->num_rows] = self->metadata_length; - } else { - self->num_rows += num_rows; - } -out: - return ret; -} - -int -tsk_edge_table_clear(tsk_edge_table_t *self) -{ - return tsk_edge_table_truncate(self, 0); -} - -int -tsk_edge_table_truncate(tsk_edge_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - if (tsk_edge_table_has_metadata(self)) { - self->metadata_length = self->metadata_offset[num_rows]; - } -out: - return ret; -} - -int -tsk_edge_table_extend(tsk_edge_table_t *self, const tsk_edge_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_edge_t edge; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_edge_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_edge_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &edge); - if (ret != 0) { - goto out; - } - ret_id = tsk_edge_table_add_row(self, edge.left, edge.right, edge.parent, - edge.child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -static inline void -tsk_edge_table_get_row_unsafe( - const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row) -{ - row->id = (tsk_id_t) index; - row->left = self->left[index]; - row->right = self->right[index]; - row->parent = self->parent[index]; - row->child = self->child[index]; - if (tsk_edge_table_has_metadata(self)) { - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; - } else { - row->metadata_length = 0; - row->metadata = NULL; - } -} - -int -tsk_edge_table_get_row(const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - tsk_edge_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -void -tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out) -{ - int ret; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "edge_table: %p:\n", (const void *) self); - fprintf(out, "options = 0x%X\n", self->options); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - ret = tsk_edge_table_dump_text(self, out); - tsk_bug_assert(ret == 0); -} - -int -tsk_edge_table_set_metadata_schema(tsk_edge_table_t *self, const char *metadata_schema, - tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_edge_table_dump_text(const tsk_edge_table_t *self, FILE *out) -{ - tsk_id_t j; - int ret = TSK_ERR_IO; - tsk_edge_t row; - int err; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "id\tleft\tright\tparent\tchild\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < (tsk_id_t) self->num_rows; j++) { - tsk_edge_table_get_row_unsafe(self, j, &row); - err = fprintf(out, "%lld\t%.3f\t%.3f\t%lld\t%lld\t%.*s\n", (long long) j, - row.left, row.right, (long long) row.parent, (long long) row.child, - (int) row.metadata_length, row.metadata); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_edge_table_equals( - const tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_flags_t options) -{ - bool metadata_equal; - bool ret - = self->num_rows == other->num_rows - && tsk_memcmp(self->left, other->left, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->right, other->right, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->parent, other->parent, self->num_rows * sizeof(tsk_id_t)) - == 0 - && tsk_memcmp(self->child, other->child, self->num_rows * sizeof(tsk_id_t)) - == 0; - - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - metadata_equal = false; - if (self->metadata_length == other->metadata_length) { - if (tsk_edge_table_has_metadata(self) - && tsk_edge_table_has_metadata(other)) { - metadata_equal - = tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0; - } else { - /* The only way that the metadata lengths can be equal (which - * we've already tested) and either one or the other of the tables - * hasn't got metadata is if they are both zero length. */ - tsk_bug_assert(self->metadata_length == 0); - metadata_equal = true; - } - } - ret = ret && metadata_equal; - } - return ret; -} - -int -tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - tsk_size_t remaining_rows; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - remaining_rows = subset_double_column(self->left, self->num_rows, keep); - subset_double_column(self->right, self->num_rows, keep); - subset_id_column(self->parent, self->num_rows, keep); - subset_id_column(self->child, self->num_rows, keep); - if (self->metadata_length > 0) { - tsk_bug_assert(!(self->options & TSK_TABLE_NO_METADATA)); - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, self->num_rows, keep); - } - self->num_rows = remaining_rows; - return ret; -} - -static int -tsk_edge_table_dump(const tsk_edge_table_t *self, kastore_t *store, tsk_flags_t options) -{ - int ret = 0; - const write_table_col_t write_cols[] = { - { "edges/left", (void *) self->left, self->num_rows, KAS_FLOAT64 }, - { "edges/right", (void *) self->right, self->num_rows, KAS_FLOAT64 }, - { "edges/parent", (void *) self->parent, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "edges/child", (void *) self->child, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "edges/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "edges/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, - self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - /* TODO when the general code has been updated to only write out the - * column when the lenght of ragged columns is > 0 we can get rid of - * this special case here and use write_table. */ - ret = write_table_cols(store, write_cols, options); - if (ret != 0) { - goto out; - } - if (tsk_edge_table_has_metadata(self)) { - ret = write_table_ragged_cols(store, ragged_cols, options); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -tsk_edge_table_load(tsk_edge_table_t *self, kastore_t *store) -{ - int ret = 0; - char *metadata_schema = NULL; - double *left = NULL; - double *right = NULL; - tsk_id_t *parent = NULL; - tsk_id_t *child = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - tsk_size_t num_rows, metadata_length, metadata_schema_length; - - read_table_col_t cols[] = { - { "edges/left", (void **) &left, KAS_FLOAT64, 0 }, - { "edges/right", (void **) &right, KAS_FLOAT64, 0 }, - { "edges/parent", (void **) &parent, TSK_ID_STORAGE_TYPE, 0 }, - { "edges/child", (void **) &child, TSK_ID_STORAGE_TYPE, 0 }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "edges/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "edges/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, - KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_edge_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_edge_table_takeset_columns( - self, num_rows, left, right, parent, child, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - left = NULL; - right = NULL; - parent = NULL; - child = NULL; - metadata = NULL; - metadata_offset = NULL; -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -int -tsk_edge_table_squash(tsk_edge_table_t *self) -{ - int k; - int ret = 0; - tsk_edge_t *edges = NULL; - tsk_size_t num_output_edges; - - if (self->metadata_length > 0) { - ret = TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA; - goto out; - } - - edges = tsk_malloc(self->num_rows * sizeof(tsk_edge_t)); - if (edges == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (k = 0; k < (int) self->num_rows; k++) { - edges[k].left = self->left[k]; - edges[k].right = self->right[k]; - edges[k].parent = self->parent[k]; - edges[k].child = self->child[k]; - edges[k].metadata_length = 0; - } - - ret = tsk_squash_edges(edges, self->num_rows, &num_output_edges); - if (ret != 0) { - goto out; - } - tsk_edge_table_clear(self); - tsk_bug_assert(num_output_edges <= self->max_rows); - self->num_rows = num_output_edges; - for (k = 0; k < (int) num_output_edges; k++) { - self->left[k] = edges[k].left; - self->right[k] = edges[k].right; - self->parent[k] = edges[k].parent; - self->child[k] = edges[k].child; - } -out: - tsk_safe_free(edges); - return ret; -} - -/************************* - * site table - *************************/ - -static void -tsk_site_table_free_columns(tsk_site_table_t *self) -{ - tsk_safe_free(self->position); - tsk_safe_free(self->ancestral_state); - tsk_safe_free(self->ancestral_state_offset); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_site_table_free(tsk_site_table_t *self) -{ - tsk_site_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_site_table_expand_main_columns(tsk_site_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->position, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->ancestral_state_offset, new_max_rows + 1, - sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_site_table_expand_ancestral_state( - tsk_site_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->ancestral_state_length, additional_length, - self->max_ancestral_state_length_increment, &self->max_ancestral_state_length, - (void **) &self->ancestral_state, sizeof(*self->ancestral_state)); -} - -static int -tsk_site_table_expand_metadata(tsk_site_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_site_table_set_max_rows_increment( - tsk_site_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_site_table_set_max_metadata_length_increment( - tsk_site_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_site_table_set_max_ancestral_state_length_increment( - tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment) -{ - self->max_ancestral_state_length_increment = max_ancestral_state_length_increment; - return 0; -} - -int -tsk_site_table_init(tsk_site_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_site_table_t)); - - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_ancestral_state_length_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_site_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_expand_ancestral_state(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->ancestral_state_offset[0] = 0; - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_ancestral_state_length_increment = 0; - self->max_metadata_length_increment = 0; - tsk_site_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -tsk_id_t -tsk_site_table_add_row(tsk_site_table_t *self, double position, - const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, - tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - tsk_size_t ancestral_state_offset, metadata_offset; - - ret = tsk_site_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - self->position[self->num_rows] = position; - - ancestral_state_offset = (tsk_size_t) self->ancestral_state_length; - tsk_bug_assert( - self->ancestral_state_offset[self->num_rows] == ancestral_state_offset); - ret = tsk_site_table_expand_ancestral_state(self, ancestral_state_length); - if (ret != 0) { - goto out; - } - self->ancestral_state_length += ancestral_state_length; - tsk_memmove(self->ancestral_state + ancestral_state_offset, ancestral_state, - ancestral_state_length); - self->ancestral_state_offset[self->num_rows + 1] = self->ancestral_state_length; - - metadata_offset = (tsk_size_t) self->metadata_length; - tsk_bug_assert(self->metadata_offset[self->num_rows] == metadata_offset); - ret = tsk_site_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - self->metadata_length += metadata_length; - tsk_memmove(self->metadata + metadata_offset, metadata, metadata_length); - self->metadata_offset[self->num_rows + 1] = self->metadata_length; - - ret = (tsk_id_t) self->num_rows; - self->num_rows++; -out: - return ret; -} - -static int -tsk_site_table_update_row_rewrite(tsk_site_table_t *self, tsk_id_t index, - double position, const char *ancestral_state, tsk_size_t ancestral_state_length, - const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_site_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_site_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_site_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_site_table_add_row(self, position, ancestral_state, - ancestral_state_length, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_site_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_site_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_site_table_update_row(tsk_site_table_t *self, tsk_id_t index, double position, - const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_site_t current_row; - - ret = tsk_site_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length - && current_row.ancestral_state_length == ancestral_state_length) { - self->position[index] = position; - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->ancestral_state[self->ancestral_state_offset[index]], - ancestral_state, ancestral_state_length * sizeof(*ancestral_state)); - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_site_table_update_row_rewrite(self, index, position, ancestral_state, - ancestral_state_length, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret = 0; - tsk_size_t j, ancestral_state_length, metadata_length; - - if (position == NULL || ancestral_state == NULL || ancestral_state_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - ret = tsk_site_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->position + self->num_rows, position, num_rows * sizeof(double)); - - /* Metadata column */ - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_site_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = self->metadata_length + metadata_offset[j]; - } - self->metadata_length += metadata_length; - } - self->metadata_offset[self->num_rows + num_rows] = self->metadata_length; - - /* Ancestral state column */ - ret = check_offsets(num_rows, ancestral_state_offset, 0, false); - if (ret != 0) { - goto out; - } - ancestral_state_length = ancestral_state_offset[num_rows]; - ret = tsk_site_table_expand_ancestral_state(self, ancestral_state_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->ancestral_state + self->ancestral_state_length, ancestral_state, - ancestral_state_length * sizeof(char)); - for (j = 0; j < num_rows; j++) { - self->ancestral_state_offset[self->num_rows + j] - = self->ancestral_state_length + ancestral_state_offset[j]; - } - self->ancestral_state_length += ancestral_state_length; - self->ancestral_state_offset[self->num_rows + num_rows] - = self->ancestral_state_length; - - self->num_rows += num_rows; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_site_table_copy( - const tsk_site_table_t *self, tsk_site_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_site_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_site_table_set_columns(dest, self->num_rows, self->position, - self->ancestral_state, self->ancestral_state_offset, self->metadata, - self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int -tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret = 0; - - ret = tsk_site_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_append_columns(self, num_rows, position, ancestral_state, - ancestral_state_offset, metadata, metadata_offset); -out: - return ret; -} - -int -tsk_site_table_takeset_columns(tsk_site_table_t *self, tsk_size_t num_rows, - double *position, char *ancestral_state, tsk_size_t *ancestral_state_offset, - char *metadata, tsk_size_t *metadata_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - if (position == NULL || ancestral_state == NULL || ancestral_state_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = check_ragged_column(num_rows, ancestral_state, ancestral_state_offset); - if (ret != 0) { - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_site_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - self->position = position; - - ret = takeset_ragged_column(num_rows, ancestral_state, ancestral_state_offset, - (void *) &self->ancestral_state, &self->ancestral_state_offset, - &self->ancestral_state_length); - if (ret != 0) { - goto out; - } - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -bool -tsk_site_table_equals( - const tsk_site_table_t *self, const tsk_site_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && self->ancestral_state_length == other->ancestral_state_length - && tsk_memcmp(self->position, other->position, self->num_rows * sizeof(double)) - == 0 - && tsk_memcmp(self->ancestral_state_offset, other->ancestral_state_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->ancestral_state, other->ancestral_state, - self->ancestral_state_length * sizeof(char)) - == 0; - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_site_table_clear(tsk_site_table_t *self) -{ - return tsk_site_table_truncate(self, 0); -} - -int -tsk_site_table_truncate(tsk_site_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->ancestral_state_length = self->ancestral_state_offset[num_rows]; - self->metadata_length = self->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_site_table_extend(tsk_site_table_t *self, const tsk_site_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_site_t site; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_site_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_site_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &site); - if (ret != 0) { - goto out; - } - ret_id = tsk_site_table_add_row(self, site.position, site.ancestral_state, - site.ancestral_state_length, site.metadata, site.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out) -{ - int ret; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "site_table: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\t(max= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "ancestral_state_length = %lld\t(max= %lld\tincrement = %lld)\n", - (long long) self->ancestral_state_length, - (long long) self->max_ancestral_state_length, - (long long) self->max_ancestral_state_length_increment); - fprintf(out, "metadata_length = %lld(\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - ret = tsk_site_table_dump_text(self, out); - tsk_bug_assert(ret == 0); - - tsk_bug_assert(self->ancestral_state_offset[0] == 0); - tsk_bug_assert( - self->ancestral_state_length == self->ancestral_state_offset[self->num_rows]); - tsk_bug_assert(self->metadata_offset[0] == 0); - tsk_bug_assert(self->metadata_length == self->metadata_offset[self->num_rows]); -} - -static inline void -tsk_site_table_get_row_unsafe( - const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row) -{ - row->id = (tsk_id_t) index; - row->position = self->position[index]; - row->ancestral_state_length - = self->ancestral_state_offset[index + 1] - self->ancestral_state_offset[index]; - row->ancestral_state = self->ancestral_state + self->ancestral_state_offset[index]; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; - /* This struct has a placeholder for mutations. Probably should be separate - * structs for this (tsk_site_table_row_t?) */ - row->mutations_length = 0; - row->mutations = NULL; -} - -int -tsk_site_table_get_row(const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_SITE_OUT_OF_BOUNDS; - goto out; - } - tsk_site_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_site_table_set_metadata_schema(tsk_site_table_t *self, const char *metadata_schema, - tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out) -{ - tsk_size_t j; - int ret = TSK_ERR_IO; - int err; - tsk_size_t ancestral_state_len, metadata_len; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "id\tposition\tancestral_state\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - ancestral_state_len - = self->ancestral_state_offset[j + 1] - self->ancestral_state_offset[j]; - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%lld\t%f\t%.*s\t%.*s\n", (long long) j, self->position[j], - (int) ancestral_state_len, - self->ancestral_state + self->ancestral_state_offset[j], (int) metadata_len, - self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -int -tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - tsk_size_t remaining_rows; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - - remaining_rows = subset_double_column(self->position, self->num_rows, keep); - self->ancestral_state_length = subset_ragged_char_column( - self->ancestral_state, self->ancestral_state_offset, self->num_rows, keep); - if (self->metadata_length > 0) { - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, self->num_rows, keep); - } - self->num_rows = remaining_rows; - return ret; -} - -static int -tsk_site_table_dump(const tsk_site_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t cols[] = { - { "sites/position", (void *) self->position, self->num_rows, KAS_FLOAT64 }, - { "sites/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "sites/ancestral_state", (void *) self->ancestral_state, - self->ancestral_state_length, KAS_UINT8, self->ancestral_state_offset, - self->num_rows }, - { "sites/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, - self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, cols, ragged_cols, options); -} - -static int -tsk_site_table_load(tsk_site_table_t *self, kastore_t *store) -{ - int ret = 0; - char *metadata_schema = NULL; - double *position = NULL; - char *ancestral_state = NULL; - tsk_size_t *ancestral_state_offset = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - tsk_size_t num_rows, ancestral_state_length, metadata_length, metadata_schema_length; - - read_table_col_t cols[] = { - { "sites/position", (void **) &position, KAS_FLOAT64, 0 }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "sites/ancestral_state", (void **) &ancestral_state, &ancestral_state_length, - KAS_UINT8, &ancestral_state_offset, 0 }, - { "sites/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, 0 }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "sites/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, - KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_site_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_site_table_takeset_columns(self, num_rows, position, ancestral_state, - ancestral_state_offset, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - position = NULL; - ancestral_state = NULL; - ancestral_state_offset = NULL; - metadata = NULL; - metadata_offset = NULL; - -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -/************************* - * mutation table - *************************/ - -static void -tsk_mutation_table_free_columns(tsk_mutation_table_t *self) -{ - tsk_safe_free(self->node); - tsk_safe_free(self->site); - tsk_safe_free(self->parent); - tsk_safe_free(self->time); - tsk_safe_free(self->derived_state); - tsk_safe_free(self->derived_state_offset); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_mutation_table_free(tsk_mutation_table_t *self) -{ - tsk_mutation_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_mutation_table_expand_main_columns( - tsk_mutation_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->site, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->derived_state_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_mutation_table_expand_derived_state( - tsk_mutation_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->derived_state_length, additional_length, - self->max_derived_state_length_increment, &self->max_derived_state_length, - (void **) &self->derived_state, sizeof(*self->derived_state)); -} - -static int -tsk_mutation_table_expand_metadata( - tsk_mutation_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_mutation_table_set_max_rows_increment( - tsk_mutation_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_mutation_table_set_max_metadata_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_mutation_table_set_max_derived_state_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment) -{ - self->max_derived_state_length_increment = max_derived_state_length_increment; - return 0; -} - -int -tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_mutation_table_t)); - - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_derived_state_length_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_mutation_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_expand_derived_state(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->derived_state_offset[0] = 0; - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_derived_state_length_increment = 0; - self->max_metadata_length_increment = 0; - tsk_mutation_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -tsk_id_t -tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t node, - tsk_id_t parent, double time, const char *derived_state, - tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) -{ - tsk_id_t ret; - tsk_size_t derived_state_offset, metadata_offset; - - ret = tsk_mutation_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - self->site[self->num_rows] = site; - self->node[self->num_rows] = node; - self->parent[self->num_rows] = parent; - self->time[self->num_rows] = time; - - derived_state_offset = self->derived_state_length; - tsk_bug_assert(self->derived_state_offset[self->num_rows] == derived_state_offset); - ret = tsk_mutation_table_expand_derived_state(self, derived_state_length); - if (ret != 0) { - goto out; - } - self->derived_state_length += derived_state_length; - tsk_memmove( - self->derived_state + derived_state_offset, derived_state, derived_state_length); - self->derived_state_offset[self->num_rows + 1] = self->derived_state_length; - - metadata_offset = self->metadata_length; - tsk_bug_assert(self->metadata_offset[self->num_rows] == metadata_offset); - ret = tsk_mutation_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - self->metadata_length += metadata_length; - tsk_memmove(self->metadata + metadata_offset, metadata, metadata_length); - self->metadata_offset[self->num_rows + 1] = self->metadata_length; - - ret = (tsk_id_t) self->num_rows; - self->num_rows++; -out: - return ret; -} - -static int -tsk_mutation_table_update_row_rewrite(tsk_mutation_table_t *self, tsk_id_t index, - tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, - const char *derived_state, tsk_size_t derived_state_length, const char *metadata, - tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_mutation_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_mutation_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_mutation_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_mutation_table_add_row(self, site, node, parent, time, derived_state, - derived_state_length, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_mutation_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_mutation_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_mutation_table_update_row(tsk_mutation_table_t *self, tsk_id_t index, tsk_id_t site, - tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, - tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_mutation_t current_row; - - ret = tsk_mutation_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length - && current_row.derived_state_length == derived_state_length) { - self->site[index] = site; - self->node[index] = node; - self->parent[index] = parent; - self->time[index] = time; - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->derived_state[self->derived_state_offset[index]], - derived_state, derived_state_length * sizeof(*derived_state)); - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_mutation_table_update_row_rewrite(self, index, site, node, parent, - time, derived_state, derived_state_length, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, - const double *time, const char *derived_state, - const tsk_size_t *derived_state_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret = 0; - tsk_size_t j, derived_state_length, metadata_length; - - if (site == NULL || node == NULL || derived_state == NULL - || derived_state_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - ret = tsk_mutation_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->site + self->num_rows, site, num_rows * sizeof(tsk_id_t)); - tsk_memcpy(self->node + self->num_rows, node, num_rows * sizeof(tsk_id_t)); - if (parent == NULL) { - /* If parent is NULL, set all parents to the null mutation */ - tsk_memset(self->parent + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); - } else { - tsk_memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); - } - if (time == NULL) { - /* If time is NULL, set all times to TSK_UNKNOWN_TIME which is the - * default */ - for (j = 0; j < num_rows; j++) { - self->time[self->num_rows + j] = TSK_UNKNOWN_TIME; - } - } else { - tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); - } - - /* Metadata column */ - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_mutation_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = self->metadata_length + metadata_offset[j]; - } - self->metadata_length += metadata_length; - } - self->metadata_offset[self->num_rows + num_rows] = self->metadata_length; - - /* Derived state column */ - ret = check_offsets(num_rows, derived_state_offset, 0, false); - if (ret != 0) { - goto out; - } - derived_state_length = derived_state_offset[num_rows]; - ret = tsk_mutation_table_expand_derived_state(self, derived_state_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->derived_state + self->derived_state_length, derived_state, - derived_state_length * sizeof(char)); - for (j = 0; j < num_rows; j++) { - self->derived_state_offset[self->num_rows + j] - = self->derived_state_length + derived_state_offset[j]; - } - self->derived_state_length += derived_state_length; - self->derived_state_offset[self->num_rows + num_rows] = self->derived_state_length; - - self->num_rows += num_rows; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_mutation_table_takeset_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, char *derived_state, - tsk_size_t *derived_state_offset, char *metadata, tsk_size_t *metadata_offset) -{ - tsk_size_t j; - int ret = 0; - - if (site == NULL || node == NULL || derived_state == NULL - || derived_state_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - /* We need to check all the inputs before we start freeing or taking memory */ - ret = check_ragged_column(num_rows, derived_state, derived_state_offset); - if (ret != 0) { - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_mutation_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - self->site = site; - self->node = node; - - ret = takeset_optional_id_column(num_rows, parent, &self->parent); - if (ret != 0) { - goto out; - } - if (time == NULL) { - /* Time defaults to unknown time if not specified. */ - self->time = tsk_malloc(num_rows * sizeof(*self->time)); - if (self->time == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (j = 0; j < num_rows; j++) { - self->time[j] = TSK_UNKNOWN_TIME; - } - - } else { - self->time = time; - } - - ret = takeset_ragged_column(num_rows, derived_state, derived_state_offset, - (void *) &self->derived_state, &self->derived_state_offset, - &self->derived_state_length); - if (ret != 0) { - goto out; - } - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_mutation_table_copy( - const tsk_mutation_table_t *self, tsk_mutation_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_mutation_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_mutation_table_set_columns(dest, self->num_rows, self->site, self->node, - self->parent, self->time, self->derived_state, self->derived_state_offset, - self->metadata, self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int -tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, - const double *time, const char *derived_state, - const tsk_size_t *derived_state_offset, const char *metadata, - const tsk_size_t *metadata_offset) -{ - int ret = 0; - - ret = tsk_mutation_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_append_columns(self, num_rows, site, node, parent, time, - derived_state, derived_state_offset, metadata, metadata_offset); -out: - return ret; -} - -bool -tsk_mutation_table_equals(const tsk_mutation_table_t *self, - const tsk_mutation_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && self->derived_state_length == other->derived_state_length - && tsk_memcmp(self->site, other->site, self->num_rows * sizeof(tsk_id_t)) == 0 - && tsk_memcmp(self->node, other->node, self->num_rows * sizeof(tsk_id_t)) == 0 - && tsk_memcmp(self->parent, other->parent, self->num_rows * sizeof(tsk_id_t)) - == 0 - && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->derived_state_offset, other->derived_state_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->derived_state, other->derived_state, - self->derived_state_length * sizeof(char)) - == 0; - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_mutation_table_clear(tsk_mutation_table_t *self) -{ - return tsk_mutation_table_truncate(self, 0); -} - -int -tsk_mutation_table_truncate(tsk_mutation_table_t *mutations, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > mutations->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - mutations->num_rows = num_rows; - mutations->derived_state_length = mutations->derived_state_offset[num_rows]; - mutations->metadata_length = mutations->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_mutation_table_extend(tsk_mutation_table_t *self, const tsk_mutation_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_mutation_t mutation; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_mutation_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_mutation_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &mutation); - if (ret != 0) { - goto out; - } - ret_id = tsk_mutation_table_add_row(self, mutation.site, mutation.node, - mutation.parent, mutation.time, mutation.derived_state, - mutation.derived_state_length, mutation.metadata, mutation.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out) -{ - int ret; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "mutation_table: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "derived_state_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->derived_state_length, - (long long) self->max_derived_state_length, - (long long) self->max_derived_state_length_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - ret = tsk_mutation_table_dump_text(self, out); - tsk_bug_assert(ret == 0); - tsk_bug_assert(self->derived_state_offset[0] == 0); - tsk_bug_assert( - self->derived_state_length == self->derived_state_offset[self->num_rows]); - tsk_bug_assert(self->metadata_offset[0] == 0); - tsk_bug_assert(self->metadata_length == self->metadata_offset[self->num_rows]); -} - -static inline void -tsk_mutation_table_get_row_unsafe( - const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row) -{ - row->id = (tsk_id_t) index; - row->site = self->site[index]; - row->node = self->node[index]; - row->parent = self->parent[index]; - row->time = self->time[index]; - row->derived_state_length - = self->derived_state_offset[index + 1] - self->derived_state_offset[index]; - row->derived_state = self->derived_state + self->derived_state_offset[index]; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; - row->edge = TSK_NULL; -} - -int -tsk_mutation_table_get_row( - const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_MUTATION_OUT_OF_BOUNDS; - goto out; - } - tsk_mutation_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_mutation_table_set_metadata_schema(tsk_mutation_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out) -{ - int ret = TSK_ERR_IO; - int err; - tsk_size_t j, derived_state_len, metadata_len; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "id\tsite\tnode\tparent\ttime\tderived_state\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - derived_state_len - = self->derived_state_offset[j + 1] - self->derived_state_offset[j]; - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%lld\t%lld\t%lld\t%lld\t%f\t%.*s\t%.*s\n", (long long) j, - (long long) self->site[j], (long long) self->node[j], - (long long) self->parent[j], self->time[j], (int) derived_state_len, - self->derived_state + self->derived_state_offset[j], (int) metadata_len, - self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -int -tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) -{ - int ret = 0; - const tsk_size_t current_num_rows = self->num_rows; - tsk_size_t j, remaining_rows; - tsk_id_t pj; - tsk_id_t *id_map = ret_id_map; - tsk_id_t *restrict parent = self->parent; - - if (ret_id_map == NULL) { - id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); - if (id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } - - keep_mask_to_id_map(current_num_rows, keep, id_map); - - /* Note: we could add some options to avoid these checks if we wanted. - * MAP_DELETED_TO_NULL is an obvious one, and I guess it might be - * helpful to also provide NO_REMAP to prevent reference remapping - * entirely. */ - for (j = 0; j < current_num_rows; j++) { - if (keep[j]) { - pj = parent[j]; - if (pj != TSK_NULL) { - if (pj < 0 || pj >= (tsk_id_t) current_num_rows) { - ret = TSK_ERR_MUTATION_OUT_OF_BOUNDS; - goto out; - } - if (id_map[pj] == TSK_NULL) { - ret = TSK_ERR_KEEP_ROWS_MAP_TO_DELETED; - goto out; - } - } - } - } - - remaining_rows = subset_id_column(self->site, current_num_rows, keep); - subset_id_column(self->node, current_num_rows, keep); - subset_remap_id_column(parent, current_num_rows, keep, id_map); - subset_double_column(self->time, current_num_rows, keep); - self->derived_state_length = subset_ragged_char_column( - self->derived_state, self->derived_state_offset, current_num_rows, keep); - if (self->metadata_length > 0) { - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, current_num_rows, keep); - } - self->num_rows = remaining_rows; -out: - if (ret_id_map == NULL) { - tsk_safe_free(id_map); - } - return ret; -} - -static int -tsk_mutation_table_dump( - const tsk_mutation_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t cols[] = { - { "mutations/site", (void *) self->site, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "mutations/node", (void *) self->node, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "mutations/parent", (void *) self->parent, self->num_rows, - TSK_ID_STORAGE_TYPE }, - { "mutations/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, - { "mutations/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "mutations/derived_state", (void *) self->derived_state, - self->derived_state_length, KAS_UINT8, self->derived_state_offset, - self->num_rows }, - { "mutations/metadata", (void *) self->metadata, self->metadata_length, - KAS_UINT8, self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, cols, ragged_cols, options); -} - -static int -tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) -{ - int ret = 0; - tsk_id_t *node = NULL; - tsk_id_t *site = NULL; - tsk_id_t *parent = NULL; - double *time = NULL; - char *derived_state = NULL; - tsk_size_t *derived_state_offset = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - char *metadata_schema = NULL; - tsk_size_t num_rows, derived_state_length, metadata_length, metadata_schema_length; - - read_table_col_t cols[] = { - { "mutations/site", (void **) &site, TSK_ID_STORAGE_TYPE, 0 }, - { "mutations/node", (void **) &node, TSK_ID_STORAGE_TYPE, 0 }, - { "mutations/parent", (void **) &parent, TSK_ID_STORAGE_TYPE, 0 }, - { "mutations/time", (void **) &time, KAS_FLOAT64, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "mutations/derived_state", (void **) &derived_state, &derived_state_length, - KAS_UINT8, &derived_state_offset, 0 }, - { "mutations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, 0 }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "mutations/metadata_schema", (void **) &metadata_schema, - &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_mutation_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_mutation_table_takeset_columns(self, num_rows, site, node, parent, time, - derived_state, derived_state_offset, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - site = NULL; - node = NULL; - parent = NULL; - time = NULL; - derived_state = NULL; - derived_state_offset = NULL; - metadata = NULL; - metadata_offset = NULL; - -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -/************************* - * migration table - *************************/ - -static void -tsk_migration_table_free_columns(tsk_migration_table_t *self) -{ - tsk_safe_free(self->left); - tsk_safe_free(self->right); - tsk_safe_free(self->node); - tsk_safe_free(self->source); - tsk_safe_free(self->dest); - tsk_safe_free(self->time); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_migration_table_free(tsk_migration_table_t *self) -{ - tsk_migration_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_migration_table_expand_main_columns( - tsk_migration_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->source, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->dest, new_max_rows, sizeof(tsk_id_t)); - if (ret != 0) { - goto out; - } - ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_migration_table_expand_metadata( - tsk_migration_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_migration_table_set_max_rows_increment( - tsk_migration_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_migration_table_set_max_metadata_length_increment( - tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_migration_table_init(tsk_migration_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_migration_table_t)); - - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_migration_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_metadata_length_increment = 0; - tsk_migration_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -int -tsk_migration_table_append_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *node, - const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - tsk_size_t j, metadata_length; - - if (left == NULL || right == NULL || node == NULL || source == NULL || dest == NULL - || time == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if ((metadata == NULL) != (metadata_offset == NULL)) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - ret = tsk_migration_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->left + self->num_rows, left, num_rows * sizeof(double)); - tsk_memcpy(self->right + self->num_rows, right, num_rows * sizeof(double)); - tsk_memcpy(self->node + self->num_rows, node, num_rows * sizeof(tsk_id_t)); - tsk_memcpy(self->source + self->num_rows, source, num_rows * sizeof(tsk_id_t)); - tsk_memcpy(self->dest + self->num_rows, dest, num_rows * sizeof(tsk_id_t)); - tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); - if (metadata == NULL) { - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; - } - } else { - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = (tsk_size_t) self->metadata_length + metadata_offset[j]; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_migration_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - self->metadata_length += metadata_length; - } - - self->num_rows += num_rows; - self->metadata_offset[self->num_rows] = self->metadata_length; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_migration_table_takeset_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - double *left, double *right, tsk_id_t *node, tsk_id_t *source, tsk_id_t *dest, - double *time, char *metadata, tsk_size_t *metadata_offset) -{ - int ret = 0; - - if (left == NULL || right == NULL || node == NULL || source == NULL || dest == NULL - || time == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - /* We need to check all the inputs before we start freeing or taking memory */ - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_migration_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - self->left = left; - self->right = right; - self->node = node; - self->source = source; - self->dest = dest; - self->time = time; - - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_migration_table_copy( - const tsk_migration_table_t *self, tsk_migration_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_migration_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_migration_table_set_columns(dest, self->num_rows, self->left, self->right, - self->node, self->source, self->dest, self->time, self->metadata, - self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int -tsk_migration_table_set_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *node, - const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - - ret = tsk_migration_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_append_columns(self, num_rows, left, right, node, source, - dest, time, metadata, metadata_offset); -out: - return ret; -} - -tsk_id_t -tsk_migration_table_add_row(tsk_migration_table_t *self, double left, double right, - tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, - tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - ret = tsk_migration_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - - tsk_bug_assert(self->num_rows < self->max_rows); - tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); - tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); - self->left[self->num_rows] = left; - self->right[self->num_rows] = right; - self->node[self->num_rows] = node; - self->source[self->num_rows] = source; - self->dest[self->num_rows] = dest; - self->time[self->num_rows] = time; - self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; - self->metadata_length += metadata_length; - - ret = (tsk_id_t) self->num_rows; - self->num_rows++; -out: - return ret; -} - -static int -tsk_migration_table_update_row_rewrite(tsk_migration_table_t *self, tsk_id_t index, - double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, - double time, const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_migration_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_migration_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_migration_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_migration_table_add_row( - self, left, right, node, source, dest, time, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_migration_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_migration_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_migration_table_update_row(tsk_migration_table_t *self, tsk_id_t index, double left, - double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, - const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_migration_t current_row; - - ret = tsk_migration_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length) { - self->left[index] = left; - self->right[index] = right; - self->node[index] = node; - self->source[index] = source; - self->dest[index] = dest; - self->time[index] = time; - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_migration_table_update_row_rewrite(self, index, left, right, node, - source, dest, time, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_migration_table_clear(tsk_migration_table_t *self) -{ - return tsk_migration_table_truncate(self, 0); -} - -int -tsk_migration_table_truncate(tsk_migration_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->metadata_length = self->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_migration_table_extend(tsk_migration_table_t *self, - const tsk_migration_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, - tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_migration_t migration; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_migration_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_migration_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &migration); - if (ret != 0) { - goto out; - } - ret_id = tsk_migration_table_add_row(self, migration.left, migration.right, - migration.node, migration.source, migration.dest, migration.time, - migration.metadata, migration.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out) -{ - int ret; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "migration_table: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - ret = tsk_migration_table_dump_text(self, out); - tsk_bug_assert(ret == 0); -} - -static inline void -tsk_migration_table_get_row_unsafe( - const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row) -{ - row->id = (tsk_id_t) index; - row->left = self->left[index]; - row->right = self->right[index]; - row->node = self->node[index]; - row->source = self->source[index]; - row->dest = self->dest[index]; - row->time = self->time[index]; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; -} - -int -tsk_migration_table_get_row( - const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_MIGRATION_OUT_OF_BOUNDS; - goto out; - } - tsk_migration_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_migration_table_set_metadata_schema(tsk_migration_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out) -{ - tsk_size_t j; - int ret = TSK_ERR_IO; - tsk_size_t metadata_len; - int err; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "left\tright\tnode\tsource\tdest\ttime\tmetadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%.3f\t%.3f\t%lld\t%lld\t%lld\t%f\t%.*s\n", self->left[j], - self->right[j], (long long) self->node[j], (long long) self->source[j], - (long long) self->dest[j], self->time[j], (int) metadata_len, - self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_migration_table_equals(const tsk_migration_table_t *self, - const tsk_migration_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && tsk_memcmp(self->left, other->left, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->right, other->right, self->num_rows * sizeof(double)) == 0 - && tsk_memcmp(self->node, other->node, self->num_rows * sizeof(tsk_id_t)) == 0 - && tsk_memcmp(self->source, other->source, self->num_rows * sizeof(tsk_id_t)) - == 0 - && tsk_memcmp(self->dest, other->dest, self->num_rows * sizeof(tsk_id_t)) == 0 - && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0; - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - tsk_size_t remaining_rows; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - - remaining_rows = subset_double_column(self->left, self->num_rows, keep); - subset_double_column(self->right, self->num_rows, keep); - subset_id_column(self->node, self->num_rows, keep); - subset_id_column(self->source, self->num_rows, keep); - subset_id_column(self->dest, self->num_rows, keep); - subset_double_column(self->time, self->num_rows, keep); - if (self->metadata_length > 0) { - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, self->num_rows, keep); - } - self->num_rows = remaining_rows; - return ret; -} - -static int -tsk_migration_table_dump( - const tsk_migration_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t cols[] = { - { "migrations/left", (void *) self->left, self->num_rows, KAS_FLOAT64 }, - { "migrations/right", (void *) self->right, self->num_rows, KAS_FLOAT64 }, - { "migrations/node", (void *) self->node, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "migrations/source", (void *) self->source, self->num_rows, - TSK_ID_STORAGE_TYPE }, - { "migrations/dest", (void *) self->dest, self->num_rows, TSK_ID_STORAGE_TYPE }, - { "migrations/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, - { "migrations/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "migrations/metadata", (void *) self->metadata, self->metadata_length, - KAS_UINT8, self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, cols, ragged_cols, options); -} - -static int -tsk_migration_table_load(tsk_migration_table_t *self, kastore_t *store) -{ - int ret = 0; - tsk_id_t *source = NULL; - tsk_id_t *dest = NULL; - tsk_id_t *node = NULL; - double *left = NULL; - double *right = NULL; - double *time = NULL; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - char *metadata_schema = NULL; - tsk_size_t num_rows, metadata_length, metadata_schema_length; - - read_table_col_t cols[] = { - { "migrations/left", (void **) &left, KAS_FLOAT64, 0 }, - { "migrations/right", (void **) &right, KAS_FLOAT64, 0 }, - { "migrations/node", (void **) &node, TSK_ID_STORAGE_TYPE, 0 }, - { "migrations/source", (void **) &source, TSK_ID_STORAGE_TYPE, 0 }, - { "migrations/dest", (void **) &dest, TSK_ID_STORAGE_TYPE, 0 }, - { "migrations/time", (void **) &time, KAS_FLOAT64, 0 }, - { .name = NULL }, - }; - read_table_ragged_col_t ragged_cols[] = { - { "migrations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "migrations/metadata_schema", (void **) &metadata_schema, - &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_migration_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_migration_table_takeset_columns(self, num_rows, left, right, node, source, - dest, time, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - left = NULL; - right = NULL; - node = NULL; - source = NULL; - dest = NULL; - time = NULL; - metadata = NULL; - metadata_offset = NULL; - -out: - free_read_table_mem(cols, ragged_cols, properties); - return ret; -} - -/************************* - * population table - *************************/ - -static void -tsk_population_table_free_columns(tsk_population_table_t *self) -{ - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_offset); -} - -int -tsk_population_table_free(tsk_population_table_t *self) -{ - tsk_population_table_free_columns(self); - tsk_safe_free(self->metadata_schema); - return 0; -} - -static int -tsk_population_table_expand_main_columns( - tsk_population_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column( - (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_population_table_expand_metadata( - tsk_population_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->metadata_length, additional_length, - self->max_metadata_length_increment, &self->max_metadata_length, - (void **) &self->metadata, sizeof(*self->metadata)); -} - -int -tsk_population_table_set_max_rows_increment( - tsk_population_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_population_table_set_max_metadata_length_increment( - tsk_population_table_t *self, tsk_size_t max_metadata_length_increment) -{ - self->max_metadata_length_increment = max_metadata_length_increment; - return 0; -} - -int -tsk_population_table_init(tsk_population_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_population_table_t)); - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_metadata_length_increment = 1; - ret = tsk_population_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_expand_metadata(self, 1); - if (ret != 0) { - goto out; - } - self->metadata_offset[0] = 0; - self->max_rows_increment = 0; - self->max_metadata_length_increment = 0; - tsk_population_table_set_metadata_schema(self, NULL, 0); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_population_table_copy(const tsk_population_table_t *self, - tsk_population_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_population_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_population_table_set_columns( - dest, self->num_rows, self->metadata, self->metadata_offset); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); -out: - return ret; -} - -int -tsk_population_table_set_columns(tsk_population_table_t *self, tsk_size_t num_rows, - const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - - ret = tsk_population_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_append_columns(self, num_rows, metadata, metadata_offset); -out: - return ret; -} - -int -tsk_population_table_append_columns(tsk_population_table_t *self, tsk_size_t num_rows, - const char *metadata, const tsk_size_t *metadata_offset) -{ - int ret; - tsk_size_t j, metadata_length; - - if (metadata == NULL || metadata_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_population_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - - ret = check_offsets(num_rows, metadata_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->metadata_offset[self->num_rows + j] - = self->metadata_length + metadata_offset[j]; - } - metadata_length = metadata_offset[num_rows]; - ret = tsk_population_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->metadata + self->metadata_length, metadata, - metadata_length * sizeof(char)); - self->metadata_length += metadata_length; - - self->num_rows += num_rows; - self->metadata_offset[self->num_rows] = self->metadata_length; -out: - return ret; -} - -int -tsk_population_table_takeset_columns(tsk_population_table_t *self, tsk_size_t num_rows, - char *metadata, tsk_size_t *metadata_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - if (metadata == NULL || metadata_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = check_ragged_column(num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - - tsk_population_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - - ret = takeset_ragged_column(num_rows, metadata, metadata_offset, - (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static tsk_id_t -tsk_population_table_add_row_internal( - tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - tsk_bug_assert(self->num_rows < self->max_rows); - tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); - tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); - self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; - self->metadata_length += metadata_length; - ret = (tsk_id_t) self->num_rows; - self->num_rows++; - return ret; -} - -tsk_id_t -tsk_population_table_add_row( - tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length) -{ - tsk_id_t ret = 0; - - ret = tsk_population_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_expand_metadata(self, metadata_length); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_add_row_internal(self, metadata, metadata_length); -out: - return ret; -} - -static int -tsk_population_table_update_row_rewrite(tsk_population_table_t *self, tsk_id_t index, - const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_population_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_population_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_population_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_population_table_add_row(self, metadata, metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_population_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_population_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_population_table_update_row(tsk_population_table_t *self, tsk_id_t index, - const char *metadata, tsk_size_t metadata_length) -{ - int ret = 0; - tsk_population_t current_row; - - ret = tsk_population_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.metadata_length == metadata_length) { - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, - metadata_length * sizeof(*metadata)); - } else { - ret = tsk_population_table_update_row_rewrite( - self, index, metadata, metadata_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_population_table_clear(tsk_population_table_t *self) -{ - return tsk_population_table_truncate(self, 0); -} - -int -tsk_population_table_truncate(tsk_population_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->metadata_length = self->metadata_offset[num_rows]; -out: - return ret; -} - -int -tsk_population_table_extend(tsk_population_table_t *self, - const tsk_population_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_population_t population; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_population_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_population_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &population); - if (ret != 0) { - goto out; - } - ret_id = tsk_population_table_add_row( - self, population.metadata, population.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out) -{ - tsk_size_t j, k; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "population_table: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->metadata_length, (long long) self->max_metadata_length, - (long long) self->max_metadata_length_increment); - fprintf(out, TABLE_SEP); - write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - fprintf(out, "index\tmetadata_offset\tmetadata\n"); - for (j = 0; j < self->num_rows; j++) { - fprintf( - out, "%lld\t%lld\t", (long long) j, (long long) self->metadata_offset[j]); - for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { - fprintf(out, "%c", self->metadata[k]); - } - fprintf(out, "\n"); - } - tsk_bug_assert(self->metadata_offset[0] == 0); - tsk_bug_assert(self->metadata_offset[self->num_rows] == self->metadata_length); -} - -static inline void -tsk_population_table_get_row_unsafe( - const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row) -{ - row->id = (tsk_id_t) index; - row->metadata_length - = self->metadata_offset[index + 1] - self->metadata_offset[index]; - row->metadata = self->metadata + self->metadata_offset[index]; -} - -int -tsk_population_table_get_row( - const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - tsk_population_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_population_table_set_metadata_schema(tsk_population_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_population_table_dump_text(const tsk_population_table_t *self, FILE *out) -{ - int ret = TSK_ERR_IO; - int err; - tsk_size_t j; - tsk_size_t metadata_len; - - err = write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - if (err < 0) { - goto out; - } - err = fprintf(out, "metadata\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%.*s\n", (int) metadata_len, - self->metadata + self->metadata_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_population_table_equals(const tsk_population_table_t *self, - const tsk_population_table_t *other, tsk_flags_t options) -{ - /* Since we only have the metadata column in the table currently, equality - * reduces to comparing the number of rows if we disable metadata comparison. - */ - bool ret = self->num_rows == other->num_rows; - if (!(options & TSK_CMP_IGNORE_METADATA)) { - ret = ret && self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata_offset, other->metadata_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - - if (self->metadata_length > 0) { - self->metadata_length = subset_ragged_char_column( - self->metadata, self->metadata_offset, self->num_rows, keep); - } - self->num_rows = count_true(self->num_rows, keep); - return ret; -} - -static int -tsk_population_table_dump( - const tsk_population_table_t *self, kastore_t *store, tsk_flags_t options) -{ - const write_table_col_t cols[] = { - { "populations/metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - const write_table_ragged_col_t ragged_cols[] = { - { "populations/metadata", (void *) self->metadata, self->metadata_length, - KAS_UINT8, self->metadata_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table(store, cols, ragged_cols, options); -} - -static int -tsk_population_table_load(tsk_population_table_t *self, kastore_t *store) -{ - int ret = 0; - char *metadata = NULL; - tsk_size_t *metadata_offset = NULL; - char *metadata_schema = NULL; - tsk_size_t num_rows, metadata_length, metadata_schema_length; - - read_table_ragged_col_t ragged_cols[] = { - { "populations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, - &metadata_offset, 0 }, - { .name = NULL }, - }; - read_table_property_t properties[] = { - { "populations/metadata_schema", (void **) &metadata_schema, - &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, NULL, ragged_cols, properties, 0); - if (ret != 0) { - goto out; - } - if (metadata_schema != NULL) { - ret = tsk_population_table_set_metadata_schema( - self, metadata_schema, metadata_schema_length); - if (ret != 0) { - goto out; - } - } - ret = tsk_population_table_takeset_columns( - self, num_rows, metadata, metadata_offset); - if (ret != 0) { - goto out; - } - metadata = NULL; - metadata_offset = NULL; - -out: - free_read_table_mem(NULL, ragged_cols, properties); - return ret; -} - -/************************* - * provenance table - *************************/ - -static void -tsk_provenance_table_free_columns(tsk_provenance_table_t *self) -{ - tsk_safe_free(self->timestamp); - tsk_safe_free(self->timestamp_offset); - tsk_safe_free(self->record); - tsk_safe_free(self->record_offset); -} - -int -tsk_provenance_table_free(tsk_provenance_table_t *self) -{ - tsk_provenance_table_free_columns(self); - return 0; -} - -static int -tsk_provenance_table_expand_main_columns( - tsk_provenance_table_t *self, tsk_size_t additional_rows) -{ - int ret = 0; - tsk_size_t new_max_rows; - - ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, - additional_rows, &new_max_rows); - if (ret != 0) { - goto out; - } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column( - (void **) &self->timestamp_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - ret = expand_column( - (void **) &self->record_offset, new_max_rows + 1, sizeof(tsk_size_t)); - if (ret != 0) { - goto out; - } - self->max_rows = new_max_rows; - } -out: - return ret; -} - -static int -tsk_provenance_table_expand_timestamp( - tsk_provenance_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->timestamp_length, additional_length, - self->max_timestamp_length_increment, &self->max_timestamp_length, - (void **) &self->timestamp, sizeof(*self->timestamp)); -} - -static int -tsk_provenance_table_expand_record( - tsk_provenance_table_t *self, tsk_size_t additional_length) -{ - return expand_ragged_column(self->record_length, additional_length, - self->max_record_length_increment, &self->max_record_length, - (void **) &self->record, sizeof(*self->record)); -} - -int -tsk_provenance_table_set_max_rows_increment( - tsk_provenance_table_t *self, tsk_size_t max_rows_increment) -{ - self->max_rows_increment = max_rows_increment; - return 0; -} - -int -tsk_provenance_table_set_max_timestamp_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment) -{ - self->max_timestamp_length_increment = max_timestamp_length_increment; - return 0; -} - -int -tsk_provenance_table_set_max_record_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_record_length_increment) -{ - self->max_record_length_increment = max_record_length_increment; - return 0; -} - -int -tsk_provenance_table_init(tsk_provenance_table_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(tsk_provenance_table_t)); - /* Allocate space for one row initially, ensuring we always have valid pointers - * even if the table is empty */ - self->max_rows_increment = 1; - self->max_timestamp_length_increment = 1; - self->max_record_length_increment = 1; - ret = tsk_provenance_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_expand_timestamp(self, 1); - if (ret != 0) { - goto out; - } - self->timestamp_offset[0] = 0; - ret = tsk_provenance_table_expand_record(self, 1); - if (ret != 0) { - goto out; - } - self->record_offset[0] = 0; - self->max_rows_increment = 0; - self->max_timestamp_length_increment = 0; - self->max_record_length_increment = 0; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_provenance_table_copy(const tsk_provenance_table_t *self, - tsk_provenance_table_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_provenance_table_init(dest, 0); - if (ret != 0) { - goto out; - } - } - ret = tsk_provenance_table_set_columns(dest, self->num_rows, self->timestamp, - self->timestamp_offset, self->record, self->record_offset); -out: - return ret; -} - -int -tsk_provenance_table_set_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, - const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, - const tsk_size_t *record_offset) -{ - int ret; - - ret = tsk_provenance_table_clear(self); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_append_columns( - self, num_rows, timestamp, timestamp_offset, record, record_offset); -out: - return ret; -} - -int -tsk_provenance_table_append_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, - const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, - const tsk_size_t *record_offset) -{ - int ret; - tsk_size_t j, timestamp_length, record_length; - - if (timestamp == NULL || timestamp_offset == NULL || record == NULL - || record_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = tsk_provenance_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - - ret = check_offsets(num_rows, timestamp_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->timestamp_offset[self->num_rows + j] - = self->timestamp_length + timestamp_offset[j]; - } - timestamp_length = timestamp_offset[num_rows]; - ret = tsk_provenance_table_expand_timestamp(self, timestamp_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->timestamp + self->timestamp_length, timestamp, - timestamp_length * sizeof(char)); - self->timestamp_length += timestamp_length; - - ret = check_offsets(num_rows, record_offset, 0, false); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - self->record_offset[self->num_rows + j] = self->record_length + record_offset[j]; - } - record_length = record_offset[num_rows]; - ret = tsk_provenance_table_expand_record(self, record_length); - if (ret != 0) { - goto out; - } - tsk_memcpy(self->record + self->record_length, record, record_length * sizeof(char)); - self->record_length += record_length; - - self->num_rows += num_rows; - self->timestamp_offset[self->num_rows] = self->timestamp_length; - self->record_offset[self->num_rows] = self->record_length; -out: - return ret; -} - -int -tsk_provenance_table_takeset_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, - char *timestamp, tsk_size_t *timestamp_offset, char *record, - tsk_size_t *record_offset) -{ - int ret = 0; - - /* We need to check all the inputs before we start freeing or taking memory */ - if (timestamp == NULL || timestamp_offset == NULL || record == NULL - || record_offset == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - ret = check_ragged_column(num_rows, timestamp, timestamp_offset); - if (ret != 0) { - goto out; - } - ret = check_ragged_column(num_rows, record, record_offset); - if (ret != 0) { - goto out; - } - - tsk_provenance_table_free_columns(self); - self->num_rows = num_rows; - self->max_rows = num_rows; - - ret = takeset_ragged_column(num_rows, timestamp, timestamp_offset, - (void *) &self->timestamp, &self->timestamp_offset, &self->timestamp_length); - if (ret != 0) { - goto out; - } - ret = takeset_ragged_column(num_rows, record, record_offset, (void *) &self->record, - &self->record_offset, &self->record_length); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static tsk_id_t -tsk_provenance_table_add_row_internal(tsk_provenance_table_t *self, - const char *timestamp, tsk_size_t timestamp_length, const char *record, - tsk_size_t record_length) -{ - tsk_id_t ret = 0; - - tsk_bug_assert(self->num_rows < self->max_rows); - tsk_bug_assert( - self->timestamp_length + timestamp_length <= self->max_timestamp_length); - tsk_memmove(self->timestamp + self->timestamp_length, timestamp, timestamp_length); - self->timestamp_offset[self->num_rows + 1] - = self->timestamp_length + timestamp_length; - self->timestamp_length += timestamp_length; - tsk_bug_assert(self->record_length + record_length <= self->max_record_length); - tsk_memmove(self->record + self->record_length, record, record_length); - self->record_offset[self->num_rows + 1] = self->record_length + record_length; - self->record_length += record_length; - ret = (tsk_id_t) self->num_rows; - self->num_rows++; - return ret; -} - -tsk_id_t -tsk_provenance_table_add_row(tsk_provenance_table_t *self, const char *timestamp, - tsk_size_t timestamp_length, const char *record, tsk_size_t record_length) -{ - tsk_id_t ret = 0; - - ret = tsk_provenance_table_expand_main_columns(self, 1); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_expand_timestamp(self, timestamp_length); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_expand_record(self, record_length); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_add_row_internal( - self, timestamp, timestamp_length, record, record_length); -out: - return ret; -} - -static int -tsk_provenance_table_update_row_rewrite(tsk_provenance_table_t *self, tsk_id_t index, - const char *timestamp, tsk_size_t timestamp_length, const char *record, - tsk_size_t record_length) -{ - int ret = 0; - tsk_id_t j, ret_id; - tsk_provenance_table_t copy; - tsk_size_t num_rows; - tsk_id_t *rows = NULL; - - ret = tsk_provenance_table_copy(self, ©, 0); - if (ret != 0) { - goto out; - } - rows = tsk_malloc(self->num_rows * sizeof(*rows)); - if (rows == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_provenance_table_truncate(self, (tsk_size_t) index); - tsk_bug_assert(ret == 0); - ret_id = tsk_provenance_table_add_row( - self, timestamp, timestamp_length, record, record_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_rows = 0; - for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { - rows[num_rows] = j; - num_rows++; - } - ret = tsk_provenance_table_extend(self, ©, num_rows, rows, 0); - if (ret != 0) { - goto out; - } -out: - tsk_provenance_table_free(©); - tsk_safe_free(rows); - return ret; -} - -int -tsk_provenance_table_update_row(tsk_provenance_table_t *self, tsk_id_t index, - const char *timestamp, tsk_size_t timestamp_length, const char *record, - tsk_size_t record_length) -{ - int ret = 0; - tsk_provenance_t current_row; - - ret = tsk_provenance_table_get_row(self, index, ¤t_row); - if (ret != 0) { - goto out; - } - if (current_row.timestamp_length == timestamp_length - && current_row.record_length == record_length) { - /* Note: important to use tsk_memmove here as we may be provided pointers - * to the column memory as input via get_row */ - tsk_memmove(&self->timestamp[self->timestamp_offset[index]], timestamp, - timestamp_length * sizeof(*timestamp)); - tsk_memmove(&self->record[self->record_offset[index]], record, - record_length * sizeof(*record)); - } else { - ret = tsk_provenance_table_update_row_rewrite( - self, index, timestamp, timestamp_length, record, record_length); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_provenance_table_clear(tsk_provenance_table_t *self) -{ - return tsk_provenance_table_truncate(self, 0); -} - -int -tsk_provenance_table_truncate(tsk_provenance_table_t *self, tsk_size_t num_rows) -{ - int ret = 0; - - if (num_rows > self->num_rows) { - ret = TSK_ERR_BAD_TABLE_POSITION; - goto out; - } - self->num_rows = num_rows; - self->timestamp_length = self->timestamp_offset[num_rows]; - self->record_length = self->record_offset[num_rows]; -out: - return ret; -} - -int -tsk_provenance_table_extend(tsk_provenance_table_t *self, - const tsk_provenance_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_provenance_t provenance; - - if (self == other) { - ret = TSK_ERR_CANNOT_EXTEND_FROM_SELF; - goto out; - } - - /* We know how much to expand the non-ragged columns, so do it ahead of time */ - ret = tsk_provenance_table_expand_main_columns(self, num_rows); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_rows; j++) { - ret = tsk_provenance_table_get_row( - other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &provenance); - if (ret != 0) { - goto out; - } - ret_id = tsk_provenance_table_add_row(self, provenance.timestamp, - provenance.timestamp_length, provenance.record, provenance.record_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - return ret; -} - -void -tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out) -{ - tsk_size_t j, k; - - fprintf(out, "\n" TABLE_SEP); - fprintf(out, "provenance_table: %p:\n", (const void *) self); - fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->num_rows, (long long) self->max_rows, - (long long) self->max_rows_increment); - fprintf(out, "timestamp_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->timestamp_length, (long long) self->max_timestamp_length, - (long long) self->max_timestamp_length_increment); - fprintf(out, "record_length = %lld\tmax= %lld\tincrement = %lld)\n", - (long long) self->record_length, (long long) self->max_record_length, - (long long) self->max_record_length_increment); - fprintf(out, TABLE_SEP); - fprintf(out, "index\ttimestamp_offset\ttimestamp\trecord_offset\tprovenance\n"); - for (j = 0; j < self->num_rows; j++) { - fprintf( - out, "%lld\t%lld\t", (long long) j, (long long) self->timestamp_offset[j]); - for (k = self->timestamp_offset[j]; k < self->timestamp_offset[j + 1]; k++) { - fprintf(out, "%c", self->timestamp[k]); - } - fprintf(out, "\t%lld\t", (long long) self->record_offset[j]); - for (k = self->record_offset[j]; k < self->record_offset[j + 1]; k++) { - fprintf(out, "%c", self->record[k]); - } - fprintf(out, "\n"); - } - tsk_bug_assert(self->timestamp_offset[0] == 0); - tsk_bug_assert(self->timestamp_offset[self->num_rows] == self->timestamp_length); - tsk_bug_assert(self->record_offset[0] == 0); - tsk_bug_assert(self->record_offset[self->num_rows] == self->record_length); -} - -static inline void -tsk_provenance_table_get_row_unsafe( - const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row) -{ - row->id = (tsk_id_t) index; - row->timestamp_length - = self->timestamp_offset[index + 1] - self->timestamp_offset[index]; - row->timestamp = self->timestamp + self->timestamp_offset[index]; - row->record_length = self->record_offset[index + 1] - self->record_offset[index]; - row->record = self->record + self->record_offset[index]; -} - -int -tsk_provenance_table_get_row( - const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row) -{ - int ret = 0; - - if (index < 0 || index >= (tsk_id_t) self->num_rows) { - ret = TSK_ERR_PROVENANCE_OUT_OF_BOUNDS; - goto out; - } - tsk_provenance_table_get_row_unsafe(self, index, row); -out: - return ret; -} - -int -tsk_provenance_table_dump_text(const tsk_provenance_table_t *self, FILE *out) -{ - int ret = TSK_ERR_IO; - int err; - tsk_size_t j, timestamp_len, record_len; - - err = fprintf(out, "record\ttimestamp\n"); - if (err < 0) { - goto out; - } - for (j = 0; j < self->num_rows; j++) { - record_len = self->record_offset[j + 1] - self->record_offset[j]; - timestamp_len = self->timestamp_offset[j + 1] - self->timestamp_offset[j]; - err = fprintf(out, "%.*s\t%.*s\n", (int) record_len, - self->record + self->record_offset[j], (int) timestamp_len, - self->timestamp + self->timestamp_offset[j]); - if (err < 0) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -bool -tsk_provenance_table_equals(const tsk_provenance_table_t *self, - const tsk_provenance_table_t *other, tsk_flags_t options) -{ - bool ret - = self->num_rows == other->num_rows - && self->record_length == other->record_length - && tsk_memcmp(self->record_offset, other->record_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->record, other->record, self->record_length * sizeof(char)) - == 0; - if (!(options & TSK_CMP_IGNORE_TIMESTAMPS)) { - ret = ret && self->timestamp_length == other->timestamp_length - && tsk_memcmp(self->timestamp_offset, other->timestamp_offset, - (self->num_rows + 1) * sizeof(tsk_size_t)) - == 0 - && tsk_memcmp(self->timestamp, other->timestamp, - self->timestamp_length * sizeof(char)) - == 0; - } - return ret; -} - -int -tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, - tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) -{ - int ret = 0; - - if (id_map != NULL) { - keep_mask_to_id_map(self->num_rows, keep, id_map); - } - self->timestamp_length = subset_ragged_char_column( - self->timestamp, self->timestamp_offset, self->num_rows, keep); - self->record_length = subset_ragged_char_column( - self->record, self->record_offset, self->num_rows, keep); - self->num_rows = count_true(self->num_rows, keep); - - return ret; -} - -static int -tsk_provenance_table_dump( - const tsk_provenance_table_t *self, kastore_t *store, tsk_flags_t options) -{ - write_table_ragged_col_t ragged_cols[] = { - { "provenances/timestamp", (void *) self->timestamp, self->timestamp_length, - KAS_UINT8, self->timestamp_offset, self->num_rows }, - { "provenances/record", (void *) self->record, self->record_length, KAS_UINT8, - self->record_offset, self->num_rows }, - { .name = NULL }, - }; - - return write_table_ragged_cols(store, ragged_cols, options); -} - -static int -tsk_provenance_table_load(tsk_provenance_table_t *self, kastore_t *store) -{ - int ret; - char *timestamp = NULL; - tsk_size_t *timestamp_offset = NULL; - char *record = NULL; - tsk_size_t *record_offset = NULL; - tsk_size_t num_rows, timestamp_length, record_length; - - read_table_ragged_col_t ragged_cols[] = { - { "provenances/timestamp", (void **) ×tamp, ×tamp_length, KAS_UINT8, - ×tamp_offset, 0 }, - { "provenances/record", (void **) &record, &record_length, KAS_UINT8, - &record_offset, 0 }, - { .name = NULL }, - }; - - ret = read_table(store, &num_rows, NULL, ragged_cols, NULL, 0); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_takeset_columns( - self, num_rows, timestamp, timestamp_offset, record, record_offset); - if (ret != 0) { - goto out; - } - timestamp = NULL; - timestamp_offset = NULL; - record = NULL; - record_offset = NULL; - -out: - free_read_table_mem(NULL, ragged_cols, NULL); - return ret; -} - -/************************* - * sort_tables - *************************/ - -typedef struct { - double left; - double right; - tsk_id_t parent; - tsk_id_t child; - double time; - /* It would be a little bit more convenient to store a pointer to the - * metadata here in the struct rather than an offset back into the - * original array. However, this would increase the size of the struct - * from 40 bytes to 48 and we will allocate very large numbers of these. - */ - tsk_size_t metadata_offset; - tsk_size_t metadata_length; -} edge_sort_t; - -typedef struct { - tsk_mutation_t mut; - int num_descendants; -} mutation_canonical_sort_t; - -typedef struct { - tsk_individual_t ind; - tsk_id_t first_node; - tsk_size_t num_descendants; -} individual_canonical_sort_t; - -typedef struct { - double left; - double right; - tsk_id_t node; - tsk_id_t source; - tsk_id_t dest; - double time; - tsk_size_t metadata_offset; - tsk_size_t metadata_length; -} migration_sort_t; - -static int -cmp_site(const void *a, const void *b) -{ - const tsk_site_t *ia = (const tsk_site_t *) a; - const tsk_site_t *ib = (const tsk_site_t *) b; - /* Compare sites by position */ - int ret = (ia->position > ib->position) - (ia->position < ib->position); - if (ret == 0) { - /* Within a particular position sort by ID. This ensures that relative - * ordering of multiple sites at the same position is maintained; the - * redundant sites will get compacted down by clean_tables(), but in the - * meantime if the order of the redundant sites changes it will cause the - * sort order of mutations to be corrupted, as the mutations will follow - * their sites. */ - ret = (ia->id > ib->id) - (ia->id < ib->id); - } - return ret; -} - -static int -cmp_mutation(const void *a, const void *b) -{ - const tsk_mutation_t *ia = (const tsk_mutation_t *) a; - const tsk_mutation_t *ib = (const tsk_mutation_t *) b; - /* Compare mutations by site */ - int ret = (ia->site > ib->site) - (ia->site < ib->site); - /* Within a particular site sort by time if known, then ID. This ensures that - * relative ordering within a site is maintained */ - if (ret == 0 && !tsk_is_unknown_time(ia->time) && !tsk_is_unknown_time(ib->time)) { - ret = (ia->time < ib->time) - (ia->time > ib->time); - } - if (ret == 0) { - ret = (ia->id > ib->id) - (ia->id < ib->id); - } - return ret; -} - -static int -cmp_mutation_canonical(const void *a, const void *b) -{ - const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; - const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; - /* Compare mutations by site */ - int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); - if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) - && !tsk_is_unknown_time(ib->mut.time)) { - ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); - } - if (ret == 0) { - ret = (ia->num_descendants < ib->num_descendants) - - (ia->num_descendants > ib->num_descendants); - } - if (ret == 0) { - ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); - } - if (ret == 0) { - ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); - } - return ret; -} - -static int -cmp_individual_canonical(const void *a, const void *b) -{ - const individual_canonical_sort_t *ia = (const individual_canonical_sort_t *) a; - const individual_canonical_sort_t *ib = (const individual_canonical_sort_t *) b; - int ret = (ia->num_descendants < ib->num_descendants) - - (ia->num_descendants > ib->num_descendants); - if (ret == 0) { - ret = (ia->first_node > ib->first_node) - (ia->first_node < ib->first_node); - } - if (ret == 0) { - ret = (ia->ind.id > ib->ind.id) - (ia->ind.id < ib->ind.id); - } - return ret; -} - -static int -cmp_edge(const void *a, const void *b) -{ - const edge_sort_t *ca = (const edge_sort_t *) a; - const edge_sort_t *cb = (const edge_sort_t *) b; - - int ret = (ca->time > cb->time) - (ca->time < cb->time); - /* If time values are equal, sort by the parent node */ - if (ret == 0) { - ret = (ca->parent > cb->parent) - (ca->parent < cb->parent); - /* If the parent nodes are equal, sort by the child ID. */ - if (ret == 0) { - ret = (ca->child > cb->child) - (ca->child < cb->child); - /* If the child nodes are equal, sort by the left coordinate. */ - if (ret == 0) { - ret = (ca->left > cb->left) - (ca->left < cb->left); - } - } - } - return ret; -} - -static int -cmp_migration(const void *a, const void *b) -{ - const migration_sort_t *ca = (const migration_sort_t *) a; - const migration_sort_t *cb = (const migration_sort_t *) b; - - int ret = (ca->time > cb->time) - (ca->time < cb->time); - /* If time values are equal, sort by the source population */ - if (ret == 0) { - ret = (ca->source > cb->source) - (ca->source < cb->source); - /* If the source populations are equal, sort by the dest */ - if (ret == 0) { - ret = (ca->dest > cb->dest) - (ca->dest < cb->dest); - /* If the dest populations are equal, sort by the left coordinate. */ - if (ret == 0) { - ret = (ca->left > cb->left) - (ca->left < cb->left); - /* If everything else is equal, compare by node */ - if (ret == 0) { - ret = (ca->node > cb->node) - (ca->node < cb->node); - } - } - } - } - return ret; -} - -static int -tsk_table_sorter_sort_edges(tsk_table_sorter_t *self, tsk_size_t start) -{ - int ret = 0; - const tsk_edge_table_t *edges = &self->tables->edges; - const double *restrict node_time = self->tables->nodes.time; - edge_sort_t *e; - tsk_size_t j, k, metadata_offset; - tsk_size_t n = edges->num_rows - start; - edge_sort_t *sorted_edges = tsk_malloc(n * sizeof(*sorted_edges)); - char *old_metadata = tsk_malloc(edges->metadata_length); - bool has_metadata = tsk_edge_table_has_metadata(edges); - - if (sorted_edges == NULL || old_metadata == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(old_metadata, edges->metadata, edges->metadata_length); - for (j = 0; j < n; j++) { - e = sorted_edges + j; - k = start + j; - e->left = edges->left[k]; - e->right = edges->right[k]; - e->parent = edges->parent[k]; - e->child = edges->child[k]; - e->time = node_time[e->parent]; - if (has_metadata) { - e->metadata_offset = edges->metadata_offset[k]; - e->metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - } - } - qsort(sorted_edges, (size_t) n, sizeof(edge_sort_t), cmp_edge); - /* Copy the edges back into the table. */ - metadata_offset = 0; - for (j = 0; j < n; j++) { - e = sorted_edges + j; - k = start + j; - edges->left[k] = e->left; - edges->right[k] = e->right; - edges->parent[k] = e->parent; - edges->child[k] = e->child; - if (has_metadata) { - tsk_memcpy(edges->metadata + metadata_offset, - old_metadata + e->metadata_offset, e->metadata_length); - edges->metadata_offset[k] = metadata_offset; - metadata_offset += e->metadata_length; - } - } -out: - tsk_safe_free(sorted_edges); - tsk_safe_free(old_metadata); - return ret; -} - -static int -tsk_table_sorter_sort_migrations(tsk_table_sorter_t *self, tsk_size_t start) -{ - int ret = 0; - const tsk_migration_table_t *migrations = &self->tables->migrations; - migration_sort_t *m; - tsk_size_t j, k, metadata_offset; - tsk_size_t n = migrations->num_rows - start; - migration_sort_t *sorted_migrations = tsk_malloc(n * sizeof(*sorted_migrations)); - char *old_metadata = tsk_malloc(migrations->metadata_length); - - if (sorted_migrations == NULL || old_metadata == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(old_metadata, migrations->metadata, migrations->metadata_length); - for (j = 0; j < n; j++) { - m = sorted_migrations + j; - k = start + j; - m->left = migrations->left[k]; - m->right = migrations->right[k]; - m->node = migrations->node[k]; - m->source = migrations->source[k]; - m->dest = migrations->dest[k]; - m->time = migrations->time[k]; - m->metadata_offset = migrations->metadata_offset[k]; - m->metadata_length - = migrations->metadata_offset[k + 1] - migrations->metadata_offset[k]; - } - qsort(sorted_migrations, (size_t) n, sizeof(migration_sort_t), cmp_migration); - /* Copy the migrations back into the table. */ - metadata_offset = 0; - for (j = 0; j < n; j++) { - m = sorted_migrations + j; - k = start + j; - migrations->left[k] = m->left; - migrations->right[k] = m->right; - migrations->node[k] = m->node; - migrations->source[k] = m->source; - migrations->dest[k] = m->dest; - migrations->time[k] = m->time; - tsk_memcpy(migrations->metadata + metadata_offset, - old_metadata + m->metadata_offset, m->metadata_length); - migrations->metadata_offset[k] = metadata_offset; - metadata_offset += m->metadata_length; - } -out: - tsk_safe_free(sorted_migrations); - tsk_safe_free(old_metadata); - return ret; -} - -static int -tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_site_table_t *sites = &self->tables->sites; - tsk_site_table_t copy; - tsk_size_t j; - tsk_size_t num_sites = sites->num_rows; - tsk_site_t *sorted_sites = tsk_malloc(num_sites * sizeof(*sorted_sites)); - - ret = tsk_site_table_copy(sites, ©, 0); - if (ret != 0) { - goto out; - } - if (sorted_sites == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (j = 0; j < num_sites; j++) { - tsk_site_table_get_row_unsafe(©, (tsk_id_t) j, sorted_sites + j); - } - - /* Sort the sites by position */ - qsort(sorted_sites, (size_t) num_sites, sizeof(*sorted_sites), cmp_site); - - /* Build the mapping from old site IDs to new site IDs and copy back into the - * table - */ - tsk_site_table_clear(sites); - for (j = 0; j < num_sites; j++) { - self->site_id_map[sorted_sites[j].id] = (tsk_id_t) j; - ret_id = tsk_site_table_add_row(sites, sorted_sites[j].position, - sorted_sites[j].ancestral_state, sorted_sites[j].ancestral_state_length, - sorted_sites[j].metadata, sorted_sites[j].metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; -out: - tsk_safe_free(sorted_sites); - tsk_site_table_free(©); - return ret; -} - -static int -tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t ret_id, parent, mapped_parent; - tsk_mutation_table_t *mutations = &self->tables->mutations; - tsk_size_t num_mutations = mutations->num_rows; - tsk_mutation_table_t copy; - tsk_mutation_t *sorted_mutations - = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); - tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); - - ret = tsk_mutation_table_copy(mutations, ©, 0); - if (ret != 0) { - goto out; - } - if (mutation_id_map == NULL || sorted_mutations == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (j = 0; j < num_mutations; j++) { - tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, sorted_mutations + j); - sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site]; - } - ret = tsk_mutation_table_clear(mutations); - if (ret != 0) { - goto out; - } - - qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation); - - /* Make a first pass through the sorted mutations to build the ID map. */ - for (j = 0; j < num_mutations; j++) { - mutation_id_map[sorted_mutations[j].id] = (tsk_id_t) j; - } - - for (j = 0; j < num_mutations; j++) { - mapped_parent = TSK_NULL; - parent = sorted_mutations[j].parent; - if (parent != TSK_NULL) { - mapped_parent = mutation_id_map[parent]; - } - ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site, - sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time, - sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length, - sorted_mutations[j].metadata, sorted_mutations[j].metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; - -out: - tsk_safe_free(mutation_id_map); - tsk_safe_free(sorted_mutations); - tsk_mutation_table_free(©); - return ret; -} - -static int -tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t ret_id, parent, mapped_parent, p; - tsk_mutation_table_t *mutations = &self->tables->mutations; - tsk_size_t num_mutations = mutations->num_rows; - tsk_mutation_table_t copy; - mutation_canonical_sort_t *sorted_mutations - = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); - tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); - - ret = tsk_mutation_table_copy(mutations, ©, 0); - if (ret != 0) { - goto out; - } - if (mutation_id_map == NULL || sorted_mutations == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* compute numbers of descendants for each mutation */ - for (j = 0; j < num_mutations; j++) { - sorted_mutations[j].num_descendants = 0; - } - for (j = 0; j < num_mutations; j++) { - p = mutations->parent[j]; - while (p != TSK_NULL) { - sorted_mutations[p].num_descendants += 1; - if (sorted_mutations[p].num_descendants > (int) num_mutations) { - ret = TSK_ERR_MUTATION_PARENT_INCONSISTENT; - goto out; - } - p = mutations->parent[p]; - } - } - - for (j = 0; j < num_mutations; j++) { - tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); - sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; - } - ret = tsk_mutation_table_clear(mutations); - if (ret != 0) { - goto out; - } - - qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation_canonical); - - /* Make a first pass through the sorted mutations to build the ID map. */ - for (j = 0; j < num_mutations; j++) { - mutation_id_map[sorted_mutations[j].mut.id] = (tsk_id_t) j; - } - - for (j = 0; j < num_mutations; j++) { - mapped_parent = TSK_NULL; - parent = sorted_mutations[j].mut.parent; - if (parent != TSK_NULL) { - mapped_parent = mutation_id_map[parent]; - } - ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].mut.site, - sorted_mutations[j].mut.node, mapped_parent, sorted_mutations[j].mut.time, - sorted_mutations[j].mut.derived_state, - sorted_mutations[j].mut.derived_state_length, - sorted_mutations[j].mut.metadata, sorted_mutations[j].mut.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; - -out: - tsk_safe_free(mutation_id_map); - tsk_safe_free(sorted_mutations); - tsk_mutation_table_free(©); - return ret; -} - -static int -tsk_individual_table_topological_sort( - tsk_individual_table_t *self, tsk_id_t *traversal_order, tsk_size_t *num_descendants) -{ - int ret = 0; - tsk_id_t i, j, p; - tsk_individual_t individual; - tsk_size_t num_individuals = self->num_rows; - tsk_size_t current_todo = 0; - tsk_size_t todo_insertion_point = 0; - tsk_size_t *incoming_edge_count - = tsk_malloc(num_individuals * sizeof(*incoming_edge_count)); - bool count_descendants = (num_descendants != NULL); - - if (incoming_edge_count == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (i = 0; i < (tsk_id_t) num_individuals; i++) { - incoming_edge_count[i] = 0; - traversal_order[i] = TSK_NULL; - if (count_descendants) { - num_descendants[i] = 0; - } - } - - /* First find the set of individuals that have no children by creating - * an array of incoming edge counts */ - for (i = 0; i < (tsk_id_t) self->parents_length; i++) { - if (self->parents[i] != TSK_NULL) { - incoming_edge_count[self->parents[i]]++; - } - } - /* Use these as the starting points for checking all individuals, - * doing this in reverse makes the sort stable */ - for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { - if (incoming_edge_count[i] == 0) { - traversal_order[todo_insertion_point] = i; - todo_insertion_point++; - } - } - - /* Now process individuals from the set that have no children, updating their - * parents' information as we go, and adding their parents to the list if - * this was their last child */ - while (current_todo < todo_insertion_point) { - j = traversal_order[current_todo]; - tsk_individual_table_get_row_unsafe(self, j, &individual); - for (i = 0; i < (tsk_id_t) individual.parents_length; i++) { - p = individual.parents[i]; - if (p != TSK_NULL) { - incoming_edge_count[p]--; - if (count_descendants) { - num_descendants[p] += 1 + num_descendants[j]; - } - if (incoming_edge_count[p] == 0) { - traversal_order[todo_insertion_point] = p; - todo_insertion_point++; - } - } - } - current_todo++; - } - - /* Any edges left are parts of cycles */ - for (i = 0; i < (tsk_id_t) num_individuals; i++) { - if (incoming_edge_count[i] > 0) { - ret = TSK_ERR_INDIVIDUAL_PARENT_CYCLE; - goto out; - } - } - -out: - tsk_safe_free(incoming_edge_count); - return ret; -} - -int -tsk_table_collection_individual_topological_sort( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t i, ret_id; - tsk_individual_table_t copy; - tsk_individual_t individual; - tsk_individual_table_t *individuals = &self->individuals; - tsk_node_table_t *nodes = &self->nodes; - tsk_size_t num_individuals = individuals->num_rows; - tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); - tsk_id_t *new_id_map = tsk_malloc(num_individuals * sizeof(*new_id_map)); - - if (new_id_map == NULL || traversal_order == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(new_id_map, 0xff, num_individuals * sizeof(*new_id_map)); - - ret = tsk_individual_table_copy(individuals, ©, 0); - if (ret != 0) { - goto out; - } - - ret_id = tsk_table_collection_check_integrity(self, 0); - if (ret_id != 0) { - ret = (int) ret_id; - goto out; - } - - ret = tsk_individual_table_clear(individuals); - if (ret != 0) { - goto out; - } - - ret = tsk_individual_table_topological_sort(©, traversal_order, NULL); - if (ret != 0) { - goto out; - } - - /* The sorted individuals are in reverse order */ - for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { - tsk_individual_table_get_row_unsafe(©, traversal_order[i], &individual); - ret_id = tsk_individual_table_add_row(individuals, individual.flags, - individual.location, individual.location_length, individual.parents, - individual.parents_length, individual.metadata, individual.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - new_id_map[traversal_order[i]] = ret_id; - } - - /* Rewrite the parent ids */ - for (i = 0; i < (tsk_id_t) individuals->parents_length; i++) { - if (individuals->parents[i] != TSK_NULL) { - individuals->parents[i] = new_id_map[individuals->parents[i]]; - } - } - /* Rewrite the node individual ids */ - for (i = 0; i < (tsk_id_t) nodes->num_rows; i++) { - if (nodes->individual[i] != TSK_NULL) { - nodes->individual[i] = new_id_map[nodes->individual[i]]; - } - } - - ret = 0; -out: - tsk_safe_free(traversal_order); - tsk_safe_free(new_id_map); - tsk_individual_table_free(©); - return ret; -} - -static int -tsk_table_sorter_sort_individuals_canonical(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_id_t ret_id, i, j, parent, mapped_parent; - tsk_individual_table_t *individuals = &self->tables->individuals; - tsk_node_table_t *nodes = &self->tables->nodes; - tsk_individual_table_t copy; - tsk_size_t num_individuals = individuals->num_rows; - individual_canonical_sort_t *sorted_individuals - = tsk_malloc(num_individuals * sizeof(*sorted_individuals)); - tsk_id_t *individual_id_map - = tsk_malloc(num_individuals * sizeof(*individual_id_map)); - tsk_size_t *num_descendants = tsk_malloc(num_individuals * sizeof(*num_descendants)); - tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); - - if (individual_id_map == NULL || sorted_individuals == NULL - || traversal_order == NULL || num_descendants == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_individual_table_copy(individuals, ©, 0); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_clear(individuals); - if (ret != 0) { - goto out; - } - - ret = tsk_individual_table_topological_sort(©, traversal_order, num_descendants); - if (ret != 0) { - goto out; - } - - for (i = 0; i < (tsk_id_t) num_individuals; i++) { - sorted_individuals[i].num_descendants = num_descendants[i]; - sorted_individuals[i].first_node = (tsk_id_t) nodes->num_rows; - } - - /* find first referring node */ - for (j = 0; j < (tsk_id_t) nodes->num_rows; j++) { - if (nodes->individual[j] != TSK_NULL) { - sorted_individuals[nodes->individual[j]].first_node - = TSK_MIN(j, sorted_individuals[nodes->individual[j]].first_node); - } - } - - for (j = 0; j < (tsk_id_t) num_individuals; j++) { - tsk_individual_table_get_row_unsafe( - ©, (tsk_id_t) j, &sorted_individuals[j].ind); - } - - qsort(sorted_individuals, (size_t) num_individuals, sizeof(*sorted_individuals), - cmp_individual_canonical); - - /* Make a first pass through the sorted individuals to build the ID map. */ - for (j = 0; j < (tsk_id_t) num_individuals; j++) { - individual_id_map[sorted_individuals[j].ind.id] = (tsk_id_t) j; - } - - for (i = 0; i < (tsk_id_t) num_individuals; i++) { - for (j = 0; j < (tsk_id_t) sorted_individuals[i].ind.parents_length; j++) { - parent = sorted_individuals[i].ind.parents[j]; - if (parent != TSK_NULL) { - mapped_parent = individual_id_map[parent]; - sorted_individuals[i].ind.parents[j] = mapped_parent; - } - } - ret_id = tsk_individual_table_add_row(individuals, - sorted_individuals[i].ind.flags, sorted_individuals[i].ind.location, - sorted_individuals[i].ind.location_length, sorted_individuals[i].ind.parents, - sorted_individuals[i].ind.parents_length, sorted_individuals[i].ind.metadata, - sorted_individuals[i].ind.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; - - /* remap individuals in the node table */ - for (i = 0; i < (tsk_id_t) nodes->num_rows; i++) { - j = nodes->individual[i]; - if (j != TSK_NULL) { - nodes->individual[i] = individual_id_map[j]; - } - } - -out: - tsk_safe_free(sorted_individuals); - tsk_safe_free(individual_id_map); - tsk_safe_free(traversal_order); - tsk_safe_free(num_descendants); - tsk_individual_table_free(©); - return ret; -} - -int -tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) -{ - int ret = 0; - tsk_size_t edge_start = 0; - tsk_size_t migration_start = 0; - bool skip_sites = false; - bool skip_individuals = false; - - if (start != NULL) { - if (start->edges > self->tables->edges.num_rows) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - edge_start = start->edges; - if (start->migrations > self->tables->migrations.num_rows) { - ret = TSK_ERR_MIGRATION_OUT_OF_BOUNDS; - goto out; - } - migration_start = start->migrations; - - /* We only allow sites and mutations to be specified as a way to - * skip sorting them entirely. Both sites and mutations must be - * equal to the number of rows */ - if (start->sites == self->tables->sites.num_rows - && start->mutations == self->tables->mutations.num_rows) { - skip_sites = true; - } else if (start->sites != 0 || start->mutations != 0) { - ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; - goto out; - } - } - /* The indexes will be invalidated, so drop them */ - ret = tsk_table_collection_drop_index(self->tables, 0); - if (ret != 0) { - goto out; - } - - if (self->sort_edges != NULL) { - ret = self->sort_edges(self, edge_start); - if (ret != 0) { - goto out; - } - } - /* Avoid calling sort_migrations in the common case when it's a no-op */ - if (self->tables->migrations.num_rows > 0) { - ret = tsk_table_sorter_sort_migrations(self, migration_start); - if (ret != 0) { - goto out; - } - } - if (!skip_sites) { - ret = tsk_table_sorter_sort_sites(self); - if (ret != 0) { - goto out; - } - ret = self->sort_mutations(self); - if (ret != 0) { - goto out; - } - } - if (!skip_individuals && self->sort_individuals != NULL) { - ret = self->sort_individuals(self); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -int -tsk_table_sorter_init( - tsk_table_sorter_t *self, tsk_table_collection_t *tables, tsk_flags_t options) -{ - int ret = 0; - tsk_id_t ret_id; - - tsk_memset(self, 0, sizeof(tsk_table_sorter_t)); - if (!(options & TSK_NO_CHECK_INTEGRITY)) { - ret_id = tsk_table_collection_check_integrity(tables, 0); - if (ret_id != 0) { - ret = (int) ret_id; - goto out; - } - } - self->tables = tables; - - self->site_id_map = tsk_malloc(self->tables->sites.num_rows * sizeof(tsk_id_t)); - if (self->site_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* Set the sort_edges and sort_mutations methods to the default. */ - self->sort_edges = tsk_table_sorter_sort_edges; - self->sort_mutations = tsk_table_sorter_sort_mutations; - /* Default sort doesn't touch individuals */ - self->sort_individuals = NULL; -out: - return ret; -} - -int -tsk_table_sorter_free(tsk_table_sorter_t *self) -{ - tsk_safe_free(self->site_id_map); - return 0; -} - -/************************* - * segment overlapper - *************************/ - -typedef struct _interval_list_t { - double left; - double right; - struct _interval_list_t *next; -} interval_list_t; - -typedef struct _mutation_id_list_t { - tsk_id_t mutation; - struct _mutation_id_list_t *next; -} mutation_id_list_t; - -typedef struct _tsk_segment_t { - double left; - double right; - struct _tsk_segment_t *next; - tsk_id_t node; -} tsk_segment_t; - -/* segment overlap finding algorithm */ -typedef struct { - /* The input segments. This buffer is sorted by the algorithm and we also - * assume that there is space for an extra element at the end */ - tsk_segment_t *segments; - tsk_size_t num_segments; - tsk_size_t index; - tsk_size_t num_overlapping; - double left; - double right; - /* Output buffer */ - tsk_size_t max_overlapping; - tsk_segment_t **overlapping; -} segment_overlapper_t; - -typedef struct { - tsk_size_t num_samples; - tsk_flags_t options; - tsk_table_collection_t *tables; - /* Keep a copy of the input tables */ - tsk_table_collection_t input_tables; - /* State for topology */ - tsk_segment_t **ancestor_map_head; - tsk_segment_t **ancestor_map_tail; - /* Mapping of input node IDs to output node IDs. */ - tsk_id_t *node_id_map; - bool *is_sample; - /* Segments for a particular parent that are processed together */ - tsk_segment_t *segment_queue; - tsk_size_t segment_queue_size; - tsk_size_t max_segment_queue_size; - segment_overlapper_t segment_overlapper; - tsk_blkalloc_t segment_heap; - /* Buffer for output edges. For each child we keep a linked list of - * intervals, and also store the actual children that have been buffered. */ - tsk_blkalloc_t interval_list_heap; - interval_list_t **child_edge_map_head; - interval_list_t **child_edge_map_tail; - tsk_id_t *buffered_children; - tsk_size_t num_buffered_children; - /* For each mutation, map its output node. */ - tsk_id_t *mutation_node_map; - /* Map of input nodes to the list of input mutation IDs */ - mutation_id_list_t **node_mutation_list_map_head; - mutation_id_list_t **node_mutation_list_map_tail; - mutation_id_list_t *node_mutation_list_mem; - /* When reducing topology, we need a map positions to their corresponding - * sites.*/ - double *position_lookup; - int64_t edge_sort_offset; -} simplifier_t; - -static int -cmp_segment(const void *a, const void *b) -{ - const tsk_segment_t *ia = (const tsk_segment_t *) a; - const tsk_segment_t *ib = (const tsk_segment_t *) b; - int ret = (ia->left > ib->left) - (ia->left < ib->left); - /* Break ties using the node */ - if (ret == 0) { - ret = (ia->node > ib->node) - (ia->node < ib->node); - } - return ret; -} - -static int TSK_WARN_UNUSED -segment_overlapper_alloc(segment_overlapper_t *self) -{ - int ret = 0; - - tsk_memset(self, 0, sizeof(*self)); - self->max_overlapping = 8; /* Making sure we call tsk_realloc in tests */ - self->overlapping = tsk_malloc(self->max_overlapping * sizeof(*self->overlapping)); - if (self->overlapping == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -static int -segment_overlapper_free(segment_overlapper_t *self) -{ - tsk_safe_free(self->overlapping); - return 0; -} - -/* Initialise the segment overlapper for use. Note that the segments - * array must have space for num_segments + 1 elements! - */ -static int TSK_WARN_UNUSED -segment_overlapper_start( - segment_overlapper_t *self, tsk_segment_t *segments, tsk_size_t num_segments) -{ - int ret = 0; - tsk_segment_t *sentinel; - void *p; - - if (self->max_overlapping < num_segments) { - self->max_overlapping = num_segments; - p = tsk_realloc( - self->overlapping, self->max_overlapping * sizeof(*self->overlapping)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->overlapping = p; - } - self->segments = segments; - self->num_segments = num_segments; - self->index = 0; - self->num_overlapping = 0; - self->left = 0; - self->right = DBL_MAX; - - /* Sort the segments in the buffer by left coordinate */ - qsort( - self->segments, (size_t) self->num_segments, sizeof(tsk_segment_t), cmp_segment); - /* NOTE! We are assuming that there's space for another element on the end - * here. This is to insert a sentinel which simplifies the logic. */ - sentinel = self->segments + self->num_segments; - sentinel->left = DBL_MAX; -out: - return ret; -} - -static int TSK_WARN_UNUSED -segment_overlapper_next(segment_overlapper_t *self, double *left, double *right, - tsk_segment_t ***overlapping, tsk_size_t *num_overlapping) -{ - int ret = 0; - tsk_size_t j, k; - tsk_size_t n = self->num_segments; - tsk_segment_t *S = self->segments; - - if (self->index < n) { - self->left = self->right; - /* Remove any elements of X with right <= left */ - k = 0; - for (j = 0; j < self->num_overlapping; j++) { - if (self->overlapping[j]->right > self->left) { - self->overlapping[k] = self->overlapping[j]; - k++; - } - } - self->num_overlapping = k; - if (k == 0) { - self->left = S[self->index].left; - } - while (self->index < n && S[self->index].left == self->left) { - tsk_bug_assert(self->num_overlapping < self->max_overlapping); - self->overlapping[self->num_overlapping] = &S[self->index]; - self->num_overlapping++; - self->index++; - } - self->index--; - self->right = S[self->index + 1].left; - for (j = 0; j < self->num_overlapping; j++) { - self->right = TSK_MIN(self->right, self->overlapping[j]->right); - } - tsk_bug_assert(self->left < self->right); - self->index++; - ret = 1; - } else { - self->left = self->right; - self->right = DBL_MAX; - k = 0; - for (j = 0; j < self->num_overlapping; j++) { - if (self->overlapping[j]->right > self->left) { - self->right = TSK_MIN(self->right, self->overlapping[j]->right); - self->overlapping[k] = self->overlapping[j]; - k++; - } - } - self->num_overlapping = k; - if (k > 0) { - ret = 1; - } - } - - *left = self->left; - *right = self->right; - *overlapping = self->overlapping; - *num_overlapping = self->num_overlapping; - return ret; -} - -static int -cmp_node_id(const void *a, const void *b) -{ - const tsk_id_t *ia = (const tsk_id_t *) a; - const tsk_id_t *ib = (const tsk_id_t *) b; - return (*ia > *ib) - (*ia < *ib); -} - -/************************* - * Ancestor mapper - *************************/ - -/* NOTE: this struct shares a lot with the simplifier_t, mostly in - * terms of infrastructure for managing the list of intervals, saving - * edges etc. We should try to abstract the common functionality out - * into a separate class, which handles this. - */ -typedef struct { - tsk_id_t *samples; - tsk_size_t num_samples; - tsk_id_t *ancestors; - tsk_size_t num_ancestors; - tsk_table_collection_t *tables; - tsk_edge_table_t *result; - tsk_segment_t **ancestor_map_head; - tsk_segment_t **ancestor_map_tail; - bool *is_sample; - bool *is_ancestor; - tsk_segment_t *segment_queue; - tsk_size_t segment_queue_size; - tsk_size_t max_segment_queue_size; - segment_overlapper_t segment_overlapper; - tsk_blkalloc_t segment_heap; - tsk_blkalloc_t interval_list_heap; - interval_list_t **child_edge_map_head; - interval_list_t **child_edge_map_tail; - tsk_id_t *buffered_children; - tsk_size_t num_buffered_children; - double sequence_length; - double oldest_node_time; -} ancestor_mapper_t; - -static tsk_segment_t *TSK_WARN_UNUSED -ancestor_mapper_alloc_segment( - ancestor_mapper_t *self, double left, double right, tsk_id_t node) -{ - tsk_segment_t *seg = NULL; - - seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); - if (seg == NULL) { - goto out; - } - seg->next = NULL; - seg->left = left; - seg->right = right; - seg->node = node; -out: - return seg; -} - -static interval_list_t *TSK_WARN_UNUSED -ancestor_mapper_alloc_interval_list(ancestor_mapper_t *self, double left, double right) -{ - interval_list_t *x = NULL; - - x = tsk_blkalloc_get(&self->interval_list_heap, sizeof(*x)); - if (x == NULL) { - goto out; - } - x->next = NULL; - x->left = left; - x->right = right; -out: - return x; -} - -static int -ancestor_mapper_flush_edges( - ancestor_mapper_t *self, tsk_id_t parent, tsk_size_t *ret_num_edges) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_id_t child; - interval_list_t *x; - tsk_size_t num_edges = 0; - - qsort(self->buffered_children, (size_t) self->num_buffered_children, - sizeof(tsk_id_t), cmp_node_id); - for (j = 0; j < self->num_buffered_children; j++) { - child = self->buffered_children[j]; - for (x = self->child_edge_map_head[child]; x != NULL; x = x->next) { - ret_id = tsk_edge_table_add_row( - self->result, x->left, x->right, parent, child, NULL, 0); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_edges++; - } - self->child_edge_map_head[child] = NULL; - self->child_edge_map_tail[child] = NULL; - } - self->num_buffered_children = 0; - *ret_num_edges = num_edges; - ret = tsk_blkalloc_reset(&self->interval_list_heap); -out: - return ret; -} - -static int -ancestor_mapper_record_edge( - ancestor_mapper_t *self, double left, double right, tsk_id_t child) -{ - int ret = 0; - interval_list_t *tail, *x; - - tail = self->child_edge_map_tail[child]; - if (tail == NULL) { - tsk_bug_assert(self->num_buffered_children < self->tables->nodes.num_rows); - self->buffered_children[self->num_buffered_children] = child; - self->num_buffered_children++; - x = ancestor_mapper_alloc_interval_list(self, left, right); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->child_edge_map_head[child] = x; - self->child_edge_map_tail[child] = x; - } else { - if (tail->right == left) { - tail->right = right; - } else { - x = ancestor_mapper_alloc_interval_list(self, left, right); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tail->next = x; - self->child_edge_map_tail[child] = x; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -ancestor_mapper_add_ancestry(ancestor_mapper_t *self, tsk_id_t input_id, double left, - double right, tsk_id_t output_id) -{ - int ret = 0; - tsk_segment_t *tail = self->ancestor_map_tail[input_id]; - tsk_segment_t *x; - - tsk_bug_assert(left < right); - if (tail == NULL) { - x = ancestor_mapper_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->ancestor_map_head[input_id] = x; - self->ancestor_map_tail[input_id] = x; - } else { - if (tail->right == left && tail->node == output_id) { - tail->right = right; - } else { - x = ancestor_mapper_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tail->next = x; - self->ancestor_map_tail[input_id] = x; - } - } -out: - return ret; -} - -static void -ancestor_mapper_find_oldest_node(ancestor_mapper_t *self) -{ - const double *node_time = self->tables->nodes.time; - tsk_size_t j; - double max_time = -1; - - for (j = 0; j < self->num_ancestors; j++) { - max_time = TSK_MAX(max_time, node_time[self->ancestors[j]]); - } - for (j = 0; j < self->num_samples; j++) { - max_time = TSK_MAX(max_time, node_time[self->samples[j]]); - } - - self->oldest_node_time = max_time; -} - -static int -ancestor_mapper_init_samples(ancestor_mapper_t *self, tsk_id_t *samples) -{ - int ret = 0; - tsk_size_t j; - - /* Go through the samples to check for errors. */ - for (j = 0; j < self->num_samples; j++) { - if (samples[j] < 0 || samples[j] > (tsk_id_t) self->tables->nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->is_sample[samples[j]]) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - self->is_sample[samples[j]] = true; - ret = ancestor_mapper_add_ancestry( - self, samples[j], 0, self->tables->sequence_length, samples[j]); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -ancestor_mapper_init_ancestors(ancestor_mapper_t *self, tsk_id_t *ancestors) -{ - int ret = 0; - tsk_size_t j; - - /* Go through the samples to check for errors. */ - for (j = 0; j < self->num_ancestors; j++) { - if (ancestors[j] < 0 || ancestors[j] > (tsk_id_t) self->tables->nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->is_ancestor[ancestors[j]]) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - self->is_ancestor[ancestors[j]] = true; - } -out: - return ret; -} - -static int -ancestor_mapper_init(ancestor_mapper_t *self, tsk_id_t *samples, tsk_size_t num_samples, - tsk_id_t *ancestors, tsk_size_t num_ancestors, tsk_table_collection_t *tables, - tsk_edge_table_t *result) -{ - int ret = 0; - tsk_size_t num_nodes; - - tsk_memset(self, 0, sizeof(ancestor_mapper_t)); - self->num_samples = num_samples; - self->num_ancestors = num_ancestors; - self->samples = samples; - self->ancestors = ancestors; - self->tables = tables; - self->result = result; - self->sequence_length = self->tables->sequence_length; - - if (samples == NULL || num_samples == 0 || ancestors == NULL || num_ancestors == 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - /* Allocate the heaps used for small objects-> Assuming 8K is a good chunk size - */ - ret = tsk_blkalloc_init(&self->segment_heap, 8192); - if (ret != 0) { - goto out; - } - ret = tsk_blkalloc_init(&self->interval_list_heap, 8192); - if (ret != 0) { - goto out; - } - ret = segment_overlapper_alloc(&self->segment_overlapper); - if (ret != 0) { - goto out; - } - - num_nodes = tables->nodes.num_rows; - /* Make the maps and set the intial state */ - self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); - self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); - self->child_edge_map_head = tsk_calloc(num_nodes, sizeof(interval_list_t *)); - self->child_edge_map_tail = tsk_calloc(num_nodes, sizeof(interval_list_t *)); - self->buffered_children = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->is_sample = tsk_calloc(num_nodes, sizeof(bool)); - self->is_ancestor = tsk_calloc(num_nodes, sizeof(bool)); - self->max_segment_queue_size = 64; - self->segment_queue - = tsk_malloc(self->max_segment_queue_size * sizeof(tsk_segment_t)); - if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL - || self->child_edge_map_head == NULL || self->child_edge_map_tail == NULL - || self->is_sample == NULL || self->is_ancestor == NULL - || self->segment_queue == NULL || self->buffered_children == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - // Clear memory. - ret = ancestor_mapper_init_samples(self, samples); - if (ret != 0) { - goto out; - } - ret = ancestor_mapper_init_ancestors(self, ancestors); - if (ret != 0) { - goto out; - } - ancestor_mapper_find_oldest_node(self); - ret = tsk_edge_table_clear(self->result); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int -ancestor_mapper_free(ancestor_mapper_t *self) -{ - tsk_blkalloc_free(&self->segment_heap); - tsk_blkalloc_free(&self->interval_list_heap); - segment_overlapper_free(&self->segment_overlapper); - tsk_safe_free(self->ancestor_map_head); - tsk_safe_free(self->ancestor_map_tail); - tsk_safe_free(self->child_edge_map_head); - tsk_safe_free(self->child_edge_map_tail); - tsk_safe_free(self->segment_queue); - tsk_safe_free(self->is_sample); - tsk_safe_free(self->is_ancestor); - tsk_safe_free(self->buffered_children); - return 0; -} - -static int TSK_WARN_UNUSED -ancestor_mapper_enqueue_segment( - ancestor_mapper_t *self, double left, double right, tsk_id_t node) -{ - int ret = 0; - tsk_segment_t *seg; - void *p; - - tsk_bug_assert(left < right); - /* Make sure we always have room for one more segment in the queue so we - * can put a tail sentinel on it */ - if (self->segment_queue_size == self->max_segment_queue_size - 1) { - self->max_segment_queue_size *= 2; - p = tsk_realloc(self->segment_queue, - self->max_segment_queue_size * sizeof(*self->segment_queue)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->segment_queue = p; - } - seg = self->segment_queue + self->segment_queue_size; - seg->left = left; - seg->right = right; - seg->node = node; - self->segment_queue_size++; -out: - return ret; -} - -static int TSK_WARN_UNUSED -ancestor_mapper_merge_ancestors(ancestor_mapper_t *self, tsk_id_t input_id) -{ - int ret = 0; - tsk_segment_t **X, *x; - tsk_size_t j, num_overlapping, num_flushed_edges; - double left, right, prev_right; - bool is_sample = self->is_sample[input_id]; - bool is_ancestor = self->is_ancestor[input_id]; - - if (is_sample) { - /* Free up the existing ancestry mapping. */ - x = self->ancestor_map_tail[input_id]; - tsk_bug_assert(x->left == 0 && x->right == self->sequence_length); - self->ancestor_map_head[input_id] = NULL; - self->ancestor_map_tail[input_id] = NULL; - } - ret = segment_overlapper_start( - &self->segment_overlapper, self->segment_queue, self->segment_queue_size); - if (ret != 0) { - goto out; - } - - prev_right = 0; - while ((ret = segment_overlapper_next( - &self->segment_overlapper, &left, &right, &X, &num_overlapping)) - == 1) { - tsk_bug_assert(left < right); - tsk_bug_assert(num_overlapping > 0); - if (is_ancestor || is_sample) { - for (j = 0; j < num_overlapping; j++) { - ret = ancestor_mapper_record_edge(self, left, right, X[j]->node); - if (ret != 0) { - goto out; - } - } - ret = ancestor_mapper_add_ancestry(self, input_id, left, right, input_id); - if (ret != 0) { - goto out; - } - if (is_sample && left != prev_right) { - /* Fill in any gaps in ancestry for the sample */ - ret = ancestor_mapper_add_ancestry( - self, input_id, prev_right, left, input_id); - if (ret != 0) { - goto out; - } - } - } else { - for (j = 0; j < num_overlapping; j++) { - ret = ancestor_mapper_add_ancestry( - self, input_id, left, right, X[j]->node); - if (ret != 0) { - goto out; - } - } - } - prev_right = right; - } - if (is_sample && prev_right != self->tables->sequence_length) { - /* If a trailing gap exists in the sample ancestry, fill it in. */ - ret = ancestor_mapper_add_ancestry( - self, input_id, prev_right, self->sequence_length, input_id); - if (ret != 0) { - goto out; - } - } - if (input_id != TSK_NULL) { - ret = ancestor_mapper_flush_edges(self, input_id, &num_flushed_edges); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -ancestor_mapper_process_parent_edges( - ancestor_mapper_t *self, tsk_id_t parent, tsk_size_t start, tsk_size_t end) -{ - int ret = 0; - tsk_size_t j; - tsk_segment_t *x; - const tsk_edge_table_t *input_edges = &self->tables->edges; - tsk_id_t child; - double left, right; - - /* Go through the edges and queue up ancestry segments for processing. */ - self->segment_queue_size = 0; - for (j = start; j < end; j++) { - tsk_bug_assert(parent == input_edges->parent[j]); - child = input_edges->child[j]; - left = input_edges->left[j]; - right = input_edges->right[j]; - // printf("C: %i, L: %f, R: %f\n", child, left, right); - for (x = self->ancestor_map_head[child]; x != NULL; x = x->next) { - if (x->right > left && right > x->left) { - ret = ancestor_mapper_enqueue_segment( - self, TSK_MAX(x->left, left), TSK_MIN(x->right, right), x->node); - if (ret != 0) { - goto out; - } - } - } - } - // We can now merge the ancestral segments for the parent - ret = ancestor_mapper_merge_ancestors(self, parent); - if (ret != 0) { - goto out; - } - -out: - return ret; -} - -static int TSK_WARN_UNUSED -ancestor_mapper_run(ancestor_mapper_t *self) -{ - int ret = 0; - tsk_size_t j, start; - tsk_id_t parent, current_parent; - const tsk_edge_table_t *input_edges = &self->tables->edges; - tsk_size_t num_edges = input_edges->num_rows; - const double *node_time = self->tables->nodes.time; - bool early_exit = false; - - if (num_edges > 0) { - start = 0; - current_parent = input_edges->parent[0]; - for (j = 0; j < num_edges; j++) { - parent = input_edges->parent[j]; - if (parent != current_parent) { - ret = ancestor_mapper_process_parent_edges( - self, current_parent, start, j); - if (ret != 0) { - goto out; - } - start = j; - current_parent = parent; - if (node_time[current_parent] > self->oldest_node_time) { - early_exit = true; - break; - } - } - } - if (!early_exit) { - /* If we didn't break out of the loop early, we need to still process - * the final parent */ - ret = ancestor_mapper_process_parent_edges(self, current_parent, start, j); - if (ret != 0) { - goto out; - } - } - } -out: - return ret; -} - -/************************* - * IBD Segments - *************************/ - -/* This maps two positive integers 0 <= a < b < N into the set - * {0, ..., N^2}. For us to overflow an int64, N would need to - * be > sqrt(2^63), ~3 * 10^9. The maximum value for a 32bit int - * is ~2 * 10^9, so this can't happen here, however it is - * theoretically possible with 64 bit IDs. It would require - * a *very* large node table --- assuming 24 bytes per row - * it would be at least 67GiB. To make sure this eventuality - * doesn't happen, we have a tsk_bug_assert in the - * tsk_identity_segments_init. - */ -static inline int64_t -pair_to_integer(tsk_id_t a, tsk_id_t b, tsk_size_t N) -{ - tsk_id_t tmp; - if (a > b) { - tmp = a; - a = b; - b = tmp; - } - return ((int64_t) a) * (int64_t) N + (int64_t) b; -} - -static inline void -integer_to_pair(int64_t index, tsk_size_t N, tsk_id_t *a, tsk_id_t *b) -{ - *a = (tsk_id_t)(index / (int64_t) N); - *b = (tsk_id_t)(index % (int64_t) N); -} - -static int64_t -tsk_identity_segments_get_key( - const tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b) -{ - int64_t ret; - tsk_id_t N = (tsk_id_t) self->num_nodes; - - if (a < 0 || b < 0 || a >= N || b >= N) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (a == b) { - ret = TSK_ERR_SAME_NODES_IN_PAIR; - goto out; - } - ret = pair_to_integer(a, b, self->num_nodes); -out: - return ret; -} - -static tsk_identity_segment_t *TSK_WARN_UNUSED -tsk_identity_segments_alloc_segment( - tsk_identity_segments_t *self, double left, double right, tsk_id_t node) -{ - tsk_identity_segment_t *seg = tsk_blkalloc_get(&self->heap, sizeof(*seg)); - if (seg == NULL) { - goto out; - } - tsk_bug_assert(left < right); - tsk_bug_assert(node >= 0 && node < (tsk_id_t) self->num_nodes); - - seg->next = NULL; - seg->left = left; - seg->right = right; - seg->node = node; -out: - return seg; -} - -static tsk_avl_node_int_t * -tsk_identity_segments_alloc_new_pair(tsk_identity_segments_t *self, int64_t key) -{ - tsk_avl_node_int_t *avl_node = tsk_blkalloc_get(&self->heap, sizeof(*avl_node)); - tsk_identity_segment_list_t *list = tsk_blkalloc_get(&self->heap, sizeof(*list)); - - if (avl_node == NULL || list == NULL) { - return NULL; - } - avl_node->key = key; - avl_node->value = list; - memset(list, 0, sizeof(*list)); - return avl_node; -} - -/* Deliberately not making this a part of the public interface for now, - * so we don't have to worry about the signature */ -static int -tsk_identity_segments_init( - tsk_identity_segments_t *self, tsk_size_t num_nodes, tsk_flags_t options) -{ - int ret = 0; - /* Make sure we don't overflow in the ID mapping. See the comments in pair_to_integer - * for details. */ - double max_num_nodes = sqrt(1ULL << 63); - tsk_bug_assert((double) num_nodes < max_num_nodes); - - memset(self, 0, sizeof(*self)); - self->num_nodes = num_nodes; - /* Storing segments implies storing pairs */ - if (options & TSK_IBD_STORE_SEGMENTS) { - self->store_pairs = true; - self->store_segments = true; - } else if (options & TSK_IBD_STORE_PAIRS) { - self->store_pairs = true; - } - ret = tsk_avl_tree_int_init(&self->pair_map); - if (ret != 0) { - goto out; - } - /* Allocate heap memory in 1MiB blocks */ - ret = tsk_blkalloc_init(&self->heap, 1024 * 1024); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -void -tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out) -{ - tsk_avl_node_int_t **nodes = tsk_malloc(self->pair_map.size * sizeof(*nodes)); - int64_t key; - tsk_identity_segment_list_t *value; - tsk_identity_segment_t *seg; - tsk_size_t j; - tsk_id_t a, b; - - tsk_bug_assert(nodes != NULL); - - fprintf(out, "===\nIBD Result\n===\n"); - fprintf(out, "total_span = %f\n", self->total_span); - fprintf(out, "num_segments = %lld\n", (unsigned long long) self->num_segments); - fprintf(out, "store_pairs = %d\n", self->store_pairs); - fprintf(out, "store_segments = %d\n", self->store_segments); - if (self->store_pairs) { - fprintf(out, "num_keys = %d\n", (int) self->pair_map.size); - tsk_avl_tree_int_ordered_nodes(&self->pair_map, nodes); - for (j = 0; j < self->pair_map.size; j++) { - key = nodes[j]->key; - value = (tsk_identity_segment_list_t *) nodes[j]->value; - integer_to_pair(key, self->num_nodes, &a, &b); - fprintf(out, "%lld\t(%d,%d) n=%d total_span=%f\t", (long long) key, (int) a, - (int) b, (int) value->num_segments, value->total_span); - if (self->store_segments) { - for (seg = value->head; seg != NULL; seg = seg->next) { - fprintf( - out, "(%f, %f)->%d, ", seg->left, seg->right, (int) seg->node); - } - } - fprintf(out, "\n"); - } - } - fprintf(out, "Segment memory\n"); - tsk_blkalloc_print_state(&self->heap, out); - tsk_safe_free(nodes); -} - -tsk_size_t -tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self) -{ - return self->num_segments; -} - -double -tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self) -{ - return self->total_span; -} - -tsk_size_t -tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self) -{ - return self->pair_map.size; -} - -/* Use an inorder traversal on the AVL tree to get the pairs in order. - * Recursion is safe here because it's a balanced tree (see the AVL tree - * code for notes on this). - */ -static int -get_keys_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs) -{ - tsk_id_t a, b; - - if (node == NULL) { - return index; - } - index = get_keys_traverse(node->llink, index, N, pairs); - integer_to_pair(node->key, N, &a, &b); - pairs[2 * index] = a; - pairs[2 * index + 1] = b; - return get_keys_traverse(node->rlink, index + 1, N, pairs); -} - -int -tsk_identity_segments_get_keys(const tsk_identity_segments_t *self, tsk_id_t *pairs) -{ - if (!self->store_pairs) { - return TSK_ERR_IBD_PAIRS_NOT_STORED; - } - get_keys_traverse( - tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs); - return 0; -} - -static int -get_items_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs, - tsk_identity_segment_list_t **lists) -{ - tsk_id_t a, b; - - if (node == NULL) { - return index; - } - index = get_items_traverse(node->llink, index, N, pairs, lists); - integer_to_pair(node->key, N, &a, &b); - pairs[2 * index] = a; - pairs[2 * index + 1] = b; - lists[index] = node->value; - return get_items_traverse(node->rlink, index + 1, N, pairs, lists); -} - -int -tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, - tsk_identity_segment_list_t **lists) -{ - if (!self->store_pairs) { - return TSK_ERR_IBD_PAIRS_NOT_STORED; - } - get_items_traverse( - tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs, lists); - return 0; -} - -int -tsk_identity_segments_free(tsk_identity_segments_t *self) -{ - tsk_blkalloc_free(&self->heap); - tsk_avl_tree_int_free(&self->pair_map); - return 0; -} - -static int TSK_WARN_UNUSED -tsk_identity_segments_update_pair(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, - double left, double right, tsk_id_t node) -{ - int ret = 0; - tsk_identity_segment_t *x; - tsk_identity_segment_list_t *list; - /* skip the error checking here since this an internal API */ - int64_t key = pair_to_integer(a, b, self->num_nodes); - tsk_avl_node_int_t *avl_node = tsk_avl_tree_int_search(&self->pair_map, key); - - if (avl_node == NULL) { - /* We haven't seen this pair before */ - avl_node = tsk_identity_segments_alloc_new_pair(self, key); - if (avl_node == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_avl_tree_int_insert(&self->pair_map, avl_node); - tsk_bug_assert(ret == 0); - } - list = (tsk_identity_segment_list_t *) avl_node->value; - list->num_segments++; - list->total_span += right - left; - if (self->store_segments) { - x = tsk_identity_segments_alloc_segment(self, left, right, node); - if (x == NULL) { - goto out; - } - if (list->tail == NULL) { - list->head = x; - list->tail = x; - } else { - list->tail->next = x; - list->tail = x; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_identity_segments_add_segment(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, - double left, double right, tsk_id_t node) -{ - int ret = 0; - - if (self->store_pairs) { - ret = tsk_identity_segments_update_pair(self, a, b, left, right, node); - if (ret != 0) { - goto out; - } - } - self->total_span += right - left; - self->num_segments++; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t sample_a, - tsk_id_t sample_b, tsk_identity_segment_list_t **ret_list) -{ - int ret = 0; - int64_t key = tsk_identity_segments_get_key(self, sample_a, sample_b); - tsk_avl_node_int_t *avl_node; - - if (key < 0) { - ret = (int) key; - goto out; - } - if (!self->store_pairs) { - ret = TSK_ERR_IBD_PAIRS_NOT_STORED; - goto out; - } - avl_node = tsk_avl_tree_int_search(&self->pair_map, key); - *ret_list = NULL; - if (avl_node != NULL) { - *ret_list = (tsk_identity_segment_list_t *) avl_node->value; - } -out: - return ret; -} - -/************************* - * IBD finder - *************************/ - -typedef struct { - tsk_identity_segments_t *result; - double min_span; - double max_time; - const tsk_table_collection_t *tables; - /* Maps nodes to their sample set IDs. Input samples map to set 0 - * in the "within" case. */ - tsk_id_t *sample_set_id; - /* True if we're finding IBD between sample sets, false otherwise. */ - bool finding_between; - tsk_segment_t **ancestor_map_head; - tsk_segment_t **ancestor_map_tail; - tsk_segment_t *segment_queue; - tsk_size_t segment_queue_size; - tsk_size_t max_segment_queue_size; - tsk_blkalloc_t segment_heap; -} tsk_ibd_finder_t; - -static tsk_segment_t *TSK_WARN_UNUSED -tsk_ibd_finder_alloc_segment( - tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) -{ - tsk_segment_t *seg = NULL; - - seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); - if (seg == NULL) { - goto out; - } - seg->next = NULL; - seg->left = left; - seg->right = right; - seg->node = node; - -out: - return seg; -} -static int TSK_WARN_UNUSED -tsk_ibd_finder_add_ancestry(tsk_ibd_finder_t *self, tsk_id_t input_id, double left, - double right, tsk_id_t output_id) -{ - int ret = 0; - tsk_segment_t *tail = self->ancestor_map_tail[input_id]; - tsk_segment_t *x = NULL; - - tsk_bug_assert(left < right); - x = tsk_ibd_finder_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (tail == NULL) { - self->ancestor_map_head[input_id] = x; - self->ancestor_map_tail[input_id] = x; - } else { - tail->next = x; - self->ancestor_map_tail[input_id] = x; - } -out: - return ret; -} - -static int -tsk_ibd_finder_init_samples_from_set( - tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t u; - - for (j = 0; j < num_samples; j++) { - u = samples[j]; - - if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->sample_set_id[u] != TSK_NULL) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - self->sample_set_id[u] = 0; - } -out: - return ret; -} - -static void -tsk_ibd_finder_init_samples_from_nodes(tsk_ibd_finder_t *self) -{ - tsk_id_t u; - const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; - const tsk_flags_t *restrict flags = self->tables->nodes.flags; - - for (u = 0; u < num_nodes; u++) { - if (flags[u] & TSK_NODE_IS_SAMPLE) { - self->sample_set_id[u] = 0; - } - } -} - -static int -tsk_ibd_finder_add_sample_ancestry(tsk_ibd_finder_t *self) -{ - - int ret = 0; - tsk_id_t u; - const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; - const double L = self->tables->sequence_length; - - for (u = 0; u < num_nodes; u++) { - if (self->sample_set_id[u] != TSK_NULL) { - ret = tsk_ibd_finder_add_ancestry(self, u, 0, L, u); - if (ret != 0) { - goto out; - } - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_init(tsk_ibd_finder_t *self, const tsk_table_collection_t *tables, - tsk_identity_segments_t *result, double min_span, double max_time) -{ - int ret = 0; - tsk_size_t num_nodes; - - tsk_memset(self, 0, sizeof(tsk_ibd_finder_t)); - - if (min_span < 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (max_time < 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - self->tables = tables; - self->result = result; - self->max_time = max_time; - self->min_span = min_span; - - ret = tsk_blkalloc_init(&self->segment_heap, 8192); - if (ret != 0) { - goto out; - } - - num_nodes = tables->nodes.num_rows; - self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_head)); - self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_tail)); - self->sample_set_id = tsk_malloc(num_nodes * sizeof(*self->sample_set_id)); - self->segment_queue_size = 0; - self->max_segment_queue_size = 64; - self->segment_queue - = tsk_malloc(self->max_segment_queue_size * sizeof(*self->segment_queue)); - if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL - || self->sample_set_id == NULL || self->segment_queue == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(self->sample_set_id, TSK_NULL, num_nodes * sizeof(*self->sample_set_id)); -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_enqueue_segment( - tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) -{ - int ret = 0; - tsk_segment_t *seg; - void *p; - - if ((right - left) > self->min_span) { - /* Make sure we always have room for one more segment in the queue so we - * can put a tail sentinel on it */ - if (self->segment_queue_size == self->max_segment_queue_size - 1) { - self->max_segment_queue_size *= 2; - p = tsk_realloc(self->segment_queue, - self->max_segment_queue_size * sizeof(*self->segment_queue)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->segment_queue = p; - } - seg = self->segment_queue + self->segment_queue_size; - seg->left = left; - seg->right = right; - seg->node = node; - self->segment_queue_size++; - } -out: - return ret; -} - -static bool -tsk_ibd_finder_passes_filters( - const tsk_ibd_finder_t *self, tsk_id_t a, tsk_id_t b, double left, double right) -{ - if (a == b) { - return false; - } - if ((right - left) <= self->min_span) { - return false; - } - if (self->finding_between) { - return self->sample_set_id[a] != self->sample_set_id[b]; - } else { - return true; - } -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_record_ibd(tsk_ibd_finder_t *self, tsk_id_t parent) -{ - int ret = 0; - tsk_size_t j; - tsk_segment_t *seg0, *seg1; - double left, right; - - for (seg0 = self->ancestor_map_head[parent]; seg0 != NULL; seg0 = seg0->next) { - for (j = 0; j < self->segment_queue_size; j++) { - seg1 = &self->segment_queue[j]; - left = TSK_MAX(seg0->left, seg1->left); - right = TSK_MIN(seg0->right, seg1->right); - if (tsk_ibd_finder_passes_filters( - self, seg0->node, seg1->node, left, right)) { - ret = tsk_identity_segments_add_segment( - self->result, seg0->node, seg1->node, left, right, parent); - if (ret != 0) { - goto out; - } - } - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_add_queued_ancestry(tsk_ibd_finder_t *self, tsk_id_t parent) -{ - int ret = 0; - tsk_size_t j; - tsk_segment_t seg; - - for (j = 0; j < self->segment_queue_size; j++) { - seg = self->segment_queue[j]; - ret = tsk_ibd_finder_add_ancestry(self, parent, seg.left, seg.right, seg.node); - if (ret != 0) { - goto out; - } - } - self->segment_queue_size = 0; -out: - return ret; -} - -static void -tsk_ibd_finder_print_state(tsk_ibd_finder_t *self, FILE *out) -{ - tsk_size_t j; - tsk_segment_t *u = NULL; - - fprintf(out, "--ibd-finder stats--\n"); - fprintf(out, "max_time = %f\n", self->max_time); - fprintf(out, "min_span = %f\n", self->min_span); - fprintf(out, "finding_between = %d\n", self->finding_between); - fprintf(out, "===\nEdges\n===\n"); - for (j = 0; j < self->tables->edges.num_rows; j++) { - fprintf(out, "L:%f, R:%f, P:%lld, C:%lld\n", self->tables->edges.left[j], - self->tables->edges.right[j], (long long) self->tables->edges.parent[j], - (long long) self->tables->edges.child[j]); - } - fprintf(out, "===\nNodes\n===\n"); - for (j = 0; j < self->tables->nodes.num_rows; j++) { - fprintf(out, "ID:%d, Time:%f, Flag:%lld Sample set:%d\n", (int) j, - self->tables->nodes.time[j], (long long) self->tables->nodes.flags[j], - (int) self->sample_set_id[j]); - } - fprintf(out, "===\nAncestral map\n===\n"); - for (j = 0; j < self->tables->nodes.num_rows; j++) { - fprintf(out, "Node %lld: ", (long long) j); - for (u = self->ancestor_map_head[j]; u != NULL; u = u->next) { - fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); - } - fprintf(out, "\n"); - } - tsk_identity_segments_print_state(self->result, out); -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_init_within( - tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) -{ - int ret; - - if (samples == NULL) { - tsk_ibd_finder_init_samples_from_nodes(self); - } else { - ret = tsk_ibd_finder_init_samples_from_set(self, samples, num_samples); - if (ret != 0) { - goto out; - } - } - self->finding_between = false; - ret = tsk_ibd_finder_add_sample_ancestry(self); -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_init_between(tsk_ibd_finder_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets) -{ - int ret = 0; - tsk_size_t j, k, index; - tsk_id_t u; - - index = 0; - for (j = 0; j < num_sample_sets; j++) { - for (k = 0; k < sample_set_sizes[j]; k++) { - u = sample_sets[index]; - if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->sample_set_id[u] != TSK_NULL) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - self->sample_set_id[u] = (tsk_id_t) j; - index++; - } - } - self->finding_between = true; - ret = tsk_ibd_finder_add_sample_ancestry(self); -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_ibd_finder_run(tsk_ibd_finder_t *self) -{ - const tsk_edge_table_t *input_edges = &self->tables->edges; - const tsk_size_t num_edges = input_edges->num_rows; - int ret = 0; - tsk_size_t j; - tsk_segment_t *s; - tsk_id_t parent, child; - double left, right, intvl_l, intvl_r, time; - - for (j = 0; j < num_edges; j++) { - parent = input_edges->parent[j]; - left = input_edges->left[j]; - right = input_edges->right[j]; - child = input_edges->child[j]; - time = self->tables->nodes.time[parent]; - if (time > self->max_time) { - break; - } - - for (s = self->ancestor_map_head[child]; s != NULL; s = s->next) { - intvl_l = TSK_MAX(left, s->left); - intvl_r = TSK_MIN(right, s->right); - ret = tsk_ibd_finder_enqueue_segment(self, intvl_l, intvl_r, s->node); - if (ret != 0) { - goto out; - } - } - ret = tsk_ibd_finder_record_ibd(self, parent); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_add_queued_ancestry(self, parent); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -tsk_ibd_finder_free(tsk_ibd_finder_t *self) -{ - tsk_blkalloc_free(&self->segment_heap); - tsk_safe_free(self->sample_set_id); - tsk_safe_free(self->ancestor_map_head); - tsk_safe_free(self->ancestor_map_tail); - tsk_safe_free(self->segment_queue); - return 0; -} - -/************************* - * simplifier - *************************/ - -static void -simplifier_check_state(simplifier_t *self) -{ - tsk_size_t j, k; - tsk_segment_t *u; - mutation_id_list_t *list_node; - tsk_id_t site; - interval_list_t *int_list; - tsk_id_t child; - double position, last_position; - bool found; - tsk_size_t num_intervals; - - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - tsk_bug_assert((self->ancestor_map_head[j] == NULL) - == (self->ancestor_map_tail[j] == NULL)); - for (u = self->ancestor_map_head[j]; u != NULL; u = u->next) { - tsk_bug_assert(u->left < u->right); - if (u->next != NULL) { - tsk_bug_assert(u->right <= u->next->left); - if (u->right == u->next->left) { - tsk_bug_assert(u->node != u->next->node); - } - } else { - tsk_bug_assert(u == self->ancestor_map_tail[j]); - } - } - } - - for (j = 0; j < self->segment_queue_size; j++) { - tsk_bug_assert(self->segment_queue[j].left < self->segment_queue[j].right); - } - - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - last_position = -1; - for (list_node = self->node_mutation_list_map_head[j]; list_node != NULL; - list_node = list_node->next) { - tsk_bug_assert( - self->input_tables.mutations.node[list_node->mutation] == (tsk_id_t) j); - site = self->input_tables.mutations.site[list_node->mutation]; - position = self->input_tables.sites.position[site]; - tsk_bug_assert(last_position <= position); - last_position = position; - } - } - - /* check the buffered edges */ - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - tsk_bug_assert((self->child_edge_map_head[j] == NULL) - == (self->child_edge_map_tail[j] == NULL)); - if (self->child_edge_map_head[j] != NULL) { - /* Make sure that the child is in our list */ - found = false; - for (k = 0; k < self->num_buffered_children; k++) { - if (self->buffered_children[k] == (tsk_id_t) j) { - found = true; - break; - } - } - tsk_bug_assert(found); - } - } - num_intervals = 0; - for (j = 0; j < self->num_buffered_children; j++) { - child = self->buffered_children[j]; - tsk_bug_assert(self->child_edge_map_head[child] != NULL); - for (int_list = self->child_edge_map_head[child]; int_list != NULL; - int_list = int_list->next) { - tsk_bug_assert(int_list->left < int_list->right); - if (int_list->next != NULL) { - tsk_bug_assert(int_list->right < int_list->next->left); - } - num_intervals++; - } - } - tsk_bug_assert( - num_intervals - == self->interval_list_heap.total_allocated / (sizeof(interval_list_t))); -} - -static void -print_segment_chain(tsk_segment_t *head, FILE *out) -{ - tsk_segment_t *u; - - for (u = head; u != NULL; u = u->next) { - fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); - } -} - -static void -simplifier_print_state(simplifier_t *self, FILE *out) -{ - tsk_size_t j; - tsk_segment_t *u; - mutation_id_list_t *list_node; - interval_list_t *int_list; - tsk_id_t child; - - fprintf(out, "--simplifier state--\n"); - fprintf(out, "options:\n"); - fprintf(out, "\tfilter_unreferenced_sites : %d\n", - !!(self->options & TSK_SIMPLIFY_FILTER_SITES)); - fprintf(out, "\tno_filter_nodes : %d\n", - !!(self->options & TSK_SIMPLIFY_NO_FILTER_NODES)); - fprintf(out, "\treduce_to_site_topology : %d\n", - !!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)); - fprintf(out, "\tkeep_unary : %d\n", - !!(self->options & TSK_SIMPLIFY_KEEP_UNARY)); - fprintf(out, "\tkeep_input_roots : %d\n", - !!(self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS)); - fprintf(out, "\tkeep_unary_in_individuals : %d\n", - !!(self->options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS)); - - fprintf(out, "===\nInput tables\n==\n"); - tsk_table_collection_print_state(&self->input_tables, out); - fprintf(out, "===\nOutput tables\n==\n"); - tsk_table_collection_print_state(self->tables, out); - fprintf(out, "===\nmemory heaps\n==\n"); - fprintf(out, "segment_heap:\n"); - tsk_blkalloc_print_state(&self->segment_heap, out); - fprintf(out, "interval_list_heap:\n"); - tsk_blkalloc_print_state(&self->interval_list_heap, out); - fprintf(out, "===\nancestors\n==\n"); - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - fprintf(out, "%lld:\t", (long long) j); - print_segment_chain(self->ancestor_map_head[j], out); - fprintf(out, "\n"); - } - fprintf(out, "===\nnode_id map (input->output)\n==\n"); - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - if (self->node_id_map[j] != TSK_NULL) { - fprintf( - out, "%lld->%lld\n", (long long) j, (long long) self->node_id_map[j]); - } - } - fprintf(out, "===\nsegment queue\n==\n"); - for (j = 0; j < self->segment_queue_size; j++) { - u = &self->segment_queue[j]; - fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); - fprintf(out, "\n"); - } - fprintf(out, "===\nbuffered children\n==\n"); - for (j = 0; j < self->num_buffered_children; j++) { - child = self->buffered_children[j]; - fprintf(out, "%lld -> ", (long long) j); - for (int_list = self->child_edge_map_head[child]; int_list != NULL; - int_list = int_list->next) { - fprintf(out, "(%f, %f), ", int_list->left, int_list->right); - } - fprintf(out, "\n"); - } - fprintf(out, "===\nmutation node map\n==\n"); - for (j = 0; j < self->input_tables.mutations.num_rows; j++) { - fprintf(out, "%lld\t-> %lld\n", (long long) j, - (long long) self->mutation_node_map[j]); - } - fprintf(out, "===\nnode mutation id list map\n==\n"); - for (j = 0; j < self->input_tables.nodes.num_rows; j++) { - if (self->node_mutation_list_map_head[j] != NULL) { - fprintf(out, "%lld\t-> [", (long long) j); - for (list_node = self->node_mutation_list_map_head[j]; list_node != NULL; - list_node = list_node->next) { - fprintf(out, "%lld,", (long long) list_node->mutation); - } - fprintf(out, "]\n"); - } - } - if (!!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)) { - fprintf(out, "===\nposition_lookup\n==\n"); - for (j = 0; j < self->input_tables.sites.num_rows + 2; j++) { - fprintf(out, "%lld\t-> %f\n", (long long) j, self->position_lookup[j]); - } - } - simplifier_check_state(self); -} - -static tsk_segment_t *TSK_WARN_UNUSED -simplifier_alloc_segment(simplifier_t *self, double left, double right, tsk_id_t node) -{ - tsk_segment_t *seg = NULL; - - seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); - if (seg == NULL) { - goto out; - } - seg->next = NULL; - seg->left = left; - seg->right = right; - seg->node = node; -out: - return seg; -} - -static interval_list_t *TSK_WARN_UNUSED -simplifier_alloc_interval_list(simplifier_t *self, double left, double right) -{ - interval_list_t *x = NULL; - - x = tsk_blkalloc_get(&self->interval_list_heap, sizeof(*x)); - if (x == NULL) { - goto out; - } - x->next = NULL; - x->left = left; - x->right = right; -out: - return x; -} - -/* Add a new node to the output node table corresponding to the specified input id. - * Returns the new ID. */ -static tsk_id_t TSK_WARN_UNUSED -simplifier_record_node(simplifier_t *self, tsk_id_t input_id) -{ - tsk_node_t node; - bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); - - tsk_node_table_get_row_unsafe(&self->input_tables.nodes, (tsk_id_t) input_id, &node); - if (update_flags) { - /* Zero out the sample bit */ - node.flags &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; - if (self->is_sample[input_id]) { - node.flags |= TSK_NODE_IS_SAMPLE; - } - } - self->node_id_map[input_id] = (tsk_id_t) self->tables->nodes.num_rows; - return tsk_node_table_add_row(&self->tables->nodes, node.flags, node.time, - node.population, node.individual, node.metadata, node.metadata_length); -} - -/* Remove the mapping for the last recorded node. */ -static int -simplifier_rewind_node(simplifier_t *self, tsk_id_t input_id, tsk_id_t output_id) -{ - self->node_id_map[input_id] = TSK_NULL; - return tsk_node_table_truncate(&self->tables->nodes, (tsk_size_t) output_id); -} - -static int -simplifier_flush_edges(simplifier_t *self, tsk_id_t parent, tsk_size_t *ret_num_edges) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_id_t child; - interval_list_t *x; - tsk_size_t num_edges = 0; - - qsort(self->buffered_children, (size_t) self->num_buffered_children, - sizeof(tsk_id_t), cmp_node_id); - for (j = 0; j < self->num_buffered_children; j++) { - child = self->buffered_children[j]; - for (x = self->child_edge_map_head[child]; x != NULL; x = x->next) { - ret_id = tsk_edge_table_add_row( - &self->tables->edges, x->left, x->right, parent, child, NULL, 0); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - num_edges++; - } - self->child_edge_map_head[child] = NULL; - self->child_edge_map_tail[child] = NULL; - } - self->num_buffered_children = 0; - *ret_num_edges = num_edges; - ret = tsk_blkalloc_reset(&self->interval_list_heap); -out: - return ret; -} - -/* When we are reducing topology down to what is visible at the sites we need a - * lookup table to find the closest site position for each edge. We do this with - * a sorted array and binary search */ -static int -simplifier_init_position_lookup(simplifier_t *self) -{ - int ret = 0; - tsk_size_t num_sites = self->input_tables.sites.num_rows; - - self->position_lookup = tsk_malloc((num_sites + 2) * sizeof(*self->position_lookup)); - if (self->position_lookup == NULL) { - goto out; - } - self->position_lookup[0] = 0; - self->position_lookup[num_sites + 1] = self->input_tables.sequence_length; - tsk_memcpy(self->position_lookup + 1, self->input_tables.sites.position, - num_sites * sizeof(double)); -out: - return ret; -} -/* - * Find the smallest site position index greater than or equal to left - * and right, i.e., slide each endpoint of an interval to the right - * until they hit a site position. If both left and right map to the - * the same position then we discard this edge. We also discard an - * edge if left = 0 and right is less than the first site position. - */ -static bool -simplifier_map_reduced_coordinates(simplifier_t *self, double *left, double *right) -{ - double *X = self->position_lookup; - tsk_size_t N = self->input_tables.sites.num_rows + 2; - tsk_size_t left_index, right_index; - bool skip = false; - - left_index = tsk_search_sorted(X, N, *left); - right_index = tsk_search_sorted(X, N, *right); - if (left_index == right_index || (left_index == 0 && right_index == 1)) { - skip = true; - } else { - /* Remap back to zero if the left end maps to the first site. */ - if (left_index == 1) { - left_index = 0; - } - *left = X[left_index]; - *right = X[right_index]; - } - return skip; -} - -/* Records the specified edge for the current parent by buffering it */ -static int -simplifier_record_edge(simplifier_t *self, double left, double right, tsk_id_t child) -{ - int ret = 0; - interval_list_t *tail, *x; - bool skip; - - if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { - skip = simplifier_map_reduced_coordinates(self, &left, &right); - /* NOTE: we exit early here when reduce_coordindates has told us to - * skip this edge, as it is not visible in the reduced tree sequence */ - if (skip) { - goto out; - } - } - - tail = self->child_edge_map_tail[child]; - if (tail == NULL) { - tsk_bug_assert(self->num_buffered_children < self->input_tables.nodes.num_rows); - self->buffered_children[self->num_buffered_children] = child; - self->num_buffered_children++; - x = simplifier_alloc_interval_list(self, left, right); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->child_edge_map_head[child] = x; - self->child_edge_map_tail[child] = x; - } else { - if (tail->right == left) { - tail->right = right; - } else { - x = simplifier_alloc_interval_list(self, left, right); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tail->next = x; - self->child_edge_map_tail[child] = x; - } - } -out: - return ret; -} - -static int -simplifier_init_sites(simplifier_t *self) -{ - int ret = 0; - tsk_id_t node; - mutation_id_list_t *list_node; - tsk_size_t j; - - self->mutation_node_map - = tsk_calloc(self->input_tables.mutations.num_rows, sizeof(tsk_id_t)); - self->node_mutation_list_mem - = tsk_malloc(self->input_tables.mutations.num_rows * sizeof(mutation_id_list_t)); - self->node_mutation_list_map_head - = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); - self->node_mutation_list_map_tail - = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); - if (self->mutation_node_map == NULL || self->node_mutation_list_mem == NULL - || self->node_mutation_list_map_head == NULL - || self->node_mutation_list_map_tail == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(self->mutation_node_map, 0xff, - self->input_tables.mutations.num_rows * sizeof(tsk_id_t)); - - for (j = 0; j < self->input_tables.mutations.num_rows; j++) { - node = self->input_tables.mutations.node[j]; - list_node = self->node_mutation_list_mem + j; - list_node->mutation = (tsk_id_t) j; - list_node->next = NULL; - if (self->node_mutation_list_map_head[node] == NULL) { - self->node_mutation_list_map_head[node] = list_node; - } else { - self->node_mutation_list_map_tail[node]->next = list_node; - } - self->node_mutation_list_map_tail[node] = list_node; - } -out: - return ret; -} - -static void -simplifier_map_mutations( - simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) -{ - mutation_id_list_t *m_node; - double position; - tsk_id_t site; - - m_node = self->node_mutation_list_map_head[input_id]; - while (m_node != NULL) { - site = self->input_tables.mutations.site[m_node->mutation]; - position = self->input_tables.sites.position[site]; - if (left <= position && position < right) { - self->mutation_node_map[m_node->mutation] = output_id; - } - m_node = m_node->next; - } -} - -static int TSK_WARN_UNUSED -simplifier_add_ancestry( - simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) -{ - int ret = 0; - tsk_segment_t *tail = self->ancestor_map_tail[input_id]; - tsk_segment_t *x; - - tsk_bug_assert(left < right); - if (tail == NULL) { - x = simplifier_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->ancestor_map_head[input_id] = x; - self->ancestor_map_tail[input_id] = x; - } else { - if (tail->right == left && tail->node == output_id) { - tail->right = right; - } else { - x = simplifier_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tail->next = x; - self->ancestor_map_tail[input_id] = x; - } - } - simplifier_map_mutations(self, input_id, left, right, output_id); -out: - return ret; -} - -/* Sets up the internal working copies of the various tables, as needed - * depending on the specified options. */ -static int -simplifier_init_tables(simplifier_t *self) -{ - int ret; - bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); - bool filter_populations = self->options & TSK_SIMPLIFY_FILTER_POPULATIONS; - bool filter_individuals = self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS; - bool filter_sites = self->options & TSK_SIMPLIFY_FILTER_SITES; - tsk_bookmark_t rows_to_retain; - - /* NOTE: this is a bit inefficient here as we're taking copies of - * the tables even in the no-filter case where the original tables - * won't be touched (beyond references to external tables that may - * need updating). Future versions may do something a bit more - * complicated like temporarily stealing the pointers to the - * underlying column memory in these tables, and then being careful - * not to free the table at the end. - */ - ret = tsk_table_collection_copy(self->tables, &self->input_tables, 0); - if (ret != 0) { - goto out; - } - memset(&rows_to_retain, 0, sizeof(rows_to_retain)); - rows_to_retain.provenances = self->tables->provenances.num_rows; - if (!filter_nodes) { - rows_to_retain.nodes = self->tables->nodes.num_rows; - } - if (!filter_populations) { - rows_to_retain.populations = self->tables->populations.num_rows; - } - if (!filter_individuals) { - rows_to_retain.individuals = self->tables->individuals.num_rows; - } - if (!filter_sites) { - rows_to_retain.sites = self->tables->sites.num_rows; - } - - ret = tsk_table_collection_truncate(self->tables, &rows_to_retain); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int -simplifier_init_nodes(simplifier_t *self, const tsk_id_t *samples) -{ - int ret = 0; - tsk_id_t node_id; - tsk_size_t j; - const tsk_size_t num_nodes = self->input_tables.nodes.num_rows; - bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); - bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); - tsk_flags_t *node_flags = self->tables->nodes.flags; - tsk_id_t *node_id_map = self->node_id_map; - - if (filter_nodes) { - tsk_bug_assert(self->tables->nodes.num_rows == 0); - /* The node table has been cleared. Add nodes for the samples. */ - for (j = 0; j < self->num_samples; j++) { - node_id = simplifier_record_node(self, samples[j]); - if (node_id < 0) { - ret = (int) node_id; - goto out; - } - } - } else { - tsk_bug_assert(self->tables->nodes.num_rows == num_nodes); - if (update_flags) { - for (j = 0; j < num_nodes; j++) { - /* Reset the sample flags */ - node_flags[j] &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; - if (self->is_sample[j]) { - node_flags[j] |= TSK_NODE_IS_SAMPLE; - } - } - } - - for (j = 0; j < num_nodes; j++) { - node_id_map[j] = (tsk_id_t) j; - } - } - /* Add the initial ancestry */ - for (j = 0; j < self->num_samples; j++) { - node_id = samples[j]; - ret = simplifier_add_ancestry(self, node_id, 0, - self->input_tables.sequence_length, self->node_id_map[node_id]); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int -simplifier_init(simplifier_t *self, const tsk_id_t *samples, tsk_size_t num_samples, - tsk_table_collection_t *tables, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t ret_id; - tsk_size_t num_nodes; - - tsk_memset(self, 0, sizeof(simplifier_t)); - self->num_samples = num_samples; - self->options = options; - self->tables = tables; - - /* TODO we can add a flag to skip these checks for when we know they are - * unnecessary */ - /* TODO Current unit tests require TSK_CHECK_SITE_DUPLICATES but it's - * debateable whether we need it. If we remove, we definitely need explicit - * tests to ensure we're doing sensible things with duplicate sites. - * (Particularly, re TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY.) */ - ret_id = tsk_table_collection_check_integrity(tables, - TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES); - if (ret_id != 0) { - ret = (int) ret_id; - goto out; - } - - /* Allocate the heaps used for small objects-> Assuming 8K is a good chunk size - */ - ret = tsk_blkalloc_init(&self->segment_heap, 8192); - if (ret != 0) { - goto out; - } - ret = tsk_blkalloc_init(&self->interval_list_heap, 8192); - if (ret != 0) { - goto out; - } - ret = segment_overlapper_alloc(&self->segment_overlapper); - if (ret != 0) { - goto out; - } - num_nodes = tables->nodes.num_rows; - /* Make the maps and set the intial state */ - self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); - self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); - self->child_edge_map_head = tsk_calloc(num_nodes, sizeof(interval_list_t *)); - self->child_edge_map_tail = tsk_calloc(num_nodes, sizeof(interval_list_t *)); - self->node_id_map = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->buffered_children = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->is_sample = tsk_calloc(num_nodes, sizeof(bool)); - self->max_segment_queue_size = 64; - self->segment_queue - = tsk_malloc(self->max_segment_queue_size * sizeof(tsk_segment_t)); - if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL - || self->child_edge_map_head == NULL || self->child_edge_map_tail == NULL - || self->node_id_map == NULL || self->is_sample == NULL - || self->segment_queue == NULL || self->buffered_children == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* Go through the samples to check for errors before we clear the tables. */ - for (j = 0; j < self->num_samples; j++) { - if (samples[j] < 0 || samples[j] >= (tsk_id_t) num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (self->is_sample[samples[j]]) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - self->is_sample[samples[j]] = true; - } - tsk_memset(self->node_id_map, 0xff, num_nodes * sizeof(tsk_id_t)); - - ret = simplifier_init_tables(self); - if (ret != 0) { - goto out; - } - ret = simplifier_init_sites(self); - if (ret != 0) { - goto out; - } - ret = simplifier_init_nodes(self, samples); - if (ret != 0) { - goto out; - } - if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { - ret = simplifier_init_position_lookup(self); - if (ret != 0) { - goto out; - } - } - - self->edge_sort_offset = TSK_NULL; -out: - return ret; -} - -static int -simplifier_free(simplifier_t *self) -{ - tsk_table_collection_free(&self->input_tables); - tsk_blkalloc_free(&self->segment_heap); - tsk_blkalloc_free(&self->interval_list_heap); - segment_overlapper_free(&self->segment_overlapper); - tsk_safe_free(self->ancestor_map_head); - tsk_safe_free(self->ancestor_map_tail); - tsk_safe_free(self->child_edge_map_head); - tsk_safe_free(self->child_edge_map_tail); - tsk_safe_free(self->node_id_map); - tsk_safe_free(self->segment_queue); - tsk_safe_free(self->is_sample); - tsk_safe_free(self->mutation_node_map); - tsk_safe_free(self->node_mutation_list_mem); - tsk_safe_free(self->node_mutation_list_map_head); - tsk_safe_free(self->node_mutation_list_map_tail); - tsk_safe_free(self->buffered_children); - tsk_safe_free(self->position_lookup); - return 0; -} - -static int TSK_WARN_UNUSED -simplifier_enqueue_segment(simplifier_t *self, double left, double right, tsk_id_t node) -{ - int ret = 0; - tsk_segment_t *seg; - void *p; - - tsk_bug_assert(left < right); - /* Make sure we always have room for one more segment in the queue so we - * can put a tail sentinel on it */ - if (self->segment_queue_size == self->max_segment_queue_size - 1) { - self->max_segment_queue_size *= 2; - p = tsk_realloc(self->segment_queue, - self->max_segment_queue_size * sizeof(*self->segment_queue)); - if (p == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - self->segment_queue = p; - } - seg = self->segment_queue + self->segment_queue_size; - seg->left = left; - seg->right = right; - seg->node = node; - self->segment_queue_size++; -out: - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) -{ - int ret = 0; - tsk_segment_t **X, *x; - tsk_size_t j, num_overlapping, num_flushed_edges; - double left, right, prev_right; - tsk_id_t ancestry_node; - tsk_id_t output_id = self->node_id_map[input_id]; - bool is_sample = self->is_sample[input_id]; - bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); - bool keep_unary = self->options & TSK_SIMPLIFY_KEEP_UNARY; - - if ((self->options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS) - && (self->input_tables.nodes.individual[input_id] != TSK_NULL)) { - keep_unary = true; - } - - if (is_sample) { - /* Free up the existing ancestry mapping. */ - x = self->ancestor_map_tail[input_id]; - tsk_bug_assert(x->left == 0 && x->right == self->tables->sequence_length); - self->ancestor_map_head[input_id] = NULL; - self->ancestor_map_tail[input_id] = NULL; - } - - ret = segment_overlapper_start( - &self->segment_overlapper, self->segment_queue, self->segment_queue_size); - if (ret != 0) { - goto out; - } - prev_right = 0; - while ((ret = segment_overlapper_next( - &self->segment_overlapper, &left, &right, &X, &num_overlapping)) - == 1) { - tsk_bug_assert(left < right); - tsk_bug_assert(num_overlapping > 0); - if (num_overlapping == 1) { - ancestry_node = X[0]->node; - if (is_sample) { - ret = simplifier_record_edge(self, left, right, ancestry_node); - if (ret != 0) { - goto out; - } - ancestry_node = output_id; - } else if (keep_unary) { - if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id); - } - ret = simplifier_record_edge(self, left, right, ancestry_node); - if (ret != 0) { - goto out; - } - } - } else { - if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id); - if (output_id < 0) { - ret = (int) output_id; - goto out; - } - } - ancestry_node = output_id; - for (j = 0; j < num_overlapping; j++) { - ret = simplifier_record_edge(self, left, right, X[j]->node); - if (ret != 0) { - goto out; - } - } - } - if (is_sample && left != prev_right) { - /* Fill in any gaps in ancestry for the sample */ - ret = simplifier_add_ancestry(self, input_id, prev_right, left, output_id); - if (ret != 0) { - goto out; - } - } - if (keep_unary) { - ancestry_node = output_id; - } - ret = simplifier_add_ancestry(self, input_id, left, right, ancestry_node); - if (ret != 0) { - goto out; - } - prev_right = right; - } - /* Check for errors occuring in the loop condition */ - if (ret != 0) { - goto out; - } - if (is_sample && prev_right != self->tables->sequence_length) { - /* If a trailing gap exists in the sample ancestry, fill it in. */ - ret = simplifier_add_ancestry( - self, input_id, prev_right, self->tables->sequence_length, output_id); - if (ret != 0) { - goto out; - } - } - if (output_id != TSK_NULL) { - ret = simplifier_flush_edges(self, output_id, &num_flushed_edges); - if (ret != 0) { - goto out; - } - if (filter_nodes && (num_flushed_edges == 0) && !is_sample) { - ret = simplifier_rewind_node(self, input_id, output_id); - } - } -out: - return ret; -} - -/* Extract the ancestry for the specified input node over the specified - * interval and queue it up for merging. - */ -static int TSK_WARN_UNUSED -simplifier_extract_ancestry( - simplifier_t *self, double left, double right, tsk_id_t input_id) -{ - int ret = 0; - tsk_segment_t *x = self->ancestor_map_head[input_id]; - tsk_segment_t y; /* y is the segment that has been removed */ - tsk_segment_t *x_head, *x_prev, *seg_left, *seg_right; - - x_head = NULL; - x_prev = NULL; - while (x != NULL) { - if (x->right > left && right > x->left) { - y.left = TSK_MAX(x->left, left); - y.right = TSK_MIN(x->right, right); - y.node = x->node; - ret = simplifier_enqueue_segment(self, y.left, y.right, y.node); - if (ret != 0) { - goto out; - } - seg_left = NULL; - seg_right = NULL; - if (x->left != y.left) { - seg_left = simplifier_alloc_segment(self, x->left, y.left, x->node); - if (seg_left == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (x_prev == NULL) { - x_head = seg_left; - } else { - x_prev->next = seg_left; - } - x_prev = seg_left; - } - if (x->right != y.right) { - x->left = y.right; - seg_right = x; - } else { - seg_right = x->next; - // TODO free x - } - if (x_prev == NULL) { - x_head = seg_right; - } else { - x_prev->next = seg_right; - } - x = seg_right; - } else { - if (x_prev == NULL) { - x_head = x; - } - x_prev = x; - x = x->next; - } - } - - self->ancestor_map_head[input_id] = x_head; - self->ancestor_map_tail[input_id] = x_prev; -out: - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_process_parent_edges( - simplifier_t *self, tsk_id_t parent, tsk_size_t start, tsk_size_t end) -{ - int ret = 0; - tsk_size_t j; - const tsk_edge_table_t *input_edges = &self->input_tables.edges; - tsk_id_t child; - double left, right; - - /* Go through the edges and queue up ancestry segments for processing. */ - self->segment_queue_size = 0; - for (j = start; j < end; j++) { - tsk_bug_assert(parent == input_edges->parent[j]); - child = input_edges->child[j]; - left = input_edges->left[j]; - right = input_edges->right[j]; - ret = simplifier_extract_ancestry(self, left, right, child); - if (ret != 0) { - goto out; - } - } - /* We can now merge the ancestral segments for the parent */ - ret = simplifier_merge_ancestors(self, parent); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_finalise_site_references( - simplifier_t *self, const bool *site_referenced, tsk_id_t *site_id_map) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_site_t site; - const tsk_size_t num_sites = self->input_tables.sites.num_rows; - - if (self->options & TSK_SIMPLIFY_FILTER_SITES) { - for (j = 0; j < num_sites; j++) { - tsk_site_table_get_row_unsafe( - &self->input_tables.sites, (tsk_id_t) j, &site); - site_id_map[j] = TSK_NULL; - if (site_referenced[j]) { - ret_id = tsk_site_table_add_row(&self->tables->sites, site.position, - site.ancestral_state, site.ancestral_state_length, site.metadata, - site.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - site_id_map[j] = ret_id; - } - } - } else { - tsk_bug_assert(self->tables->sites.num_rows == num_sites); - for (j = 0; j < num_sites; j++) { - site_id_map[j] = (tsk_id_t) j; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_finalise_population_references(simplifier_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t pop_id, ret_id; - tsk_population_t pop; - tsk_id_t *node_population = self->tables->nodes.population; - const tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_size_t num_populations = self->input_tables.populations.num_rows; - bool *population_referenced - = tsk_calloc(num_populations, sizeof(*population_referenced)); - tsk_id_t *population_id_map - = tsk_malloc(num_populations * sizeof(*population_id_map)); - - tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_POPULATIONS); - - if (population_referenced == NULL || population_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (j = 0; j < num_nodes; j++) { - pop_id = node_population[j]; - if (pop_id != TSK_NULL) { - population_referenced[pop_id] = true; - } - } - - for (j = 0; j < num_populations; j++) { - tsk_population_table_get_row_unsafe( - &self->input_tables.populations, (tsk_id_t) j, &pop); - population_id_map[j] = TSK_NULL; - if (population_referenced[j]) { - ret_id = tsk_population_table_add_row( - &self->tables->populations, pop.metadata, pop.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - population_id_map[j] = ret_id; - } - } - - /* Remap the IDs in the node table */ - for (j = 0; j < num_nodes; j++) { - pop_id = node_population[j]; - if (pop_id != TSK_NULL) { - node_population[j] = population_id_map[pop_id]; - } - } -out: - tsk_safe_free(population_id_map); - tsk_safe_free(population_referenced); - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_finalise_individual_references(simplifier_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t pop_id, ret_id; - tsk_individual_t ind; - tsk_id_t *node_individual = self->tables->nodes.individual; - tsk_id_t *parents; - const tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_size_t num_individuals = self->input_tables.individuals.num_rows; - bool *individual_referenced - = tsk_calloc(num_individuals, sizeof(*individual_referenced)); - tsk_id_t *individual_id_map - = tsk_malloc(num_individuals * sizeof(*individual_id_map)); - - tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS); - - if (individual_referenced == NULL || individual_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (j = 0; j < num_nodes; j++) { - pop_id = node_individual[j]; - if (pop_id != TSK_NULL) { - individual_referenced[pop_id] = true; - } - } - - for (j = 0; j < num_individuals; j++) { - tsk_individual_table_get_row_unsafe( - &self->input_tables.individuals, (tsk_id_t) j, &ind); - individual_id_map[j] = TSK_NULL; - if (individual_referenced[j]) { - /* Can't remap the parents inline here because we have no - * guarantees about sortedness */ - ret_id = tsk_individual_table_add_row(&self->tables->individuals, ind.flags, - ind.location, ind.location_length, ind.parents, ind.parents_length, - ind.metadata, ind.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - individual_id_map[j] = ret_id; - } - } - - /* Remap the IDs in the node table */ - for (j = 0; j < num_nodes; j++) { - pop_id = node_individual[j]; - if (pop_id != TSK_NULL) { - node_individual[j] = individual_id_map[pop_id]; - } - } - - /* Remap parent IDs. * - * NOTE! must take the pointer reference here as it can change from - * the start of the function */ - parents = self->tables->individuals.parents; - for (j = 0; j < self->tables->individuals.parents_length; j++) { - if (parents[j] != TSK_NULL) { - parents[j] = individual_id_map[parents[j]]; - } - } - -out: - tsk_safe_free(individual_id_map); - tsk_safe_free(individual_referenced); - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_output_sites(simplifier_t *self) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - tsk_mutation_t mutation; - const tsk_size_t num_sites = self->input_tables.sites.num_rows; - const tsk_size_t num_mutations = self->input_tables.mutations.num_rows; - bool *site_referenced = tsk_calloc(num_sites, sizeof(*site_referenced)); - tsk_id_t *site_id_map = tsk_malloc(num_sites * sizeof(*site_id_map)); - tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); - const tsk_id_t *mutation_node_map = self->mutation_node_map; - const tsk_id_t *mutation_site = self->input_tables.mutations.site; - - if (site_referenced == NULL || site_id_map == NULL || mutation_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (j = 0; j < num_mutations; j++) { - if (mutation_node_map[j] != TSK_NULL) { - site_referenced[mutation_site[j]] = true; - } - } - ret = simplifier_finalise_site_references(self, site_referenced, site_id_map); - if (ret != 0) { - goto out; - } - - for (j = 0; j < num_mutations; j++) { - mutation_id_map[j] = TSK_NULL; - if (mutation_node_map[j] != TSK_NULL) { - tsk_mutation_table_get_row_unsafe( - &self->input_tables.mutations, (tsk_id_t) j, &mutation); - mutation.node = mutation_node_map[j]; - mutation.site = site_id_map[mutation.site]; - if (mutation.parent != TSK_NULL) { - mutation.parent = mutation_id_map[mutation.parent]; - } - ret_id = tsk_mutation_table_add_row(&self->tables->mutations, mutation.site, - mutation.node, mutation.parent, mutation.time, mutation.derived_state, - mutation.derived_state_length, mutation.metadata, - mutation.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - mutation_id_map[j] = ret_id; - } - } -out: - tsk_safe_free(site_referenced); - tsk_safe_free(site_id_map); - tsk_safe_free(mutation_id_map); - return ret; -} - -/* Flush the remaining non-edge and node data in the model to the - * output tables. */ -static int TSK_WARN_UNUSED -simplifier_flush_output(simplifier_t *self) -{ - int ret = 0; - - /* TODO Migrations fit reasonably neatly into the pattern that we have here. We - * can consider references to populations from migration objects in the same way - * as from nodes, so that we only remove a population if its referenced by - * neither. Mapping the population IDs in migrations is then easy. In principle - * nodes are similar, but the semantics are slightly different because we've - * already allocated all the nodes by their references from edges. We then - * need to decide whether we remove migrations that reference unmapped nodes - * or whether to add these nodes back in (probably the former is the correct - * approach).*/ - if (self->input_tables.migrations.num_rows != 0) { - ret = TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED; - goto out; - } - - ret = simplifier_output_sites(self); - if (ret != 0) { - goto out; - } - - if (self->options & TSK_SIMPLIFY_FILTER_POPULATIONS) { - ret = simplifier_finalise_population_references(self); - if (ret != 0) { - goto out; - } - } - if (self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS) { - ret = simplifier_finalise_individual_references(self); - if (ret != 0) { - goto out; - } - } - -out: - return ret; -} - -static void -simplifier_set_edge_sort_offset(simplifier_t *self, double youngest_root_time) -{ - const tsk_edge_table_t edges = self->tables->edges; - const double *node_time = self->tables->nodes.time; - int64_t offset; - - for (offset = 0; offset < (int64_t) edges.num_rows; offset++) { - if (node_time[edges.parent[offset]] >= youngest_root_time) { - break; - } - } - self->edge_sort_offset = offset; -} - -static int TSK_WARN_UNUSED -simplifier_sort_edges(simplifier_t *self) -{ - /* designated initialisers are guaranteed to set any missing fields to - * zero, so we don't need to set the rest of them. */ - tsk_bookmark_t bookmark = { - .edges = (tsk_size_t) self->edge_sort_offset, - .sites = self->tables->sites.num_rows, - .mutations = self->tables->mutations.num_rows, - }; - tsk_bug_assert(self->edge_sort_offset >= 0); - return tsk_table_collection_sort(self->tables, &bookmark, 0); -} - -static int TSK_WARN_UNUSED -simplifier_insert_input_roots(simplifier_t *self) -{ - int ret = 0; - tsk_id_t input_id, output_id; - tsk_segment_t *x; - tsk_size_t num_flushed_edges; - double youngest_root_time = DBL_MAX; - const double *node_time = self->tables->nodes.time; - - for (input_id = 0; input_id < (tsk_id_t) self->input_tables.nodes.num_rows; - input_id++) { - x = self->ancestor_map_head[input_id]; - if (x != NULL) { - output_id = self->node_id_map[input_id]; - if (output_id == TSK_NULL) { - output_id = simplifier_record_node(self, input_id); - if (output_id < 0) { - ret = (int) output_id; - goto out; - } - } - youngest_root_time = TSK_MIN(youngest_root_time, node_time[output_id]); - while (x != NULL) { - if (x->node != output_id) { - ret = simplifier_record_edge(self, x->left, x->right, x->node); - if (ret != 0) { - goto out; - } - simplifier_map_mutations( - self, input_id, x->left, x->right, output_id); - } - x = x->next; - } - ret = simplifier_flush_edges(self, output_id, &num_flushed_edges); - if (ret != 0) { - goto out; - } - } - } - if (youngest_root_time != DBL_MAX) { - simplifier_set_edge_sort_offset(self, youngest_root_time); - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -simplifier_run(simplifier_t *self, tsk_id_t *node_map) -{ - int ret = 0; - tsk_size_t j, start; - tsk_id_t parent, current_parent; - const tsk_edge_table_t *input_edges = &self->input_tables.edges; - tsk_size_t num_edges = input_edges->num_rows; - - if (num_edges > 0) { - start = 0; - current_parent = input_edges->parent[0]; - for (j = 0; j < num_edges; j++) { - parent = input_edges->parent[j]; - if (parent != current_parent) { - ret = simplifier_process_parent_edges(self, current_parent, start, j); - if (ret != 0) { - goto out; - } - current_parent = parent; - start = j; - } - } - ret = simplifier_process_parent_edges(self, current_parent, start, num_edges); - if (ret != 0) { - goto out; - } - } - if (self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS) { - ret = simplifier_insert_input_roots(self); - if (ret != 0) { - goto out; - } - } - ret = simplifier_flush_output(self); - if (ret != 0) { - goto out; - } - if (node_map != NULL) { - /* Finally, output the new IDs for the nodes, if required. */ - tsk_memcpy(node_map, self->node_id_map, - self->input_tables.nodes.num_rows * sizeof(tsk_id_t)); - } - if (self->edge_sort_offset != TSK_NULL) { - tsk_bug_assert(self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS); - ret = simplifier_sort_edges(self); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -/************************* - * table_collection - *************************/ - -typedef struct { - tsk_id_t index; - /* These are the sort keys in order */ - double first; - double second; - tsk_id_t third; - tsk_id_t fourth; -} index_sort_t; - -static int -cmp_index_sort(const void *a, const void *b) -{ - const index_sort_t *ca = (const index_sort_t *) a; - const index_sort_t *cb = (const index_sort_t *) b; - int ret = (ca->first > cb->first) - (ca->first < cb->first); - if (ret == 0) { - ret = (ca->second > cb->second) - (ca->second < cb->second); - if (ret == 0) { - ret = (ca->third > cb->third) - (ca->third < cb->third); - if (ret == 0) { - ret = (ca->fourth > cb->fourth) - (ca->fourth < cb->fourth); - } - } - } - return ret; -} - -static int -tsk_table_collection_check_offsets(const tsk_table_collection_t *self) -{ - int ret = 0; - - ret = check_offsets(self->nodes.num_rows, self->nodes.metadata_offset, - self->nodes.metadata_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->sites.num_rows, self->sites.ancestral_state_offset, - self->sites.ancestral_state_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->sites.num_rows, self->sites.metadata_offset, - self->sites.metadata_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->mutations.num_rows, self->mutations.derived_state_offset, - self->mutations.derived_state_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->mutations.num_rows, self->mutations.metadata_offset, - self->mutations.metadata_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->individuals.num_rows, self->individuals.metadata_offset, - self->individuals.metadata_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->provenances.num_rows, self->provenances.timestamp_offset, - self->provenances.timestamp_length, true); - if (ret != 0) { - goto out; - } - ret = check_offsets(self->provenances.num_rows, self->provenances.record_offset, - self->provenances.record_length, true); - if (ret != 0) { - goto out; - } - ret = 0; -out: - return ret; -} - -static int -tsk_table_collection_check_node_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - double node_time; - tsk_id_t population, individual; - tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; - tsk_id_t num_individuals = (tsk_id_t) self->individuals.num_rows; - const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); - - for (j = 0; j < self->nodes.num_rows; j++) { - node_time = self->nodes.time[j]; - if (!tsk_isfinite(node_time)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - if (check_population_refs) { - population = self->nodes.population[j]; - if (population < TSK_NULL || population >= num_populations) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - } - individual = self->nodes.individual[j]; - if (individual < TSK_NULL || individual >= num_individuals) { - ret = TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS; - goto out; - } - } -out: - return ret; -} - -static int -tsk_table_collection_check_edge_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t parent, last_parent, child, last_child; - double left, last_left, right; - const double *time = self->nodes.time; - const double L = self->sequence_length; - const tsk_edge_table_t edges = self->edges; - const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; - const bool check_ordering = !!(options & TSK_CHECK_EDGE_ORDERING); - bool *parent_seen = NULL; - - if (check_ordering) { - parent_seen = tsk_calloc((tsk_size_t) num_nodes, sizeof(*parent_seen)); - if (parent_seen == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } - - /* Just keeping compiler happy; these values don't matter. */ - last_left = 0; - last_parent = 0; - last_child = 0; - for (j = 0; j < edges.num_rows; j++) { - parent = edges.parent[j]; - child = edges.child[j]; - left = edges.left[j]; - right = edges.right[j]; - /* Node ID integrity */ - if (parent == TSK_NULL) { - ret = TSK_ERR_NULL_PARENT; - goto out; - } - if (parent < 0 || parent >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (child == TSK_NULL) { - ret = TSK_ERR_NULL_CHILD; - goto out; - } - if (child < 0 || child >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - /* Spatial requirements for edges */ - if (!(tsk_isfinite(left) && tsk_isfinite(right))) { - ret = TSK_ERR_GENOME_COORDS_NONFINITE; - goto out; - } - if (left < 0) { - ret = TSK_ERR_LEFT_LESS_ZERO; - goto out; - } - if (right > L) { - ret = TSK_ERR_RIGHT_GREATER_SEQ_LENGTH; - goto out; - } - if (left >= right) { - ret = TSK_ERR_BAD_EDGE_INTERVAL; - goto out; - } - /* time[child] must be < time[parent] */ - if (time[child] >= time[parent]) { - ret = TSK_ERR_BAD_NODE_TIME_ORDERING; - goto out; - } - - if (check_ordering) { - if (parent_seen[parent]) { - ret = TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS; - goto out; - } - if (j > 0) { - /* Input data must sorted by (time[parent], parent, child, left). */ - if (time[parent] < time[last_parent]) { - ret = TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME; - goto out; - } - if (time[parent] == time[last_parent]) { - if (parent == last_parent) { - if (child < last_child) { - ret = TSK_ERR_EDGES_NOT_SORTED_CHILD; - goto out; - } - if (child == last_child) { - if (left == last_left) { - ret = TSK_ERR_DUPLICATE_EDGES; - goto out; - } else if (left < last_left) { - ret = TSK_ERR_EDGES_NOT_SORTED_LEFT; - goto out; - } - } - } else { - parent_seen[last_parent] = true; - } - } - } - last_parent = parent; - last_child = child; - last_left = left; - } - } -out: - tsk_safe_free(parent_seen); - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_check_site_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - double position; - const double L = self->sequence_length; - const tsk_site_table_t sites = self->sites; - const bool check_site_ordering = !!(options & TSK_CHECK_SITE_ORDERING); - const bool check_site_duplicates = !!(options & TSK_CHECK_SITE_DUPLICATES); - - for (j = 0; j < sites.num_rows; j++) { - position = sites.position[j]; - /* Spatial requirements */ - if (!tsk_isfinite(position)) { - ret = TSK_ERR_BAD_SITE_POSITION; - goto out; - } - if (position < 0 || position >= L) { - ret = TSK_ERR_BAD_SITE_POSITION; - goto out; - } - if (j > 0) { - if (check_site_duplicates && sites.position[j - 1] == position) { - ret = TSK_ERR_DUPLICATE_SITE_POSITION; - goto out; - } - if (check_site_ordering && sites.position[j - 1] > position) { - ret = TSK_ERR_UNSORTED_SITES; - goto out; - } - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_check_mutation_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t parent_mut; - double mutation_time; - double last_known_time = INFINITY; - const tsk_mutation_table_t mutations = self->mutations; - const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; - const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; - const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; - const double *node_time = self->nodes.time; - const bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); - bool unknown_time; - int num_known_times = 0; - int num_unknown_times = 0; - - for (j = 0; j < mutations.num_rows; j++) { - /* Basic reference integrity */ - if (mutations.site[j] < 0 || mutations.site[j] >= num_sites) { - ret = TSK_ERR_SITE_OUT_OF_BOUNDS; - goto out; - } - if (mutations.node[j] < 0 || mutations.node[j] >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - /* Integrity check for mutation parent */ - parent_mut = mutations.parent[j]; - if (parent_mut < TSK_NULL || parent_mut >= num_mutations) { - ret = TSK_ERR_MUTATION_OUT_OF_BOUNDS; - goto out; - } - if (parent_mut == (tsk_id_t) j) { - ret = TSK_ERR_MUTATION_PARENT_EQUAL; - goto out; - } - /* Check that time is finite and not more recent than node time */ - mutation_time = mutations.time[j]; - unknown_time = tsk_is_unknown_time(mutation_time); - if (!unknown_time) { - if (!tsk_isfinite(mutation_time)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - if (mutation_time < node_time[mutations.node[j]]) { - ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; - goto out; - } - } - - /* reset checks when reaching a new site */ - if (j > 0 && mutations.site[j - 1] != mutations.site[j]) { - last_known_time = INFINITY; - num_known_times = 0; - num_unknown_times = 0; - } - - /* Check known/unknown times are not both present on a site */ - if (unknown_time) { - num_unknown_times++; - } else { - num_known_times++; - } - if ((num_unknown_times > 0) && (num_known_times > 0)) { - ret = TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN; - goto out; - } - - /* check parent site agrees */ - if (parent_mut != TSK_NULL) { - if (mutations.site[parent_mut] != mutations.site[j]) { - ret = TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE; - goto out; - } - /* If this mutation time is known, then the parent time - * must also be, or else the - * TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN check - * above will fail. */ - if (!unknown_time && mutation_time > mutations.time[parent_mut]) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; - goto out; - } - } - - if (check_mutation_ordering) { - /* Check site ordering */ - if (j > 0 && mutations.site[j - 1] > mutations.site[j]) { - ret = TSK_ERR_UNSORTED_MUTATIONS; - goto out; - } - - /* Check if parents are listed before their children */ - if (parent_mut != TSK_NULL && parent_mut > (tsk_id_t) j) { - ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; - goto out; - } - - /* Check time ordering. We do this after the other checks above, - * so that more specific errors trigger first */ - if (!unknown_time) { - if (mutation_time > last_known_time) { - ret = TSK_ERR_UNSORTED_MUTATIONS; - goto out; - } - last_known_time = mutation_time; - } - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_check_migration_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j; - double left, right, time; - const double L = self->sequence_length; - const tsk_migration_table_t migrations = self->migrations; - const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; - const tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; - const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); - const bool check_migration_ordering = !!(options & TSK_CHECK_MIGRATION_ORDERING); - - for (j = 0; j < migrations.num_rows; j++) { - if (migrations.node[j] < 0 || migrations.node[j] >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (check_population_refs) { - if (migrations.source[j] < 0 || migrations.source[j] >= num_populations) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - if (migrations.dest[j] < 0 || migrations.dest[j] >= num_populations) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - } - time = migrations.time[j]; - if (!tsk_isfinite(time)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - if (j > 0) { - if (check_migration_ordering && migrations.time[j - 1] > time) { - ret = TSK_ERR_UNSORTED_MIGRATIONS; - goto out; - } - } - left = migrations.left[j]; - right = migrations.right[j]; - /* Spatial requirements */ - /* TODO it's a bit misleading to use the edge-specific errors here. */ - if (!(tsk_isfinite(left) && tsk_isfinite(right))) { - ret = TSK_ERR_GENOME_COORDS_NONFINITE; - goto out; - } - if (left < 0) { - ret = TSK_ERR_LEFT_LESS_ZERO; - goto out; - } - if (right > L) { - ret = TSK_ERR_RIGHT_GREATER_SEQ_LENGTH; - goto out; - } - if (left >= right) { - ret = TSK_ERR_BAD_EDGE_INTERVAL; - goto out; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_check_individual_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_size_t j, k; - const tsk_individual_table_t individuals = self->individuals; - const tsk_id_t num_individuals = (tsk_id_t) individuals.num_rows; - const bool check_individual_ordering = options & TSK_CHECK_INDIVIDUAL_ORDERING; - - for (j = 0; j < (tsk_size_t) num_individuals; j++) { - for (k = individuals.parents_offset[j]; k < individuals.parents_offset[j + 1]; - k++) { - /* Check parent references are valid */ - if (individuals.parents[k] != TSK_NULL - && (individuals.parents[k] < 0 - || individuals.parents[k] >= num_individuals)) { - ret = TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS; - goto out; - } - /* Check no-one is their own parent */ - if (individuals.parents[k] == (tsk_id_t) j) { - ret = TSK_ERR_INDIVIDUAL_SELF_PARENT; - goto out; - } - /* Check parents are ordered */ - if (check_individual_ordering && individuals.parents[k] != TSK_NULL - && individuals.parents[k] >= (tsk_id_t) j) { - ret = TSK_ERR_UNSORTED_INDIVIDUALS; - goto out; - } - } - } -out: - return ret; -} - -static tsk_id_t TSK_WARN_UNUSED -tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) -{ - tsk_id_t ret = 0; - tsk_size_t j, k; - tsk_id_t e, u, site, mutation; - double tree_left, tree_right; - const double sequence_length = self->sequence_length; - const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; - const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; - const tsk_size_t num_edges = self->edges.num_rows; - const double *restrict site_position = self->sites.position; - const tsk_id_t *restrict mutation_site = self->mutations.site; - const tsk_id_t *restrict mutation_node = self->mutations.node; - const double *restrict mutation_time = self->mutations.time; - const double *restrict node_time = self->nodes.time; - const tsk_id_t *restrict I = self->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->indexes.edge_removal_order; - const double *restrict edge_right = self->edges.right; - const double *restrict edge_left = self->edges.left; - const tsk_id_t *restrict edge_child = self->edges.child; - const tsk_id_t *restrict edge_parent = self->edges.parent; - tsk_id_t *restrict parent = NULL; - int8_t *restrict used_edges = NULL; - tsk_id_t num_trees = 0; - - parent = tsk_malloc(self->nodes.num_rows * sizeof(*parent)); - used_edges = tsk_malloc(num_edges * sizeof(*used_edges)); - if (parent == NULL || used_edges == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, self->nodes.num_rows * sizeof(*parent)); - tsk_memset(used_edges, 0, num_edges * sizeof(*used_edges)); - - tree_left = 0; - num_trees = 0; - j = 0; - k = 0; - site = 0; - mutation = 0; - tsk_bug_assert(I != NULL && O != NULL); - tsk_bug_assert(self->indexes.num_edges == num_edges); - - while (j < num_edges || tree_left < sequence_length) { - while (k < num_edges && edge_right[O[k]] == tree_left) { - e = O[k]; - if (used_edges[e] != 1) { - ret = TSK_ERR_TABLES_BAD_INDEXES; - goto out; - } - parent[edge_child[e]] = TSK_NULL; - used_edges[e]++; - k++; - } - while (j < num_edges && edge_left[I[j]] == tree_left) { - e = I[j]; - if (used_edges[e] != 0) { - ret = TSK_ERR_TABLES_BAD_INDEXES; - goto out; - } - used_edges[e]++; - u = edge_child[e]; - if (parent[u] != TSK_NULL) { - ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; - goto out; - } - parent[u] = edge_parent[e]; - j++; - } - tree_right = sequence_length; - if (j < num_edges) { - tree_right = TSK_MIN(tree_right, edge_left[I[j]]); - } - if (k < num_edges) { - tree_right = TSK_MIN(tree_right, edge_right[O[k]]); - } - while (site < num_sites && site_position[site] < tree_right) { - while (mutation < num_mutations && mutation_site[mutation] == site) { - if (!tsk_is_unknown_time(mutation_time[mutation]) - && parent[mutation_node[mutation]] != TSK_NULL - && node_time[parent[mutation_node[mutation]]] - <= mutation_time[mutation]) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE; - goto out; - } - mutation++; - } - site++; - } - if (tree_right <= tree_left) { - ret = TSK_ERR_TABLES_BAD_INDEXES; - goto out; - } - tree_left = tree_right; - /* This is technically possible; if we have 2**31 edges each defining - * a single tree, and there's a gap between each of these edges we - * would overflow this counter. */ - if (num_trees == TSK_MAX_ID) { - ret = TSK_ERR_TREE_OVERFLOW; - goto out; - } - num_trees++; - } - tsk_bug_assert(j == num_edges); - while (k < num_edges) { - /* At this point it must be that used_edges[O[k]] == 1, - * since otherwise we would have added a different edge twice, - * and so hit the error above. */ - e = O[k]; - if (edge_right[e] != sequence_length) { - ret = TSK_ERR_TABLES_BAD_INDEXES; - goto out; - } - used_edges[e]++; - k++; - } - ret = num_trees; -out: - /* Can't use tsk_safe_free because of restrict*/ - if (parent != NULL) { - free(parent); - } - if (used_edges != NULL) { - free(used_edges); - } - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_check_index_integrity(const tsk_table_collection_t *self) -{ - int ret = 0; - tsk_id_t j; - const tsk_id_t num_edges = (tsk_id_t) self->edges.num_rows; - const tsk_id_t *edge_insertion_order = self->indexes.edge_insertion_order; - const tsk_id_t *edge_removal_order = self->indexes.edge_removal_order; - - if (!tsk_table_collection_has_index(self, 0)) { - ret = TSK_ERR_TABLES_NOT_INDEXED; - goto out; - } - for (j = 0; j < num_edges; j++) { - if (edge_insertion_order[j] < 0 || edge_insertion_order[j] >= num_edges) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - if (edge_removal_order[j] < 0 || edge_removal_order[j] >= num_edges) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - } -out: - return ret; -} - -tsk_id_t TSK_WARN_UNUSED -tsk_table_collection_check_integrity( - const tsk_table_collection_t *self, tsk_flags_t options) -{ - tsk_id_t ret = 0; - - if (options & TSK_CHECK_TREES) { - /* Checking the trees implies these checks */ - options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING - | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING - | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; - } - - if (self->sequence_length <= 0) { - ret = TSK_ERR_BAD_SEQUENCE_LENGTH; - goto out; - } - ret = tsk_table_collection_check_offsets(self); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_node_integrity(self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_edge_integrity(self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_site_integrity(self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_mutation_integrity(self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_migration_integrity(self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_check_individual_integrity(self, options); - if (ret != 0) { - goto out; - } - - if (options & TSK_CHECK_INDEXES) { - ret = tsk_table_collection_check_index_integrity(self); - if (ret != 0) { - goto out; - } - } - if (options & TSK_CHECK_TREES) { - ret = tsk_table_collection_check_tree_integrity(self); - if (ret < 0) { - goto out; - } - } -out: - return ret; -} - -void -tsk_table_collection_print_state(const tsk_table_collection_t *self, FILE *out) -{ - fprintf(out, "Table collection state\n"); - fprintf(out, "sequence_length = %f\n", self->sequence_length); - - write_metadata_schema_header( - out, self->metadata_schema, self->metadata_schema_length); - fprintf(out, "#metadata#\n"); - fprintf(out, "%.*s\n", (int) self->metadata_length, self->metadata); - fprintf(out, "#end#metadata\n"); - fprintf(out, "#time_units#\n"); - fprintf(out, "%.*s\n", (int) self->time_units_length, self->time_units); - fprintf(out, "#end#time_units\n"); - tsk_individual_table_print_state(&self->individuals, out); - tsk_node_table_print_state(&self->nodes, out); - tsk_edge_table_print_state(&self->edges, out); - tsk_migration_table_print_state(&self->migrations, out); - tsk_site_table_print_state(&self->sites, out); - tsk_mutation_table_print_state(&self->mutations, out); - tsk_population_table_print_state(&self->populations, out); - tsk_provenance_table_print_state(&self->provenances, out); -} - -int TSK_WARN_UNUSED -tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_flags_t edge_options = 0; - - tsk_memset(self, 0, sizeof(*self)); - if (options & TSK_TC_NO_EDGE_METADATA) { - edge_options |= TSK_TABLE_NO_METADATA; - } - - /* Set default time_units value */ - ret = tsk_table_collection_set_time_units( - self, TSK_TIME_UNITS_UNKNOWN, strlen(TSK_TIME_UNITS_UNKNOWN)); - if (ret != 0) { - goto out; - } - - ret = tsk_node_table_init(&self->nodes, 0); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_init(&self->edges, edge_options); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_init(&self->migrations, 0); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_init(&self->sites, 0); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_init(&self->mutations, 0); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_init(&self->individuals, 0); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_init(&self->populations, 0); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_init(&self->provenances, 0); - if (ret != 0) { - goto out; - } - ret = tsk_reference_sequence_init(&self->reference_sequence, 0); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int -tsk_table_collection_free(tsk_table_collection_t *self) -{ - tsk_individual_table_free(&self->individuals); - tsk_node_table_free(&self->nodes); - tsk_edge_table_free(&self->edges); - tsk_migration_table_free(&self->migrations); - tsk_site_table_free(&self->sites); - tsk_mutation_table_free(&self->mutations); - tsk_population_table_free(&self->populations); - tsk_provenance_table_free(&self->provenances); - tsk_reference_sequence_free(&self->reference_sequence); - tsk_safe_free(self->indexes.edge_insertion_order); - tsk_safe_free(self->indexes.edge_removal_order); - tsk_safe_free(self->file_uuid); - tsk_safe_free(self->time_units); - tsk_safe_free(self->metadata); - tsk_safe_free(self->metadata_schema); - return 0; -} - -bool -tsk_table_collection_equals(const tsk_table_collection_t *self, - const tsk_table_collection_t *other, tsk_flags_t options) -{ - bool ret = self->sequence_length == other->sequence_length - && self->time_units_length == other->time_units_length - && tsk_memcmp(self->time_units, other->time_units, - self->time_units_length * sizeof(char)) - == 0; - if (!(options & TSK_CMP_IGNORE_TABLES)) { - ret = ret - && tsk_individual_table_equals( - &self->individuals, &other->individuals, options) - && tsk_node_table_equals(&self->nodes, &other->nodes, options) - && tsk_edge_table_equals(&self->edges, &other->edges, options) - && tsk_migration_table_equals( - &self->migrations, &other->migrations, options) - && tsk_site_table_equals(&self->sites, &other->sites, options) - && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) - && tsk_population_table_equals( - &self->populations, &other->populations, options); - /* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */ - if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { - ret = ret - && tsk_provenance_table_equals( - &self->provenances, &other->provenances, options); - } - } - /* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */ - if (options & TSK_CMP_IGNORE_METADATA) { - options |= TSK_CMP_IGNORE_TS_METADATA; - } - if (!(options & TSK_CMP_IGNORE_TS_METADATA)) { - ret = ret - && (self->metadata_length == other->metadata_length - && self->metadata_schema_length == other->metadata_schema_length - && tsk_memcmp(self->metadata, other->metadata, - self->metadata_length * sizeof(char)) - == 0 - && tsk_memcmp(self->metadata_schema, other->metadata_schema, - self->metadata_schema_length * sizeof(char)) - == 0); - } - - if (!(options & TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { - ret = ret - && tsk_reference_sequence_equals( - &self->reference_sequence, &other->reference_sequence, options); - } - return ret; -} - -int -tsk_table_collection_set_time_units( - tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length) -{ - return replace_string( - &self->time_units, &self->time_units_length, time_units, time_units_length); -} - -int -tsk_table_collection_set_metadata( - tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length) -{ - return replace_string( - &self->metadata, &self->metadata_length, metadata, metadata_length); -} - -int -tsk_table_collection_takeset_metadata( - tsk_table_collection_t *self, char *metadata, tsk_size_t metadata_length) -{ - return takeset_string( - &self->metadata, &self->metadata_length, metadata, metadata_length); -} - -int -tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length) -{ - return replace_string(&self->metadata_schema, &self->metadata_schema_length, - metadata_schema, metadata_schema_length); -} - -int -tsk_table_collection_set_indexes(tsk_table_collection_t *self, - tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order) -{ - int ret = 0; - tsk_size_t index_size = self->edges.num_rows * sizeof(tsk_id_t); - - tsk_table_collection_drop_index(self, 0); - self->indexes.edge_insertion_order = tsk_malloc(index_size); - self->indexes.edge_removal_order = tsk_malloc(index_size); - if (self->indexes.edge_insertion_order == NULL - || self->indexes.edge_removal_order == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->indexes.edge_insertion_order, edge_insertion_order, index_size); - tsk_memcpy(self->indexes.edge_removal_order, edge_removal_order, index_size); - self->indexes.num_edges = self->edges.num_rows; -out: - return ret; -} - -int -tsk_table_collection_takeset_indexes(tsk_table_collection_t *self, - tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order) -{ - int ret = 0; - - if (edge_insertion_order == NULL || edge_removal_order == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - tsk_table_collection_drop_index(self, 0); - self->indexes.edge_insertion_order = edge_insertion_order; - self->indexes.edge_removal_order = edge_removal_order; - self->indexes.num_edges = self->edges.num_rows; -out: - return ret; -} - -bool -tsk_table_collection_has_index( - const tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - return self->indexes.edge_insertion_order != NULL - && self->indexes.edge_removal_order != NULL - && self->indexes.num_edges == self->edges.num_rows; -} - -bool -tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self) -{ - return !tsk_reference_sequence_is_null(&self->reference_sequence); -} - -int -tsk_table_collection_drop_index( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - tsk_safe_free(self->indexes.edge_insertion_order); - tsk_safe_free(self->indexes.edge_removal_order); - self->indexes.edge_insertion_order = NULL; - self->indexes.edge_removal_order = NULL; - self->indexes.num_edges = 0; - return 0; -} - -int TSK_WARN_UNUSED -tsk_table_collection_build_index( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = TSK_ERR_GENERIC; - tsk_id_t ret_id; - tsk_size_t j; - double *time = self->nodes.time; - index_sort_t *sort_buff = NULL; - tsk_id_t parent; - - /* For build indexes to make sense we must have referential integrity and - * sorted edges */ - ret_id = tsk_table_collection_check_integrity(self, TSK_CHECK_EDGE_ORDERING); - if (ret_id != 0) { - ret = (int) ret_id; - goto out; - } - - tsk_table_collection_drop_index(self, 0); - self->indexes.edge_insertion_order - = tsk_malloc(self->edges.num_rows * sizeof(tsk_id_t)); - self->indexes.edge_removal_order - = tsk_malloc(self->edges.num_rows * sizeof(tsk_id_t)); - sort_buff = tsk_malloc(self->edges.num_rows * sizeof(index_sort_t)); - if (self->indexes.edge_insertion_order == NULL - || self->indexes.edge_removal_order == NULL || sort_buff == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* sort by left and increasing time to give us the order in which - * records should be inserted */ - for (j = 0; j < self->edges.num_rows; j++) { - sort_buff[j].index = (tsk_id_t) j; - sort_buff[j].first = self->edges.left[j]; - parent = self->edges.parent[j]; - sort_buff[j].second = time[parent]; - sort_buff[j].third = parent; - sort_buff[j].fourth = self->edges.child[j]; - } - qsort( - sort_buff, (size_t) self->edges.num_rows, sizeof(index_sort_t), cmp_index_sort); - for (j = 0; j < self->edges.num_rows; j++) { - self->indexes.edge_insertion_order[j] = sort_buff[j].index; - } - /* sort by right and decreasing parent time to give us the order in which - * records should be removed. */ - for (j = 0; j < self->edges.num_rows; j++) { - sort_buff[j].index = (tsk_id_t) j; - sort_buff[j].first = self->edges.right[j]; - parent = self->edges.parent[j]; - sort_buff[j].second = -time[parent]; - sort_buff[j].third = -parent; - sort_buff[j].fourth = -self->edges.child[j]; - } - qsort( - sort_buff, (size_t) self->edges.num_rows, sizeof(index_sort_t), cmp_index_sort); - for (j = 0; j < self->edges.num_rows; j++) { - self->indexes.edge_removal_order[j] = sort_buff[j].index; - } - self->indexes.num_edges = self->edges.num_rows; - ret = 0; -out: - tsk_safe_free(sort_buff); - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_set_file_uuid(tsk_table_collection_t *self, const char *uuid) -{ - int ret = 0; - - tsk_safe_free(self->file_uuid); - self->file_uuid = NULL; - - if (uuid != NULL) { - /* Allow space for \0 so we can print it as a string */ - self->file_uuid = tsk_malloc(TSK_UUID_SIZE + 1); - if (self->file_uuid == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memcpy(self->file_uuid, uuid, TSK_UUID_SIZE); - self->file_uuid[TSK_UUID_SIZE] = '\0'; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_copy(const tsk_table_collection_t *self, - tsk_table_collection_t *dest, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_table_collection_init(dest, options); - if (ret != 0) { - goto out; - } - } - ret = tsk_node_table_copy(&self->nodes, &dest->nodes, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_copy(&self->edges, &dest->edges, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_copy(&self->migrations, &dest->migrations, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_copy(&self->sites, &dest->sites, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_copy(&self->mutations, &dest->mutations, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_copy(&self->individuals, &dest->individuals, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_copy(&self->populations, &dest->populations, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_copy(&self->provenances, &dest->provenances, TSK_NO_INIT); - if (ret != 0) { - goto out; - } - dest->sequence_length = self->sequence_length; - if (tsk_table_collection_has_index(self, 0)) { - ret = tsk_table_collection_set_indexes( - dest, self->indexes.edge_insertion_order, self->indexes.edge_removal_order); - if (ret != 0) { - goto out; - } - } - ret = tsk_table_collection_set_time_units( - dest, self->time_units, self->time_units_length); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_set_metadata(dest, self->metadata, self->metadata_length); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_set_metadata_schema( - dest, self->metadata_schema, self->metadata_schema_length); - if (ret != 0) { - goto out; - } - ret = tsk_reference_sequence_copy( - &self->reference_sequence, &dest->reference_sequence, options); - if (ret != 0) { - goto out; - } - if (options & TSK_COPY_FILE_UUID) { - /* The UUID should only be generated on writing to a file (see the call - * to generate_uuid in tsk_table_collection_write_format_data) and - * no other writing access is supported. We only read the value from - * the file, and raise an error if it's the wrong length there. Thus, - * finding a UUID value of any other length here is undefined behaviour. - */ - tsk_bug_assert( - self->file_uuid == NULL || strlen(self->file_uuid) == TSK_UUID_SIZE); - ret = tsk_table_collection_set_file_uuid(dest, self->file_uuid); - if (ret != 0) { - goto out; - } - } -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *store) -{ - int ret = 0; - size_t len; - uint32_t *version = NULL; - int8_t *format_name = NULL; - int8_t *uuid = NULL; - double *L = NULL; - char *time_units = NULL; - char *metadata = NULL; - char *metadata_schema = NULL; - size_t time_units_length, metadata_length, metadata_schema_length; - /* TODO we could simplify this function quite a bit if we use the - * read_table_properties infrastructure. We would need to add the - * ability to have non-optional columns to that though. */ - - ret = kastore_gets_int8(store, "format/name", &format_name, &len); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (len != TSK_FILE_FORMAT_NAME_LENGTH) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - if (tsk_memcmp(TSK_FILE_FORMAT_NAME, format_name, TSK_FILE_FORMAT_NAME_LENGTH) - != 0) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - - ret = kastore_gets_uint32(store, "format/version", &version, &len); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (len != 2) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - if (version[0] < TSK_FILE_FORMAT_VERSION_MAJOR) { - ret = TSK_ERR_FILE_VERSION_TOO_OLD; - goto out; - } - if (version[0] > TSK_FILE_FORMAT_VERSION_MAJOR) { - ret = TSK_ERR_FILE_VERSION_TOO_NEW; - goto out; - } - - ret = kastore_gets_float64(store, "sequence_length", &L, &len); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (len != 1) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - if (L[0] <= 0.0) { - ret = TSK_ERR_BAD_SEQUENCE_LENGTH; - goto out; - } - self->sequence_length = L[0]; - - ret = kastore_gets_int8(store, "uuid", &uuid, &len); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (len != TSK_UUID_SIZE) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - ret = tsk_table_collection_set_file_uuid(self, (const char *) uuid); - if (ret != 0) { - goto out; - } - - ret = kastore_containss(store, "time_units"); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (ret == 1) { - ret = kastore_gets_int8( - store, "time_units", (int8_t **) &time_units, &time_units_length); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - ret = tsk_table_collection_set_time_units( - self, time_units, (tsk_size_t) time_units_length); - if (ret != 0) { - goto out; - } - } - ret = kastore_containss(store, "metadata"); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (ret == 1) { - ret = kastore_gets_int8( - store, "metadata", (int8_t **) &metadata, &metadata_length); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - ret = tsk_table_collection_takeset_metadata( - self, metadata, (tsk_size_t) metadata_length); - if (ret != 0) { - goto out; - } - metadata = NULL; - } - - ret = kastore_containss(store, "metadata_schema"); - if (ret < 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - if (ret == 1) { - ret = kastore_gets_int8(store, "metadata_schema", (int8_t **) &metadata_schema, - (size_t *) &metadata_schema_length); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - ret = tsk_table_collection_set_metadata_schema( - self, metadata_schema, (tsk_size_t) metadata_schema_length); - if (ret != 0) { - goto out; - } - } - -out: - if ((ret ^ (1 << TSK_KAS_ERR_BIT)) == KAS_ERR_KEY_NOT_FOUND) { - ret = TSK_ERR_REQUIRED_COL_NOT_FOUND; - } - tsk_safe_free(version); - tsk_safe_free(format_name); - tsk_safe_free(uuid); - tsk_safe_free(L); - tsk_safe_free(time_units); - tsk_safe_free(metadata_schema); - tsk_safe_free(metadata); - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_dump_indexes(const tsk_table_collection_t *self, kastore_t *store, - tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - write_table_col_t cols[] = { - { "indexes/edge_insertion_order", NULL, self->indexes.num_edges, - TSK_ID_STORAGE_TYPE }, - { "indexes/edge_removal_order", NULL, self->indexes.num_edges, - TSK_ID_STORAGE_TYPE }, - { .name = NULL }, - }; - - if (tsk_table_collection_has_index(self, 0)) { - cols[0].array = self->indexes.edge_insertion_order; - cols[1].array = self->indexes.edge_removal_order; - ret = write_table_cols(store, cols, 0); - } - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store) -{ - int ret = 0; - tsk_id_t *edge_insertion_order = NULL; - tsk_id_t *edge_removal_order = NULL; - tsk_size_t num_rows; - - read_table_col_t cols[] = { - { "indexes/edge_insertion_order", (void **) &edge_insertion_order, - TSK_ID_STORAGE_TYPE, TSK_COL_OPTIONAL }, - { "indexes/edge_removal_order", (void **) &edge_removal_order, - TSK_ID_STORAGE_TYPE, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - num_rows = TSK_NUM_ROWS_UNSET; - ret = read_table_cols(store, &num_rows, cols, 0); - if (ret != 0) { - goto out; - } - - if ((edge_insertion_order == NULL) != (edge_removal_order == NULL)) { - ret = TSK_ERR_BOTH_COLUMNS_REQUIRED; - goto out; - } - if (edge_insertion_order != NULL) { - if (num_rows != self->edges.num_rows) { - ret = TSK_ERR_FILE_FORMAT; - goto out; - } - ret = tsk_table_collection_takeset_indexes( - self, edge_insertion_order, edge_removal_order); - if (ret != 0) { - goto out; - } - } - edge_insertion_order = NULL; - edge_removal_order = NULL; -out: - tsk_safe_free(edge_insertion_order); - tsk_safe_free(edge_removal_order); - return ret; -} - -static int -tsk_table_collection_load_reference_sequence( - tsk_table_collection_t *self, kastore_t *store) -{ - int ret = 0; - char *data = NULL; - char *url = NULL; - char *metadata = NULL; - char *metadata_schema = NULL; - tsk_size_t data_length = 0, url_length, metadata_length, metadata_schema_length; - - read_table_property_t properties[] = { - { "reference_sequence/data", (void **) &data, &data_length, KAS_UINT8, - TSK_COL_OPTIONAL }, - { "reference_sequence/url", (void **) &url, &url_length, KAS_UINT8, - TSK_COL_OPTIONAL }, - { "reference_sequence/metadata", (void **) &metadata, &metadata_length, - KAS_UINT8, TSK_COL_OPTIONAL }, - { "reference_sequence/metadata_schema", (void **) &metadata_schema, - &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, - { .name = NULL }, - }; - - ret = read_table_properties(store, properties, 0); - if (ret != 0) { - goto out; - } - if (data != NULL) { - ret = tsk_reference_sequence_takeset_data( - &self->reference_sequence, data, (tsk_size_t) data_length); - if (ret != 0) { - goto out; - } - data = NULL; - } - if (metadata != NULL) { - ret = tsk_reference_sequence_takeset_metadata( - &self->reference_sequence, metadata, (tsk_size_t) metadata_length); - if (ret != 0) { - goto out; - } - metadata = NULL; - } - if (metadata_schema != NULL) { - ret = tsk_reference_sequence_set_metadata_schema(&self->reference_sequence, - metadata_schema, (tsk_size_t) metadata_schema_length); - if (ret != 0) { - goto out; - } - } - if (url != NULL) { - ret = tsk_reference_sequence_set_url( - &self->reference_sequence, url, (tsk_size_t) url_length); - if (ret != 0) { - goto out; - } - } - -out: - free_read_table_mem(NULL, NULL, properties); - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_loadf_inited( - tsk_table_collection_t *self, FILE *file, tsk_flags_t options) -{ - int ret = 0; - kastore_t store; - - int kas_flags = KAS_READ_ALL; - if ((options & TSK_LOAD_SKIP_TABLES) - || (options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { - kas_flags = 0; - } - kas_flags = kas_flags | KAS_GET_TAKES_OWNERSHIP; - ret = kastore_openf(&store, file, "r", kas_flags); - - if (ret != 0) { - if (ret == KAS_ERR_EOF) { - /* KAS_ERR_EOF means that we tried to read a store from the stream - * and we hit EOF immediately without reading any bytes. We signal - * this back to the client, which allows it to read an indefinite - * number of stores from a stream */ - ret = TSK_ERR_EOF; - } else { - ret = tsk_set_kas_error(ret); - } - goto out; - } - ret = tsk_table_collection_read_format_data(self, &store); - if (ret != 0) { - goto out; - } - if (!(options & TSK_LOAD_SKIP_TABLES)) { - ret = tsk_node_table_load(&self->nodes, &store); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_load(&self->edges, &store); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_load(&self->sites, &store); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_load(&self->mutations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_load(&self->migrations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_load(&self->individuals, &store); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_load(&self->populations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_load(&self->provenances, &store); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_load_indexes(self, &store); - if (ret != 0) { - goto out; - } - } else { - ret = tsk_table_collection_build_index(self, 0); - if (ret != 0) { - goto out; - } - } - if (!(options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { - ret = tsk_table_collection_load_reference_sequence(self, &store); - if (ret != 0) { - goto out; - } - } - ret = kastore_close(&store); - if (ret != 0) { - goto out; - } -out: - /* If we're exiting on an error, we ignore any further errors that might come - * from kastore. In the nominal case, closing an already-closed store is a - * safe noop */ - kastore_close(&store); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t options) -{ - int ret = 0; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_table_collection_init(self, options); - if (ret != 0) { - goto out; - } - } - ret = tsk_table_collection_loadf_inited(self, file, options); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_load( - tsk_table_collection_t *self, const char *filename, tsk_flags_t options) -{ - int ret = 0; - FILE *file = NULL; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_table_collection_init(self, options); - if (ret != 0) { - goto out; - } - } - file = fopen(filename, "rb"); - if (file == NULL) { - ret = TSK_ERR_IO; - goto out; - } - ret = tsk_table_collection_loadf_inited(self, file, options); - if (ret != 0) { - goto out; - } - if (fclose(file) != 0) { - ret = TSK_ERR_IO; - goto out; - } - file = NULL; -out: - if (file != NULL) { - /* Ignore any additional errors we might get when closing the file - * in error conditions */ - fclose(file); - } - return ret; -} - -static int TSK_WARN_UNUSED -tsk_table_collection_dump_reference_sequence(const tsk_table_collection_t *self, - kastore_t *store, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - const tsk_reference_sequence_t *ref = &self->reference_sequence; - write_table_col_t write_cols[] = { - { "reference_sequence/data", (void *) ref->data, ref->data_length, KAS_UINT8 }, - { "reference_sequence/url", (void *) ref->url, ref->url_length, KAS_UINT8 }, - { "reference_sequence/metadata", (void *) ref->metadata, ref->metadata_length, - KAS_UINT8 }, - { "reference_sequence/metadata_schema", (void *) ref->metadata_schema, - ref->metadata_schema_length, KAS_UINT8 }, - { .name = NULL }, - }; - if (tsk_table_collection_has_reference_sequence(self)) { - ret = write_table_cols(store, write_cols, 0); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_dump( - const tsk_table_collection_t *self, const char *filename, tsk_flags_t options) -{ - int ret = 0; - FILE *file = fopen(filename, "wb"); - - if (file == NULL) { - ret = TSK_ERR_IO; - goto out; - } - ret = tsk_table_collection_dumpf(self, file, options); - if (ret != 0) { - goto out; - } - if (fclose(file) != 0) { - ret = TSK_ERR_IO; - goto out; - } - file = NULL; -out: - if (file != NULL) { - /* Ignore any additional errors we might get when closing the file - * in error conditions */ - fclose(file); - /* If an error occurred make sure that the filename is removed */ - remove(filename); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_dumpf( - const tsk_table_collection_t *self, FILE *file, tsk_flags_t options) -{ - int ret = 0; - kastore_t store; - char uuid[TSK_UUID_SIZE + 1]; // Must include space for trailing null. - write_table_col_t format_columns[] = { - { "format/name", (const void *) &TSK_FILE_FORMAT_NAME, - TSK_FILE_FORMAT_NAME_LENGTH, KAS_INT8 }, - { "format/version", - (const void *) &(uint32_t[]){ - TSK_FILE_FORMAT_VERSION_MAJOR, TSK_FILE_FORMAT_VERSION_MINOR }, - 2, KAS_UINT32 }, - { "sequence_length", (const void *) &self->sequence_length, 1, KAS_FLOAT64 }, - { "uuid", (void *) uuid, TSK_UUID_SIZE, KAS_INT8 }, - { "time_units", (void *) self->time_units, self->time_units_length, KAS_INT8 }, - { "metadata", (void *) self->metadata, self->metadata_length, KAS_INT8 }, - { "metadata_schema", (void *) self->metadata_schema, - self->metadata_schema_length, KAS_INT8 }, - { .name = NULL }, - }; - - tsk_memset(&store, 0, sizeof(store)); - - ret = kastore_openf(&store, file, "w", 0); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } - - /* Write format data */ - ret = tsk_generate_uuid(uuid, 0); - if (ret != 0) { - goto out; - } - - ret = write_table_cols(&store, format_columns, options); - if (ret != 0) { - goto out; - } - - /* All of these functions will set the kas_error internally, so we don't have - * to modify the return value. */ - ret = tsk_node_table_dump(&self->nodes, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_dump(&self->edges, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_dump(&self->sites, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_dump(&self->migrations, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_dump(&self->mutations, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_dump(&self->individuals, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_dump(&self->populations, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_dump(&self->provenances, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_dump_indexes(self, &store, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_dump_reference_sequence(self, &store, options); - if (ret != 0) { - goto out; - } - - ret = kastore_close(&store); - if (ret != 0) { - ret = tsk_set_kas_error(ret); - goto out; - } -out: - /* It's safe to close a kastore twice. */ - if (ret != 0) { - kastore_close(&store); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map) -{ - int ret = 0; - simplifier_t simplifier; - tsk_id_t *local_samples = NULL; - tsk_id_t u; - - /* Avoid calling to simplifier_free with uninit'd memory on error branches */ - tsk_memset(&simplifier, 0, sizeof(simplifier_t)); - - if ((options & TSK_SIMPLIFY_KEEP_UNARY) - && (options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS)) { - ret = TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE; - goto out; - } - - /* For now we don't bother with edge metadata, but it can easily be - * implemented. */ - if (self->edges.metadata_length > 0) { - ret = TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA; - goto out; - } - - if (samples == NULL) { - local_samples = tsk_malloc(self->nodes.num_rows * sizeof(*local_samples)); - if (local_samples == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - num_samples = 0; - for (u = 0; u < (tsk_id_t) self->nodes.num_rows; u++) { - if (!!(self->nodes.flags[u] & TSK_NODE_IS_SAMPLE)) { - local_samples[num_samples] = u; - num_samples++; - } - } - samples = local_samples; - } - - ret = simplifier_init(&simplifier, samples, num_samples, self, options); - if (ret != 0) { - goto out; - } - ret = simplifier_run(&simplifier, node_map); - if (ret != 0) { - goto out; - } - if (!!(options & TSK_DEBUG)) { - simplifier_print_state(&simplifier, tsk_get_debug_stream()); - } - /* The indexes are invalidated now so drop them */ - ret = tsk_table_collection_drop_index(self, 0); -out: - simplifier_free(&simplifier); - tsk_safe_free(local_samples); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samples, - tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, - tsk_flags_t TSK_UNUSED(options), tsk_edge_table_t *result) -{ - int ret = 0; - ancestor_mapper_t ancestor_mapper; - - tsk_memset(&ancestor_mapper, 0, sizeof(ancestor_mapper_t)); - - if (self->edges.metadata_length > 0) { - ret = TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA; - goto out; - } - - ret = ancestor_mapper_init( - &ancestor_mapper, samples, num_samples, ancestors, num_ancestors, self, result); - if (ret != 0) { - goto out; - } - ret = ancestor_mapper_run(&ancestor_mapper); - if (ret != 0) { - goto out; - } -out: - ancestor_mapper_free(&ancestor_mapper); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_ibd_within(const tsk_table_collection_t *self, - tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, - double min_span, double max_time, tsk_flags_t options) -{ - int ret = 0; - tsk_ibd_finder_t ibd_finder; - - ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_span, max_time); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_init_within(&ibd_finder, samples, num_samples); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_run(&ibd_finder); - if (ret != 0) { - goto out; - } - if (!!(options & TSK_DEBUG)) { - tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); - } -out: - tsk_ibd_finder_free(&ibd_finder); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_ibd_between(const tsk_table_collection_t *self, - tsk_identity_segments_t *result, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_span, - double max_time, tsk_flags_t options) -{ - int ret = 0; - tsk_ibd_finder_t ibd_finder; - - ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_span, max_time); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_init_between( - &ibd_finder, num_sample_sets, sample_set_sizes, sample_sets); - if (ret != 0) { - goto out; - } - ret = tsk_ibd_finder_run(&ibd_finder); - if (ret != 0) { - goto out; - } - if (!!(options & TSK_DEBUG)) { - tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); - } -out: - tsk_ibd_finder_free(&ibd_finder); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_sort( - tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options) -{ - int ret = 0; - tsk_table_sorter_t sorter; - - ret = tsk_table_sorter_init(&sorter, self, options); - if (ret != 0) { - goto out; - } - ret = tsk_table_sorter_run(&sorter, start); - if (ret != 0) { - goto out; - } -out: - tsk_table_sorter_free(&sorter); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_id_t k; - tsk_id_t *nodes = NULL; - tsk_table_sorter_t sorter; - tsk_flags_t subset_options = options & TSK_SUBSET_KEEP_UNREFERENCED; - - ret = tsk_table_sorter_init(&sorter, self, 0); - if (ret != 0) { - goto out; - } - sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; - sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical; - - nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes)); - if (nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { - nodes[k] = k; - } - ret = tsk_table_collection_subset(self, nodes, self->nodes.num_rows, subset_options); - if (ret != 0) { - goto out; - } - ret = tsk_table_sorter_run(&sorter, NULL); - if (ret != 0) { - goto out; - } -out: - tsk_safe_free(nodes); - tsk_table_sorter_free(&sorter); - return ret; -} - -/* - * Remove any sites with duplicate positions, retaining only the *first* - * one. Assumes the tables have been sorted, throwing an error if not. - */ -int TSK_WARN_UNUSED -tsk_table_collection_deduplicate_sites( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t ret_id; - tsk_size_t j; - /* Map of old site IDs to new site IDs. */ - tsk_id_t *site_id_map = NULL; - tsk_site_table_t copy; - tsk_site_t row, last_row; - - /* Early exit if there's 0 rows. We don't exit early for one row because - * we would then skip error checking, making the semantics inconsistent. */ - if (self->sites.num_rows == 0) { - return 0; - } - - /* Must allocate the site table first for tsk_site_table_free to be safe */ - ret = tsk_site_table_copy(&self->sites, ©, 0); - if (ret != 0) { - goto out; - } - ret_id = tsk_table_collection_check_integrity(self, TSK_CHECK_SITE_ORDERING); - if (ret_id != 0) { - ret = (int) ret_id; - goto out; - } - - site_id_map = tsk_malloc(copy.num_rows * sizeof(*site_id_map)); - if (site_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_site_table_clear(&self->sites); - if (ret != 0) { - goto out; - } - - last_row.position = -1; - site_id_map[0] = 0; - for (j = 0; j < copy.num_rows; j++) { - tsk_site_table_get_row_unsafe(©, (tsk_id_t) j, &row); - if (row.position != last_row.position) { - ret_id - = tsk_site_table_add_row(&self->sites, row.position, row.ancestral_state, - row.ancestral_state_length, row.metadata, row.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - site_id_map[j] = (tsk_id_t) self->sites.num_rows - 1; - last_row = row; - } - - if (self->sites.num_rows < copy.num_rows) { - // Remap sites in the mutation table - // (but only if there's been any changed sites) - for (j = 0; j < self->mutations.num_rows; j++) { - self->mutations.site[j] = site_id_map[self->mutations.site[j]]; - } - } - ret = 0; -out: - tsk_site_table_free(©); - tsk_safe_free(site_id_map); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_compute_mutation_parents( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t num_trees; - const tsk_id_t *I, *O; - const tsk_edge_table_t edges = self->edges; - const tsk_node_table_t nodes = self->nodes; - const tsk_site_table_t sites = self->sites; - const tsk_mutation_table_t mutations = self->mutations; - const tsk_id_t M = (tsk_id_t) edges.num_rows; - tsk_id_t tj, tk; - tsk_id_t *parent = NULL; - tsk_id_t *bottom_mutation = NULL; - tsk_id_t u; - double left, right; - tsk_id_t site; - /* Using unsigned values here avoids potentially undefined behaviour */ - tsk_size_t j, mutation, first_mutation; - - /* Set the mutation parent to TSK_NULL so that we don't check the - * parent values we are about to write over. */ - tsk_memset(mutations.parent, 0xff, mutations.num_rows * sizeof(*mutations.parent)); - num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); - bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); - if (parent == NULL || bottom_mutation == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); - tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); - tsk_memset(mutations.parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); - - I = self->indexes.edge_insertion_order; - O = self->indexes.edge_removal_order; - tj = 0; - tk = 0; - site = 0; - mutation = 0; - left = 0; - while (tj < M || left < self->sequence_length) { - while (tk < M && edges.right[O[tk]] == left) { - parent[edges.child[O[tk]]] = TSK_NULL; - tk++; - } - while (tj < M && edges.left[I[tj]] == left) { - parent[edges.child[I[tj]]] = edges.parent[I[tj]]; - tj++; - } - right = self->sequence_length; - if (tj < M) { - right = TSK_MIN(right, edges.left[I[tj]]); - } - if (tk < M) { - right = TSK_MIN(right, edges.right[O[tk]]); - } - - /* Tree is now ready. We look at each site on this tree in turn */ - while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { - /* Create a mapping from mutations to nodes. If we see more than one - * mutation at a node, the previously seen one must be the parent - * of the current since we assume they are in order. */ - first_mutation = mutation; - while (mutation < mutations.num_rows && mutations.site[mutation] == site) { - u = mutations.node[mutation]; - if (bottom_mutation[u] != TSK_NULL) { - mutations.parent[mutation] = bottom_mutation[u]; - } - bottom_mutation[u] = (tsk_id_t) mutation; - mutation++; - } - /* Make the common case of 1 mutation fast */ - if (mutation > first_mutation + 1) { - /* If we have more than one mutation, compute the parent for each - * one by traversing up the tree until we find a node that has a - * mutation. */ - for (j = first_mutation; j < mutation; j++) { - if (mutations.parent[j] == TSK_NULL) { - u = parent[mutations.node[j]]; - while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { - u = parent[u]; - } - if (u != TSK_NULL) { - mutations.parent[j] = bottom_mutation[u]; - } - } - } - } - /* Reset the mapping for the next site */ - for (j = first_mutation; j < mutation; j++) { - u = mutations.node[j]; - bottom_mutation[u] = TSK_NULL; - /* Check that we haven't violated the sortedness property */ - if (mutations.parent[j] > (tsk_id_t) j) { - ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; - goto out; - } - } - site++; - } - /* Move on to the next tree */ - left = right; - } - -out: - tsk_safe_free(parent); - tsk_safe_free(bottom_mutation); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_compute_mutation_times( - tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_id_t num_trees; - const tsk_id_t *restrict I = self->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->indexes.edge_removal_order; - const tsk_edge_table_t edges = self->edges; - const tsk_node_table_t nodes = self->nodes; - const tsk_site_table_t sites = self->sites; - const tsk_mutation_table_t mutations = self->mutations; - const tsk_id_t M = (tsk_id_t) edges.num_rows; - tsk_id_t tj, tk; - tsk_id_t *parent = NULL; - double *numerator = NULL; - double *denominator = NULL; - tsk_id_t u; - double left, right, parent_time; - tsk_id_t site; - /* Using unsigned values here avoids potentially undefined behaviour */ - tsk_size_t j, mutation, first_mutation; - tsk_bookmark_t skip_edges = { 0, 0, self->edges.num_rows, 0, 0, 0, 0, 0 }; - - /* The random param is for future usage */ - if (random != NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - - /* First set the times to TSK_UNKNOWN_TIME so that check will succeed */ - for (j = 0; j < mutations.num_rows; j++) { - mutations.time[j] = TSK_UNKNOWN_TIME; - } - num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); - numerator = tsk_malloc(nodes.num_rows * sizeof(*numerator)); - denominator = tsk_malloc(nodes.num_rows * sizeof(*denominator)); - if (parent == NULL || numerator == NULL || denominator == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); - tsk_memset(numerator, 0, nodes.num_rows * sizeof(*numerator)); - tsk_memset(denominator, 0, nodes.num_rows * sizeof(*denominator)); - - tj = 0; - tk = 0; - site = 0; - mutation = 0; - left = 0; - while (tj < M || left < self->sequence_length) { - while (tk < M && edges.right[O[tk]] == left) { - parent[edges.child[O[tk]]] = TSK_NULL; - tk++; - } - while (tj < M && edges.left[I[tj]] == left) { - parent[edges.child[I[tj]]] = edges.parent[I[tj]]; - tj++; - } - right = self->sequence_length; - if (tj < M) { - right = TSK_MIN(right, edges.left[I[tj]]); - } - if (tk < M) { - right = TSK_MIN(right, edges.right[O[tk]]); - } - - /* Tree is now ready. We look at each site on this tree in turn */ - while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { - first_mutation = mutation; - /* Count how many mutations each edge has to get our - denominator */ - while (mutation < mutations.num_rows && mutations.site[mutation] == site) { - denominator[mutations.node[mutation]]++; - mutation++; - } - /* Go over the mutations again assigning times. As the sorting - requirements guarantee that parents are before children, we assign - oldest first */ - for (j = first_mutation; j < mutation; j++) { - u = mutations.node[j]; - numerator[u]++; - if (parent[u] == TSK_NULL) { - /* This mutation is above a root */ - mutations.time[j] = nodes.time[u]; - } else { - parent_time = nodes.time[parent[u]]; - mutations.time[j] = parent_time - - (parent_time - nodes.time[u]) * numerator[u] - / (denominator[u] + 1); - } - } - /* Reset the book-keeping for the next site */ - for (j = first_mutation; j < mutation; j++) { - u = mutations.node[j]; - numerator[u] = 0; - denominator[u] = 0; - } - site++; - } - /* Move on to the next tree */ - left = right; - } - - /* Now that mutations have times their sort order may have been invalidated, so - * re-sort. Safe to cast the result to an int here because we're not counting - * trees. */ - ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_MUTATION_ORDERING); - if (ret == TSK_ERR_UNSORTED_MUTATIONS) { - ret = tsk_table_collection_sort(self, &skip_edges, 0); - if (ret != 0) { - goto out; - } - } else if (ret < 0) { - goto out; - } - -out: - tsk_safe_free(parent); - tsk_safe_free(numerator); - tsk_safe_free(denominator); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_delete_older( - tsk_table_collection_t *self, double time, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_edge_t edge; - tsk_mutation_t mutation; - tsk_migration_t migration; - tsk_edge_table_t edges; - tsk_mutation_table_t mutations; - tsk_migration_table_t migrations; - const double *restrict node_time = self->nodes.time; - tsk_id_t j, ret_id, parent; - double mutation_time; - tsk_id_t *mutation_map = NULL; - - memset(&edges, 0, sizeof(edges)); - memset(&mutations, 0, sizeof(mutations)); - memset(&migrations, 0, sizeof(migrations)); - - ret = tsk_edge_table_copy(&self->edges, &edges, 0); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_clear(&self->edges); - if (ret != 0) { - goto out; - } - for (j = 0; j < (tsk_id_t) edges.num_rows; j++) { - tsk_edge_table_get_row_unsafe(&edges, j, &edge); - if (node_time[edge.parent] <= time) { - ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, - edge.parent, edge.child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - } - /* Calling x_table_free multiple times is safe, so get rid of the - * extra edge table memory as soon as we can. */ - tsk_edge_table_free(&edges); - - mutation_map = tsk_malloc(self->mutations.num_rows * sizeof(*mutation_map)); - if (mutation_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_mutation_table_copy(&self->mutations, &mutations, 0); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_clear(&self->mutations); - if (ret != 0) { - goto out; - } - for (j = 0; j < (tsk_id_t) mutations.num_rows; j++) { - tsk_mutation_table_get_row_unsafe(&mutations, j, &mutation); - mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] - : mutation.time; - mutation_map[j] = TSK_NULL; - if (mutation_time < time) { - ret_id = tsk_mutation_table_add_row(&self->mutations, mutation.site, - mutation.node, mutation.parent, mutation.time, mutation.derived_state, - mutation.derived_state_length, mutation.metadata, - mutation.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - mutation_map[j] = ret_id; - } - } - tsk_mutation_table_free(&mutations); - for (j = 0; j < (tsk_id_t) self->mutations.num_rows; j++) { - parent = self->mutations.parent[j]; - if (parent != TSK_NULL) { - self->mutations.parent[j] = mutation_map[parent]; - } - } - - ret = tsk_migration_table_copy(&self->migrations, &migrations, 0); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_clear(&self->migrations); - if (ret != 0) { - goto out; - } - for (j = 0; j < (tsk_id_t) migrations.num_rows; j++) { - tsk_migration_table_get_row_unsafe(&migrations, j, &migration); - if (migration.time < time) { - ret_id = tsk_migration_table_add_row(&self->migrations, migration.left, - migration.right, migration.node, migration.source, migration.dest, - migration.time, migration.metadata, migration.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - } - tsk_migration_table_free(&migrations); -out: - tsk_edge_table_free(&edges); - tsk_mutation_table_free(&mutations); - tsk_migration_table_free(&migrations); - tsk_safe_free(mutation_map); - return ret; -} - -int -tsk_table_collection_record_num_rows( - const tsk_table_collection_t *self, tsk_bookmark_t *position) -{ - position->individuals = self->individuals.num_rows; - position->nodes = self->nodes.num_rows; - position->edges = self->edges.num_rows; - position->migrations = self->migrations.num_rows; - position->sites = self->sites.num_rows; - position->mutations = self->mutations.num_rows; - position->populations = self->populations.num_rows; - position->provenances = self->provenances.num_rows; - return 0; -} - -int TSK_WARN_UNUSED -tsk_table_collection_truncate(tsk_table_collection_t *tables, tsk_bookmark_t *position) -{ - int ret = 0; - - ret = tsk_table_collection_drop_index(tables, 0); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_truncate(&tables->individuals, position->individuals); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_truncate(&tables->nodes, position->nodes); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_truncate(&tables->edges, position->edges); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_truncate(&tables->migrations, position->migrations); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_truncate(&tables->sites, position->sites); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_truncate(&tables->mutations, position->mutations); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_truncate(&tables->populations, position->populations); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_truncate(&tables->provenances, position->provenances); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - bool clear_provenance = !!(options & TSK_CLEAR_PROVENANCE); - bool clear_metadata_schemas = !!(options & TSK_CLEAR_METADATA_SCHEMAS); - bool clear_ts_metadata = !!(options & TSK_CLEAR_TS_METADATA_AND_SCHEMA); - tsk_bookmark_t rows_to_retain - = { .provenances = clear_provenance ? 0 : self->provenances.num_rows }; - - ret = tsk_table_collection_truncate(self, &rows_to_retain); - if (ret != 0) { - goto out; - } - - if (clear_metadata_schemas) { - ret = tsk_individual_table_set_metadata_schema(&self->individuals, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_node_table_set_metadata_schema(&self->nodes, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_set_metadata_schema(&self->edges, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_set_metadata_schema(&self->migrations, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_set_metadata_schema(&self->sites, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_set_metadata_schema(&self->mutations, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_set_metadata_schema(&self->populations, "", 0); - if (ret != 0) { - goto out; - } - } - - if (clear_ts_metadata) { - ret = tsk_table_collection_set_metadata(self, "", 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_set_metadata_schema(self, "", 0); - if (ret != 0) { - goto out; - } - } - -out: - return ret; -} - -static int -tsk_table_collection_add_and_remap_node(tsk_table_collection_t *self, - const tsk_table_collection_t *other, tsk_id_t node_id, tsk_id_t *individual_map, - tsk_id_t *population_map, tsk_id_t *node_map, bool add_populations) -{ - int ret = 0; - tsk_id_t ret_id, new_ind, new_pop; - tsk_node_t node; - tsk_individual_t ind; - tsk_population_t pop; - - ret = tsk_node_table_get_row(&other->nodes, node_id, &node); - if (ret < 0) { - goto out; - } - new_ind = TSK_NULL; - if (node.individual != TSK_NULL) { - if (individual_map[node.individual] == TSK_NULL) { - ret = tsk_individual_table_get_row( - &other->individuals, node.individual, &ind); - if (ret < 0) { - goto out; - } - ret_id = tsk_individual_table_add_row(&self->individuals, ind.flags, - ind.location, ind.location_length, ind.parents, ind.parents_length, - ind.metadata, ind.metadata_length); - if (ret < 0) { - ret = (int) ret_id; - goto out; - } - individual_map[node.individual] = ret_id; - } - new_ind = individual_map[node.individual]; - } - new_pop = TSK_NULL; - if (node.population != TSK_NULL) { - // keep same pops if add_populations is False - if (!add_populations) { - population_map[node.population] = node.population; - } - if (population_map[node.population] == TSK_NULL) { - ret = tsk_population_table_get_row( - &other->populations, node.population, &pop); - if (ret < 0) { - goto out; - } - ret_id = tsk_population_table_add_row( - &self->populations, pop.metadata, pop.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - population_map[node.population] = ret_id; - } - new_pop = population_map[node.population]; - } - ret_id = tsk_node_table_add_row(&self->nodes, node.flags, node.time, new_pop, - new_ind, node.metadata, node.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - node_map[node.id] = ret_id; - -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, - tsk_size_t num_nodes, tsk_flags_t options) -{ - int ret = 0; - tsk_id_t ret_id, j, k, parent_ind, new_parent, new_child, new_node, site_id; - tsk_size_t num_parents; - tsk_individual_t ind; - tsk_edge_t edge; - tsk_id_t *node_map = NULL; - tsk_id_t *individual_map = NULL; - tsk_id_t *population_map = NULL; - tsk_id_t *site_map = NULL; - tsk_id_t *mutation_map = NULL; - tsk_table_collection_t tables; - tsk_population_t pop; - tsk_site_t site; - tsk_mutation_t mut; - bool keep_unreferenced = !!(options & TSK_SUBSET_KEEP_UNREFERENCED); - bool no_change_populations = !!(options & TSK_SUBSET_NO_CHANGE_POPULATIONS); - - ret = tsk_table_collection_copy(self, &tables, 0); - if (ret != 0) { - goto out; - } - /* Not calling TSK_CHECK_TREES so casting to int is safe */ - ret = (int) tsk_table_collection_check_integrity(self, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_clear(self, 0); - if (ret != 0) { - goto out; - } - - node_map = tsk_malloc(tables.nodes.num_rows * sizeof(*node_map)); - individual_map = tsk_malloc(tables.individuals.num_rows * sizeof(*individual_map)); - population_map = tsk_malloc(tables.populations.num_rows * sizeof(*population_map)); - site_map = tsk_malloc(tables.sites.num_rows * sizeof(*site_map)); - mutation_map = tsk_malloc(tables.mutations.num_rows * sizeof(*mutation_map)); - if (node_map == NULL || individual_map == NULL || population_map == NULL - || site_map == NULL || mutation_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(node_map, 0xff, tables.nodes.num_rows * sizeof(*node_map)); - tsk_memset( - individual_map, 0xff, tables.individuals.num_rows * sizeof(*individual_map)); - tsk_memset( - population_map, 0xff, tables.populations.num_rows * sizeof(*population_map)); - tsk_memset(site_map, 0xff, tables.sites.num_rows * sizeof(*site_map)); - tsk_memset(mutation_map, 0xff, tables.mutations.num_rows * sizeof(*mutation_map)); - - if (no_change_populations) { - ret = tsk_population_table_copy( - &tables.populations, &self->populations, TSK_NO_INIT); - if (ret < 0) { - goto out; - } - for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { - population_map[k] = k; - } - } - - // First do individuals so they stay in the same order. - // So we can remap individual parents and not rely on sortedness, - // we first check who to keep; then build the individual map, and - // finally populate the tables. - if (keep_unreferenced) { - for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { - // put a non-NULL value here; fill in the actual order next - individual_map[k] = 0; - } - } else { - for (k = 0; k < (tsk_id_t) num_nodes; k++) { - if (nodes[k] < 0 || nodes[k] >= (tsk_id_t) tables.nodes.num_rows) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - j = tables.nodes.individual[nodes[k]]; - if (j != TSK_NULL) { - individual_map[j] = 0; - } - } - } - j = 0; - for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { - if (individual_map[k] != TSK_NULL) { - individual_map[k] = j; - j++; - } - } - for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { - if (individual_map[k] != TSK_NULL) { - tsk_individual_table_get_row_unsafe(&tables.individuals, k, &ind); - num_parents = 0; - for (j = 0; j < (tsk_id_t) ind.parents_length; j++) { - parent_ind = ind.parents[j]; - new_parent = parent_ind; - if (parent_ind != TSK_NULL) { - new_parent = individual_map[parent_ind]; - } - if ((parent_ind == TSK_NULL) || (new_parent != TSK_NULL)) { - /* Beware: this modifies the parents column of tables.individuals - * in-place! But it's OK as we don't use it again. */ - ind.parents[num_parents] = new_parent; - num_parents++; - } - } - ret_id = tsk_individual_table_add_row(&self->individuals, ind.flags, - ind.location, ind.location_length, ind.parents, num_parents, - ind.metadata, ind.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - tsk_bug_assert(individual_map[k] == ret_id); - } - } - - // Nodes and populations - for (k = 0; k < (tsk_id_t) num_nodes; k++) { - ret = tsk_table_collection_add_and_remap_node( - self, &tables, nodes[k], individual_map, population_map, node_map, true); - if (ret < 0) { - goto out; - } - } - - /* TODO: Subset the migrations table. We would need to make sure - * that we don't remove populations that are referenced, so it would - * need to be done before the next code block. */ - if (tables.migrations.num_rows != 0) { - ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; - goto out; - } - - if (keep_unreferenced) { - // Keep unused populations - for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { - if (population_map[k] == TSK_NULL) { - tsk_population_table_get_row_unsafe(&tables.populations, k, &pop); - ret_id = tsk_population_table_add_row( - &self->populations, pop.metadata, pop.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - } - } - - // Edges - for (k = 0; k < (tsk_id_t) tables.edges.num_rows; k++) { - tsk_edge_table_get_row_unsafe(&tables.edges, k, &edge); - new_parent = node_map[edge.parent]; - new_child = node_map[edge.child]; - if ((new_parent != TSK_NULL) && (new_child != TSK_NULL)) { - ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, - new_parent, new_child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - } - - // Mutations and sites - // Make a first pass through to build the mutation_map so that - // mutation parent can be remapped even if the table is not in order. - j = 0; - for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { - if (node_map[tables.mutations.node[k]] != TSK_NULL) { - mutation_map[k] = j; - j++; - site_id = tables.mutations.site[k]; - if (site_map[site_id] == TSK_NULL) { - // Insert a temporary non-NULL value - site_map[site_id] = 1; - } - } - } - // Keep retained sites in their original order - j = 0; - for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) { - if (keep_unreferenced || site_map[k] != TSK_NULL) { - tsk_site_table_get_row_unsafe(&tables.sites, k, &site); - ret_id = tsk_site_table_add_row(&self->sites, site.position, - site.ancestral_state, site.ancestral_state_length, site.metadata, - site.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - site_map[k] = j; - j++; - } - } - for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { - tsk_mutation_table_get_row_unsafe(&tables.mutations, k, &mut); - new_node = node_map[mut.node]; - if (new_node != TSK_NULL) { - new_parent = TSK_NULL; - if (mut.parent != TSK_NULL) { - new_parent = mutation_map[mut.parent]; - } - ret_id = tsk_mutation_table_add_row(&self->mutations, site_map[mut.site], - new_node, new_parent, mut.time, mut.derived_state, - mut.derived_state_length, mut.metadata, mut.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - tsk_bug_assert(mutation_map[mut.id] == ret_id); - } - if (ret < 0) { - goto out; - } - } - - ret = 0; -out: - tsk_safe_free(node_map); - tsk_safe_free(individual_map); - tsk_safe_free(population_map); - tsk_safe_free(site_map); - tsk_safe_free(mutation_map); - tsk_table_collection_free(&tables); - return ret; -} - -static int -tsk_check_subset_equality(tsk_table_collection_t *self, - const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, - tsk_size_t num_shared_nodes) -{ - int ret = 0; - tsk_id_t k, i; - tsk_id_t *self_nodes = NULL; - tsk_id_t *other_nodes = NULL; - tsk_table_collection_t self_copy; - tsk_table_collection_t other_copy; - - tsk_memset(&self_copy, 0, sizeof(self_copy)); - tsk_memset(&other_copy, 0, sizeof(other_copy)); - self_nodes = tsk_malloc(num_shared_nodes * sizeof(*self_nodes)); - other_nodes = tsk_malloc(num_shared_nodes * sizeof(*other_nodes)); - if (self_nodes == NULL || other_nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - i = 0; - for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { - if (other_node_mapping[k] != TSK_NULL) { - self_nodes[i] = other_node_mapping[k]; - other_nodes[i] = k; - i++; - } - } - - ret = tsk_table_collection_copy(self, &self_copy, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_copy(other, &other_copy, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_canonicalise(&self_copy, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_canonicalise(&other_copy, 0); - if (ret != 0) { - goto out; - } - if (!tsk_table_collection_equals(&self_copy, &other_copy, - TSK_CMP_IGNORE_TS_METADATA | TSK_CMP_IGNORE_PROVENANCE - | TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { - ret = TSK_ERR_UNION_DIFF_HISTORIES; - goto out; - } - -out: - tsk_table_collection_free(&self_copy); - tsk_table_collection_free(&other_copy); - tsk_safe_free(other_nodes); - tsk_safe_free(self_nodes); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_union(tsk_table_collection_t *self, - const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, - tsk_flags_t options) -{ - int ret = 0; - tsk_id_t ret_id, k, i, new_parent, new_child; - tsk_size_t num_shared_nodes = 0; - tsk_size_t num_individuals_self = self->individuals.num_rows; - tsk_edge_t edge; - tsk_mutation_t mut; - tsk_site_t site; - tsk_id_t *node_map = NULL; - tsk_id_t *individual_map = NULL; - tsk_id_t *population_map = NULL; - tsk_id_t *site_map = NULL; - bool add_populations = !(options & TSK_UNION_NO_ADD_POP); - bool check_shared_portion = !(options & TSK_UNION_NO_CHECK_SHARED); - - /* Not calling TSK_CHECK_TREES so casting to int is safe */ - ret = (int) tsk_table_collection_check_integrity(self, 0); - if (ret != 0) { - goto out; - } - ret = (int) tsk_table_collection_check_integrity(other, 0); - if (ret != 0) { - goto out; - } - for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { - if (other_node_mapping[k] >= (tsk_id_t) self->nodes.num_rows - || other_node_mapping[k] < TSK_NULL) { - ret = TSK_ERR_UNION_BAD_MAP; - goto out; - } - if (other_node_mapping[k] != TSK_NULL) { - num_shared_nodes++; - } - } - - if (check_shared_portion) { - ret = tsk_check_subset_equality( - self, other, other_node_mapping, num_shared_nodes); - if (ret != 0) { - goto out; - } - } - - // Maps relating the IDs in other to the new IDs in self. - node_map = tsk_malloc(other->nodes.num_rows * sizeof(*node_map)); - individual_map = tsk_malloc(other->individuals.num_rows * sizeof(*individual_map)); - population_map = tsk_malloc(other->populations.num_rows * sizeof(*population_map)); - site_map = tsk_malloc(other->sites.num_rows * sizeof(*site_map)); - if (node_map == NULL || individual_map == NULL || population_map == NULL - || site_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(node_map, 0xff, other->nodes.num_rows * sizeof(*node_map)); - tsk_memset( - individual_map, 0xff, other->individuals.num_rows * sizeof(*individual_map)); - tsk_memset( - population_map, 0xff, other->populations.num_rows * sizeof(*population_map)); - tsk_memset(site_map, 0xff, other->sites.num_rows * sizeof(*site_map)); - - /* We have to map the individuals who are linked to nodes in the intersection first - as otherwise an individual linked to one node in the intersection and one in - `other` would be duplicated. We assume that the individual in `self` takes - priority. - */ - for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { - if (other_node_mapping[k] != TSK_NULL - && other->nodes.individual[k] != TSK_NULL) { - individual_map[other->nodes.individual[k]] - = self->nodes.individual[other_node_mapping[k]]; - } - } - // nodes, individuals, populations - for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { - if (other_node_mapping[k] != TSK_NULL) { - node_map[k] = other_node_mapping[k]; - } else { - ret = tsk_table_collection_add_and_remap_node(self, other, k, individual_map, - population_map, node_map, add_populations); - if (ret < 0) { - goto out; - } - } - } - - /* Now we know the full individual map we can remap the parents of the new - * individuals*/ - for (k = (tsk_id_t) self->individuals.parents_offset[num_individuals_self]; - k < (tsk_id_t) self->individuals.parents_length; k++) { - if (self->individuals.parents[k] != TSK_NULL) { - self->individuals.parents[k] = individual_map[self->individuals.parents[k]]; - } - } - - // edges - for (k = 0; k < (tsk_id_t) other->edges.num_rows; k++) { - tsk_edge_table_get_row_unsafe(&other->edges, k, &edge); - if ((other_node_mapping[edge.parent] == TSK_NULL) - || (other_node_mapping[edge.child] == TSK_NULL)) { - new_parent = node_map[edge.parent]; - new_child = node_map[edge.child]; - ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, - new_parent, new_child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - } - - // mutations and sites - i = 0; - for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { - tsk_site_table_get_row_unsafe(&other->sites, k, &site); - while ((i < (tsk_id_t) other->mutations.num_rows) - && (other->mutations.site[i] == site.id)) { - tsk_mutation_table_get_row_unsafe(&other->mutations, i, &mut); - if (other_node_mapping[mut.node] == TSK_NULL) { - if (site_map[site.id] == TSK_NULL) { - ret_id = tsk_site_table_add_row(&self->sites, site.position, - site.ancestral_state, site.ancestral_state_length, site.metadata, - site.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - site_map[site.id] = ret_id; - } - // the parents will be recomputed later - new_parent = TSK_NULL; - ret_id = tsk_mutation_table_add_row(&self->mutations, site_map[site.id], - node_map[mut.node], new_parent, mut.time, mut.derived_state, - mut.derived_state_length, mut.metadata, mut.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - i++; - } - } - - /* TODO: Union of the Migrations Table. The only hindrance to performing the - * union operation on Migrations Tables is that tsk_table_collection_sort - * does not sort migrations by time, and instead throws an error. */ - if (self->migrations.num_rows != 0 || other->migrations.num_rows != 0) { - ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; - goto out; - } - - // sorting, deduplicating, and computing parents - ret = tsk_table_collection_sort(self, 0, 0); - if (ret < 0) { - goto out; - } - - ret = tsk_table_collection_deduplicate_sites(self, 0); - if (ret < 0) { - goto out; - } - - // need to sort again since after deduplicating sites, mutations - // may not be sorted by time within sites - ret = tsk_table_collection_sort(self, 0, 0); - if (ret < 0) { - goto out; - } - - ret = tsk_table_collection_build_index(self, 0); - if (ret < 0) { - goto out; - } - - ret = tsk_table_collection_compute_mutation_parents(self, 0); - if (ret < 0) { - goto out; - } - -out: - tsk_safe_free(node_map); - tsk_safe_free(individual_map); - tsk_safe_free(population_map); - tsk_safe_free(site_map); - return ret; -} - -static int -cmp_edge_cl(const void *a, const void *b) -{ - const tsk_edge_t *ia = (const tsk_edge_t *) a; - const tsk_edge_t *ib = (const tsk_edge_t *) b; - int ret = (ia->parent > ib->parent) - (ia->parent < ib->parent); - if (ret == 0) { - ret = (ia->child > ib->child) - (ia->child < ib->child); - if (ret == 0) { - ret = (ia->left > ib->left) - (ia->left < ib->left); - } - } - return ret; -} - -/* Squash the edges in the specified array in place. The output edges will - * be sorted by (child_id, left). - */ - -int TSK_WARN_UNUSED -tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges) -{ - int ret = 0; - tsk_size_t j, k, l; - - if (num_edges < 2) { - *num_output_edges = num_edges; - return ret; - } - - qsort(edges, (size_t) num_edges, sizeof(tsk_edge_t), cmp_edge_cl); - j = 0; - l = 0; - for (k = 1; k < num_edges; k++) { - if (edges[k - 1].metadata_length > 0) { - ret = TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA; - goto out; - } - - /* Check for overlapping edges. */ - if (edges[k - 1].parent == edges[k].parent - && edges[k - 1].child == edges[k].child - && edges[k - 1].right > edges[k].left) { - ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; - goto out; - } - - /* Add squashed edge. */ - if (edges[k - 1].parent != edges[k].parent || edges[k - 1].right != edges[k].left - || edges[j].child != edges[k].child) { - - edges[l].left = edges[j].left; - edges[l].right = edges[k - 1].right; - edges[l].parent = edges[j].parent; - edges[l].child = edges[j].child; - - j = k; - l++; - } - } - edges[l].left = edges[j].left; - edges[l].right = edges[k - 1].right; - edges[l].parent = edges[j].parent; - edges[l].child = edges[j].child; - - *num_output_edges = (tsk_size_t) l + 1; - -out: - return ret; -} - -/* ======================================================== * - * Tree diff iterator. - * ======================================================== */ - -int TSK_WARN_UNUSED -tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, - tsk_id_t num_trees, tsk_flags_t options) -{ - int ret = 0; - - tsk_bug_assert(tables != NULL); - tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); - self->num_nodes = tables->nodes.num_rows; - self->num_edges = tables->edges.num_rows; - self->tables = tables; - self->insertion_index = 0; - self->removal_index = 0; - self->tree_left = 0; - self->tree_index = -1; - if (num_trees < 0) { - num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - } - self->last_index = num_trees; - - if (options & TSK_INCLUDE_TERMINAL) { - self->last_index = self->last_index + 1; - } - self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); - if (self->edge_list_nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -int -tsk_diff_iter_free(tsk_diff_iter_t *self) -{ - tsk_safe_free(self->edge_list_nodes); - return 0; -} - -void -tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) -{ - fprintf(out, "tree_diff_iterator state\n"); - fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); - fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); - fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); - fprintf(out, "tree_left = %f\n", self->tree_left); - fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); -} - -int TSK_WARN_UNUSED -tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, - tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) -{ - int ret = 0; - tsk_id_t k; - const double sequence_length = self->tables->sequence_length; - double left = self->tree_left; - double right = sequence_length; - tsk_size_t next_edge_list_node = 0; - tsk_edge_list_node_t *out_head = NULL; - tsk_edge_list_node_t *out_tail = NULL; - tsk_edge_list_node_t *in_head = NULL; - tsk_edge_list_node_t *in_tail = NULL; - tsk_edge_list_node_t *w = NULL; - tsk_edge_list_t edges_out; - tsk_edge_list_t edges_in; - const tsk_edge_table_t *edges = &self->tables->edges; - const tsk_id_t *insertion_order = self->tables->indexes.edge_insertion_order; - const tsk_id_t *removal_order = self->tables->indexes.edge_removal_order; - - tsk_memset(&edges_out, 0, sizeof(edges_out)); - tsk_memset(&edges_in, 0, sizeof(edges_in)); - - if (self->tree_index + 1 < self->last_index) { - /* First we remove the stale records */ - while (self->removal_index < (tsk_id_t) self->num_edges - && left == edges->right[removal_order[self->removal_index]]) { - k = removal_order[self->removal_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (out_head == NULL) { - out_head = w; - out_tail = w; - } else { - out_tail->next = w; - w->prev = out_tail; - out_tail = w; - } - self->removal_index++; - } - edges_out.head = out_head; - edges_out.tail = out_tail; - - /* Now insert the new records */ - while (self->insertion_index < (tsk_id_t) self->num_edges - && left == edges->left[insertion_order[self->insertion_index]]) { - k = insertion_order[self->insertion_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (in_head == NULL) { - in_head = w; - in_tail = w; - } else { - in_tail->next = w; - w->prev = in_tail; - in_tail = w; - } - self->insertion_index++; - } - edges_in.head = in_head; - edges_in.tail = in_tail; - - right = sequence_length; - if (self->insertion_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); - } - if (self->removal_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); - } - self->tree_index++; - ret = TSK_TREE_OK; - } - *edges_out_ret = edges_out; - *edges_in_ret = edges_in; - *ret_left = left; - *ret_right = right; - /* Set the left coordinate for the next tree */ - self->tree_left = right; - return ret; -} diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h deleted file mode 100644 index 38f3096c9..000000000 --- a/subprojects/tskit/tskit/tables.h +++ /dev/null @@ -1,4790 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2017-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file tables.h - * @brief Tskit Tables API. - */ -#ifndef TSK_TABLES_H -#define TSK_TABLES_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include - -#include - -#include - -/****************************************************************************/ -/* Definitions for the basic objects */ -/****************************************************************************/ - -/** -@brief A single individual defined by a row in the individual table. - -@rst -See the :ref:`data model ` section for the definition of -an individual and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Bitwise flags. */ - tsk_flags_t flags; - /** @brief Spatial location. The number of dimensions is defined by - * ``location_length``. */ - const double *location; - /** @brief Number of spatial dimensions. */ - tsk_size_t location_length; - /** @brief IDs of the parents. The number of parents given by ``parents_length``*/ - tsk_id_t *parents; - /** @brief Number of parents. */ - tsk_size_t parents_length; - /** @brief Metadata. */ - const char *metadata; - /** @brief Size of the metadata in bytes. */ - tsk_size_t metadata_length; - /** @brief An array of the nodes associated with this individual */ - const tsk_id_t *nodes; - /** @brief The number of nodes associated with this individual*/ - tsk_size_t nodes_length; -} tsk_individual_t; - -/** -@brief A single node defined by a row in the node table. - -@rst -See the :ref:`data model ` section for the definition of -a node and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Bitwise flags. */ - tsk_flags_t flags; - /** @brief Time. */ - double time; - /** @brief Population ID. */ - tsk_id_t population; - /** @brief Individual ID. */ - tsk_id_t individual; - /** @brief Metadata. */ - const char *metadata; - /** @brief Size of the metadata in bytes. */ - tsk_size_t metadata_length; -} tsk_node_t; - -/** -@brief A single edge defined by a row in the edge table. - -@rst -See the :ref:`data model ` section for the definition of -an edge and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Parent node ID. */ - tsk_id_t parent; - /** @brief Child node ID. */ - tsk_id_t child; - /** @brief Left coordinate. */ - double left; - /** @brief Right coordinate. */ - double right; - /** @brief Metadata. */ - const char *metadata; - /** @brief Size of the metadata in bytes. */ - tsk_size_t metadata_length; -} tsk_edge_t; - -/** -@brief A single mutation defined by a row in the mutation table. - -@rst -See the :ref:`data model ` section for the definition of -a mutation and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Site ID. */ - tsk_id_t site; - /** @brief Node ID. */ - tsk_id_t node; - /** @brief Parent mutation ID. */ - tsk_id_t parent; - /** @brief Mutation time. */ - double time; - /** @brief Derived state. */ - const char *derived_state; - /** @brief Size of the derived state in bytes. */ - tsk_size_t derived_state_length; - /** @brief Metadata. */ - const char *metadata; - /** @brief Size of the metadata in bytes. */ - tsk_size_t metadata_length; - /** @brief The ID of the edge that this mutation lies on, or TSK_NULL - if there is no corresponding edge.*/ - tsk_id_t edge; -} tsk_mutation_t; - -/** -@brief A single site defined by a row in the site table. - -@rst -See the :ref:`data model ` section for the definition of -a site and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Position coordinate. */ - double position; - /** @brief Ancestral state. */ - const char *ancestral_state; - /** @brief Ancestral state length in bytes. */ - tsk_size_t ancestral_state_length; - /** @brief Metadata. */ - const char *metadata; - /** @brief Metadata length in bytes. */ - tsk_size_t metadata_length; - /** @brief An array of this site's mutations */ - const tsk_mutation_t *mutations; - /** @brief The number of mutations at this site */ - tsk_size_t mutations_length; -} tsk_site_t; - -/** -@brief A single migration defined by a row in the migration table. - -@rst -See the :ref:`data model ` section for the definition of -a migration and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Source population ID. */ - tsk_id_t source; - /** @brief Destination population ID. */ - tsk_id_t dest; - /** @brief Node ID. */ - tsk_id_t node; - /** @brief Left coordinate. */ - double left; - /** @brief Right coordinate. */ - double right; - /** @brief Time. */ - double time; - /** @brief Metadata. */ - const char *metadata; - /** @brief Size of the metadata in bytes. */ - tsk_size_t metadata_length; - -} tsk_migration_t; - -/** -@brief A single population defined by a row in the population table. - -@rst -See the :ref:`data model ` section for the definition of -a population and its properties. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief Metadata. */ - const char *metadata; - /** @brief Metadata length in bytes. */ - tsk_size_t metadata_length; -} tsk_population_t; - -/** -@brief A single provenance defined by a row in the provenance table. - -@rst -See the :ref:`data model ` section for the definition of -a provenance object and its properties. See the :ref:`sec_provenance` section -for more information on how provenance records should be structured. -@endrst -*/ -typedef struct { - /** @brief Non-negative ID value corresponding to table row. */ - tsk_id_t id; - /** @brief The timestamp. */ - const char *timestamp; - /** @brief The timestamp length in bytes. */ - tsk_size_t timestamp_length; - /** @brief The record. */ - const char *record; - /** @brief The record length in bytes. */ - tsk_size_t record_length; -} tsk_provenance_t; - -/****************************************************************************/ -/* Table definitions */ -/****************************************************************************/ - -/** -@brief The individual table. - -@rst -See the individual :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the location column. */ - tsk_size_t location_length; - tsk_size_t max_location_length; - tsk_size_t max_location_length_increment; - /** @brief The total length of the parent column. */ - tsk_size_t parents_length; - tsk_size_t max_parents_length; - tsk_size_t max_parents_length_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The flags column. */ - tsk_flags_t *flags; - /** @brief The location column. */ - double *location; - /** @brief The location_offset column. */ - tsk_size_t *location_offset; - /** @brief The parents column. */ - tsk_id_t *parents; - /** @brief The parents_offset column. */ - tsk_size_t *parents_offset; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_individual_table_t; - -/** -@brief The node table. - -@rst -See the node :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The flags column. */ - tsk_flags_t *flags; - /** @brief The time column. */ - double *time; - /** @brief The population column. */ - tsk_id_t *population; - /** @brief The individual column. */ - tsk_id_t *individual; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_node_table_t; - -/** -@brief The edge table. - -@rst -See the edge :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The left column. */ - double *left; - /** @brief The right column. */ - double *right; - /** @brief The parent column. */ - tsk_id_t *parent; - /** @brief The child column. */ - tsk_id_t *child; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; - /** @brief Flags for this table */ - tsk_flags_t options; -} tsk_edge_table_t; - -/** -@brief The migration table. - -@rst -See the migration :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The source column. */ - tsk_id_t *source; - /** @brief The dest column. */ - tsk_id_t *dest; - /** @brief The node column. */ - tsk_id_t *node; - /** @brief The left column. */ - double *left; - /** @brief The right column. */ - double *right; - /** @brief The time column. */ - double *time; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_migration_table_t; - -/** -@brief The site table. - -@rst -See the site :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - tsk_size_t ancestral_state_length; - tsk_size_t max_ancestral_state_length; - tsk_size_t max_ancestral_state_length_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The position column. */ - double *position; - /** @brief The ancestral_state column. */ - char *ancestral_state; - /** @brief The ancestral_state_offset column. */ - tsk_size_t *ancestral_state_offset; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_site_table_t; - -/** -@brief The mutation table. - -@rst -See the mutation :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - tsk_size_t derived_state_length; - tsk_size_t max_derived_state_length; - tsk_size_t max_derived_state_length_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The node column. */ - tsk_id_t *node; - /** @brief The site column. */ - tsk_id_t *site; - /** @brief The parent column. */ - tsk_id_t *parent; - /** @brief The time column. */ - double *time; - /** @brief The derived_state column. */ - char *derived_state; - /** @brief The derived_state_offset column. */ - tsk_size_t *derived_state_offset; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_mutation_table_t; - -/** -@brief The population table. - -@rst -See the population :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the metadata column. */ - tsk_size_t metadata_length; - tsk_size_t max_metadata_length; - tsk_size_t max_metadata_length_increment; - tsk_size_t metadata_schema_length; - /** @brief The metadata column. */ - char *metadata; - /** @brief The metadata_offset column. */ - tsk_size_t *metadata_offset; - /** @brief The metadata schema */ - char *metadata_schema; -} tsk_population_table_t; - -/** -@brief The provenance table. - -@rst -See the provenance :ref:`table definition ` for -details of the columns in this table. -@endrst -*/ -typedef struct { - /** @brief The number of rows in this table. */ - tsk_size_t num_rows; - tsk_size_t max_rows; - tsk_size_t max_rows_increment; - /** @brief The total length of the timestamp column. */ - tsk_size_t timestamp_length; - tsk_size_t max_timestamp_length; - tsk_size_t max_timestamp_length_increment; - /** @brief The total length of the record column. */ - tsk_size_t record_length; - tsk_size_t max_record_length; - tsk_size_t max_record_length_increment; - /** @brief The timestamp column. */ - char *timestamp; - /** @brief The timestamp_offset column. */ - tsk_size_t *timestamp_offset; - /** @brief The record column. */ - char *record; - /** @brief The record_offset column. */ - tsk_size_t *record_offset; -} tsk_provenance_table_t; - -typedef struct { - char *data; - tsk_size_t data_length; - char *url; - tsk_size_t url_length; - char *metadata; - tsk_size_t metadata_length; - char *metadata_schema; - tsk_size_t metadata_schema_length; -} tsk_reference_sequence_t; - -/** -@brief A collection of tables defining the data for a tree sequence. -*/ -typedef struct { - /** @brief The sequence length defining the tree sequence's coordinate space */ - double sequence_length; - char *file_uuid; - /** @brief The units of the time dimension */ - char *time_units; - tsk_size_t time_units_length; - /** @brief The tree-sequence metadata */ - char *metadata; - tsk_size_t metadata_length; - /** @brief The metadata schema */ - char *metadata_schema; - tsk_size_t metadata_schema_length; - tsk_reference_sequence_t reference_sequence; - /** @brief The individual table */ - tsk_individual_table_t individuals; - /** @brief The node table */ - tsk_node_table_t nodes; - /** @brief The edge table */ - tsk_edge_table_t edges; - /** @brief The migration table */ - tsk_migration_table_t migrations; - /** @brief The site table */ - tsk_site_table_t sites; - /** @brief The mutation table */ - tsk_mutation_table_t mutations; - /** @brief The population table */ - tsk_population_table_t populations; - /** @brief The provenance table */ - tsk_provenance_table_t provenances; - struct { - tsk_id_t *edge_insertion_order; - tsk_id_t *edge_removal_order; - tsk_size_t num_edges; - } indexes; -} tsk_table_collection_t; - -/** -@brief A bookmark recording the position of all the tables in a table collection. -*/ -typedef struct { - /** @brief The position in the individual table. */ - tsk_size_t individuals; - /** @brief The position in the node table. */ - tsk_size_t nodes; - /** @brief The position in the edge table. */ - tsk_size_t edges; - /** @brief The position in the migration table. */ - tsk_size_t migrations; - /** @brief The position in the site table. */ - tsk_size_t sites; - /** @brief The position in the mutation table. */ - tsk_size_t mutations; - /** @brief The position in the population table. */ - tsk_size_t populations; - /** @brief The position in the provenance table. */ - tsk_size_t provenances; -} tsk_bookmark_t; - -/** -@brief Low-level table sorting method. -*/ -typedef struct _tsk_table_sorter_t { - /** @brief The input tables that are being sorted. */ - tsk_table_collection_t *tables; - /** @brief The edge sorting function. If set to NULL, edges are not sorted. */ - int (*sort_edges)(struct _tsk_table_sorter_t *self, tsk_size_t start); - /** @brief The mutation sorting function. */ - int (*sort_mutations)(struct _tsk_table_sorter_t *self); - /** @brief The individual sorting function. */ - int (*sort_individuals)(struct _tsk_table_sorter_t *self); - /** @brief An opaque pointer for use by client code */ - void *user_data; - /** @brief Mapping from input site IDs to output site IDs */ - tsk_id_t *site_id_map; -} tsk_table_sorter_t; - -/* Structs for IBD finding. - * TODO: document properly - * */ - -/* Note for tskit developers: it's perhaps a bit confusing/pointless to - * have the tsk_identity_segment_t struct as well as the internal tsk_segment_t - * struct (which is identical). However, we may want to implement either - * segment type differently in future, and since the tsk_identity_segment_t - * is part of the public API we want to allow the freedom for the different - * structures to evolve over time */ -typedef struct _tsk_identity_segment_t { - double left; - double right; - struct _tsk_identity_segment_t *next; - tsk_id_t node; -} tsk_identity_segment_t; - -typedef struct { - tsk_size_t num_segments; - double total_span; - tsk_identity_segment_t *head; - tsk_identity_segment_t *tail; -} tsk_identity_segment_list_t; - -typedef struct { - tsk_size_t num_nodes; - tsk_avl_tree_int_t pair_map; - tsk_size_t num_segments; - double total_span; - tsk_blkalloc_t heap; - bool store_segments; - bool store_pairs; -} tsk_identity_segments_t; - -/* Diff iterator. */ -typedef struct _tsk_edge_list_node_t { - tsk_edge_t edge; - struct _tsk_edge_list_node_t *next; - struct _tsk_edge_list_node_t *prev; -} tsk_edge_list_node_t; - -typedef struct { - tsk_edge_list_node_t *head; - tsk_edge_list_node_t *tail; -} tsk_edge_list_t; - -typedef struct { - tsk_size_t num_nodes; - tsk_size_t num_edges; - double tree_left; - const tsk_table_collection_t *tables; - tsk_id_t insertion_index; - tsk_id_t removal_index; - tsk_id_t tree_index; - tsk_id_t last_index; - tsk_edge_list_node_t *edge_list_nodes; -} tsk_diff_iter_t; - -/****************************************************************************/ -/* Common function options */ -/****************************************************************************/ - -/** -@defgroup API_FLAGS_SIMPLIFY_GROUP :c:func:`tsk_table_collection_simplify` and -:c:func:`tsk_treeseq_simplify` specific flags. -@{ -*/ -/** Remove sites from the output if there are no mutations that reference them.*/ -#define TSK_SIMPLIFY_FILTER_SITES (1 << 0) -/** Remove populations from the output if there are no nodes or migrations that -reference them. */ -#define TSK_SIMPLIFY_FILTER_POPULATIONS (1 << 1) -/** Remove individuals from the output if there are no nodes that reference them.*/ -#define TSK_SIMPLIFY_FILTER_INDIVIDUALS (1 << 2) -/** Do not remove nodes from the output if there are no edges that reference -them and do not reorder nodes so that the samples are nodes 0 to num_samples - 1. -Note that this flag is negated compared to other filtering options because -the default behaviour is to filter unreferenced nodes and reorder to put samples -first. -*/ -#define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7) -/** -Do not update the sample status of nodes as a result of simplification. -*/ -#define TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS (1 << 8) -/** -Reduce the topological information in the tables to the minimum necessary to -represent the trees that contain sites. If there are zero sites this will -result in an zero output edges. When the number of sites is greater than zero, -every tree in the output tree sequence will contain at least one site. -For a given site, the topology of the tree containing that site will be -identical (up to node ID remapping) to the topology of the corresponding tree -in the input. -*/ -#define TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY (1 << 3) -/** -By default simplify removes unary nodes (i.e., nodes with exactly one child) -along the path from samples to root. If this option is specified such unary -nodes will be preserved in the output. -*/ -#define TSK_SIMPLIFY_KEEP_UNARY (1 << 4) -/** -By default simplify removes all topology ancestral the MRCAs of the samples. -This option inserts edges from these MRCAs back to the roots of the input -trees. -*/ -#define TSK_SIMPLIFY_KEEP_INPUT_ROOTS (1 << 5) -/** -@rst -This acts like :c:macro:`TSK_SIMPLIFY_KEEP_UNARY` (and is mutually exclusive with that -flag). It keeps unary nodes, but only if the unary node is referenced from an individual. -@endrst -*/ -#define TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS (1 << 6) -/** @} */ - -/** -@defgroup API_FLAGS_SUBSET_GROUP :c:func:`tsk_table_collection_subset` specific flags. -@{ -*/ -/**If this flag is provided, the population table will not be changed in any way.*/ -#define TSK_SUBSET_NO_CHANGE_POPULATIONS (1 << 0) -/** -@rst -If this flag is provided, then unreferenced sites, individuals, and populations -will not be removed. If so, the site and individual tables will not be changed, -and (unless :c:macro:`TSK_SUBSET_NO_CHANGE_POPULATIONS` is also provided) unreferenced -populations will be placed last, in their original order. -@endrst -*/ -#define TSK_SUBSET_KEEP_UNREFERENCED (1 << 1) -/** @} */ - -/** -@defgroup API_FLAGS_CHECK_INTEGRITY_GROUP :c:func:`tsk_table_collection_check_integrity` -specific flags. -@{ -*/ -/** Check edge ordering constraints for a tree sequence. */ -#define TSK_CHECK_EDGE_ORDERING (1 << 0) -/** Check that sites are in non-decreasing position order. */ -#define TSK_CHECK_SITE_ORDERING (1 << 1) -/**Check for any duplicate site positions. */ -#define TSK_CHECK_SITE_DUPLICATES (1 << 2) -/** -Check constraints on the ordering of mutations. Any non-null -mutation parents and known times are checked for ordering -constraints. -*/ -#define TSK_CHECK_MUTATION_ORDERING (1 << 3) -/**Check individual parents are before children, where specified. */ -#define TSK_CHECK_INDIVIDUAL_ORDERING (1 << 4) -/**Check migrations are ordered by time. */ -#define TSK_CHECK_MIGRATION_ORDERING (1 << 5) -/**Check that the table indexes exist, and contain valid edge references. */ -#define TSK_CHECK_INDEXES (1 << 6) -/** -All checks needed to define a valid tree sequence. Note that -this implies all of the above checks. -*/ -#define TSK_CHECK_TREES (1 << 7) - -/* Leave room for more positive check flags */ -/** -Do not check integrity of references to populations. This -can be safely combined with the other checks. -*/ -#define TSK_NO_CHECK_POPULATION_REFS (1 << 12) -/** @} */ - -/** -@defgroup API_FLAGS_LOAD_INIT_GROUP Flags used by load and init methods. -@{ -*/ -/* These flags are for table collection load or init, or used as - flags on table collection or individual tables. - * As flags are passed though from load to init they share a namespace */ -/** Skip reading tables, and only load top-level information. */ -#define TSK_LOAD_SKIP_TABLES (1 << 0) -/** Do not load reference sequence. */ -#define TSK_LOAD_SKIP_REFERENCE_SEQUENCE (1 << 1) -/** -@rst -Do not allocate space to store metadata in this table. Operations -attempting to add non-empty metadata to the table will fail -with error TSK_ERR_METADATA_DISABLED. -@endrst -*/ -#define TSK_TABLE_NO_METADATA (1 << 2) -/** -@rst -Do not allocate space to store metadata in the edge table. Operations -attempting to add non-empty metadata to the edge table will fail -with error TSK_ERR_METADATA_DISABLED. -@endrst -*/ -#define TSK_TC_NO_EDGE_METADATA (1 << 3) -/** @} */ - -/* Flags for dump tables */ -/* We may not want to document this flag, but it's useful for testing - * so we put it high up in the bit space, below the common options */ -#define TSK_DUMP_FORCE_OFFSET_64 (1 << 27) - -/** -@defgroup API_FLAGS_COPY_GROUP Flags used by :c:func:`tsk_table_collection_copy`. -@{ -*/ -/** Copy the file uuid, by default this is not copied. */ -#define TSK_COPY_FILE_UUID (1 << 0) -/** @} */ - -/** -@defgroup API_FLAGS_UNION_GROUP Flags used by :c:func:`tsk_table_collection_union`. -@{ -*/ -/** -By default, union checks that the portion of shared history between -``self`` and ``other``, as implied by ``other_node_mapping``, are indeed -equivalent. It does so by subsetting both ``self`` and ``other`` on the -equivalent nodes specified in ``other_node_mapping``, and then checking for -equality of the subsets. -*/ -#define TSK_UNION_NO_CHECK_SHARED (1 << 0) -/** - By default, all nodes new to ``self`` are assigned new populations. If this -option is specified, nodes that are added to ``self`` will retain the -population IDs they have in ``other``. - */ -#define TSK_UNION_NO_ADD_POP (1 << 1) -/** @} */ - -/** -@defgroup API_FLAGS_CMP_GROUP Flags used by :c:func:`tsk_table_collection_equals`. -@{ -*/ -/** -Do not include the top-level tree sequence metadata and metadata schemas -in the comparison. -*/ -#define TSK_CMP_IGNORE_TS_METADATA (1 << 0) -/** Do not include the provenance table in comparison. */ -#define TSK_CMP_IGNORE_PROVENANCE (1 << 1) -/** -@rst -Do not include metadata when comparing the table collections. -This includes both the top-level tree sequence metadata as well as the -metadata for each of the tables (i.e, :c:macro:`TSK_CMP_IGNORE_TS_METADATA` is implied). -All metadata schemas are also ignored. -@endrst -*/ -#define TSK_CMP_IGNORE_METADATA (1 << 2) -/** -@rst -Do not include the timestamp information when comparing the provenance -tables. This has no effect if :c:macro:`TSK_CMP_IGNORE_PROVENANCE` is specified. -@endrst -*/ -#define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) -/** -Do not include any tables in the comparison, thus comparing only the -top-level information of the table collections being compared. -*/ -#define TSK_CMP_IGNORE_TABLES (1 << 4) -/** Do not include the reference sequence in the comparison. */ -#define TSK_CMP_IGNORE_REFERENCE_SEQUENCE (1 << 5) -/** @} */ - -/** -@defgroup API_FLAGS_CLEAR_GROUP Flags used by :c:func:`tsk_table_collection_clear`. -@{ -*/ -/** Additionally clear the table metadata schemas*/ -#define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) -/** Additionally clear the tree-sequence metadata and schema*/ -#define TSK_CLEAR_TS_METADATA_AND_SCHEMA (1 << 1) -/** Additionally clear the provenance table*/ -#define TSK_CLEAR_PROVENANCE (1 << 2) -/** @} */ - -/* For the edge diff iterator */ -#define TSK_INCLUDE_TERMINAL (1 << 0) - -/** @brief Value returned by seeking methods when they have successfully - seeked to a non-null tree. - - @ingroup TREE_API_SEEKING_GROUP -*/ -#define TSK_TREE_OK 1 - -/****************************************************************************/ -/* Function signatures */ -/****************************************************************************/ - -/** -@defgroup INDIVIDUAL_TABLE_API_GROUP Individual table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_individual_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_init(tsk_individual_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_individual_table_t object. -@return Always returns 0. -*/ -int tsk_individual_table_free(tsk_individual_table_t *self); - -/** -@brief Adds a row to this individual table. - -@rst -Add a new individual with the specified ``flags``, ``location``, ``parents`` and -``metadata`` to the table. Copies of the ``location``, ``parents`` and ``metadata`` -parameters are taken immediately. See the :ref:`table definition -` for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param flags The bitwise flags for the new individual. -@param location A pointer to a double array representing the spatial location - of the new individual. Can be ``NULL`` if ``location_length`` is 0. -@param location_length The number of dimensions in the locations position. - Note this the number of elements in the corresponding double array - not the number of bytes. -@param parents A pointer to a ``tsk_id`` array representing the parents - of the new individual. Can be ``NULL`` if ``parents_length`` is 0. -@param parents_length The number of parents. - Note this the number of elements in the corresponding ``tsk_id`` array - not the number of bytes. -@param metadata The metadata to be associated with the new individual. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return the ID of the newly added individual on success, - or a negative value on failure. -*/ -tsk_id_t tsk_individual_table_add_row(tsk_individual_table_t *self, tsk_flags_t flags, - const double *location, tsk_size_t location_length, const tsk_id_t *parents, - tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. Copies of the ``location``, ``parents`` and ``metadata`` -parameters are taken immediately. See the :ref:`table definition -` for details of the columns in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param index The row to update. -@param flags The bitwise flags for the individual. -@param location A pointer to a double array representing the spatial location - of the new individual. Can be ``NULL`` if ``location_length`` is 0. -@param location_length The number of dimensions in the locations position. - Note this the number of elements in the corresponding double array - not the number of bytes. -@param parents A pointer to a ``tsk_id`` array representing the parents - of the new individual. Can be ``NULL`` if ``parents_length`` is 0. -@param parents_length The number of parents. - Note this the number of elements in the corresponding ``tsk_id`` array - not the number of bytes. -@param metadata The metadata to be associated with the new individual. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_update_row(tsk_individual_table_t *self, tsk_id_t index, - tsk_flags_t flags, const double *location, tsk_size_t location_length, - const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_individual_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_clear(tsk_individual_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_individual_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_truncate(tsk_individual_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_individual_table_t object where rows are to be added. -@param other A pointer to a tsk_individual_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_extend(tsk_individual_table_t *self, - const tsk_individual_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -The values in the ``parents`` column are updated according to this map, so that -reference integrity within the table is maintained. As a consequence of this, -the values in the ``parents`` column for kept rows are bounds-checked and an -error raised if they are not valid. Rows that are deleted are not checked for -parent ID integrity. - -If an attempt is made to delete rows that are referred to by the ``parents`` -column of rows that are retained, an error is raised. - -These error conditions are checked before any alterations to the table are -made. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param other A pointer to a tsk_individual_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_individual_table_equals(const tsk_individual_table_t *self, - const tsk_individual_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. - -Indexes that are present are also copied to the destination table. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param dest A pointer to a tsk_individual_table_t object. If the TSK_NO_INIT -option is specified, this must be an initialised individual table. If not, it must be an -uninitialised individual table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_copy(const tsk_individual_table_t *self, - tsk_individual_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified individual struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_individual_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_get_row( - const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row); - -/** -@brief Set the metadata schema - -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_metadata_schema(tsk_individual_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_individual_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param flags The array of tsk_flag_t flag values to be copied. -@param location The array of double location values to be copied. -@param location_offset The array of tsk_size_t location offset values to be copied. -@param parents The array of tsk_id_t parent values to be copied. -@param parents_offset The array of tsk_size_t parent offset values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_columns(tsk_individual_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, - const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param num_rows The number of rows to copy from the specifed arrays -@param flags The array of tsk_flag_t flag values to be copied. -@param location The array of double location values to be copied. -@param location_offset The array of tsk_size_t location offset values to be copied. -@param parents The array of tsk_id_t parent values to be copied. -@param parents_offset The array of tsk_size_t parent offset values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_append_columns(tsk_individual_table_t *self, - tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, - const tsk_size_t *location_offset, const tsk_id_t *parents, - const tsk_size_t *parents_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_max_rows_increment( - tsk_individual_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_max_metadata_length_increment( - tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment); - -/** -@brief Controls the pre-allocation strategy for the location column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param max_location_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_max_location_length_increment( - tsk_individual_table_t *self, tsk_size_t max_location_length_increment); - -/** -@brief Controls the pre-allocation strategy for the parents column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param max_parents_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_individual_table_set_max_parents_length_increment( - tsk_individual_table_t *self, tsk_size_t max_parents_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out); -/** -@defgroup NODE_TABLE_API_GROUP Node table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_node_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_init(tsk_node_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_node_table_t object. -@return Always returns 0. -*/ -int tsk_node_table_free(tsk_node_table_t *self); - -/** -@brief Adds a row to this node table. - -@rst -Add a new node with the specified ``flags``, ``time``, ``population``, -``individual`` and ``metadata`` to the table. A copy of the ``metadata`` parameter -is taken immediately. See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param flags The bitwise flags for the new node. -@param time The time for the new node. -@param population The population for the new node. Set to TSK_NULL if not -known. -@param individual The individual for the new node. Set to TSK_NULL if not -known. -@param metadata The metadata to be associated with the new node. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return the ID of the newly added node on success, - or a negative value on failure. -*/ -tsk_id_t tsk_node_table_add_row(tsk_node_table_t *self, tsk_flags_t flags, double time, - tsk_id_t population, tsk_id_t individual, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. A copy of the ``metadata`` parameter is taken immediately. See the -:ref:`table definition ` for details of the columns -in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param index The row to update. -@param flags The bitwise flags for the node. -@param time The time for the node. -@param population The population for the node. Set to TSK_NULL if not known. -@param individual The individual for the node. Set to TSK_NULL if not known. -@param metadata The metadata to be associated with the node. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_update_row(tsk_node_table_t *self, tsk_id_t index, tsk_flags_t flags, - double time, tsk_id_t population, tsk_id_t individual, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_node_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_clear(tsk_node_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_node_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_truncate(tsk_node_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_node_table_t object where rows are to be added. -@param other A pointer to a tsk_node_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_extend(tsk_node_table_t *self, const tsk_node_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param other A pointer to a tsk_node_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_node_table_equals( - const tsk_node_table_t *self, const tsk_node_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the TSK_NO_INIT option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param dest A pointer to a tsk_node_table_t object. If the TSK_NO_INIT option - is specified, this must be an initialised node table. If not, it must - be an uninitialised node table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_copy( - const tsk_node_table_t *self, tsk_node_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified node struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_node_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_get_row( - const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_node_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_set_metadata_schema(tsk_node_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_node_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param flags The array of tsk_flag_t values to be copied. -@param time The array of double time values to be copied. -@param population The array of tsk_id_t population values to be copied. -@param individual The array of tsk_id_t individual values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_set_columns(tsk_node_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param num_rows The number of rows to copy from the specifed arrays -@param flags The array of tsk_flag_t values to be copied. -@param time The array of double time values to be copied. -@param population The array of tsk_id_t population values to be copied. -@param individual The array of tsk_id_t individual values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_append_columns(tsk_node_table_t *self, tsk_size_t num_rows, - const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ - -int tsk_node_table_set_max_rows_increment( - tsk_node_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_node_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_node_table_set_max_metadata_length_increment( - tsk_node_table_t *self, tsk_size_t max_metadata_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_node_table_dump_text(const tsk_node_table_t *self, FILE *out); - -/** -@defgroup EDGE_TABLE_API_GROUP Edge table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. - -**Options** - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_TABLE_NO_METADATA` -@endrst - -@param self A pointer to an uninitialised tsk_edge_table_t object. -@param options Allocation time options. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_init(tsk_edge_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_edge_table_t object. -@return Always returns 0. -*/ -int tsk_edge_table_free(tsk_edge_table_t *self); - -/** -@brief Adds a row to this edge table. - -@rst -Add a new edge with the specified ``left``, ``right``, ``parent``, ``child`` and -``metadata`` to the table. See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param left The left coordinate for the new edge. -@param right The right coordinate for the new edge. -@param parent The parent node for the new edge. -@param child The child node for the new edge. -@param metadata The metadata to be associated with the new edge. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. - -@return Return the ID of the newly added edge on success, - or a negative value on failure. -*/ -tsk_id_t tsk_edge_table_add_row(tsk_edge_table_t *self, double left, double right, - tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. A copy of the ``metadata`` parameter is taken immediately. See the -:ref:`table definition ` for details of the columns -in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param index The row to update. -@param left The left coordinate for the edge. -@param right The right coordinate for the edge. -@param parent The parent node for the edge. -@param child The child node for the edge. -@param metadata The metadata to be associated with the edge. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_update_row(tsk_edge_table_t *self, tsk_id_t index, double left, - double right, tsk_id_t parent, tsk_id_t child, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_edge_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_clear(tsk_edge_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_edge_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_truncate(tsk_edge_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is ``NULL``, -append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied -as-is and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_edge_table_t object where rows are to be added. -@param other A pointer to a tsk_edge_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_extend(tsk_edge_table_t *self, const tsk_edge_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param other A pointer to a tsk_edge_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_edge_table_equals( - const tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param dest A pointer to a tsk_edge_table_t object. If the TSK_NO_INIT option - is specified, this must be an initialised edge table. If not, it must - be an uninitialised edge table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_copy( - const tsk_edge_table_t *self, tsk_edge_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified edge struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_edge_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_get_row( - const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_edge_table_t object. -@param metadata_schema A pointer to a char array -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_set_metadata_schema(tsk_edge_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_edge_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param left The array of double left values to be copied. -@param right The array of double right values to be copied. -@param parent The array of tsk_id_t parent values to be copied. -@param child The array of tsk_id_t child values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_set_columns(tsk_edge_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param left The array of double left values to be copied. -@param right The array of double right values to be copied. -@param parent The array of tsk_id_t parent values to be copied. -@param child The array of tsk_id_t child values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -*/ -int tsk_edge_table_append_columns(tsk_edge_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_set_max_rows_increment( - tsk_edge_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_set_max_metadata_length_increment( - tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment); - -/** -@brief Squash adjacent edges in-place - -@rst -Sorts, then condenses the table into the smallest possible number of rows by -combining any adjacent edges. A pair of edges is said to be `adjacent` if -they have the same parent and child nodes, and if the left coordinate of -one of the edges is equal to the right coordinate of the other edge. -This process is performed in-place so that any set of adjacent edges is -replaced by a single edge. The new edge will have the same parent and child -node, a left coordinate equal to the smallest left coordinate in the set, -and a right coordinate equal to the largest right coordinate in the set. -The new edge table will be sorted in the canonical order (P, C, L, R). - -.. note:: - Note that this method will fail if any edges have non-empty metadata. - -@endrst - -@param self A pointer to a tsk_edge_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_edge_table_squash(tsk_edge_table_t *self); - -/** @} */ - -/* Undocumented methods */ - -int tsk_edge_table_dump_text(const tsk_edge_table_t *self, FILE *out); - -/** -@defgroup MIGRATION_TABLE_API_GROUP Migration table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_migration_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_init(tsk_migration_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_migration_table_t object. -@return Always returns 0. -*/ -int tsk_migration_table_free(tsk_migration_table_t *self); - -/** -@brief Adds a row to this migration table. - -@rst -Add a new migration with the specified ``left``, ``right``, ``node``, -``source``, ``dest``, ``time`` and ``metadata`` to the table. -See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param left The left coordinate for the new migration. -@param right The right coordinate for the new migration. -@param node The node ID for the new migration. -@param source The source population ID for the new migration. -@param dest The destination population ID for the new migration. -@param time The time for the new migration. -@param metadata The metadata to be associated with the new migration. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. - -@return Return the ID of the newly added migration on success, - or a negative value on failure. -*/ -tsk_id_t tsk_migration_table_add_row(tsk_migration_table_t *self, double left, - double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, - const char *metadata, tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. A copy of the ``metadata`` parameter is taken immediately. See the -:ref:`table definition ` for details of the columns -in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param index The row to update. -@param left The left coordinate for the migration. -@param right The right coordinate for the migration. -@param node The node ID for the migration. -@param source The source population ID for the migration. -@param dest The destination population ID for the migration. -@param time The time for the migration. -@param metadata The metadata to be associated with the migration. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_update_row(tsk_migration_table_t *self, tsk_id_t index, - double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, - double time, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_migration_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_clear(tsk_migration_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_migration_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_truncate(tsk_migration_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_migration_table_t object where rows are to be added. -@param other A pointer to a tsk_migration_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ - -int tsk_migration_table_extend(tsk_migration_table_t *self, - const tsk_migration_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, - tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param other A pointer to a tsk_migration_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_migration_table_equals(const tsk_migration_table_t *self, - const tsk_migration_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param dest A pointer to a tsk_migration_table_t object. If the TSK_NO_INIT -option is specified, this must be an initialised migration table. If not, it must be an -uninitialised migration table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_copy( - const tsk_migration_table_t *self, tsk_migration_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified migration struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_migration_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_get_row( - const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_migration_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_set_metadata_schema(tsk_migration_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_migration_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param left The array of double left values to be copied. -@param right The array of double right values to be copied. -@param node The array of tsk_id_t node values to be copied. -@param source The array of tsk_id_t source values to be copied. -@param dest The array of tsk_id_t dest values to be copied. -@param time The array of double time values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_set_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *node, - const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param num_rows The number of rows to copy from the specifed arrays -@param left The array of double left values to be copied. -@param right The array of double right values to be copied. -@param node The array of tsk_id_t node values to be copied. -@param source The array of tsk_id_t source values to be copied. -@param dest The array of tsk_id_t dest values to be copied. -@param time The array of double time values to be copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_append_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - const double *left, const double *right, const tsk_id_t *node, - const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_set_max_rows_increment( - tsk_migration_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_migration_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_migration_table_set_max_metadata_length_increment( - tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out); - -/** -@defgroup SITE_TABLE_API_GROUP Site table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_site_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_init(tsk_site_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_site_table_t object. -@return Always returns 0. -*/ -int tsk_site_table_free(tsk_site_table_t *self); - -/** -@brief Adds a row to this site table. - -@rst -Add a new site with the specified ``position``, ``ancestral_state`` -and ``metadata`` to the table. Copies of ``ancestral_state`` and ``metadata`` -are immediately taken. See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param position The position coordinate for the new site. -@param ancestral_state The ancestral_state for the new site. -@param ancestral_state_length The length of the ancestral_state in bytes. -@param metadata The metadata to be associated with the new site. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return the ID of the newly added site on success, - or a negative value on failure. -*/ -tsk_id_t tsk_site_table_add_row(tsk_site_table_t *self, double position, - const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. Copies of the ``ancestral_state`` and ``metadata`` parameters are taken -immediately. See the :ref:`table definition ` for -details of the columns in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param index The row to update. -@param position The position coordinate for the site. -@param ancestral_state The ancestral_state for the site. -@param ancestral_state_length The length of the ancestral_state in bytes. -@param metadata The metadata to be associated with the site. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_update_row(tsk_site_table_t *self, tsk_id_t index, double position, - const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_site_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_clear(tsk_site_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_site_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_truncate(tsk_site_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_site_table_t object where rows are to be added. -@param other A pointer to a tsk_site_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_extend(tsk_site_table_t *self, const tsk_site_table_t *other, - tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param other A pointer to a tsk_site_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_site_table_equals( - const tsk_site_table_t *self, const tsk_site_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param dest A pointer to a tsk_site_table_t object. If the TSK_NO_INIT option - is specified, this must be an initialised site table. If not, it must - be an uninitialised site table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_copy( - const tsk_site_table_t *self, tsk_site_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified site struct to reflect the values in the specified row. - -This function always sets the ``mutations`` and ``mutations_length`` -fields in the parameter :c:struct:`tsk_site_t` to ``NULL`` and ``0`` respectively. -To get access to the mutations for a particular site, please use the -tree sequence method, :c:func:`tsk_treeseq_get_site`. - -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_site_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_get_row( - const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_site_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_set_metadata_schema(tsk_site_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_site_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param position The array of double position values to be copied. -@param ancestral_state The array of char ancestral state values to be copied. -@param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be - copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param position The array of double position values to be copied. -@param ancestral_state The array of char ancestral state values to be copied. -@param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be - copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_set_max_rows_increment( - tsk_site_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ - -int tsk_site_table_set_max_metadata_length_increment( - tsk_site_table_t *self, tsk_size_t max_metadata_length_increment); - -/** -@brief Controls the pre-allocation strategy for the ancestral_state column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_site_table_t object. -@param max_ancestral_state_length_increment The number of bytes to pre-allocate, or zero -for the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_site_table_set_max_ancestral_state_length_increment( - tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out); - -/** -@defgroup MUTATION_TABLE_API_GROUP Mutation table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_mutation_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_mutation_table_t object. -@return Always returns 0. -*/ -int tsk_mutation_table_free(tsk_mutation_table_t *self); - -/** -@brief Adds a row to this mutation table. - -@rst -Add a new mutation with the specified ``site``, ``parent``, ``derived_state`` -and ``metadata`` to the table. Copies of ``derived_state`` and ``metadata`` -are immediately taken. See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param site The site ID for the new mutation. -@param node The ID of the node this mutation occurs over. -@param parent The ID of the parent mutation. -@param time The time of the mutation. -@param derived_state The derived_state for the new mutation. -@param derived_state_length The length of the derived_state in bytes. -@param metadata The metadata to be associated with the new mutation. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return the ID of the newly added mutation on success, - or a negative value on failure. -*/ -tsk_id_t tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, - tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, - tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. Copies of the ``derived_state`` and ``metadata`` parameters are taken -immediately. See the :ref:`table definition ` for -details of the columns in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param index The row to update. -@param site The site ID for the mutation. -@param node The ID of the node this mutation occurs over. -@param parent The ID of the parent mutation. -@param time The time of the mutation. -@param derived_state The derived_state for the mutation. -@param derived_state_length The length of the derived_state in bytes. -@param metadata The metadata to be associated with the mutation. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_update_row(tsk_mutation_table_t *self, tsk_id_t index, - tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, - const char *derived_state, tsk_size_t derived_state_length, const char *metadata, - tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_mutation_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_clear(tsk_mutation_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_mutation_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_truncate(tsk_mutation_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_mutation_table_t object where rows are to be added. -@param other A pointer to a tsk_mutation_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_extend(tsk_mutation_table_t *self, - const tsk_mutation_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, - tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -The values in the ``parent`` column are updated according to this map, so that -reference integrity within the table is maintained. As a consequence of this, -the values in the ``parent`` column for kept rows are bounds-checked and an -error raised if they are not valid. Rows that are deleted are not checked for -parent ID integrity. - -If an attempt is made to delete rows that are referred to by the ``parent`` -column of rows that are retained, an error is raised. - -These error conditions are checked before any alterations to the table are -made. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param other A pointer to a tsk_mutation_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_mutation_table_equals(const tsk_mutation_table_t *self, - const tsk_mutation_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param dest A pointer to a tsk_mutation_table_t object. If the TSK_NO_INIT -option is specified, this must be an initialised mutation table. If not, it must be an -uninitialised mutation table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_copy( - const tsk_mutation_table_t *self, tsk_mutation_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified mutation struct to reflect the values in the specified row. - -This function always sets the ``edge`` field in parameter -:c:struct:`tsk_mutation_t` to ``TSK_NULL``. To determine the ID of -the edge associated with a particular mutation, please use the -tree sequence method, :c:func:`tsk_treeseq_get_mutation`. - -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_mutation_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_get_row( - const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_mutation_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_set_metadata_schema(tsk_mutation_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_mutation_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param site The array of tsk_id_t site values to be copied. -@param node The array of tsk_id_t node values to be copied. -@param parent The array of tsk_id_t parent values to be copied. -@param time The array of double time values to be copied. -@param derived_state The array of char derived_state values to be copied. -@param derived_state_offset The array of tsk_size_t derived state offset values to be -copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, - const double *time, const char *derived_state, - const tsk_size_t *derived_state_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param site The array of tsk_id_t site values to be copied. -@param node The array of tsk_id_t node values to be copied. -@param parent The array of tsk_id_t parent values to be copied. -@param time The array of double time values to be copied. -@param derived_state The array of char derived_state values to be copied. -@param derived_state_offset The array of tsk_size_t derived state offset values to be - copied. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, - const double *time, const char *derived_state, - const tsk_size_t *derived_state_offset, const char *metadata, - const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_set_max_rows_increment( - tsk_mutation_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_set_max_metadata_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment); - -/** -@brief Controls the pre-allocation strategy for the derived_state column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_mutation_table_t object. -@param max_derived_state_length_increment The number of bytes to pre-allocate, or zero -for the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_mutation_table_set_max_derived_state_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out); - -/** -@defgroup POPULATION_TABLE_API_GROUP Population table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_population_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_init(tsk_population_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_population_table_t object. -@return Always returns 0. -*/ -int tsk_population_table_free(tsk_population_table_t *self); - -/** -@brief Adds a row to this population table. - -@rst -Add a new population with the specified ``metadata`` to the table. A copy of the -``metadata`` is immediately taken. See the :ref:`table definition -` for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param metadata The metadata to be associated with the new population. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return the ID of the newly added population on success, - or a negative value on failure. -*/ -tsk_id_t tsk_population_table_add_row( - tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. A copy of the ``metadata`` parameter is taken immediately. See the -:ref:`table definition ` for details of the -columns in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param index The row to update. -@param metadata The metadata to be associated with the population. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. -@param metadata_length The size of the metadata array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_update_row(tsk_population_table_t *self, tsk_id_t index, - const char *metadata, tsk_size_t metadata_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_population_table_free` to free the table's internal resources. Note that the -metadata schema is not cleared. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_clear(tsk_population_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_population_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_truncate(tsk_population_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is -and is not checked for compatibility with any existing schema on this table. -@endrst - -@param self A pointer to a tsk_population_table_t object where rows are to be added. -@param other A pointer to a tsk_population_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_extend(tsk_population_table_t *self, - const tsk_population_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns, -and their metadata schemas are byte-wise identical. - -- :c:macro:`TSK_CMP_IGNORE_METADATA` - Do not include metadata in the comparison. Note that as metadata is the - only column in the population table, two population tables are considered - equal if they have the same number of rows if this flag is specified. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param other A pointer to a tsk_population_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_population_table_equals(const tsk_population_table_t *self, - const tsk_population_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param dest A pointer to a tsk_population_table_t object. If the TSK_NO_INIT -option is specified, this must be an initialised population table. If not, it must be an -uninitialised population table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_copy(const tsk_population_table_t *self, - tsk_population_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified population struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_population_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_get_row( - const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table, replacing any existing. -@endrst -@param self A pointer to a tsk_population_table_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_set_metadata_schema(tsk_population_table_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_population_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_set_columns(tsk_population_table_t *self, tsk_size_t num_rows, - const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param metadata The array of char metadata values to be copied. -@param metadata_offset The array of tsk_size_t metadata offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_append_columns(tsk_population_table_t *self, - tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_set_max_rows_increment( - tsk_population_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the metadata column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_population_table_t object. -@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_population_table_set_max_metadata_length_increment( - tsk_population_table_t *self, tsk_size_t max_metadata_length_increment); - -/** @} */ - -/* Undocumented methods */ - -int tsk_population_table_dump_text(const tsk_population_table_t *self, FILE *out); - -/** -@defgroup PROVENANCE_TABLE_API_GROUP Provenance table API. -@{ -*/ - -/** -@brief Initialises the table by allocating the internal memory. - -@rst -This must be called before any operations are performed on the table. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. -@endrst - -@param self A pointer to an uninitialised tsk_provenance_table_t object. -@param options Allocation time options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_init(tsk_provenance_table_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table. - -@param self A pointer to an initialised tsk_provenance_table_t object. -@return Always returns 0. -*/ -int tsk_provenance_table_free(tsk_provenance_table_t *self); - -/** -@brief Adds a row to this provenance table. - -@rst -Add a new provenance with the specified ``timestamp`` and ``record`` to the table. -Copies of the ``timestamp`` and ``record`` are immediately taken. -See the :ref:`table definition ` -for details of the columns in this table. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param timestamp The timestamp to be associated with the new provenance. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``timestamp_length`` is 0. -@param timestamp_length The size of the timestamp array in bytes. -@param record The record to be associated with the new provenance. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``record_length`` is 0. -@param record_length The size of the record array in bytes. -@return Return the ID of the newly added provenance on success, - or a negative value on failure. -*/ -tsk_id_t tsk_provenance_table_add_row(tsk_provenance_table_t *self, - const char *timestamp, tsk_size_t timestamp_length, const char *record, - tsk_size_t record_length); - -/** -@brief Updates the row at the specified index. - -@rst -Rewrite the row at the specified index in this table to use the specified -values. Copies of the ``timestamp`` and ``record`` parameters are taken -immediately. See the :ref:`table definition ` -for details of the columns in this table. - -.. warning:: - Because of the way that ragged columns are encoded, this method requires a - full rewrite of the internal column memory in worst case, and would - therefore be inefficient for bulk updates for such columns. However, if the - sizes of all ragged column values are unchanged in the updated row, this - method is guaranteed to only update the memory for the row in question. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param index The row to update. -@param timestamp The timestamp to be associated with new provenance. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``timestamp_length`` is 0. -@param timestamp_length The size of the timestamp array in bytes. -@param record The record to be associated with the provenance. This - is a pointer to arbitrary memory. Can be ``NULL`` if ``record_length`` is 0. -@param record_length The size of the record array in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_update_row(tsk_provenance_table_t *self, tsk_id_t index, - const char *timestamp, tsk_size_t timestamp_length, const char *record, - tsk_size_t record_length); - -/** -@brief Clears this table, setting the number of rows to zero. - -@rst -No memory is freed as a result of this operation; please use -:c:func:`tsk_provenance_table_free` to free the table's internal resources. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_clear(tsk_provenance_table_t *self); - -/** -@brief Truncates this table so that only the first num_rows are retained. - -@param self A pointer to a tsk_provenance_table_t object. -@param num_rows The number of rows to retain in the table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_truncate(tsk_provenance_table_t *self, tsk_size_t num_rows); - -/** -@brief Extends this table by appending rows copied from another table. - -@rst -Appends the rows at the specified indexes from the table ``other`` to the end of this -table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append -the first ``num_rows`` from ``other`` to this table. -@endrst - -@param self A pointer to a tsk_provenance_table_t object where rows are to be added. -@param other A pointer to a tsk_provenance_table_t object where rows are copied from. -@param num_rows The number of rows from ``other`` to append to this table. -@param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the - first ``num_rows`` of ``other`` are used. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_extend(tsk_provenance_table_t *self, - const tsk_provenance_table_t *other, tsk_size_t num_rows, - const tsk_id_t *row_indexes, tsk_flags_t options); - -/** -@brief Subset this table by keeping rows according to a boolean mask. - -@rst -Deletes rows from this table and optionally return the mapping from IDs in -the current table to the updated table. Rows are kept or deleted according to -the specified boolean array ``keep`` such that for each row ``j`` if -``keep[j]`` is false (zero) the row is deleted, and otherwise the row is -retained. Thus, ``keep`` must be an array of at least ``num_rows`` -:c:type:`bool` values. - -If the ``id_map`` argument is non-null, this array will be updated to represent -the mapping between IDs before and after row deletion. For row ``j``, -``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or -:c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an -array of at least ``num_rows`` :c:type:`tsk_id_t` values. - -.. warning:: - C++ users need to be careful to specify the correct type when - passing in values for the ``keep`` array, - using ``std::vector`` and not ``std::vector``, - as the latter may not be correct size. - -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param keep Array of boolean flags describing whether a particular - row should be kept or not. Must be at least ``num_rows`` long. -@param options Bitwise option flags. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@param id_map An array in which to store the mapping between new - and old IDs. If NULL, this will be ignored. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, - tsk_flags_t options, tsk_id_t *id_map); - -/** -@brief Returns true if the data in the specified table is identical to the data - in this table. - -@rst - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) tables are -considered equal if they are byte-wise identical in all columns. - -- :c:macro:`TSK_CMP_IGNORE_TIMESTAMPS` -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param other A pointer to a tsk_provenance_table_t object. -@param options Bitwise comparison options. -@return Return true if the specified table is equal to this table. -*/ -bool tsk_provenance_table_equals(const tsk_provenance_table_t *self, - const tsk_provenance_table_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table into the specified destination. - -@rst -By default the method initialises the specified destination table. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param dest A pointer to a tsk_provenance_table_t object. If the TSK_NO_INIT -option is specified, this must be an initialised provenance table. If not, it must be an -uninitialised provenance table. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_copy(const tsk_provenance_table_t *self, - tsk_provenance_table_t *dest, tsk_flags_t options); - -/** -@brief Get the row at the specified index. - -@rst -Updates the specified provenance struct to reflect the values in the specified row. -Pointers to memory within this struct are handled by the table and should **not** -be freed by client code. These pointers are guaranteed to be valid until the -next operation that modifies the table (e.g., by adding a new row), but not afterwards. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param index The requested table row. -@param row A pointer to a tsk_provenance_t struct that is updated to reflect the - values in the specified row. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_get_row( - const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row); - -/** -@brief Print out the state of this table to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_provenance_table_t object. -@param out The stream to write the summary to. -*/ -void tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out); - -/** -@brief Replace this table's data by copying from a set of column arrays - -@rst -Clears the data columns of this table and then copies column data from the specified -set of arrays. The supplied arrays should all contain data on the same number of rows. -The metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param timestamp The array of char timestamp values to be copied. -@param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. -@param record The array of char record values to be copied. -@param record_offset The array of tsk_size_t record offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_set_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, - const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, - const tsk_size_t *record_offset); - -/** -@brief Extends this table by copying from a set of column arrays - -@rst -Copies column data from the specified set of arrays to create new rows at the end of the -table. The supplied arrays should all contain data on the same number of rows. The -metadata schema is not affected. -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param num_rows The number of rows to copy from the specifed arrays. -@param timestamp The array of char timestamp values to be copied. -@param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. -@param record The array of char record values to be copied. -@param record_offset The array of tsk_size_t record offset values to be copied. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_append_columns(tsk_provenance_table_t *self, - tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, - const char *record, const tsk_size_t *record_offset); - -/** -@brief Controls the pre-allocation strategy for this table - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param max_rows_increment The number of rows to pre-allocate, or zero for the default - doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_set_max_rows_increment( - tsk_provenance_table_t *self, tsk_size_t max_rows_increment); - -/** -@brief Controls the pre-allocation strategy for the timestamp column - -@rst -Set a fixed pre-allocation size, or use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param max_timestamp_length_increment The number of bytes to pre-allocate, or zero for -the default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_set_max_timestamp_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment); - -/** -@brief Controls the pre-allocation strategy for the record column - -@rst -Set a fixed pre-allocation size, use the default doubling strategy. -See :ref:`sec_c_api_memory_allocation_strategy` for details on the default -pre-allocation strategy, -@endrst - -@param self A pointer to a tsk_provenance_table_t object. -@param max_record_length_increment The number of bytes to pre-allocate, or zero for the -default doubling strategy. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_provenance_table_set_max_record_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_record_length_increment); - -/** @} */ - -/* Undocumented methods */ -int tsk_provenance_table_dump_text(const tsk_provenance_table_t *self, FILE *out); - -/****************************************************************************/ -/* Table collection .*/ -/****************************************************************************/ - -/** -@defgroup TABLE_COLLECTION_API_GROUP Table collection API. -@{ -*/ - -/** -@brief Initialises the table collection by allocating the internal memory - and initialising all the constituent tables. - -@rst -This must be called before any operations are performed on the table -collection. See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. - -**Options** - -Options can be specified by providing bitwise flags: - -- :c:macro:`TSK_TC_NO_EDGE_METADATA` -@endrst - -@param self A pointer to an uninitialised tsk_table_collection_t object. -@param options Allocation time options as above. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified table collection. - -@param self A pointer to an initialised tsk_table_collection_t object. -@return Always returns 0. -*/ -int tsk_table_collection_free(tsk_table_collection_t *self); - -/** -@brief Clears data tables (and optionally provenances and metadata) in -this table collection. - -@rst -By default this operation clears all tables except the provenance table, retaining -table metadata schemas and the tree-sequence level metadata and schema. - -No memory is freed as a result of this operation; please use -:c:func:`tsk_table_collection_free` to free internal resources. - -**Options** - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_CLEAR_PROVENANCE` -- :c:macro:`TSK_CLEAR_METADATA_SCHEMAS` -- :c:macro:`TSK_CLEAR_TS_METADATA_AND_SCHEMA` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Bitwise clearing options. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Returns true if the data in the specified table collection is equal - to the data in this table collection. - -@rst - -Returns true if the two table collections are equal. The indexes are -not considered as these are derived from the tables. We also do not -consider the ``file_uuid``, since it is a property of the file that set -of tables is stored in. - -**Options** - -Options to control the comparison can be specified by providing one or -more of the following bitwise flags. By default (options=0) two table -collections are considered equal if all of the tables are byte-wise -identical, and the sequence lengths, metadata and metadata schemas -of the two table collections are identical. - -- :c:macro:`TSK_CMP_IGNORE_PROVENANCE` -- :c:macro:`TSK_CMP_IGNORE_METADATA` -- :c:macro:`TSK_CMP_IGNORE_TS_METADATA` -- :c:macro:`TSK_CMP_IGNORE_TIMESTAMPS` -- :c:macro:`TSK_CMP_IGNORE_TABLES` -- :c:macro:`TSK_CMP_IGNORE_REFERENCE_SEQUENCE` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param other A pointer to a tsk_table_collection_t object. -@param options Bitwise comparison options. -@return Return true if the specified table collection is equal to this table. -*/ -bool tsk_table_collection_equals(const tsk_table_collection_t *self, - const tsk_table_collection_t *other, tsk_flags_t options); - -/** -@brief Copies the state of this table collection into the specified destination. - -@rst -By default the method initialises the specified destination table collection. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. - -**Options** - -Options can be specified by providing bitwise flags: - -:c:macro:`TSK_COPY_FILE_UUID` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param dest A pointer to a tsk_table_collection_t object. If the TSK_NO_INIT -option is specified, this must be an initialised table collection. If not, it must be an -uninitialised table collection. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_copy(const tsk_table_collection_t *self, - tsk_table_collection_t *dest, tsk_flags_t options); - -/** -@brief Print out the state of this table collection to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_table_collection_t object. -@param out The stream to write the summary to. -*/ -void tsk_table_collection_print_state(const tsk_table_collection_t *self, FILE *out); - -/** -@brief Load a table collection from a file path. - -@rst -Loads the data from the specified file into this table collection. -By default, the table collection is also initialised. -The resources allocated must be freed using -:c:func:`tsk_table_collection_free` even in error conditions. - -If the :c:macro:`TSK_NO_INIT` option is set, the table collection is -not initialised, allowing an already initialised table collection to -be overwritten with the data from a file. - -If the file contains multiple table collections, this function will load -the first. Please see the :c:func:`tsk_table_collection_loadf` for details -on how to sequentially load table collections from a stream. - -If the :c:macro:`TSK_LOAD_SKIP_TABLES` option is set, only the non-table information from -the table collection will be read, leaving all tables with zero rows and no -metadata or schema. -If the :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is set, the table collection is -read without loading the reference sequence. - -**Options** - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_NO_INIT` -- :c:macro:`TSK_LOAD_SKIP_TABLES` -- :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` - -**Examples** - -.. code-block:: c - - int ret; - tsk_table_collection_t tables; - ret = tsk_table_collection_load(&tables, "data.trees", 0); - if (ret != 0) { - fprintf(stderr, "Load error:%s\n", tsk_strerror(ret)); - exit(EXIT_FAILURE); - } - -@endrst - -@param self A pointer to an uninitialised tsk_table_collection_t object - if the TSK_NO_INIT option is not set (default), or an initialised - tsk_table_collection_t otherwise. -@param filename A NULL terminated string containing the filename. -@param options Bitwise options. See above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_load( - tsk_table_collection_t *self, const char *filename, tsk_flags_t options); - -/** -@brief Load a table collection from a stream. - -@rst -Loads a tables definition from the specified file stream to this table -collection. By default, the table collection is also initialised. -The resources allocated must be freed using -:c:func:`tsk_table_collection_free` even in error conditions. - -If the :c:macro:`TSK_NO_INIT` option is set, the table collection is -not initialised, allowing an already initialised table collection to -be overwritten with the data from a file. - -The stream can be an arbitrary file descriptor, for example a network socket. -No seek operations are performed. - -If the stream contains multiple table collection definitions, this function -will load the next table collection from the stream. If the stream contains no -more table collection definitions the error value :c:macro:`TSK_ERR_EOF` will -be returned. Note that EOF is only returned in the case where zero bytes are -read from the stream --- malformed files or other errors will result in -different error conditions. Please see the -:ref:`sec_c_api_examples_file_streaming` section for an example of how to -sequentially load tree sequences from a stream. - -Please note that this streaming behaviour is not supported if the -:c:macro:`TSK_LOAD_SKIP_TABLES` or :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is -set. If the :c:macro:`TSK_LOAD_SKIP_TABLES` option is set, only the non-table information -from the table collection will be read, leaving all tables with zero rows and no metadata -or schema. If the :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is set, the table -collection is read without loading the reference sequence. When attempting to read from a -stream with multiple table collection definitions and either of these two options set, -the requested information from the first table collection will be read on the first call -to :c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors. - -**Options** - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_NO_INIT` -- :c:macro:`TSK_LOAD_SKIP_TABLES` -- :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` -@endrst - -@param self A pointer to an uninitialised tsk_table_collection_t object - if the TSK_NO_INIT option is not set (default), or an initialised - tsk_table_collection_t otherwise. -@param file A FILE stream opened in an appropriate mode for reading (e.g. - "r", "r+" or "w+") positioned at the beginning of a table collection - definition. -@param options Bitwise options. See above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_loadf( - tsk_table_collection_t *self, FILE *file, tsk_flags_t options); - -/** -@brief Write a table collection to file. - -@rst -Writes the data from this table collection to the specified file. - -If an error occurs the file path is deleted, ensuring that only complete -and well formed files will be written. - -**Examples** - -.. code-block:: c - - int ret; - tsk_table_collection_t tables; - - ret = tsk_table_collection_init(&tables, 0); - error_check(ret); - tables.sequence_length = 1.0; - // Write out the empty tree sequence - ret = tsk_table_collection_dump(&tables, "empty.trees", 0); - error_check(ret); - -@endrst - -@param self A pointer to an initialised tsk_table_collection_t object. -@param filename A NULL terminated string containing the filename. -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_dump( - const tsk_table_collection_t *self, const char *filename, tsk_flags_t options); - -/** -@brief Write a table collection to a stream. - -@rst -Writes the data from this table collection to the specified FILE stream. -Semantics are identical to :c:func:`tsk_table_collection_dump`. - -Please see the :ref:`sec_c_api_examples_file_streaming` section for an example -of how to sequentially dump and load tree sequences from a stream. - -@endrst - -@param self A pointer to an initialised tsk_table_collection_t object. -@param file A FILE stream opened in an appropriate mode for writing (e.g. - "w", "a", "r+" or "w+"). -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_dumpf( - const tsk_table_collection_t *self, FILE *file, tsk_flags_t options); - -/** -@brief Record the number of rows in each table in the specified tsk_bookmark_t object. - -@param self A pointer to an initialised tsk_table_collection_t object. -@param bookmark A pointer to a tsk_bookmark_t which is updated to contain the number of - rows in all tables. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_record_num_rows( - const tsk_table_collection_t *self, tsk_bookmark_t *bookmark); - -/** -@brief Truncates the tables in this table collection according to the specified bookmark. - -@rst -Truncate the tables in this collection so that each one has the number -of rows specified in the parameter :c:type:`tsk_bookmark_t`. Use the -:c:func:`tsk_table_collection_record_num_rows` function to record the -number rows for each table in a table collection at a particular time. -@endrst - -@param self A pointer to a tsk_individual_table_t object. -@param bookmark The number of rows to retain in each table. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_truncate( - tsk_table_collection_t *self, tsk_bookmark_t *bookmark); - -/** -@brief Sorts the tables in this collection. - -@rst -Some of the tables in a table collection must satisfy specific sortedness requirements -in order to define a :ref:`valid tree sequence `. -This method sorts the ``edge``, ``site``, ``mutation`` and ``individual`` tables such -that these requirements are guaranteed to be fulfilled. The ``node``, ``population`` -and ``provenance`` tables do not have any sortedness requirements, and are therefore -ignored by this method. - -.. note:: The current implementation **may** sort in such a way that exceeds - these requirements, but this behaviour should not be relied upon and later - versions may weaken the level of sortedness. However, the method does **guarantee** - that the resulting tables describes a valid tree sequence. - -.. warning:: Sorting migrations is currently not supported and an error will be raised - if a table collection containing a non-empty migration table is specified. - -The specified :c:type:`tsk_bookmark_t` allows us to specify a start position -for sorting in each of the tables; rows before this value are assumed to already be -in sorted order and this information is used to make sorting more efficient. -Positions in tables that are not sorted (``node``, ``population`` -and ``provenance``) are ignored and can be set to arbitrary values. - -.. warning:: The current implementation only supports specifying a start - position for the ``edge`` table and in a limited form for the - ``site``, ``mutation`` and ``individual`` tables. Specifying a non-zero - ``migration``, start position results in an error. The start positions for the - ``site``, ``mutation`` and ``individual`` tables can either be 0 or the length of the - respective tables, allowing these tables to either be fully sorted, or not sorted at - all. - -The table collection will always be unindexed after sort successfully completes. - -For more control over the sorting process, see the :ref:`sec_c_api_low_level_sorting` -section. - -**Options** - -Options can be specified by providing one or more of the following bitwise -flags: - -:c:macro:`TSK_NO_CHECK_INTEGRITY` - Do not run integrity checks using - :c:func:`tsk_table_collection_check_integrity` before sorting, - potentially leading to a small reduction in execution time. This - performance optimisation should not be used unless the calling code can - guarantee reference integrity within the table collection. References - to rows not in the table or bad offsets will result in undefined - behaviour. -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param start The position to begin sorting in each table; all rows less than this - position must fulfill the tree sequence sortedness requirements. If this is - NULL, sort all rows. -@param options Sort options. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_sort( - tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); - -/** -@brief Sorts the individual table in this collection. - -@rst -Sorts the individual table in place, so that parents come before children, -and the parent column is remapped as required. Node references to individuals -are also updated. -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Sort options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_individual_topological_sort( - tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Puts the tables into canonical form. - -@rst -Put tables into canonical form such that randomly reshuffled tables -are guaranteed to always be sorted in the same order, and redundant -information is removed. The canonical sorting exceeds the usual -tree sequence sortedness requirements. - -**Options**: - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_SUBSET_KEEP_UNREFERENCED` - -@endrst - -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Simplify the tables to remove redundant information. - -@rst -Simplification transforms the tables to remove redundancy and canonicalise -tree sequence data. See the :ref:`simplification ` tutorial for -more details. - -A mapping from the node IDs in the table before simplification to their equivalent -values after simplification can be obtained via the ``node_map`` argument. If this -is non NULL, ``node_map[u]`` will contain the new ID for node ``u`` after simplification, -or :c:macro:`TSK_NULL` if the node has been removed. Thus, ``node_map`` must be an array -of at least ``self->nodes.num_rows`` :c:type:`tsk_id_t` values. - -If the `TSK_SIMPLIFY_NO_FILTER_NODES` option is specified, the node table will be -unaltered except for changing the sample status of nodes (but see the -`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option below) and to update references -to other tables that may have changed as a result of filtering (see below). -The ``node_map`` (if specified) will always be the identity mapping, such that -``node_map[u] == u`` for all nodes. Note also that the order of the list of -samples is not important in this case. - -When a table is not filtered (i.e., if the `TSK_SIMPLIFY_NO_FILTER_NODES` -option is provided or the `TSK_SIMPLIFY_FILTER_SITES`, -`TSK_SIMPLIFY_FILTER_POPULATIONS` or `TSK_SIMPLIFY_FILTER_INDIVIDUALS` -options are *not* provided) the corresponding table is modified as -little as possible, and all pointers are guaranteed to remain valid -after simplification. The only changes made to an unfiltered table are -to update any references to tables that may have changed (for example, -remapping population IDs in the node table if -`TSK_SIMPLIFY_FILTER_POPULATIONS` was specified) or altering the -sample status flag of nodes. - -.. note:: It is possible for populations and individuals to be filtered - even if `TSK_SIMPLIFY_NO_FILTER_NODES` is specified because there - may be entirely unreferenced entities in the input tables, which - are not affected by whether we filter nodes or not. - -By default, the node sample flags are updated by unsetting the -:c:macro:`TSK_NODE_IS_SAMPLE` flag for all nodes and subsequently setting it -for the nodes provided as input to this function. The -`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option will prevent this from occuring, -making it the responsibility of calling code to keep track of the ultimate -sample status of nodes. Using this option in conjunction with -`TSK_SIMPLIFY_NO_FILTER_NODES` (and without the -`TSK_SIMPLIFY_FILTER_POPULATIONS` and `TSK_SIMPLIFY_FILTER_INDIVIDUALS` -options) guarantees that the node table will not be written to during the -lifetime of this function. - -The table collection will always be unindexed after simplify successfully completes. - -.. note:: Migrations are currently not supported by simplify, and an error will - be raised if we attempt call simplify on a table collection with greater - than zero migrations. See ``_ - -**Options**: - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_SIMPLIFY_FILTER_SITES` -- :c:macro:`TSK_SIMPLIFY_FILTER_POPULATIONS` -- :c:macro:`TSK_SIMPLIFY_FILTER_INDIVIDUALS` -- :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES` -- :c:macro:`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` -- :c:macro:`TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY` -- :c:macro:`TSK_SIMPLIFY_KEEP_UNARY` -- :c:macro:`TSK_SIMPLIFY_KEEP_INPUT_ROOTS` -- :c:macro:`TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param samples Either NULL or an array of num_samples distinct and valid node IDs. - If non-null the nodes in this array will be marked as samples in the output. - If NULL, the num_samples parameter is ignored and the samples in the output - will be the same as the samples in the input. This is equivalent to populating - the samples array with all of the sample nodes in the input in increasing - order of ID. -@param num_samples The number of node IDs in the input samples array. Ignored - if the samples array is NULL. -@param options Simplify options; see above for the available bitwise flags. - For the default behaviour, a value of 0 should be provided. -@param node_map If not NULL, this array will be filled to define the mapping - between nodes IDs in the table collection before and after simplification. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map); - -/** -@brief Subsets and reorders a table collection according to an array of nodes. - -@rst -Reduces the table collection to contain only the entries referring to -the provided list of nodes, with nodes reordered according to the order -they appear in the ``nodes`` argument. Specifically, this subsets and reorders -each of the tables as follows (but see options, below): - -1. Nodes: if in the list of nodes, and in the order provided. -2. Individuals: if referred to by a retained node. -3. Populations: if referred to by a retained node, and in the order first seen - when traversing the list of retained nodes. -4. Edges: if both parent and child are retained nodes. -5. Mutations: if the mutation's node is a retained node. -6. Sites: if any mutations remain at the site after removing mutations. - -Retained individuals, edges, mutations, and sites appear in the same -order as in the original tables. Note that only the information *directly* -associated with the provided nodes is retained - for instance, -subsetting to nodes=[A, B] does not retain nodes ancestral to A and B, -and only retains the individuals A and B are in, and not their parents. - -This function does *not* require the tables to be sorted. - -.. note:: Migrations are currently not supported by subset, and an error will - be raised if we attempt call subset on a table collection with greater - than zero migrations. - -**Options**: - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_SUBSET_NO_CHANGE_POPULATIONS` -- :c:macro:`TSK_SUBSET_KEEP_UNREFERENCED` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param nodes An array of num_nodes valid node IDs. -@param num_nodes The number of node IDs in the input nodes array. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, - tsk_size_t num_nodes, tsk_flags_t options); - -/** -@brief Forms the node-wise union of two table collections. - -@rst -Expands this table collection by adding the non-shared portions of another table -collection to itself. The ``other_node_mapping`` encodes which nodes in ``other`` are -equivalent to a node in ``self``. The positions in the ``other_node_mapping`` array -correspond to node ids in ``other``, and the elements encode the equivalent -node id in ``self`` or :c:macro:`TSK_NULL` if the node is exclusive to ``other``. Nodes -that are exclusive ``other`` are added to ``self``, along with: - -1. Individuals which are new to ``self``. -2. Edges whose parent or child are new to ``self``. -3. Sites which were not present in ``self``. -4. Mutations whose nodes are new to ``self``. - -By default, populations of newly added nodes are assumed to be new populations, -and added to the population table as well. - -This operation will also sort the resulting tables, so the tables may change -even if nothing new is added, if the original tables were not sorted. - -.. note:: Migrations are currently not supported by union, and an error will - be raised if we attempt call union on a table collection with migrations. - -**Options**: - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_UNION_NO_CHECK_SHARED` -- :c:macro:`TSK_UNION_NO_ADD_POP` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param other A pointer to a tsk_table_collection_t object. -@param other_node_mapping An array of node IDs that relate nodes in other to nodes in -self: the k-th element of other_node_mapping should be the index of the equivalent -node in self, or TSK_NULL if the node is not present in self (in which case it -will be added to self). -@param options Union options; see above for the available bitwise flags. - For the default behaviour, a value of 0 should be provided. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_union(tsk_table_collection_t *self, - const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, - tsk_flags_t options); - -/** -@brief Set the time_units -@rst -Copies the time_units string to this table collection, replacing any existing. -@endrst -@param self A pointer to a tsk_table_collection_t object. -@param time_units A pointer to a char array. -@param time_units_length The size of the time units string in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_set_time_units( - tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length); - -/** -@brief Set the metadata -@rst -Copies the metadata string to this table collection, replacing any existing. -@endrst -@param self A pointer to a tsk_table_collection_t object. -@param metadata A pointer to a char array. -@param metadata_length The size of the metadata in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_set_metadata( - tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length); - -/** -@brief Set the metadata schema -@rst -Copies the metadata schema string to this table collection, replacing any existing. -@endrst -@param self A pointer to a tsk_table_collection_t object. -@param metadata_schema A pointer to a char array. -@param metadata_schema_length The size of the metadata schema in bytes. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); - -/** -@brief Returns true if this table collection is indexed. - -@rst -This method returns true if the table collection has an index -for the edge table. It guarantees that the index exists, and that -it is for the same number of edges that are in the edge table. It -does *not* guarantee that the index is valid (i.e., if the rows -in the edge have been permuted in some way since the index was built). - -See the :ref:`sec_c_api_table_indexes` section for details on the index -life-cycle. -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return true if there is an index present for this table collection. -*/ -bool tsk_table_collection_has_index( - const tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Deletes the indexes for this table collection. - -@rst -Unconditionally drop the indexes that may be present for this table collection. It -is not an error to call this method on an unindexed table collection. -See the :ref:`sec_c_api_table_indexes` section for details on the index -life-cycle. -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Always returns 0. -*/ -int tsk_table_collection_drop_index(tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Builds indexes for this table collection. - -@rst -Builds the tree traversal :ref:`indexes ` for this table -collection. Any existing index is first dropped using -:c:func:`tsk_table_collection_drop_index`. See the -:ref:`sec_c_api_table_indexes` section for details on the index life-cycle. -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_collection_build_index(tsk_table_collection_t *self, tsk_flags_t options); - -/** -@brief Runs integrity checks on this table collection. - -@rst - -Checks the integrity of this table collection. The default checks (i.e., with -options = 0) guarantee the integrity of memory and entity references within the -table collection. All positions along the genome are checked -to see if they are finite values and within the required bounds. Time values -are checked to see if they are finite or marked as unknown. -Consistency of the direction of inheritance is also checked: whether -parents are more recent than children, mutations are not more recent -than their nodes or their mutation parents, etcetera. - -To check if a set of tables fulfills the :ref:`requirements -` needed for a valid tree sequence, use -the :c:macro:`TSK_CHECK_TREES` option. When this method is called with -:c:macro:`TSK_CHECK_TREES`, the number of trees in the tree sequence is returned. Thus, -to check for errors client code should verify that the return value is less than zero. -All other options will return zero on success and a negative value on failure. - -More fine-grained checks can be achieved using bitwise combinations of the -other options. - -**Options**: - -Options can be specified by providing one or more of the following bitwise -flags: - -- :c:macro:`TSK_CHECK_EDGE_ORDERING` -- :c:macro:`TSK_CHECK_SITE_ORDERING` -- :c:macro:`TSK_CHECK_SITE_DUPLICATES` -- :c:macro:`TSK_CHECK_MUTATION_ORDERING` -- :c:macro:`TSK_CHECK_INDIVIDUAL_ORDERING` -- :c:macro:`TSK_CHECK_MIGRATION_ORDERING` -- :c:macro:`TSK_CHECK_INDEXES` -- :c:macro:`TSK_CHECK_TREES` -- :c:macro:`TSK_NO_CHECK_POPULATION_REFS` -@endrst - -@param self A pointer to a tsk_table_collection_t object. -@param options Bitwise options. -@return Return a negative error value on if any problems are detected - in the tree sequence. If the TSK_CHECK_TREES option is provided, - the number of trees in the tree sequence will be returned, on - success. -*/ -tsk_id_t tsk_table_collection_check_integrity( - const tsk_table_collection_t *self, tsk_flags_t options); - -/** @} */ - -/* Undocumented methods */ - -/* Flags for ibd_segments */ -#define TSK_IBD_STORE_PAIRS (1 << 0) -#define TSK_IBD_STORE_SEGMENTS (1 << 1) - -/* TODO be systematic about where "result" should be in the params - * list, different here and in link_ancestors. */ -/* FIXME the order of num_samples and samples needs to be reversed in within. - * This should be done as part of documenting, I guess. */ -int tsk_table_collection_ibd_within(const tsk_table_collection_t *self, - tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, - double min_span, double max_time, tsk_flags_t options); - -int tsk_table_collection_ibd_between(const tsk_table_collection_t *self, - tsk_identity_segments_t *result, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_span, - double max_time, tsk_flags_t options); - -int tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samples, - tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, - tsk_flags_t options, tsk_edge_table_t *result); -int tsk_table_collection_deduplicate_sites( - tsk_table_collection_t *tables, tsk_flags_t options); -int tsk_table_collection_compute_mutation_parents( - tsk_table_collection_t *self, tsk_flags_t options); -int tsk_table_collection_compute_mutation_times( - tsk_table_collection_t *self, double *random, tsk_flags_t options); -int tsk_table_collection_delete_older( - tsk_table_collection_t *self, double time, tsk_flags_t options); - -int tsk_table_collection_set_indexes(tsk_table_collection_t *self, - tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); - -int tsk_table_collection_takeset_metadata( - tsk_table_collection_t *self, char *metadata, tsk_size_t metadata_length); -int tsk_table_collection_takeset_indexes(tsk_table_collection_t *self, - tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); -int tsk_individual_table_takeset_columns(tsk_individual_table_t *self, - tsk_size_t num_rows, tsk_flags_t *flags, double *location, - tsk_size_t *location_offset, tsk_id_t *parents, tsk_size_t *parents_offset, - char *metadata, tsk_size_t *metadata_offset); -int tsk_node_table_takeset_columns(tsk_node_table_t *self, tsk_size_t num_rows, - tsk_flags_t *flags, double *time, tsk_id_t *population, tsk_id_t *individual, - char *metadata, tsk_size_t *metadata_offset); -int tsk_edge_table_takeset_columns(tsk_edge_table_t *self, tsk_size_t num_rows, - double *left, double *right, tsk_id_t *parent, tsk_id_t *child, char *metadata, - tsk_size_t *metadata_offset); -int tsk_migration_table_takeset_columns(tsk_migration_table_t *self, tsk_size_t num_rows, - double *left, double *right, tsk_id_t *node, tsk_id_t *source, tsk_id_t *dest, - double *time, char *metadata, tsk_size_t *metadata_offset); -int tsk_site_table_takeset_columns(tsk_site_table_t *self, tsk_size_t num_rows, - double *position, char *ancestral_state, tsk_size_t *ancestral_state_offset, - char *metadata, tsk_size_t *metadata_offset); -int tsk_mutation_table_takeset_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, char *derived_state, - tsk_size_t *derived_state_offset, char *metadata, tsk_size_t *metadata_offset); -int tsk_population_table_takeset_columns(tsk_population_table_t *self, - tsk_size_t num_rows, char *metadata, tsk_size_t *metadata_offset); -int tsk_provenance_table_takeset_columns(tsk_provenance_table_t *self, - tsk_size_t num_rows, char *timestamp, tsk_size_t *timestamp_offset, char *record, - tsk_size_t *record_offset); - -bool tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self); - -int tsk_reference_sequence_init(tsk_reference_sequence_t *self, tsk_flags_t options); -int tsk_reference_sequence_free(tsk_reference_sequence_t *self); -bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self); -bool tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, - const tsk_reference_sequence_t *other, tsk_flags_t options); -int tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, - tsk_reference_sequence_t *dest, tsk_flags_t options); -int tsk_reference_sequence_set_data( - tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length); -int tsk_reference_sequence_set_url( - tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length); -int tsk_reference_sequence_set_metadata( - tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length); -int tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, - const char *metadata_schema, tsk_size_t metadata_schema_length); -int tsk_reference_sequence_takeset_data( - tsk_reference_sequence_t *self, char *data, tsk_size_t data_length); -int tsk_reference_sequence_takeset_metadata( - tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length); - -/** -@defgroup TABLE_SORTER_API_GROUP Low-level table sorter API. -@{ -*/ - -/* NOTE: We use the "struct _tsk_table_sorter_t" form here - * rather then the usual tsk_table_sorter_t alias because - * of problems with Doxygen. This was the only way I could - * get it to work - ideally, we'd use the usual typedefs - * to avoid confusing people. - */ - -/** -@brief Initialises the memory for the sorter object. - -@rst -This must be called before any operations are performed on the -table sorter and initialises all fields. The ``edge_sort`` function -is set to the default method using qsort. The ``user_data`` -field is set to NULL. -This method supports the same options as -:c:func:`tsk_table_collection_sort`. - -@endrst - -@param self A pointer to an uninitialised tsk_table_sorter_t object. -@param tables The table collection to sort. -@param options Sorting options. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_sorter_init(struct _tsk_table_sorter_t *self, - tsk_table_collection_t *tables, tsk_flags_t options); - -/** -@brief Runs the sort using the configured functions. - -@rst -Runs the sorting process: - -1. Drop the table indexes. -2. If the ``sort_edges`` function pointer is not NULL, run it. The - first parameter to the called function will be a pointer to this - table_sorter_t object. The second parameter will be the value - ``start.edges``. This specifies the offset at which sorting should - start in the edge table. This offset is guaranteed to be within the - bounds of the edge table. -3. Sort the site table, building the mapping between site IDs in the - current and sorted tables. -4. Sort the mutation table, using the ``sort_mutations`` pointer. - -If an error occurs during the execution of a user-supplied -sorting function a non-zero value must be returned. This value -will then be returned by ``tsk_table_sorter_run``. The error -return value should be chosen to avoid conflicts with tskit error -codes. - -See :c:func:`tsk_table_collection_sort` for details on the ``start`` parameter. - -@endrst - -@param self A pointer to a tsk_table_sorter_t object. -@param start The position in the tables at which sorting starts. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_table_sorter_run(struct _tsk_table_sorter_t *self, const tsk_bookmark_t *start); - -/** -@brief Free the internal memory for the specified table sorter. - -@param self A pointer to an initialised tsk_table_sorter_t object. -@return Always returns 0. -*/ -int tsk_table_sorter_free(struct _tsk_table_sorter_t *self); - -/** @} */ - -int tsk_squash_edges( - tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges); - -/* IBD segments API. This is experimental and the interface may change. */ - -tsk_size_t tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self); -double tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self); -tsk_size_t tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self); -int tsk_identity_segments_get_keys( - const tsk_identity_segments_t *result, tsk_id_t *pairs); -int tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, - tsk_identity_segment_list_t **lists); -int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, - tsk_id_t b, tsk_identity_segment_list_t **ret_list); -void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); -int tsk_identity_segments_free(tsk_identity_segments_t *self); - -/* Edge differences */ - -/* Internal API - currently used in a few places, but a better API is envisaged - * at some point. - * IMPORTANT: tskit-rust uses this API, so don't break without discussing! - */ -int tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, - tsk_id_t num_trees, tsk_flags_t options); -int tsk_diff_iter_free(tsk_diff_iter_t *self); -int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, - tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); -void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/subprojects/tskit/tskit/trees.c b/subprojects/tskit/tskit/trees.c deleted file mode 100644 index 4604579e0..000000000 --- a/subprojects/tskit/tskit/trees.c +++ /dev/null @@ -1,6011 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2015-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include - -static inline bool -is_discrete(double x) -{ - return trunc(x) == x; -} - -/* ======================================================== * - * tree sequence - * ======================================================== */ - -static void -tsk_treeseq_check_state(const tsk_treeseq_t *self) -{ - tsk_size_t j; - tsk_size_t k, l; - tsk_site_t site; - tsk_id_t site_id = 0; - - for (j = 0; j < self->num_trees; j++) { - for (k = 0; k < self->tree_sites_length[j]; k++) { - site = self->tree_sites[j][k]; - tsk_bug_assert(site.id == site_id); - site_id++; - for (l = 0; l < site.mutations_length; l++) { - tsk_bug_assert(site.mutations[l].site == site.id); - } - } - } -} - -void -tsk_treeseq_print_state(const tsk_treeseq_t *self, FILE *out) -{ - tsk_size_t j; - tsk_size_t k, l, m; - tsk_site_t site; - - fprintf(out, "tree_sequence state\n"); - fprintf(out, "num_trees = %lld\n", (long long) self->num_trees); - fprintf(out, "samples = (%lld)\n", (long long) self->num_samples); - for (j = 0; j < self->num_samples; j++) { - fprintf(out, "\t%lld\n", (long long) self->samples[j]); - } - tsk_table_collection_print_state(self->tables, out); - fprintf(out, "tree_sites = \n"); - for (j = 0; j < self->num_trees; j++) { - fprintf(out, "tree %lld\t%lld sites\n", (long long) j, - (long long) self->tree_sites_length[j]); - for (k = 0; k < self->tree_sites_length[j]; k++) { - site = self->tree_sites[j][k]; - fprintf(out, "\tsite %lld pos = %f ancestral state = ", (long long) site.id, - site.position); - for (l = 0; l < site.ancestral_state_length; l++) { - fprintf(out, "%c", site.ancestral_state[l]); - } - fprintf(out, " %lld mutations\n", (long long) site.mutations_length); - for (l = 0; l < site.mutations_length; l++) { - fprintf(out, "\t\tmutation %lld node = %lld derived_state = ", - (long long) site.mutations[l].id, - (long long) site.mutations[l].node); - for (m = 0; m < site.mutations[l].derived_state_length; m++) { - fprintf(out, "%c", site.mutations[l].derived_state[m]); - } - fprintf(out, "\n"); - } - } - } - tsk_treeseq_check_state(self); -} - -int -tsk_treeseq_free(tsk_treeseq_t *self) -{ - if (self->tables != NULL) { - tsk_table_collection_free(self->tables); - } - tsk_safe_free(self->tables); - tsk_safe_free(self->samples); - tsk_safe_free(self->sample_index_map); - tsk_safe_free(self->breakpoints); - tsk_safe_free(self->tree_sites); - tsk_safe_free(self->tree_sites_length); - tsk_safe_free(self->tree_sites_mem); - tsk_safe_free(self->site_mutations_mem); - tsk_safe_free(self->site_mutations_length); - tsk_safe_free(self->site_mutations); - tsk_safe_free(self->individual_nodes_mem); - tsk_safe_free(self->individual_nodes_length); - tsk_safe_free(self->individual_nodes); - return 0; -} - -static int -tsk_treeseq_init_sites(tsk_treeseq_t *self) -{ - tsk_id_t j, k; - int ret = 0; - tsk_size_t offset = 0; - const tsk_size_t num_mutations = self->tables->mutations.num_rows; - const tsk_size_t num_sites = self->tables->sites.num_rows; - const tsk_id_t *restrict mutation_site = self->tables->mutations.site; - const double *restrict site_position = self->tables->sites.position; - bool discrete_sites = true; - tsk_mutation_t *mutation; - - self->site_mutations_mem - = tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem)); - self->site_mutations_length - = tsk_malloc(num_sites * sizeof(*self->site_mutations_length)); - self->site_mutations = tsk_malloc(num_sites * sizeof(*self->site_mutations)); - self->tree_sites_mem = tsk_malloc(num_sites * sizeof(*self->tree_sites_mem)); - if (self->site_mutations_mem == NULL || self->site_mutations_length == NULL - || self->site_mutations == NULL || self->tree_sites_mem == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (k = 0; k < (tsk_id_t) num_mutations; k++) { - mutation = self->site_mutations_mem + k; - ret = tsk_treeseq_get_mutation(self, k, mutation); - if (ret != 0) { - goto out; - } - } - k = 0; - for (j = 0; j < (tsk_id_t) num_sites; j++) { - discrete_sites = discrete_sites && is_discrete(site_position[j]); - self->site_mutations[j] = self->site_mutations_mem + offset; - self->site_mutations_length[j] = 0; - /* Go through all mutations for this site */ - while (k < (tsk_id_t) num_mutations && mutation_site[k] == j) { - self->site_mutations_length[j]++; - offset++; - k++; - } - ret = tsk_treeseq_get_site(self, j, self->tree_sites_mem + j); - if (ret != 0) { - goto out; - } - } - self->discrete_genome = self->discrete_genome && discrete_sites; -out: - return ret; -} - -static int -tsk_treeseq_init_individuals(tsk_treeseq_t *self) -{ - int ret = 0; - tsk_id_t node; - tsk_id_t ind; - tsk_size_t offset = 0; - tsk_size_t total_node_refs = 0; - tsk_size_t *node_count = NULL; - tsk_id_t *node_array; - const tsk_size_t num_inds = self->tables->individuals.num_rows; - const tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t *restrict node_individual = self->tables->nodes.individual; - - // First find number of nodes per individual - self->individual_nodes_length - = tsk_calloc(TSK_MAX(1, num_inds), sizeof(*self->individual_nodes_length)); - node_count = tsk_calloc(TSK_MAX(1, num_inds), sizeof(*node_count)); - - if (self->individual_nodes_length == NULL || node_count == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (node = 0; node < (tsk_id_t) num_nodes; node++) { - ind = node_individual[node]; - if (ind != TSK_NULL) { - self->individual_nodes_length[ind]++; - total_node_refs++; - } - } - - self->individual_nodes_mem - = tsk_malloc(TSK_MAX(1, total_node_refs) * sizeof(tsk_node_t)); - self->individual_nodes = tsk_malloc(TSK_MAX(1, num_inds) * sizeof(tsk_node_t *)); - if (self->individual_nodes_mem == NULL || self->individual_nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* Now fill in the node IDs */ - for (ind = 0; ind < (tsk_id_t) num_inds; ind++) { - self->individual_nodes[ind] = self->individual_nodes_mem + offset; - offset += self->individual_nodes_length[ind]; - } - for (node = 0; node < (tsk_id_t) num_nodes; node++) { - ind = node_individual[node]; - if (ind != TSK_NULL) { - node_array = self->individual_nodes[ind]; - tsk_bug_assert(node_array - self->individual_nodes_mem - < (tsk_id_t)(total_node_refs - node_count[ind])); - node_array[node_count[ind]] = node; - node_count[ind] += 1; - } - } -out: - tsk_safe_free(node_count); - return ret; -} - -/* Initialises memory associated with the trees. - */ -static int -tsk_treeseq_init_trees(tsk_treeseq_t *self) -{ - int ret = TSK_ERR_GENERIC; - tsk_size_t j, k, tree_index; - tsk_id_t site_id, edge_id, mutation_id; - double tree_left, tree_right; - const double sequence_length = self->tables->sequence_length; - const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; - const tsk_id_t num_mutations = (tsk_id_t) self->tables->mutations.num_rows; - const tsk_size_t num_edges = self->tables->edges.num_rows; - const tsk_size_t num_nodes = self->tables->nodes.num_rows; - const double *restrict site_position = self->tables->sites.position; - const tsk_id_t *restrict mutation_site = self->tables->mutations.site; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_right = self->tables->edges.right; - const double *restrict edge_left = self->tables->edges.left; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - tsk_size_t num_trees_alloc = self->num_trees + 1; - bool discrete_breakpoints = true; - tsk_id_t *node_edge_map = tsk_malloc(num_nodes * sizeof(*node_edge_map)); - tsk_mutation_t *mutation; - - self->tree_sites_length - = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); - self->tree_sites = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites)); - self->breakpoints = tsk_malloc(num_trees_alloc * sizeof(*self->breakpoints)); - if (node_edge_map == NULL || self->tree_sites == NULL - || self->tree_sites_length == NULL || self->breakpoints == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset( - self->tree_sites_length, 0, self->num_trees * sizeof(*self->tree_sites_length)); - tsk_memset(self->tree_sites, 0, self->num_trees * sizeof(*self->tree_sites)); - tsk_memset(node_edge_map, TSK_NULL, num_nodes * sizeof(*node_edge_map)); - - tree_left = 0; - tree_right = sequence_length; - tree_index = 0; - site_id = 0; - mutation_id = 0; - j = 0; - k = 0; - while (j < num_edges || tree_left < sequence_length) { - discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left); - self->breakpoints[tree_index] = tree_left; - while (k < num_edges && edge_right[O[k]] == tree_left) { - edge_id = O[k]; - node_edge_map[edge_child[edge_id]] = TSK_NULL; - k++; - } - while (j < num_edges && edge_left[I[j]] == tree_left) { - edge_id = I[j]; - node_edge_map[edge_child[edge_id]] = edge_id; - j++; - } - tree_right = sequence_length; - if (j < num_edges) { - tree_right = TSK_MIN(tree_right, edge_left[I[j]]); - } - if (k < num_edges) { - tree_right = TSK_MIN(tree_right, edge_right[O[k]]); - } - self->tree_sites[tree_index] = self->tree_sites_mem + site_id; - while (site_id < num_sites && site_position[site_id] < tree_right) { - self->tree_sites_length[tree_index]++; - while ( - mutation_id < num_mutations && mutation_site[mutation_id] == site_id) { - mutation = self->site_mutations_mem + mutation_id; - mutation->edge = node_edge_map[mutation->node]; - mutation_id++; - } - site_id++; - } - tree_left = tree_right; - tree_index++; - } - tsk_bug_assert(site_id == num_sites); - tsk_bug_assert(tree_index == self->num_trees); - self->breakpoints[tree_index] = tree_right; - discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right); - self->discrete_genome = self->discrete_genome && discrete_breakpoints; - ret = 0; -out: - tsk_safe_free(node_edge_map); - return ret; -} - -static void -tsk_treeseq_init_migrations(tsk_treeseq_t *self) -{ - tsk_size_t j; - tsk_size_t num_migrations = self->tables->migrations.num_rows; - const double *restrict left = self->tables->migrations.left; - const double *restrict right = self->tables->migrations.right; - const double *restrict time = self->tables->migrations.time; - bool discrete_breakpoints = true; - bool discrete_times = true; - - for (j = 0; j < num_migrations; j++) { - discrete_breakpoints - = discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]); - discrete_times - = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); - } - self->discrete_genome = self->discrete_genome && discrete_breakpoints; - self->discrete_time = self->discrete_time && discrete_times; -} - -static void -tsk_treeseq_init_mutations(tsk_treeseq_t *self) -{ - tsk_size_t j; - tsk_size_t num_mutations = self->tables->mutations.num_rows; - const double *restrict time = self->tables->mutations.time; - bool discrete_times = true; - - for (j = 0; j < num_mutations; j++) { - discrete_times - = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); - } - self->discrete_time = self->discrete_time && discrete_times; - - for (j = 0; j < num_mutations; j++) { - if (!tsk_is_unknown_time(time[j])) { - self->min_time = TSK_MIN(self->min_time, time[j]); - self->max_time = TSK_MAX(self->max_time, time[j]); - } - } -} - -static int -tsk_treeseq_init_nodes(tsk_treeseq_t *self) -{ - tsk_size_t j, k; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_flags_t *restrict node_flags = self->tables->nodes.flags; - const double *restrict time = self->tables->nodes.time; - int ret = 0; - bool discrete_times = true; - - /* Determine the sample size */ - self->num_samples = 0; - for (j = 0; j < num_nodes; j++) { - if (!!(node_flags[j] & TSK_NODE_IS_SAMPLE)) { - self->num_samples++; - } - } - /* TODO raise an error if < 2 samples?? */ - self->samples = tsk_malloc(self->num_samples * sizeof(tsk_id_t)); - self->sample_index_map = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - if (self->samples == NULL || self->sample_index_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - k = 0; - for (j = 0; j < num_nodes; j++) { - self->sample_index_map[j] = -1; - if (!!(node_flags[j] & TSK_NODE_IS_SAMPLE)) { - self->samples[k] = (tsk_id_t) j; - self->sample_index_map[j] = (tsk_id_t) k; - k++; - } - } - tsk_bug_assert(k == self->num_samples); - - for (j = 0; j < num_nodes; j++) { - discrete_times - = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); - } - self->discrete_time = self->discrete_time && discrete_times; - - for (j = 0; j < num_nodes; j++) { - if (!tsk_is_unknown_time(time[j])) { - self->min_time = TSK_MIN(self->min_time, time[j]); - self->max_time = TSK_MAX(self->max_time, time[j]); - } - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_init( - tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) -{ - int ret = 0; - tsk_id_t num_trees; - - tsk_memset(self, 0, sizeof(*self)); - if (options & TSK_TAKE_OWNERSHIP) { - self->tables = tables; - if (tables->edges.options & TSK_TABLE_NO_METADATA) { - ret = TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA; - goto out; - } - } else { - self->tables = tsk_malloc(sizeof(*self->tables)); - if (self->tables == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* Note that this copy reinstates metadata for a table collection with - * TSK_TC_NO_EDGE_METADATA. Otherwise a table without metadata would - * crash tsk_diff_iter_next. */ - ret = tsk_table_collection_copy(tables, self->tables, TSK_COPY_FILE_UUID); - if (ret != 0) { - goto out; - } - } - if (options & TSK_TS_INIT_BUILD_INDEXES) { - ret = tsk_table_collection_build_index(self->tables, 0); - if (ret != 0) { - goto out; - } - } - num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - self->num_trees = (tsk_size_t) num_trees; - self->discrete_genome = true; - self->discrete_time = true; - self->min_time = INFINITY; - self->max_time = -INFINITY; - ret = tsk_treeseq_init_nodes(self); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_init_sites(self); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_init_individuals(self); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_init_trees(self); - if (ret != 0) { - goto out; - } - tsk_treeseq_init_migrations(self); - tsk_treeseq_init_mutations(self); - - if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED) - && !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED, - strlen(TSK_TIME_UNITS_UNCALIBRATED))) { - self->time_uncalibrated = true; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_copy_tables( - const tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) -{ - return tsk_table_collection_copy(self->tables, tables, options); -} - -int TSK_WARN_UNUSED -tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options) -{ - int ret = 0; - tsk_table_collection_t *tables = malloc(sizeof(*tables)); - - /* Need to make sure that we're zero'd out in case of error */ - tsk_memset(self, 0, sizeof(*self)); - - if (tables == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_table_collection_load(tables, filename, options); - if (ret != 0) { - tsk_table_collection_free(tables); - tsk_safe_free(tables); - goto out; - } - /* TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless - * of error conditions. */ - ret = tsk_treeseq_init(self, tables, TSK_TAKE_OWNERSHIP); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options) -{ - int ret = 0; - tsk_table_collection_t *tables = malloc(sizeof(*tables)); - - /* Need to make sure that we're zero'd out in case of error */ - tsk_memset(self, 0, sizeof(*self)); - - if (tables == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - ret = tsk_table_collection_loadf(tables, file, options); - if (ret != 0) { - tsk_table_collection_free(tables); - tsk_safe_free(tables); - goto out; - } - /* TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless - * of error conditions. */ - ret = tsk_treeseq_init(self, tables, TSK_TAKE_OWNERSHIP); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_dump(const tsk_treeseq_t *self, const char *filename, tsk_flags_t options) -{ - return tsk_table_collection_dump(self->tables, filename, options); -} - -int TSK_WARN_UNUSED -tsk_treeseq_dumpf(const tsk_treeseq_t *self, FILE *file, tsk_flags_t options) -{ - return tsk_table_collection_dumpf(self->tables, file, options); -} - -/* Simple attribute getters */ - -const char * -tsk_treeseq_get_metadata(const tsk_treeseq_t *self) -{ - return self->tables->metadata; -} - -tsk_size_t -tsk_treeseq_get_metadata_length(const tsk_treeseq_t *self) -{ - return self->tables->metadata_length; -} - -const char * -tsk_treeseq_get_metadata_schema(const tsk_treeseq_t *self) -{ - return self->tables->metadata_schema; -} - -tsk_size_t -tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self) -{ - return self->tables->metadata_schema_length; -} - -const char * -tsk_treeseq_get_time_units(const tsk_treeseq_t *self) -{ - return self->tables->time_units; -} - -tsk_size_t -tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self) -{ - return self->tables->time_units_length; -} - -double -tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self) -{ - return self->tables->sequence_length; -} - -const char * -tsk_treeseq_get_file_uuid(const tsk_treeseq_t *self) -{ - return self->tables->file_uuid; -} - -tsk_size_t -tsk_treeseq_get_num_samples(const tsk_treeseq_t *self) -{ - return self->num_samples; -} - -tsk_size_t -tsk_treeseq_get_num_nodes(const tsk_treeseq_t *self) -{ - return self->tables->nodes.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_edges(const tsk_treeseq_t *self) -{ - return self->tables->edges.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_migrations(const tsk_treeseq_t *self) -{ - return self->tables->migrations.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_sites(const tsk_treeseq_t *self) -{ - return self->tables->sites.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_mutations(const tsk_treeseq_t *self) -{ - return self->tables->mutations.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_populations(const tsk_treeseq_t *self) -{ - return self->tables->populations.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_individuals(const tsk_treeseq_t *self) -{ - return self->tables->individuals.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_provenances(const tsk_treeseq_t *self) -{ - return self->tables->provenances.num_rows; -} - -tsk_size_t -tsk_treeseq_get_num_trees(const tsk_treeseq_t *self) -{ - return self->num_trees; -} - -const double * -tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self) -{ - return self->breakpoints; -} - -const tsk_id_t * -tsk_treeseq_get_samples(const tsk_treeseq_t *self) -{ - return self->samples; -} - -const tsk_id_t * -tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self) -{ - return self->sample_index_map; -} - -bool -tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u) -{ - bool ret = false; - - if (u >= 0 && u < (tsk_id_t) self->tables->nodes.num_rows) { - ret = !!(self->tables->nodes.flags[u] & TSK_NODE_IS_SAMPLE); - } - return ret; -} - -bool -tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self) -{ - return self->discrete_genome; -} - -bool -tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self) -{ - return self->discrete_time; -} - -double -tsk_treeseq_get_min_time(const tsk_treeseq_t *self) -{ - return self->min_time; -} - -double -tsk_treeseq_get_max_time(const tsk_treeseq_t *self) -{ - return self->max_time; -} - -bool -tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self) -{ - return tsk_table_collection_has_reference_sequence(self->tables); -} - -int -tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output) -{ - int ret = 0; - tsk_size_t i, j; - tsk_individual_t ind; - tsk_id_t ind_pop; - const tsk_id_t *node_population = self->tables->nodes.population; - const tsk_size_t num_individuals = self->tables->individuals.num_rows; - - tsk_memset(output, TSK_NULL, num_individuals * sizeof(*output)); - - for (i = 0; i < num_individuals; i++) { - ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); - tsk_bug_assert(ret == 0); - if (ind.nodes_length > 0) { - ind_pop = -2; - for (j = 0; j < ind.nodes_length; j++) { - if (ind_pop == -2) { - ind_pop = node_population[ind.nodes[j]]; - } else if (ind_pop != node_population[ind.nodes[j]]) { - ret = TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH; - goto out; - } - } - output[ind.id] = ind_pop; - } - } -out: - return ret; -} - -int -tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output) -{ - int ret = 0; - tsk_size_t i, j; - tsk_individual_t ind; - double ind_time; - const double *node_time = self->tables->nodes.time; - const tsk_size_t num_individuals = self->tables->individuals.num_rows; - - for (i = 0; i < num_individuals; i++) { - ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); - tsk_bug_assert(ret == 0); - /* the default is UNKNOWN_TIME, but nodes cannot have - * UNKNOWN _TIME so this is safe. */ - ind_time = TSK_UNKNOWN_TIME; - for (j = 0; j < ind.nodes_length; j++) { - if (j == 0) { - ind_time = node_time[ind.nodes[j]]; - } else if (ind_time != node_time[ind.nodes[j]]) { - ret = TSK_ERR_INDIVIDUAL_TIME_MISMATCH; - goto out; - } - } - output[ind.id] = ind_time; - } -out: - return ret; -} - -/* Stats functions */ - -#define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row)) - -static inline double * -GET_3D_ROW(double *base, tsk_size_t num_nodes, tsk_size_t output_dim, - tsk_size_t window_index, tsk_id_t u) -{ - tsk_size_t offset - = window_index * num_nodes * output_dim + ((tsk_size_t) u) * output_dim; - return base + offset; -} - -/* Increments the n-dimensional array with the specified shape by the specified value at - * the specified coordinate. */ -static inline void -increment_nd_array_value(double *array, tsk_size_t n, const tsk_size_t *shape, - const tsk_size_t *coordinate, double value) -{ - tsk_size_t offset = 0; - tsk_size_t product = 1; - int k; - - for (k = (int) n - 1; k >= 0; k--) { - tsk_bug_assert(coordinate[k] < shape[k]); - offset += coordinate[k] * product; - product *= shape[k]; - } - array[offset] += value; -} - -/* TODO flatten the reference sets input here and follow the same pattern used - * in diversity, divergence, etc. */ -int TSK_WARN_UNUSED -tsk_treeseq_genealogical_nearest_neighbours(const tsk_treeseq_t *self, - const tsk_id_t *focal, tsk_size_t num_focal, const tsk_id_t *const *reference_sets, - const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, - tsk_flags_t TSK_UNUSED(options), double *ret_array) -{ - int ret = 0; - tsk_id_t u, v, p; - tsk_size_t j; - /* TODO It's probably not worth bothering with the int16_t here. */ - int16_t k, focal_reference_set; - /* We use the K'th element of the array for the total. */ - const int16_t K = (int16_t)(num_reference_sets + 1); - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double sequence_length = self->tables->sequence_length; - tsk_id_t tj, tk, h; - double left, right, *A_row, scale, tree_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - double *restrict length = tsk_calloc(num_focal, sizeof(*length)); - uint32_t *restrict ref_count - = tsk_calloc(((tsk_size_t) K) * num_nodes, sizeof(*ref_count)); - int16_t *restrict reference_set_map - = tsk_malloc(num_nodes * sizeof(*reference_set_map)); - uint32_t *restrict row = NULL; - uint32_t *restrict child_row = NULL; - uint32_t total, delta; - - /* We support a max of 8K focal sets */ - if (num_reference_sets == 0 || num_reference_sets > (INT16_MAX - 1)) { - /* TODO: more specific error */ - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (parent == NULL || ref_count == NULL || reference_set_map == NULL - || length == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - tsk_memset(reference_set_map, 0xff, num_nodes * sizeof(*reference_set_map)); - tsk_memset(ret_array, 0, num_focal * num_reference_sets * sizeof(*ret_array)); - - total = 0; /* keep the compiler happy */ - - /* Set the initial conditions and check the input. */ - for (k = 0; k < (int16_t) num_reference_sets; k++) { - for (j = 0; j < reference_set_size[k]; j++) { - u = reference_sets[k][j]; - if (u < 0 || u >= (tsk_id_t) num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (reference_set_map[u] != TSK_NULL) { - /* FIXME Technically inaccurate here: duplicate focal not sample */ - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - reference_set_map[u] = k; - row = GET_2D_ROW(ref_count, K, u); - row[k] = 1; - /* Also set the count for the total among all sets */ - row[K - 1] = 1; - } - } - for (j = 0; j < num_focal; j++) { - u = focal[j]; - if (u < 0 || u >= (tsk_id_t) num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - } - - /* Iterate over the trees */ - tj = 0; - tk = 0; - left = 0; - while (tj < num_edges || left < sequence_length) { - while (tk < num_edges && edge_right[O[tk]] == left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = TSK_NULL; - child_row = GET_2D_ROW(ref_count, K, u); - while (v != TSK_NULL) { - row = GET_2D_ROW(ref_count, K, v); - for (k = 0; k < K; k++) { - row[k] -= child_row[k]; - } - v = parent[v]; - } - } - while (tj < num_edges && edge_left[I[tj]] == left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - child_row = GET_2D_ROW(ref_count, K, u); - while (v != TSK_NULL) { - row = GET_2D_ROW(ref_count, K, v); - for (k = 0; k < K; k++) { - row[k] += child_row[k]; - } - v = parent[v]; - } - } - right = sequence_length; - if (tj < num_edges) { - right = TSK_MIN(right, edge_left[I[tj]]); - } - if (tk < num_edges) { - right = TSK_MIN(right, edge_right[O[tk]]); - } - - tree_length = right - left; - /* Process this tree */ - for (j = 0; j < num_focal; j++) { - u = focal[j]; - focal_reference_set = reference_set_map[u]; - delta = focal_reference_set != -1; - p = u; - while (p != TSK_NULL) { - row = GET_2D_ROW(ref_count, K, p); - total = row[K - 1]; - if (total > delta) { - break; - } - p = parent[p]; - } - if (p != TSK_NULL) { - length[j] += tree_length; - scale = tree_length / (total - delta); - A_row = GET_2D_ROW(ret_array, num_reference_sets, j); - for (k = 0; k < K - 1; k++) { - A_row[k] += row[k] * scale; - } - if (focal_reference_set != -1) { - /* Remove the contribution for the reference set u belongs to and - * insert the correct value. The long-hand version is - * A_row[k] = A_row[k] - row[k] * scale + (row[k] - 1) * scale; - * which cancels to give: */ - A_row[focal_reference_set] -= scale; - } - } - } - - /* Move on to the next tree */ - left = right; - } - - /* Divide by the accumulated length for each node to normalise */ - for (j = 0; j < num_focal; j++) { - A_row = GET_2D_ROW(ret_array, num_reference_sets, j); - if (length[j] > 0) { - for (k = 0; k < K - 1; k++) { - A_row[k] /= length[j]; - } - } - } -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - if (ref_count != NULL) { - free(ref_count); - } - if (reference_set_map != NULL) { - free(reference_set_map); - } - if (length != NULL) { - free(length); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_mean_descendants(const tsk_treeseq_t *self, - const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, - tsk_size_t num_reference_sets, tsk_flags_t TSK_UNUSED(options), double *ret_array) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t j; - int32_t k; - /* We use the K'th element of the array for the total. */ - const int32_t K = (int32_t)(num_reference_sets + 1); - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double sequence_length = self->tables->sequence_length; - tsk_id_t tj, tk, h; - double left, right, length, *restrict C_row; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - uint32_t *restrict ref_count - = tsk_calloc(num_nodes * ((size_t) K), sizeof(*ref_count)); - double *restrict last_update = tsk_calloc(num_nodes, sizeof(*last_update)); - double *restrict total_length = tsk_calloc(num_nodes, sizeof(*total_length)); - uint32_t *restrict row, *restrict child_row; - - if (num_reference_sets == 0 || num_reference_sets > (INT32_MAX - 1)) { - /* TODO: more specific error */ - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - if (parent == NULL || ref_count == NULL || last_update == NULL - || total_length == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - /* TODO add check for duplicate values in the reference sets */ - - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - tsk_memset(ret_array, 0, num_nodes * num_reference_sets * sizeof(*ret_array)); - - /* Set the initial conditions and check the input. */ - for (k = 0; k < (int32_t) num_reference_sets; k++) { - for (j = 0; j < reference_set_size[k]; j++) { - u = reference_sets[k][j]; - if (u < 0 || u >= (tsk_id_t) num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - row = GET_2D_ROW(ref_count, K, u); - row[k] = 1; - /* Also set the count for the total among all sets */ - row[K - 1] = 1; - } - } - - /* Iterate over the trees */ - tj = 0; - tk = 0; - left = 0; - while (tj < num_edges || left < sequence_length) { - while (tk < num_edges && edge_right[O[tk]] == left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = TSK_NULL; - child_row = GET_2D_ROW(ref_count, K, u); - while (v != TSK_NULL) { - row = GET_2D_ROW(ref_count, K, v); - if (last_update[v] != left) { - if (row[K - 1] > 0) { - length = left - last_update[v]; - C_row = GET_2D_ROW(ret_array, num_reference_sets, v); - for (k = 0; k < (int32_t) num_reference_sets; k++) { - C_row[k] += length * row[k]; - } - total_length[v] += length; - } - last_update[v] = left; - } - for (k = 0; k < K; k++) { - row[k] -= child_row[k]; - } - v = parent[v]; - } - } - while (tj < num_edges && edge_left[I[tj]] == left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - child_row = GET_2D_ROW(ref_count, K, u); - while (v != TSK_NULL) { - row = GET_2D_ROW(ref_count, K, v); - if (last_update[v] != left) { - if (row[K - 1] > 0) { - length = left - last_update[v]; - C_row = GET_2D_ROW(ret_array, num_reference_sets, v); - for (k = 0; k < (int32_t) num_reference_sets; k++) { - C_row[k] += length * row[k]; - } - total_length[v] += length; - } - last_update[v] = left; - } - for (k = 0; k < K; k++) { - row[k] += child_row[k]; - } - v = parent[v]; - } - } - right = sequence_length; - if (tj < num_edges) { - right = TSK_MIN(right, edge_left[I[tj]]); - } - if (tk < num_edges) { - right = TSK_MIN(right, edge_right[O[tk]]); - } - left = right; - } - - /* Add the stats for the last tree and divide by the total length that - * each node was an ancestor to > 0 of the reference nodes. */ - for (v = 0; v < (tsk_id_t) num_nodes; v++) { - row = GET_2D_ROW(ref_count, K, v); - C_row = GET_2D_ROW(ret_array, num_reference_sets, v); - if (row[K - 1] > 0) { - length = sequence_length - last_update[v]; - total_length[v] += length; - for (k = 0; k < (int32_t) num_reference_sets; k++) { - C_row[k] += length * row[k]; - } - } - if (total_length[v] > 0) { - length = total_length[v]; - for (k = 0; k < (int32_t) num_reference_sets; k++) { - C_row[k] /= length; - } - } - } - -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - if (ref_count != NULL) { - free(ref_count); - } - if (last_update != NULL) { - free(last_update); - } - if (total_length != NULL) { - free(total_length); - } - return ret; -} - -/*********************************** - * General stats framework - ***********************************/ - -static int -tsk_treeseq_check_windows( - const tsk_treeseq_t *self, tsk_size_t num_windows, const double *windows) -{ - int ret = TSK_ERR_BAD_WINDOWS; - tsk_size_t j; - - if (num_windows < 1) { - ret = TSK_ERR_BAD_NUM_WINDOWS; - goto out; - } - /* TODO these restrictions can be lifted later if we want a specific interval. */ - if (windows[0] != 0) { - goto out; - } - if (windows[num_windows] != self->tables->sequence_length) { - goto out; - } - for (j = 0; j < num_windows; j++) { - if (windows[j] >= windows[j + 1]) { - goto out; - } - } - ret = 0; -out: - return ret; -} - -/* TODO make these functions more consistent in how the arguments are ordered */ - -static inline void -update_state(double *X, tsk_size_t state_dim, tsk_id_t dest, tsk_id_t source, int sign) -{ - tsk_size_t k; - double *X_dest = GET_2D_ROW(X, state_dim, dest); - double *X_source = GET_2D_ROW(X, state_dim, source); - - for (k = 0; k < state_dim; k++) { - X_dest[k] += sign * X_source[k]; - } -} - -static inline int -update_node_summary(tsk_id_t u, tsk_size_t result_dim, double *node_summary, double *X, - tsk_size_t state_dim, general_stat_func_t *f, void *f_params) -{ - double *X_u = GET_2D_ROW(X, state_dim, u); - double *summary_u = GET_2D_ROW(node_summary, result_dim, u); - - return f(state_dim, X_u, result_dim, summary_u, f_params); -} - -static inline void -update_running_sum(tsk_id_t u, double sign, const double *restrict branch_length, - const double *summary, tsk_size_t result_dim, double *running_sum) -{ - const double *summary_u = GET_2D_ROW(summary, result_dim, u); - const double x = sign * branch_length[u]; - tsk_size_t m; - - for (m = 0; m < result_dim; m++) { - running_sum[m] += x * summary_u[m]; - } -} - -static int -tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, - void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, - double *result) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t j, k, window_index; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double *restrict time = self->tables->nodes.time; - const double sequence_length = self->tables->sequence_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - double *restrict branch_length = tsk_calloc(num_nodes, sizeof(*branch_length)); - tsk_id_t tj, tk, h; - double t_left, t_right, w_left, w_right, left, right, scale; - const double *weight_u; - double *state_u, *result_row, *summary_u; - double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); - double *summary = tsk_calloc(num_nodes * result_dim, sizeof(*summary)); - double *running_sum = tsk_calloc(result_dim, sizeof(*running_sum)); - - if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { - ret = TSK_ERR_TIME_UNCALIBRATED; - goto out; - } - - if (parent == NULL || branch_length == NULL || state == NULL || running_sum == NULL - || summary == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - - /* Set the initial conditions */ - for (j = 0; j < self->num_samples; j++) { - u = self->samples[j]; - state_u = GET_2D_ROW(state, state_dim, u); - weight_u = GET_2D_ROW(sample_weights, state_dim, j); - tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); - summary_u = GET_2D_ROW(summary, result_dim, u); - ret = f(state_dim, state_u, result_dim, summary_u, f_params); - if (ret != 0) { - goto out; - } - } - tsk_memset(result, 0, num_windows * result_dim * sizeof(*result)); - - /* Iterate over the trees */ - tj = 0; - tk = 0; - t_left = 0; - window_index = 0; - while (tj < num_edges || t_left < sequence_length) { - while (tk < num_edges && edge_right[O[tk]] == t_left) { - h = O[tk]; - tk++; - - u = edge_child[h]; - update_running_sum(u, -1, branch_length, summary, result_dim, running_sum); - parent[u] = TSK_NULL; - branch_length[u] = 0; - - u = edge_parent[h]; - while (u != TSK_NULL) { - update_running_sum( - u, -1, branch_length, summary, result_dim, running_sum); - update_state(state, state_dim, u, edge_child[h], -1); - ret = update_node_summary( - u, result_dim, summary, state, state_dim, f, f_params); - if (ret != 0) { - goto out; - } - update_running_sum( - u, +1, branch_length, summary, result_dim, running_sum); - u = parent[u]; - } - } - - while (tj < num_edges && edge_left[I[tj]] == t_left) { - h = I[tj]; - tj++; - - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - branch_length[u] = time[v] - time[u]; - update_running_sum(u, +1, branch_length, summary, result_dim, running_sum); - - u = v; - while (u != TSK_NULL) { - update_running_sum( - u, -1, branch_length, summary, result_dim, running_sum); - update_state(state, state_dim, u, edge_child[h], +1); - ret = update_node_summary( - u, result_dim, summary, state, state_dim, f, f_params); - if (ret != 0) { - goto out; - } - update_running_sum( - u, +1, branch_length, summary, result_dim, running_sum); - u = parent[u]; - } - } - - t_right = sequence_length; - if (tj < num_edges) { - t_right = TSK_MIN(t_right, edge_left[I[tj]]); - } - if (tk < num_edges) { - t_right = TSK_MIN(t_right, edge_right[O[tk]]); - } - - while (windows[window_index] < t_right) { - tsk_bug_assert(window_index < num_windows); - w_left = windows[window_index]; - w_right = windows[window_index + 1]; - left = TSK_MAX(t_left, w_left); - right = TSK_MIN(t_right, w_right); - scale = (right - left); - tsk_bug_assert(scale > 0); - result_row = GET_2D_ROW(result, result_dim, window_index); - for (k = 0; k < result_dim; k++) { - result_row[k] += running_sum[k] * scale; - } - - if (w_right <= t_right) { - window_index++; - } else { - /* This interval crosses a tree boundary, so we update it again in the */ - /* for the next tree */ - break; - } - } - /* Move to the next tree */ - t_left = t_right; - } - tsk_bug_assert(window_index == num_windows); -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - if (branch_length != NULL) { - free(branch_length); - } - tsk_safe_free(state); - tsk_safe_free(summary); - tsk_safe_free(running_sum); - return ret; -} - -static int -get_allele_weights(const tsk_site_t *site, const double *state, tsk_size_t state_dim, - const double *total_weight, tsk_size_t *ret_num_alleles, double **ret_allele_states) -{ - int ret = 0; - tsk_size_t k; - tsk_mutation_t mutation, parent_mut; - tsk_size_t mutation_index, allele, num_alleles, alt_allele_length; - /* The allele table */ - tsk_size_t max_alleles = site->mutations_length + 1; - const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); - tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); - double *allele_states = tsk_calloc(max_alleles * state_dim, sizeof(*allele_states)); - double *allele_row; - const double *state_row; - const char *alt_allele; - - if (alleles == NULL || allele_lengths == NULL || allele_states == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - tsk_bug_assert(state != NULL); - alleles[0] = site->ancestral_state; - allele_lengths[0] = site->ancestral_state_length; - tsk_memcpy(allele_states, total_weight, state_dim * sizeof(*allele_states)); - num_alleles = 1; - - for (mutation_index = 0; mutation_index < site->mutations_length; mutation_index++) { - mutation = site->mutations[mutation_index]; - /* Compute the allele index for this derived state value. */ - allele = 0; - while (allele < num_alleles) { - if (mutation.derived_state_length == allele_lengths[allele] - && tsk_memcmp( - mutation.derived_state, alleles[allele], allele_lengths[allele]) - == 0) { - break; - } - allele++; - } - if (allele == num_alleles) { - tsk_bug_assert(allele < max_alleles); - alleles[allele] = mutation.derived_state; - allele_lengths[allele] = mutation.derived_state_length; - num_alleles++; - } - - /* Add the state for the the mutation's node to this allele */ - state_row = GET_2D_ROW(state, state_dim, mutation.node); - allele_row = GET_2D_ROW(allele_states, state_dim, allele); - for (k = 0; k < state_dim; k++) { - allele_row[k] += state_row[k]; - } - - /* Get the index for the alternate allele that we must substract from */ - alt_allele = site->ancestral_state; - alt_allele_length = site->ancestral_state_length; - if (mutation.parent != TSK_NULL) { - parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; - alt_allele = parent_mut.derived_state; - alt_allele_length = parent_mut.derived_state_length; - } - allele = 0; - while (allele < num_alleles) { - if (alt_allele_length == allele_lengths[allele] - && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) - == 0) { - break; - } - allele++; - } - tsk_bug_assert(allele < num_alleles); - - allele_row = GET_2D_ROW(allele_states, state_dim, allele); - for (k = 0; k < state_dim; k++) { - allele_row[k] -= state_row[k]; - } - } - *ret_num_alleles = num_alleles; - *ret_allele_states = allele_states; - allele_states = NULL; -out: - tsk_safe_free(alleles); - tsk_safe_free(allele_lengths); - tsk_safe_free(allele_states); - return ret; -} - -static int -compute_general_stat_site_result(tsk_site_t *site, double *state, tsk_size_t state_dim, - tsk_size_t result_dim, general_stat_func_t *f, void *f_params, double *total_weight, - bool polarised, double *result) -{ - int ret = 0; - tsk_size_t k; - tsk_size_t allele, num_alleles; - double *allele_states; - double *result_tmp = tsk_calloc(result_dim, sizeof(*result_tmp)); - - if (result_tmp == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(result, 0, result_dim * sizeof(*result)); - - ret = get_allele_weights( - site, state, state_dim, total_weight, &num_alleles, &allele_states); - if (ret != 0) { - goto out; - } - /* Sum over the allele weights. Skip the ancestral state if this is a polarised stat - */ - for (allele = polarised ? 1 : 0; allele < num_alleles; allele++) { - ret = f(state_dim, GET_2D_ROW(allele_states, state_dim, allele), result_dim, - result_tmp, f_params); - if (ret != 0) { - goto out; - } - for (k = 0; k < result_dim; k++) { - result[k] += result_tmp[k]; - } - } -out: - tsk_safe_free(result_tmp); - tsk_safe_free(allele_states); - return ret; -} - -static int -tsk_treeseq_site_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, - void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, - double *result) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t j, k, tree_site, tree_index, window_index; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double sequence_length = self->tables->sequence_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - tsk_site_t *site; - tsk_id_t tj, tk, h; - double t_left, t_right; - const double *weight_u; - double *state_u, *result_row; - double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); - double *total_weight = tsk_calloc(state_dim, sizeof(*total_weight)); - double *site_result = tsk_calloc(result_dim, sizeof(*site_result)); - bool polarised = false; - - if (parent == NULL || state == NULL || total_weight == NULL || site_result == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - - if (options & TSK_STAT_POLARISED) { - polarised = true; - } - - /* Set the initial conditions */ - for (j = 0; j < self->num_samples; j++) { - u = self->samples[j]; - state_u = GET_2D_ROW(state, state_dim, u); - weight_u = GET_2D_ROW(sample_weights, state_dim, j); - tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); - for (k = 0; k < state_dim; k++) { - total_weight[k] += weight_u[k]; - } - } - tsk_memset(result, 0, num_windows * result_dim * sizeof(*result)); - - /* Iterate over the trees */ - tj = 0; - tk = 0; - t_left = 0; - tree_index = 0; - window_index = 0; - while (tj < num_edges || t_left < sequence_length) { - while (tk < num_edges && edge_right[O[tk]] == t_left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - while (v != TSK_NULL) { - update_state(state, state_dim, v, u, -1); - v = parent[v]; - } - parent[u] = TSK_NULL; - } - - while (tj < num_edges && edge_left[I[tj]] == t_left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - while (v != TSK_NULL) { - update_state(state, state_dim, v, u, +1); - v = parent[v]; - } - } - t_right = sequence_length; - if (tj < num_edges) { - t_right = TSK_MIN(t_right, edge_left[I[tj]]); - } - if (tk < num_edges) { - t_right = TSK_MIN(t_right, edge_right[O[tk]]); - } - - /* Update the sites */ - for (tree_site = 0; tree_site < self->tree_sites_length[tree_index]; - tree_site++) { - site = self->tree_sites[tree_index] + tree_site; - ret = compute_general_stat_site_result(site, state, state_dim, result_dim, f, - f_params, total_weight, polarised, site_result); - if (ret != 0) { - goto out; - } - - while (windows[window_index + 1] <= site->position) { - window_index++; - tsk_bug_assert(window_index < num_windows); - } - tsk_bug_assert(windows[window_index] <= site->position); - tsk_bug_assert(site->position < windows[window_index + 1]); - result_row = GET_2D_ROW(result, result_dim, window_index); - for (k = 0; k < result_dim; k++) { - result_row[k] += site_result[k]; - } - } - tree_index++; - t_left = t_right; - } -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - tsk_safe_free(state); - tsk_safe_free(total_weight); - tsk_safe_free(site_result); - return ret; -} - -static inline void -increment_row(tsk_size_t length, double multiplier, double *source, double *dest) -{ - tsk_size_t j; - - for (j = 0; j < length; j++) { - dest[j] += multiplier * source[j]; - } -} - -static int -tsk_treeseq_node_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, - void *f_params, tsk_size_t num_windows, const double *windows, - tsk_flags_t TSK_UNUSED(options), double *result) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t j, window_index; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double sequence_length = self->tables->sequence_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - tsk_id_t tj, tk, h; - const double *weight_u; - double *state_u; - double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); - double *node_summary = tsk_calloc(num_nodes * result_dim, sizeof(*node_summary)); - double *last_update = tsk_calloc(num_nodes, sizeof(*last_update)); - double t_left, t_right, w_right; - - if (parent == NULL || state == NULL || node_summary == NULL || last_update == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - tsk_memset(result, 0, num_windows * num_nodes * result_dim * sizeof(*result)); - - /* Set the initial conditions */ - for (j = 0; j < self->num_samples; j++) { - u = self->samples[j]; - state_u = GET_2D_ROW(state, state_dim, u); - weight_u = GET_2D_ROW(sample_weights, state_dim, j); - tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); - } - for (u = 0; u < (tsk_id_t) num_nodes; u++) { - ret = update_node_summary( - u, result_dim, node_summary, state, state_dim, f, f_params); - if (ret != 0) { - goto out; - } - } - - /* Iterate over the trees */ - tj = 0; - tk = 0; - t_left = 0; - window_index = 0; - while (tj < num_edges || t_left < sequence_length) { - tsk_bug_assert(window_index < num_windows); - while (tk < num_edges && edge_right[O[tk]] == t_left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - while (v != TSK_NULL) { - increment_row(result_dim, t_left - last_update[v], - GET_2D_ROW(node_summary, result_dim, v), - GET_3D_ROW(result, num_nodes, result_dim, window_index, v)); - last_update[v] = t_left; - update_state(state, state_dim, v, u, -1); - ret = update_node_summary( - v, result_dim, node_summary, state, state_dim, f, f_params); - if (ret != 0) { - goto out; - } - v = parent[v]; - } - parent[u] = TSK_NULL; - } - - while (tj < num_edges && edge_left[I[tj]] == t_left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - while (v != TSK_NULL) { - increment_row(result_dim, t_left - last_update[v], - GET_2D_ROW(node_summary, result_dim, v), - GET_3D_ROW(result, num_nodes, result_dim, window_index, v)); - last_update[v] = t_left; - update_state(state, state_dim, v, u, +1); - ret = update_node_summary( - v, result_dim, node_summary, state, state_dim, f, f_params); - if (ret != 0) { - goto out; - } - v = parent[v]; - } - } - - t_right = sequence_length; - if (tj < num_edges) { - t_right = TSK_MIN(t_right, edge_left[I[tj]]); - } - if (tk < num_edges) { - t_right = TSK_MIN(t_right, edge_right[O[tk]]); - } - - while (window_index < num_windows && windows[window_index + 1] <= t_right) { - w_right = windows[window_index + 1]; - /* Flush the contributions of all nodes to the current window */ - for (u = 0; u < (tsk_id_t) num_nodes; u++) { - tsk_bug_assert(last_update[u] < w_right); - increment_row(result_dim, w_right - last_update[u], - GET_2D_ROW(node_summary, result_dim, u), - GET_3D_ROW(result, num_nodes, result_dim, window_index, u)); - last_update[u] = w_right; - } - window_index++; - } - - t_left = t_right; - } -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - tsk_safe_free(state); - tsk_safe_free(node_summary); - tsk_safe_free(last_update); - return ret; -} - -static void -span_normalise( - tsk_size_t num_windows, const double *windows, tsk_size_t row_size, double *array) -{ - tsk_size_t window_index, k; - double span, *row; - - for (window_index = 0; window_index < num_windows; window_index++) { - span = windows[window_index + 1] - windows[window_index]; - row = GET_2D_ROW(array, row_size, window_index); - for (k = 0; k < row_size; k++) { - row[k] /= span; - } - } -} - -typedef struct { - general_stat_func_t *f; - void *f_params; - double *total_weight; - double *total_minus_state; - double *result_tmp; -} unpolarised_summary_func_args; - -static int -unpolarised_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - int ret = 0; - unpolarised_summary_func_args *upargs = (unpolarised_summary_func_args *) params; - const double *total_weight = upargs->total_weight; - double *total_minus_state = upargs->total_minus_state; - double *result_tmp = upargs->result_tmp; - tsk_size_t k, m; - - ret = upargs->f(state_dim, state, result_dim, result, upargs->f_params); - if (ret != 0) { - goto out; - } - for (k = 0; k < state_dim; k++) { - total_minus_state[k] = total_weight[k] - state[k]; - } - ret = upargs->f( - state_dim, total_minus_state, result_dim, result_tmp, upargs->f_params); - if (ret != 0) { - goto out; - } - for (m = 0; m < result_dim; m++) { - result[m] += result_tmp[m]; - } -out: - return ret; -} - -/* Abstracts the running of node and branch stats where the summary function - * is run twice when non-polarised. We replace the call to the input summary - * function with a call of the required form when non-polarised, simplifying - * the implementation and memory management for the node and branch stats. - */ -static int -tsk_polarisable_func_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, - void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, - double *result) -{ - int ret = 0; - bool stat_branch = !!(options & TSK_STAT_BRANCH); - bool polarised = options & TSK_STAT_POLARISED; - general_stat_func_t *wrapped_f = f; - void *wrapped_f_params = f_params; - const double *weight_u; - unpolarised_summary_func_args upargs; - tsk_size_t j, k; - - tsk_memset(&upargs, 0, sizeof(upargs)); - if (!polarised) { - upargs.f = f; - upargs.f_params = f_params; - upargs.total_weight = tsk_calloc(state_dim, sizeof(double)); - upargs.total_minus_state = tsk_calloc(state_dim, sizeof(double)); - upargs.result_tmp = tsk_calloc(result_dim, sizeof(double)); - - if (upargs.total_weight == NULL || upargs.total_minus_state == NULL - || upargs.result_tmp == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* Compute the total weight */ - for (j = 0; j < self->num_samples; j++) { - weight_u = GET_2D_ROW(sample_weights, state_dim, j); - for (k = 0; k < state_dim; k++) { - upargs.total_weight[k] += weight_u[k]; - } - } - - wrapped_f = unpolarised_summary_func; - wrapped_f_params = &upargs; - } - - if (stat_branch) { - ret = tsk_treeseq_branch_general_stat(self, state_dim, sample_weights, - result_dim, wrapped_f, wrapped_f_params, num_windows, windows, options, - result); - } else { - ret = tsk_treeseq_node_general_stat(self, state_dim, sample_weights, result_dim, - wrapped_f, wrapped_f_params, num_windows, windows, options, result); - } -out: - tsk_safe_free(upargs.total_weight); - tsk_safe_free(upargs.total_minus_state); - tsk_safe_free(upargs.result_tmp); - return ret; -} - -int -tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, - void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, - double *result) -{ - int ret = 0; - bool stat_site = !!(options & TSK_STAT_SITE); - bool stat_branch = !!(options & TSK_STAT_BRANCH); - bool stat_node = !!(options & TSK_STAT_NODE); - double default_windows[] = { 0, self->tables->sequence_length }; - tsk_size_t row_size; - - /* If no mode is specified, we default to site mode */ - if (!(stat_site || stat_branch || stat_node)) { - stat_site = true; - } - /* It's an error to specify more than one mode */ - if (stat_site + stat_branch + stat_node > 1) { - ret = TSK_ERR_MULTIPLE_STAT_MODES; - goto out; - } - - if (state_dim < 1) { - ret = TSK_ERR_BAD_STATE_DIMS; - goto out; - } - if (result_dim < 1) { - ret = TSK_ERR_BAD_RESULT_DIMS; - goto out; - } - if (windows == NULL) { - num_windows = 1; - windows = default_windows; - } else { - ret = tsk_treeseq_check_windows(self, num_windows, windows); - if (ret != 0) { - goto out; - } - } - - if (stat_site) { - ret = tsk_treeseq_site_general_stat(self, state_dim, sample_weights, result_dim, - f, f_params, num_windows, windows, options, result); - } else { - ret = tsk_polarisable_func_general_stat(self, state_dim, sample_weights, - result_dim, f, f_params, num_windows, windows, options, result); - } - - if (options & TSK_STAT_SPAN_NORMALISE) { - row_size = result_dim; - if (stat_node) { - row_size = result_dim * tsk_treeseq_get_num_nodes(self); - } - span_normalise(num_windows, windows, row_size, result); - } - -out: - return ret; -} - -static int -check_set_indexes( - tsk_size_t num_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes) -{ - int ret = 0; - tsk_size_t j; - - for (j = 0; j < num_set_indexes; j++) { - if (set_indexes[j] < 0 || set_indexes[j] >= (tsk_id_t) num_sets) { - ret = TSK_ERR_BAD_SAMPLE_SET_INDEX; - goto out; - } - } -out: - return ret; -} - -static int -tsk_treeseq_check_sample_sets(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets) -{ - int ret = 0; - tsk_size_t j, k, l; - const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; - tsk_id_t u, sample_index; - - if (num_sample_sets == 0) { - ret = TSK_ERR_INSUFFICIENT_SAMPLE_SETS; - goto out; - } - j = 0; - for (k = 0; k < num_sample_sets; k++) { - if (sample_set_sizes[k] == 0) { - ret = TSK_ERR_EMPTY_SAMPLE_SET; - goto out; - } - for (l = 0; l < sample_set_sizes[k]; l++) { - u = sample_sets[j]; - if (u < 0 || u >= num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - sample_index = self->sample_index_map[u]; - if (sample_index == TSK_NULL) { - ret = TSK_ERR_BAD_SAMPLES; - goto out; - } - j++; - } - } -out: - return ret; -} - -typedef struct { - tsk_size_t num_samples; -} weight_stat_params_t; - -typedef struct { - tsk_size_t num_samples; - tsk_size_t num_covariates; - double *V; -} covariates_stat_params_t; - -typedef struct { - const tsk_id_t *sample_sets; - tsk_size_t num_sample_sets; - const tsk_size_t *sample_set_sizes; - const tsk_id_t *set_indexes; -} sample_count_stat_params_t; - -static int -tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - const tsk_size_t num_samples = self->num_samples; - tsk_size_t j, k, l; - tsk_id_t u, sample_index; - double *weights = NULL; - double *weight_row; - sample_count_stat_params_t args = { .sample_sets = sample_sets, - .num_sample_sets = num_sample_sets, - .sample_set_sizes = sample_set_sizes, - .set_indexes = set_indexes }; - - ret = tsk_treeseq_check_sample_sets( - self, num_sample_sets, sample_set_sizes, sample_sets); - if (ret != 0) { - goto out; - } - weights = tsk_calloc(num_samples * num_sample_sets, sizeof(*weights)); - if (weights == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - j = 0; - for (k = 0; k < num_sample_sets; k++) { - for (l = 0; l < sample_set_sizes[k]; l++) { - u = sample_sets[j]; - sample_index = self->sample_index_map[u]; - weight_row = GET_2D_ROW(weights, num_sample_sets, sample_index); - if (weight_row[k] != 0) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - weight_row[k] = 1; - j++; - } - } - ret = tsk_treeseq_general_stat(self, num_sample_sets, weights, result_dim, f, &args, - num_windows, windows, options, result); -out: - tsk_safe_free(weights); - return ret; -} - -/*********************************** - * Allele frequency spectrum - ***********************************/ - -static inline void -fold(tsk_size_t *restrict coordinate, const tsk_size_t *restrict dims, - tsk_size_t num_dims) -{ - tsk_size_t k; - double n = 0; - int s = 0; - - for (k = 0; k < num_dims; k++) { - tsk_bug_assert(coordinate[k] < dims[k]); - n += (double) dims[k] - 1; - s += (int) coordinate[k]; - } - n /= 2; - k = num_dims; - while (s == n && k > 0) { - k--; - n -= ((double) (dims[k] - 1)) / 2; - s -= (int) coordinate[k]; - } - if (s > n) { - for (k = 0; k < num_dims; k++) { - s = (int) (dims[k] - 1 - coordinate[k]); - tsk_bug_assert(s >= 0); - coordinate[k] = (tsk_size_t) s; - } - } -} - -static int -tsk_treeseq_update_site_afs(const tsk_treeseq_t *self, const tsk_site_t *site, - const double *total_counts, const double *counts, tsk_size_t num_sample_sets, - tsk_size_t window_index, tsk_size_t *result_dims, tsk_flags_t options, - double *result) -{ - int ret = 0; - tsk_size_t afs_size; - tsk_size_t k, allele, num_alleles, all_samples; - double increment, *afs, *allele_counts, *allele_count; - tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); - bool polarised = !!(options & TSK_STAT_POLARISED); - const tsk_size_t K = num_sample_sets + 1; - - if (coordinate == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = get_allele_weights( - site, counts, K, total_counts, &num_alleles, &allele_counts); - if (ret != 0) { - goto out; - } - - afs_size = result_dims[num_sample_sets]; - afs = result + afs_size * window_index; - - increment = polarised ? 1 : 0.5; - /* Sum over the allele weights. Skip the ancestral state if polarised. */ - for (allele = polarised ? 1 : 0; allele < num_alleles; allele++) { - allele_count = GET_2D_ROW(allele_counts, K, allele); - all_samples = (tsk_size_t) allele_count[num_sample_sets]; - if (all_samples > 0 && all_samples < self->num_samples) { - for (k = 0; k < num_sample_sets; k++) { - coordinate[k] = (tsk_size_t) allele_count[k]; - } - if (!polarised) { - fold(coordinate, result_dims, num_sample_sets); - } - increment_nd_array_value( - afs, num_sample_sets, result_dims, coordinate, increment); - } - } -out: - tsk_safe_free(coordinate); - tsk_safe_free(allele_counts); - return ret; -} - -static int -tsk_treeseq_site_allele_frequency_spectrum(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, double *counts, - tsk_size_t num_windows, const double *windows, tsk_size_t *result_dims, - tsk_flags_t options, double *result) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t tree_site, tree_index, window_index; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double sequence_length = self->tables->sequence_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - tsk_site_t *site; - tsk_id_t tj, tk, h; - tsk_size_t j; - const tsk_size_t K = num_sample_sets + 1; - double t_left, t_right; - double *total_counts = tsk_malloc((1 + num_sample_sets) * sizeof(*total_counts)); - - if (parent == NULL || total_counts == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - - for (j = 0; j < num_sample_sets; j++) { - total_counts[j] = (double) sample_set_sizes[j]; - } - total_counts[num_sample_sets] = (double) self->num_samples; - - /* Iterate over the trees */ - tj = 0; - tk = 0; - t_left = 0; - tree_index = 0; - window_index = 0; - while (tj < num_edges || t_left < sequence_length) { - while (tk < num_edges && edge_right[O[tk]] == t_left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - while (v != TSK_NULL) { - update_state(counts, K, v, u, -1); - v = parent[v]; - } - parent[u] = TSK_NULL; - } - - while (tj < num_edges && edge_left[I[tj]] == t_left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - while (v != TSK_NULL) { - update_state(counts, K, v, u, +1); - v = parent[v]; - } - } - t_right = sequence_length; - if (tj < num_edges) { - t_right = TSK_MIN(t_right, edge_left[I[tj]]); - } - if (tk < num_edges) { - t_right = TSK_MIN(t_right, edge_right[O[tk]]); - } - - /* Update the sites */ - for (tree_site = 0; tree_site < self->tree_sites_length[tree_index]; - tree_site++) { - site = self->tree_sites[tree_index] + tree_site; - while (windows[window_index + 1] <= site->position) { - window_index++; - tsk_bug_assert(window_index < num_windows); - } - ret = tsk_treeseq_update_site_afs(self, site, total_counts, counts, - num_sample_sets, window_index, result_dims, options, result); - if (ret != 0) { - goto out; - } - tsk_bug_assert(windows[window_index] <= site->position); - tsk_bug_assert(site->position < windows[window_index + 1]); - } - tree_index++; - t_left = t_right; - } -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - tsk_safe_free(total_counts); - return ret; -} - -static int TSK_WARN_UNUSED -tsk_treeseq_update_branch_afs(const tsk_treeseq_t *self, tsk_id_t u, double right, - const double *restrict branch_length, double *restrict last_update, - const double *counts, tsk_size_t num_sample_sets, tsk_size_t window_index, - const tsk_size_t *result_dims, tsk_flags_t options, double *result) -{ - int ret = 0; - tsk_size_t afs_size; - tsk_size_t k; - double *afs; - tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); - bool polarised = !!(options & TSK_STAT_POLARISED); - const double *count_row = GET_2D_ROW(counts, num_sample_sets + 1, u); - double x = (right - last_update[u]) * branch_length[u]; - const tsk_size_t all_samples = (tsk_size_t) count_row[num_sample_sets]; - - if (coordinate == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - if (0 < all_samples && all_samples < self->num_samples) { - if (!polarised) { - x *= 0.5; - } - afs_size = result_dims[num_sample_sets]; - afs = result + afs_size * window_index; - for (k = 0; k < num_sample_sets; k++) { - coordinate[k] = (tsk_size_t) count_row[k]; - } - if (!polarised) { - fold(coordinate, result_dims, num_sample_sets); - } - increment_nd_array_value(afs, num_sample_sets, result_dims, coordinate, x); - } - last_update[u] = right; -out: - tsk_safe_free(coordinate); - return ret; -} - -static int -tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, double *counts, tsk_size_t num_windows, - const double *windows, const tsk_size_t *result_dims, tsk_flags_t options, - double *result) -{ - int ret = 0; - tsk_id_t u, v; - tsk_size_t window_index; - tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; - const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; - const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; - const double *restrict edge_left = self->tables->edges.left; - const double *restrict edge_right = self->tables->edges.right; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const double *restrict node_time = self->tables->nodes.time; - const double sequence_length = self->tables->sequence_length; - tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); - double *restrict last_update = tsk_calloc(num_nodes, sizeof(*last_update)); - double *restrict branch_length = tsk_calloc(num_nodes, sizeof(*branch_length)); - tsk_id_t tj, tk, h; - double t_left, t_right, w_right; - const tsk_size_t K = num_sample_sets + 1; - - if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { - ret = TSK_ERR_TIME_UNCALIBRATED; - goto out; - } - - if (parent == NULL || last_update == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); - - /* Iterate over the trees */ - tj = 0; - tk = 0; - t_left = 0; - window_index = 0; - while (tj < num_edges || t_left < sequence_length) { - tsk_bug_assert(window_index < num_windows); - while (tk < num_edges && edge_right[O[tk]] == t_left) { - h = O[tk]; - tk++; - u = edge_child[h]; - v = edge_parent[h]; - ret = tsk_treeseq_update_branch_afs(self, u, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, options, - result); - if (ret != 0) { - goto out; - } - while (v != TSK_NULL) { - ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } - update_state(counts, K, v, u, -1); - v = parent[v]; - } - parent[u] = TSK_NULL; - branch_length[u] = 0; - } - - while (tj < num_edges && edge_left[I[tj]] == t_left) { - h = I[tj]; - tj++; - u = edge_child[h]; - v = edge_parent[h]; - parent[u] = v; - branch_length[u] = node_time[v] - node_time[u]; - while (v != TSK_NULL) { - ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } - update_state(counts, K, v, u, +1); - v = parent[v]; - } - } - - t_right = sequence_length; - if (tj < num_edges) { - t_right = TSK_MIN(t_right, edge_left[I[tj]]); - } - if (tk < num_edges) { - t_right = TSK_MIN(t_right, edge_right[O[tk]]); - } - - while (window_index < num_windows && windows[window_index + 1] <= t_right) { - w_right = windows[window_index + 1]; - /* Flush the contributions of all nodes to the current window */ - for (u = 0; u < (tsk_id_t) num_nodes; u++) { - tsk_bug_assert(last_update[u] < w_right); - ret = tsk_treeseq_update_branch_afs(self, u, w_right, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } - } - window_index++; - } - - t_left = t_right; - } -out: - /* Can't use msp_safe_free here because of restrict */ - if (parent != NULL) { - free(parent); - } - if (last_update != NULL) { - free(last_update); - } - if (branch_length != NULL) { - free(branch_length); - } - return ret; -} - -int -tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result) -{ - int ret = 0; - bool stat_site = !!(options & TSK_STAT_SITE); - bool stat_branch = !!(options & TSK_STAT_BRANCH); - bool stat_node = !!(options & TSK_STAT_NODE); - double default_windows[] = { 0, self->tables->sequence_length }; - const tsk_size_t num_nodes = self->tables->nodes.num_rows; - const tsk_size_t K = num_sample_sets + 1; - tsk_size_t j, k, l, afs_size; - tsk_id_t u; - tsk_size_t *result_dims = NULL; - /* These counts should really be ints, but we use doubles so that we can - * reuse code from the general_stats code paths. */ - double *counts = NULL; - double *count_row; - - if (stat_node) { - ret = TSK_ERR_UNSUPPORTED_STAT_MODE; - goto out; - } - /* If no mode is specified, we default to site mode */ - if (!(stat_site || stat_branch)) { - stat_site = true; - } - /* It's an error to specify more than one mode */ - if (stat_site + stat_branch > 1) { - ret = TSK_ERR_MULTIPLE_STAT_MODES; - goto out; - } - if (windows == NULL) { - num_windows = 1; - windows = default_windows; - } else { - ret = tsk_treeseq_check_windows(self, num_windows, windows); - if (ret != 0) { - goto out; - } - } - ret = tsk_treeseq_check_sample_sets( - self, num_sample_sets, sample_set_sizes, sample_sets); - if (ret != 0) { - goto out; - } - - /* the last element of result_dims stores the total size of the dimenensions */ - result_dims = tsk_malloc((num_sample_sets + 1) * sizeof(*result_dims)); - counts = tsk_calloc(num_nodes * K, sizeof(*counts)); - if (counts == NULL || result_dims == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - afs_size = 1; - j = 0; - for (k = 0; k < num_sample_sets; k++) { - result_dims[k] = 1 + sample_set_sizes[k]; - afs_size *= result_dims[k]; - for (l = 0; l < sample_set_sizes[k]; l++) { - u = sample_sets[j]; - count_row = GET_2D_ROW(counts, K, u); - if (count_row[k] != 0) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - count_row[k] = 1; - j++; - } - } - for (j = 0; j < self->num_samples; j++) { - u = self->samples[j]; - count_row = GET_2D_ROW(counts, K, u); - count_row[num_sample_sets] = 1; - } - result_dims[num_sample_sets] = (tsk_size_t) afs_size; - - tsk_memset(result, 0, num_windows * afs_size * sizeof(*result)); - if (stat_site) { - ret = tsk_treeseq_site_allele_frequency_spectrum(self, num_sample_sets, - sample_set_sizes, counts, num_windows, windows, result_dims, options, - result); - } else { - ret = tsk_treeseq_branch_allele_frequency_spectrum(self, num_sample_sets, counts, - num_windows, windows, result_dims, options, result); - } - - if (options & TSK_STAT_SPAN_NORMALISE) { - span_normalise(num_windows, windows, afs_size, result); - } -out: - tsk_safe_free(counts); - tsk_safe_free(result_dims); - return ret; -} - -/*********************************** - * One way stats - ***********************************/ - -static int -diversity_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double n; - tsk_size_t j; - - for (j = 0; j < state_dim; j++) { - n = (double) args.sample_set_sizes[j]; - result[j] = x[j] * (n - x[j]) / (n * (n - 1)); - } - return 0; -} - -int -tsk_treeseq_diversity(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) -{ - return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, diversity_summary_func, num_windows, windows, - options, result); -} - -static int -trait_covariance_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) -{ - weight_stat_params_t args = *(weight_stat_params_t *) params; - const double n = (double) args.num_samples; - const double *x = state; - tsk_size_t j; - - for (j = 0; j < state_dim; j++) { - result[j] = (x[j] * x[j]) / (2 * (n - 1) * (n - 1)); - } - return 0; -} - -int -tsk_treeseq_trait_covariance(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result) -{ - tsk_size_t num_samples = self->num_samples; - tsk_size_t j, k; - int ret; - const double *row; - double *new_row; - double *means = tsk_calloc(num_weights, sizeof(double)); - double *new_weights = tsk_malloc((num_weights + 1) * num_samples * sizeof(double)); - weight_stat_params_t args = { num_samples = self->num_samples }; - - if (new_weights == NULL || means == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - // center weights - for (j = 0; j < num_samples; j++) { - row = GET_2D_ROW(weights, num_weights, j); - for (k = 0; k < num_weights; k++) { - means[k] += row[k]; - } - } - for (k = 0; k < num_weights; k++) { - means[k] /= (double) num_samples; - } - for (j = 0; j < num_samples; j++) { - row = GET_2D_ROW(weights, num_weights, j); - new_row = GET_2D_ROW(new_weights, num_weights, j); - for (k = 0; k < num_weights; k++) { - new_row[k] = row[k] - means[k]; - } - } - - ret = tsk_treeseq_general_stat(self, num_weights, new_weights, num_weights, - trait_covariance_summary_func, &args, num_windows, windows, options, result); - -out: - tsk_safe_free(means); - tsk_safe_free(new_weights); - return ret; -} - -static int -trait_correlation_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) -{ - weight_stat_params_t args = *(weight_stat_params_t *) params; - const double n = (double) args.num_samples; - const double *x = state; - double p; - tsk_size_t j; - - p = x[state_dim - 1]; - for (j = 0; j < state_dim - 1; j++) { - if ((p > 0.0) && (p < 1.0)) { - result[j] = (x[j] * x[j]) / (2 * (p * (1 - p)) * n * (n - 1)); - } else { - result[j] = 0.0; - } - } - return 0; -} - -int -tsk_treeseq_trait_correlation(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result) -{ - tsk_size_t num_samples = self->num_samples; - tsk_size_t j, k; - int ret; - double *means = tsk_calloc(num_weights, sizeof(double)); - double *meansqs = tsk_calloc(num_weights, sizeof(double)); - double *sds = tsk_calloc(num_weights, sizeof(double)); - const double *row; - double *new_row; - double *new_weights = tsk_malloc((num_weights + 1) * num_samples * sizeof(double)); - weight_stat_params_t args = { num_samples = self->num_samples }; - - if (new_weights == NULL || means == NULL || meansqs == NULL || sds == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - if (num_weights < 1) { - ret = TSK_ERR_BAD_STATE_DIMS; - goto out; - } - - // center and scale weights - for (j = 0; j < num_samples; j++) { - row = GET_2D_ROW(weights, num_weights, j); - for (k = 0; k < num_weights; k++) { - means[k] += row[k]; - meansqs[k] += row[k] * row[k]; - } - } - for (k = 0; k < num_weights; k++) { - means[k] /= (double) num_samples; - meansqs[k] -= means[k] * means[k] * (double) num_samples; - meansqs[k] /= (double) (num_samples - 1); - sds[k] = sqrt(meansqs[k]); - } - for (j = 0; j < num_samples; j++) { - row = GET_2D_ROW(weights, num_weights, j); - new_row = GET_2D_ROW(new_weights, num_weights + 1, j); - for (k = 0; k < num_weights; k++) { - new_row[k] = (row[k] - means[k]) / sds[k]; - } - // set final row to 1/n to compute frequency - new_row[num_weights] = 1.0 / (double) num_samples; - } - - ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_weights, - trait_correlation_summary_func, &args, num_windows, windows, options, result); - -out: - tsk_safe_free(means); - tsk_safe_free(meansqs); - tsk_safe_free(sds); - tsk_safe_free(new_weights); - return ret; -} - -static int -trait_linear_model_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - covariates_stat_params_t args = *(covariates_stat_params_t *) params; - const double num_samples = (double) args.num_samples; - const tsk_size_t k = args.num_covariates; - const double *V = args.V; - ; - const double *x = state; - const double *v; - double m, a, denom, z; - tsk_size_t i, j; - // x[0], ..., x[result_dim - 1] contains the traits, W - // x[result_dim], ..., x[state_dim - 2] contains the covariates, Z - // x[state_dim - 1] has the number of samples below the node - - m = x[state_dim - 1]; - for (i = 0; i < result_dim; i++) { - if ((m > 0.0) && (m < num_samples)) { - v = GET_2D_ROW(V, k, i); - a = x[i]; - denom = m; - for (j = 0; j < k; j++) { - z = x[result_dim + j]; - a -= z * v[j]; - denom -= z * z; - } - // denom is the length of projection of the trait onto the subspace - // spanned by the covariates, so if it is zero then the system is - // singular and the solution is nonunique. This numerical tolerance - // could be smaller without hitting floating-point error, but being - // a tiny bit conservative about when the trait is almost in the - // span of the covariates is probably good. - if (denom < 1e-8) { - result[i] = 0.0; - } else { - result[i] = (a * a) / (2 * denom * denom); - } - } else { - result[i] = 0.0; - } - } - return 0; -} - -int -tsk_treeseq_trait_linear_model(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_covariates, const double *covariates, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) -{ - tsk_size_t num_samples = self->num_samples; - tsk_size_t i, j, k; - int ret; - const double *w, *z; - double *v, *new_row; - double *V = tsk_calloc(num_covariates * num_weights, sizeof(double)); - double *new_weights - = tsk_malloc((num_weights + num_covariates + 1) * num_samples * sizeof(double)); - - covariates_stat_params_t args - = { .num_samples = self->num_samples, .num_covariates = num_covariates, .V = V }; - - // We assume that the covariates have been *already standardised*, - // so that (a) 1 is in the span of the columns, and - // (b) their crossproduct is the identity. - // We could do this instead here with gsl linalg. - - if (new_weights == NULL || V == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - if (num_weights < 1) { - ret = TSK_ERR_BAD_STATE_DIMS; - goto out; - } - - // V = weights^T (matrix mult) covariates - for (k = 0; k < num_samples; k++) { - w = GET_2D_ROW(weights, num_weights, k); - z = GET_2D_ROW(covariates, num_covariates, k); - for (i = 0; i < num_weights; i++) { - v = GET_2D_ROW(V, num_covariates, i); - for (j = 0; j < num_covariates; j++) { - v[j] += w[i] * z[j]; - } - } - } - - for (k = 0; k < num_samples; k++) { - w = GET_2D_ROW(weights, num_weights, k); - z = GET_2D_ROW(covariates, num_covariates, k); - new_row = GET_2D_ROW(new_weights, num_covariates + num_weights + 1, k); - for (i = 0; i < num_weights; i++) { - new_row[i] = w[i]; - } - for (i = 0; i < num_covariates; i++) { - new_row[i + num_weights] = z[i]; - } - // set final row to 1 to count alleles - new_row[num_weights + num_covariates] = 1.0; - } - - ret = tsk_treeseq_general_stat(self, num_weights + num_covariates + 1, new_weights, - num_weights, trait_linear_model_summary_func, &args, num_windows, windows, - options, result); - -out: - tsk_safe_free(V); - tsk_safe_free(new_weights); - return ret; -} - -static int -segregating_sites_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double n; - tsk_size_t j; - - // this works because sum_{i=1}^k (1-p_i) = k-1 - for (j = 0; j < state_dim; j++) { - n = (double) args.sample_set_sizes[j]; - result[j] = (x[j] > 0) * (1 - x[j] / n); - } - return 0; -} - -int -tsk_treeseq_segregating_sites(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) -{ - return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, segregating_sites_summary_func, num_windows, - windows, options, result); -} - -static int -Y1_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, denom, numer; - tsk_size_t i; - - for (i = 0; i < result_dim; i++) { - ni = (double) args.sample_set_sizes[i]; - denom = ni * (ni - 1) * (ni - 2); - numer = x[i] * (ni - x[i]) * (ni - x[i] - 1); - result[i] = numer / denom; - } - return 0; -} - -int -tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) -{ - return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, Y1_summary_func, num_windows, windows, - options, result); -} - -/*********************************** - * Two way stats - ***********************************/ - -static int -check_sample_stat_inputs(tsk_size_t num_sample_sets, tsk_size_t tuple_size, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples) -{ - int ret = 0; - - if (num_sample_sets < tuple_size) { - ret = TSK_ERR_INSUFFICIENT_SAMPLE_SETS; - goto out; - } - if (num_index_tuples < 1) { - ret = TSK_ERR_INSUFFICIENT_INDEX_TUPLES; - goto out; - } - ret = check_set_indexes( - num_sample_sets, tuple_size * num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } -out: - return ret; -} - -static int -divergence_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, denom; - tsk_id_t i, j; - tsk_size_t k; - - for (k = 0; k < result_dim; k++) { - i = args.set_indexes[2 * k]; - j = args.set_indexes[2 * k + 1]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - denom = ni * (nj - (i == j)); - result[k] = x[i] * (nj - x[j]) / denom; - } - return 0; -} - -int -tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, divergence_summary_func, - num_windows, windows, options, result); -out: - return ret; -} - -static int -genetic_relatedness_summary_func(tsk_size_t state_dim, const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - tsk_id_t i, j; - tsk_size_t k; - double sumx = 0; - double sumn = 0; - double meanx, ni, nj; - - for (k = 0; k < state_dim; k++) { - sumx += x[k]; - sumn += (double) args.sample_set_sizes[k]; - } - - meanx = sumx / sumn; - for (k = 0; k < result_dim; k++) { - i = args.set_indexes[2 * k]; - j = args.set_indexes[2 * k + 1]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - result[k] = (x[i] - ni * meanx) * (x[j] - nj * meanx) / 2; - } - return 0; -} - -int -tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, genetic_relatedness_summary_func, - num_windows, windows, options, result); -out: - return ret; -} - -static int -Y2_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, denom; - tsk_id_t i, j; - tsk_size_t k; - - for (k = 0; k < result_dim; k++) { - i = args.set_indexes[2 * k]; - j = args.set_indexes[2 * k + 1]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - denom = ni * nj * (nj - 1); - result[k] = x[i] * (nj - x[j]) * (nj - x[j] - 1) / denom; - } - return 0; -} - -int -tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, Y2_summary_func, num_windows, - windows, options, result); -out: - return ret; -} - -static int -f2_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, denom, numer; - tsk_id_t i, j; - tsk_size_t k; - - for (k = 0; k < result_dim; k++) { - i = args.set_indexes[2 * k]; - j = args.set_indexes[2 * k + 1]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - denom = ni * (ni - 1) * nj * (nj - 1); - numer = x[i] * (x[i] - 1) * (nj - x[j]) * (nj - x[j] - 1) - - x[i] * (ni - x[i]) * (nj - x[j]) * x[j]; - result[k] = numer / denom; - } - return 0; -} - -int -tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, f2_summary_func, num_windows, - windows, options, result); -out: - return ret; -} - -/*********************************** - * Three way stats - ***********************************/ - -static int -Y3_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, nk, denom, numer; - tsk_id_t i, j, k; - tsk_size_t tuple_index; - - for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { - i = args.set_indexes[3 * tuple_index]; - j = args.set_indexes[3 * tuple_index + 1]; - k = args.set_indexes[3 * tuple_index + 2]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - nk = (double) args.sample_set_sizes[k]; - denom = ni * nj * nk; - numer = x[i] * (nj - x[j]) * (nk - x[k]); - result[tuple_index] = numer / denom; - } - return 0; -} - -int -tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 3, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, Y3_summary_func, num_windows, - windows, options, result); -out: - return ret; -} - -static int -f3_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, nk, denom, numer; - tsk_id_t i, j, k; - tsk_size_t tuple_index; - - for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { - i = args.set_indexes[3 * tuple_index]; - j = args.set_indexes[3 * tuple_index + 1]; - k = args.set_indexes[3 * tuple_index + 2]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - nk = (double) args.sample_set_sizes[k]; - denom = ni * (ni - 1) * nj * nk; - numer = x[i] * (x[i] - 1) * (nj - x[j]) * (nk - x[k]) - - x[i] * (ni - x[i]) * (nj - x[j]) * x[k]; - result[tuple_index] = numer / denom; - } - return 0; -} - -int -tsk_treeseq_f3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 3, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, f3_summary_func, num_windows, - windows, options, result); -out: - return ret; -} - -/*********************************** - * Four way stats - ***********************************/ - -static int -f4_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, - tsk_size_t result_dim, double *result, void *params) -{ - sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; - const double *x = state; - double ni, nj, nk, nl, denom, numer; - tsk_id_t i, j, k, l; - tsk_size_t tuple_index; - - for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { - i = args.set_indexes[4 * tuple_index]; - j = args.set_indexes[4 * tuple_index + 1]; - k = args.set_indexes[4 * tuple_index + 2]; - l = args.set_indexes[4 * tuple_index + 3]; - ni = (double) args.sample_set_sizes[i]; - nj = (double) args.sample_set_sizes[j]; - nk = (double) args.sample_set_sizes[k]; - nl = (double) args.sample_set_sizes[l]; - denom = ni * nj * nk * nl; - numer = x[i] * x[k] * (nj - x[j]) * (nl - x[l]) - - x[i] * x[l] * (nj - x[j]) * (nk - x[k]); - result[tuple_index] = numer / denom; - } - return 0; -} - -int -tsk_treeseq_f4(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result) -{ - int ret = 0; - ret = check_sample_stat_inputs(num_sample_sets, 4, num_index_tuples, index_tuples); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_index_tuples, index_tuples, f4_summary_func, num_windows, - windows, options, result); -out: - return ret; -} - -/* Error-raising getter functions */ - -int TSK_WARN_UNUSED -tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node) -{ - return tsk_node_table_get_row(&self->tables->nodes, index, node); -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge) -{ - return tsk_edge_table_get_row(&self->tables->edges, index, edge); -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_migration( - const tsk_treeseq_t *self, tsk_id_t index, tsk_migration_t *migration) -{ - return tsk_migration_table_get_row(&self->tables->migrations, index, migration); -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_mutation( - const tsk_treeseq_t *self, tsk_id_t index, tsk_mutation_t *mutation) -{ - int ret = 0; - - ret = tsk_mutation_table_get_row(&self->tables->mutations, index, mutation); - if (ret != 0) { - goto out; - } - mutation->edge = self->site_mutations_mem[index].edge; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_site(const tsk_treeseq_t *self, tsk_id_t index, tsk_site_t *site) -{ - int ret = 0; - - ret = tsk_site_table_get_row(&self->tables->sites, index, site); - if (ret != 0) { - goto out; - } - site->mutations = self->site_mutations[index]; - site->mutations_length = self->site_mutations_length[index]; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_individual( - const tsk_treeseq_t *self, tsk_id_t index, tsk_individual_t *individual) -{ - int ret = 0; - - ret = tsk_individual_table_get_row(&self->tables->individuals, index, individual); - if (ret != 0) { - goto out; - } - individual->nodes = self->individual_nodes[index]; - individual->nodes_length = self->individual_nodes_length[index]; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_population( - const tsk_treeseq_t *self, tsk_id_t index, tsk_population_t *population) -{ - return tsk_population_table_get_row(&self->tables->populations, index, population); -} - -int TSK_WARN_UNUSED -tsk_treeseq_get_provenance( - const tsk_treeseq_t *self, tsk_id_t index, tsk_provenance_t *provenance) -{ - return tsk_provenance_table_get_row(&self->tables->provenances, index, provenance); -} - -int TSK_WARN_UNUSED -tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_treeseq_t *output, - tsk_id_t *node_map) -{ - int ret = 0; - tsk_table_collection_t *tables = tsk_malloc(sizeof(*tables)); - - if (tables == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_treeseq_copy_tables(self, tables, 0); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_simplify(tables, samples, num_samples, options, node_map); - if (ret != 0) { - goto out; - } - ret = tsk_treeseq_init( - output, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); - /* Once tsk_tree_init has returned ownership of tables is transferred */ - tables = NULL; -out: - if (tables != NULL) { - tsk_table_collection_free(tables); - tsk_safe_free(tables); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, - tsk_id_t population, const char *metadata, tsk_size_t metadata_length, - tsk_flags_t TSK_UNUSED(options), tsk_treeseq_t *output) -{ - int ret = 0; - tsk_table_collection_t *tables = tsk_malloc(sizeof(*tables)); - const double *restrict node_time = self->tables->nodes.time; - const tsk_size_t num_edges = self->tables->edges.num_rows; - const tsk_size_t num_mutations = self->tables->mutations.num_rows; - tsk_id_t *split_edge = tsk_malloc(num_edges * sizeof(*split_edge)); - tsk_id_t j, u, mapped_node, ret_id; - double mutation_time; - tsk_edge_t edge; - tsk_mutation_t mutation; - tsk_bookmark_t sort_start; - - memset(output, 0, sizeof(*output)); - if (split_edge == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_treeseq_copy_tables(self, tables, 0); - if (ret != 0) { - goto out; - } - if (tables->migrations.num_rows > 0) { - ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; - goto out; - } - /* We could catch this below in add_row, but it's simpler to guarantee - * that we always catch the error in corner cases where the values - * aren't used. */ - if (population < -1 || population >= (tsk_id_t) self->tables->populations.num_rows) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - if (!tsk_isfinite(time)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - - tsk_edge_table_clear(&tables->edges); - tsk_memset(split_edge, TSK_NULL, num_edges * sizeof(*split_edge)); - - for (j = 0; j < (tsk_id_t) num_edges; j++) { - /* Would prefer to use tsk_edge_table_get_row_unsafe, but it's - * currently static to tables.c */ - ret = tsk_edge_table_get_row(&self->tables->edges, j, &edge); - tsk_bug_assert(ret == 0); - if (node_time[edge.child] < time && time < node_time[edge.parent]) { - u = tsk_node_table_add_row(&tables->nodes, flags, time, population, TSK_NULL, - metadata, metadata_length); - if (u < 0) { - ret = (int) u; - goto out; - } - ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, u, - edge.child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - edge.child = u; - split_edge[j] = u; - } - ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, - edge.parent, edge.child, edge.metadata, edge.metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - - for (j = 0; j < (tsk_id_t) num_mutations; j++) { - /* Note: we could speed this up a bit by accessing the local - * memory for mutations directly. */ - ret = tsk_treeseq_get_mutation(self, j, &mutation); - tsk_bug_assert(ret == 0); - mapped_node = TSK_NULL; - if (mutation.edge != TSK_NULL) { - mapped_node = split_edge[mutation.edge]; - } - mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] - : mutation.time; - if (mapped_node != TSK_NULL && mutation_time >= time) { - /* Update the column in-place to save a bit of time. */ - tables->mutations.node[j] = mapped_node; - } - } - - /* Skip mutations and sites as they haven't been altered */ - /* Note we can probably optimise the edge sort a bit here also by - * reasoning about when the first edge gets altered in the table. - */ - memset(&sort_start, 0, sizeof(sort_start)); - sort_start.sites = tables->sites.num_rows; - sort_start.mutations = tables->mutations.num_rows; - ret = tsk_table_collection_sort(tables, &sort_start, 0); - if (ret != 0) { - goto out; - } - - ret = tsk_treeseq_init( - output, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); - tables = NULL; -out: - if (tables != NULL) { - tsk_table_collection_free(tables); - tsk_safe_free(tables); - } - tsk_safe_free(split_edge); - return ret; -} - -/* ======================================================== * - * Tree - * ======================================================== */ - -int TSK_WARN_UNUSED -tsk_tree_init(tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) -{ - int ret = TSK_ERR_NO_MEMORY; - tsk_size_t num_samples, num_nodes, N; - - tsk_memset(self, 0, sizeof(tsk_tree_t)); - if (tree_sequence == NULL) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - num_nodes = tree_sequence->tables->nodes.num_rows; - num_samples = tree_sequence->num_samples; - self->num_nodes = num_nodes; - self->virtual_root = (tsk_id_t) num_nodes; - self->tree_sequence = tree_sequence; - self->samples = tree_sequence->samples; - self->options = options; - self->root_threshold = 1; - - /* Allocate space in the quintuply linked tree for the virtual root */ - N = num_nodes + 1; - self->parent = tsk_malloc(N * sizeof(*self->parent)); - self->left_child = tsk_malloc(N * sizeof(*self->left_child)); - self->right_child = tsk_malloc(N * sizeof(*self->right_child)); - self->left_sib = tsk_malloc(N * sizeof(*self->left_sib)); - self->right_sib = tsk_malloc(N * sizeof(*self->right_sib)); - self->num_children = tsk_calloc(N, sizeof(*self->num_children)); - self->edge = tsk_malloc(N * sizeof(*self->edge)); - if (self->parent == NULL || self->left_child == NULL || self->right_child == NULL - || self->left_sib == NULL || self->right_sib == NULL - || self->num_children == NULL || self->edge == NULL) { - goto out; - } - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - self->num_samples = tsk_calloc(N, sizeof(*self->num_samples)); - self->num_tracked_samples = tsk_calloc(N, sizeof(*self->num_tracked_samples)); - if (self->num_samples == NULL || self->num_tracked_samples == NULL) { - goto out; - } - } - if (self->options & TSK_SAMPLE_LISTS) { - self->left_sample = tsk_malloc(N * sizeof(*self->left_sample)); - self->right_sample = tsk_malloc(N * sizeof(*self->right_sample)); - self->next_sample = tsk_malloc(num_samples * sizeof(*self->next_sample)); - if (self->left_sample == NULL || self->right_sample == NULL - || self->next_sample == NULL) { - goto out; - } - } - ret = tsk_tree_clear(self); -out: - return ret; -} - -int -tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold) -{ - int ret = 0; - - if (root_threshold == 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - /* Don't allow the value to be set when the tree is out of the null - * state */ - if (self->index != -1) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - self->root_threshold = root_threshold; - /* Reset the roots */ - ret = tsk_tree_clear(self); -out: - return ret; -} - -tsk_size_t -tsk_tree_get_root_threshold(const tsk_tree_t *self) -{ - return self->root_threshold; -} - -int -tsk_tree_free(tsk_tree_t *self) -{ - tsk_safe_free(self->parent); - tsk_safe_free(self->left_child); - tsk_safe_free(self->right_child); - tsk_safe_free(self->left_sib); - tsk_safe_free(self->right_sib); - tsk_safe_free(self->num_samples); - tsk_safe_free(self->num_tracked_samples); - tsk_safe_free(self->left_sample); - tsk_safe_free(self->right_sample); - tsk_safe_free(self->next_sample); - tsk_safe_free(self->num_children); - tsk_safe_free(self->edge); - return 0; -} - -bool -tsk_tree_has_sample_lists(const tsk_tree_t *self) -{ - return !!(self->options & TSK_SAMPLE_LISTS); -} - -bool -tsk_tree_has_sample_counts(const tsk_tree_t *self) -{ - return !(self->options & TSK_NO_SAMPLE_COUNTS); -} - -static int TSK_WARN_UNUSED -tsk_tree_reset_tracked_samples(tsk_tree_t *self) -{ - int ret = 0; - - if (!tsk_tree_has_sample_counts(self)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - tsk_memset(self->num_tracked_samples, 0, - (self->num_nodes + 1) * sizeof(*self->num_tracked_samples)); -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_set_tracked_samples( - tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples) -{ - int ret = TSK_ERR_GENERIC; - tsk_size_t *tree_num_tracked_samples = self->num_tracked_samples; - const tsk_id_t *parent = self->parent; - tsk_size_t j; - tsk_id_t u; - - /* TODO This is not needed when the tree is new. We should use the - * state machine to check and only reset the tracked samples when needed. - */ - ret = tsk_tree_reset_tracked_samples(self); - if (ret != 0) { - goto out; - } - self->num_tracked_samples[self->virtual_root] = num_tracked_samples; - for (j = 0; j < num_tracked_samples; j++) { - u = tracked_samples[j]; - if (u < 0 || u >= (tsk_id_t) self->num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - goto out; - } - if (!tsk_treeseq_is_sample(self->tree_sequence, u)) { - ret = TSK_ERR_BAD_SAMPLES; - goto out; - } - if (self->num_tracked_samples[u] != 0) { - ret = TSK_ERR_DUPLICATE_SAMPLE; - goto out; - } - /* Propagate this upwards */ - while (u != TSK_NULL) { - tree_num_tracked_samples[u]++; - u = parent[u]; - } - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node) -{ - int ret = 0; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - const tsk_id_t *restrict parent = self->parent; - const tsk_id_t *restrict left_child = self->left_child; - const tsk_id_t *restrict right_sib = self->right_sib; - const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; - tsk_size_t *num_tracked_samples = self->num_tracked_samples; - tsk_size_t n, j, num_nodes; - tsk_id_t u, v; - - if (nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_tree_postorder_from(self, node, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - ret = tsk_tree_reset_tracked_samples(self); - if (ret != 0) { - goto out; - } - u = 0; /* keep the compiler happy */ - for (j = 0; j < num_nodes; j++) { - u = nodes[j]; - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - num_tracked_samples[u] += num_tracked_samples[v]; - } - num_tracked_samples[u] += flags[u] & TSK_NODE_IS_SAMPLE ? 1 : 0; - } - n = num_tracked_samples[u]; - u = parent[u]; - while (u != TSK_NULL) { - num_tracked_samples[u] = n; - u = parent[u]; - } - num_tracked_samples[self->virtual_root] = n; -out: - tsk_safe_free(nodes); - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) -{ - int ret = TSK_ERR_GENERIC; - tsk_size_t N = self->num_nodes + 1; - - if (!(options & TSK_NO_INIT)) { - ret = tsk_tree_init(dest, self->tree_sequence, options); - if (ret != 0) { - goto out; - } - } - if (self->tree_sequence != dest->tree_sequence) { - ret = TSK_ERR_BAD_PARAM_VALUE; - goto out; - } - dest->interval = self->interval; - dest->left_index = self->left_index; - dest->right_index = self->right_index; - dest->direction = self->direction; - dest->index = self->index; - dest->sites = self->sites; - dest->sites_length = self->sites_length; - dest->root_threshold = self->root_threshold; - dest->num_edges = self->num_edges; - - tsk_memcpy(dest->parent, self->parent, N * sizeof(*self->parent)); - tsk_memcpy(dest->left_child, self->left_child, N * sizeof(*self->left_child)); - tsk_memcpy(dest->right_child, self->right_child, N * sizeof(*self->right_child)); - tsk_memcpy(dest->left_sib, self->left_sib, N * sizeof(*self->left_sib)); - tsk_memcpy(dest->right_sib, self->right_sib, N * sizeof(*self->right_sib)); - tsk_memcpy(dest->num_children, self->num_children, N * sizeof(*self->num_children)); - tsk_memcpy(dest->edge, self->edge, N * sizeof(*self->edge)); - if (!(dest->options & TSK_NO_SAMPLE_COUNTS)) { - if (self->options & TSK_NO_SAMPLE_COUNTS) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - tsk_memcpy(dest->num_samples, self->num_samples, N * sizeof(*self->num_samples)); - tsk_memcpy(dest->num_tracked_samples, self->num_tracked_samples, - N * sizeof(*self->num_tracked_samples)); - } - if (dest->options & TSK_SAMPLE_LISTS) { - if (!(self->options & TSK_SAMPLE_LISTS)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - tsk_memcpy(dest->left_sample, self->left_sample, N * sizeof(*self->left_sample)); - tsk_memcpy( - dest->right_sample, self->right_sample, N * sizeof(*self->right_sample)); - tsk_memcpy(dest->next_sample, self->next_sample, - self->tree_sequence->num_samples * sizeof(*self->next_sample)); - } - ret = 0; -out: - return ret; -} - -bool TSK_WARN_UNUSED -tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other) -{ - bool ret = false; - - if (self->tree_sequence == other->tree_sequence) { - ret = self->index == other->index; - } - return ret; -} - -static int -tsk_tree_check_node(const tsk_tree_t *self, tsk_id_t u) -{ - int ret = 0; - if (u < 0 || u > (tsk_id_t) self->num_nodes) { - ret = TSK_ERR_NODE_OUT_OF_BOUNDS; - } - return ret; -} - -bool -tsk_tree_is_descendant(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v) -{ - bool ret = false; - tsk_id_t w = u; - tsk_id_t *restrict parent = self->parent; - - if (tsk_tree_check_node(self, u) == 0 && tsk_tree_check_node(self, v) == 0) { - while (w != v && w != TSK_NULL) { - w = parent[w]; - } - ret = w == v; - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca) -{ - int ret = 0; - double tu, tv; - const tsk_id_t *restrict parent = self->parent; - const double *restrict time = self->tree_sequence->tables->nodes.time; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - ret = tsk_tree_check_node(self, v); - if (ret != 0) { - goto out; - } - - /* Simplest to make the virtual_root a special case here to avoid - * doing the time lookup. */ - if (u == self->virtual_root || v == self->virtual_root) { - *mrca = self->virtual_root; - return 0; - } - - tu = time[u]; - tv = time[v]; - while (u != v) { - if (tu < tv) { - u = parent[u]; - if (u == TSK_NULL) { - break; - } - tu = time[u]; - } else { - v = parent[v]; - if (v == TSK_NULL) { - break; - } - tv = time[v]; - } - } - *mrca = u == v ? u : TSK_NULL; -out: - return ret; -} - -static int -tsk_tree_get_num_samples_by_traversal( - const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples) -{ - int ret = 0; - tsk_size_t num_nodes, j; - tsk_size_t count = 0; - const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - tsk_id_t v; - - if (nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_tree_preorder_from(self, u, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_nodes; j++) { - v = nodes[j]; - if (flags[v] & TSK_NODE_IS_SAMPLE) { - count++; - } - } - *num_samples = count; -out: - tsk_safe_free(nodes); - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_get_num_samples(const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples) -{ - int ret = 0; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - *num_samples = (tsk_size_t) self->num_samples[u]; - } else { - ret = tsk_tree_get_num_samples_by_traversal(self, u, num_samples); - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_get_num_tracked_samples( - const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_tracked_samples) -{ - int ret = 0; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - if (self->options & TSK_NO_SAMPLE_COUNTS) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - *num_tracked_samples = self->num_tracked_samples[u]; -out: - return ret; -} - -bool -tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u) -{ - return tsk_treeseq_is_sample(self->tree_sequence, u); -} - -tsk_id_t -tsk_tree_get_left_root(const tsk_tree_t *self) -{ - return self->left_child[self->virtual_root]; -} - -tsk_id_t -tsk_tree_get_right_root(const tsk_tree_t *self) -{ - return self->right_child[self->virtual_root]; -} - -tsk_size_t -tsk_tree_get_num_roots(const tsk_tree_t *self) -{ - return (tsk_size_t) self->num_children[self->virtual_root]; -} - -int TSK_WARN_UNUSED -tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent) -{ - int ret = 0; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - *parent = self->parent[u]; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *t) -{ - int ret = 0; - tsk_node_t node; - - if (u == self->virtual_root) { - *t = INFINITY; - } else { - ret = tsk_treeseq_get_node(self->tree_sequence, u, &node); - if (ret != 0) { - goto out; - } - *t = node.time; - } -out: - return ret; -} - -static inline double -tsk_tree_get_branch_length_unsafe(const tsk_tree_t *self, tsk_id_t u) -{ - const double *times = self->tree_sequence->tables->nodes.time; - const tsk_id_t parent = self->parent[u]; - - return parent == TSK_NULL ? 0 : times[parent] - times[u]; -} - -int TSK_WARN_UNUSED -tsk_tree_get_branch_length(const tsk_tree_t *self, tsk_id_t u, double *ret_branch_length) -{ - int ret = 0; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - *ret_branch_length = tsk_tree_get_branch_length_unsafe(self, u); -out: - return ret; -} - -int -tsk_tree_get_total_branch_length(const tsk_tree_t *self, tsk_id_t node, double *ret_tbl) -{ - int ret = 0; - tsk_size_t j, num_nodes; - tsk_id_t u, v; - const tsk_id_t *restrict parent = self->parent; - const double *restrict time = self->tree_sequence->tables->nodes.time; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - double sum = 0; - - if (nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_tree_preorder_from(self, node, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - /* We always skip the first node because we don't return the branch length - * over the input node. */ - for (j = 1; j < num_nodes; j++) { - u = nodes[j]; - v = parent[u]; - if (v != TSK_NULL) { - sum += time[v] - time[u]; - } - } - *ret_tbl = sum; -out: - tsk_safe_free(nodes); - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_get_sites( - const tsk_tree_t *self, const tsk_site_t **sites, tsk_size_t *sites_length) -{ - *sites = self->sites; - *sites_length = self->sites_length; - return 0; -} - -/* u must be a valid node in the tree. For internal use */ -static int -tsk_tree_get_depth_unsafe(const tsk_tree_t *self, tsk_id_t u) -{ - tsk_id_t v; - const tsk_id_t *restrict parent = self->parent; - int depth = 0; - - if (u == self->virtual_root) { - return -1; - } - for (v = parent[u]; v != TSK_NULL; v = parent[v]) { - depth++; - } - return depth; -} - -int TSK_WARN_UNUSED -tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *depth_ret) -{ - int ret = 0; - - ret = tsk_tree_check_node(self, u); - if (ret != 0) { - goto out; - } - - *depth_ret = tsk_tree_get_depth_unsafe(self, u); -out: - return ret; -} - -static tsk_id_t -tsk_tree_node_root(tsk_tree_t *self, tsk_id_t u) -{ - tsk_id_t v = u; - while (self->parent[v] != TSK_NULL) { - v = self->parent[v]; - } - - return v; -} - -static void -tsk_tree_check_state(const tsk_tree_t *self) -{ - tsk_id_t u, v; - tsk_size_t j, num_samples; - int err, c; - tsk_site_t site; - tsk_id_t *children = tsk_malloc(self->num_nodes * sizeof(tsk_id_t)); - bool *is_root = tsk_calloc(self->num_nodes, sizeof(bool)); - - tsk_bug_assert(children != NULL); - - /* Check the virtual root properties */ - tsk_bug_assert(self->parent[self->virtual_root] == TSK_NULL); - tsk_bug_assert(self->left_sib[self->virtual_root] == TSK_NULL); - tsk_bug_assert(self->right_sib[self->virtual_root] == TSK_NULL); - - for (j = 0; j < self->tree_sequence->num_samples; j++) { - u = self->samples[j]; - while (self->parent[u] != TSK_NULL) { - u = self->parent[u]; - } - is_root[u] = true; - } - if (self->tree_sequence->num_samples == 0) { - tsk_bug_assert(self->left_child[self->virtual_root] == TSK_NULL); - } - - /* Iterate over the roots and make sure they are set */ - for (u = tsk_tree_get_left_root(self); u != TSK_NULL; u = self->right_sib[u]) { - tsk_bug_assert(is_root[u]); - is_root[u] = false; - } - for (u = 0; u < (tsk_id_t) self->num_nodes; u++) { - tsk_bug_assert(!is_root[u]); - c = 0; - for (v = self->left_child[u]; v != TSK_NULL; v = self->right_sib[v]) { - tsk_bug_assert(self->parent[v] == u); - children[c] = v; - c++; - } - for (v = self->right_child[u]; v != TSK_NULL; v = self->left_sib[v]) { - tsk_bug_assert(c > 0); - c--; - tsk_bug_assert(v == children[c]); - } - } - for (j = 0; j < self->sites_length; j++) { - site = self->sites[j]; - tsk_bug_assert(self->interval.left <= site.position); - tsk_bug_assert(site.position < self->interval.right); - } - - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - tsk_bug_assert(self->num_samples != NULL); - tsk_bug_assert(self->num_tracked_samples != NULL); - for (u = 0; u < (tsk_id_t) self->num_nodes; u++) { - err = tsk_tree_get_num_samples_by_traversal(self, u, &num_samples); - tsk_bug_assert(err == 0); - tsk_bug_assert(num_samples == (tsk_size_t) self->num_samples[u]); - } - } else { - tsk_bug_assert(self->num_samples == NULL); - tsk_bug_assert(self->num_tracked_samples == NULL); - } - if (self->options & TSK_SAMPLE_LISTS) { - tsk_bug_assert(self->right_sample != NULL); - tsk_bug_assert(self->left_sample != NULL); - tsk_bug_assert(self->next_sample != NULL); - } else { - tsk_bug_assert(self->right_sample == NULL); - tsk_bug_assert(self->left_sample == NULL); - tsk_bug_assert(self->next_sample == NULL); - } - - free(children); - free(is_root); -} - -void -tsk_tree_print_state(const tsk_tree_t *self, FILE *out) -{ - tsk_size_t j; - tsk_site_t site; - - fprintf(out, "Tree state:\n"); - fprintf(out, "options = %d\n", self->options); - fprintf(out, "root_threshold = %lld\n", (long long) self->root_threshold); - fprintf(out, "left = %f\n", self->interval.left); - fprintf(out, "right = %f\n", self->interval.right); - fprintf(out, "index = %lld\n", (long long) self->index); - fprintf(out, "node\tparent\tlchild\trchild\tlsib\trsib"); - if (self->options & TSK_SAMPLE_LISTS) { - fprintf(out, "\thead\ttail"); - } - fprintf(out, "\n"); - - for (j = 0; j < self->num_nodes + 1; j++) { - fprintf(out, "%lld\t%lld\t%lld\t%lld\t%lld\t%lld", (long long) j, - (long long) self->parent[j], (long long) self->left_child[j], - (long long) self->right_child[j], (long long) self->left_sib[j], - (long long) self->right_sib[j]); - if (self->options & TSK_SAMPLE_LISTS) { - fprintf(out, "\t%lld\t%lld\t", (long long) self->left_sample[j], - (long long) self->right_sample[j]); - } - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - fprintf(out, "\t%lld\t%lld", (long long) self->num_samples[j], - (long long) self->num_tracked_samples[j]); - } - fprintf(out, "\n"); - } - fprintf(out, "sites = \n"); - for (j = 0; j < self->sites_length; j++) { - site = self->sites[j]; - fprintf(out, "\t%lld\t%f\n", (long long) site.id, site.position); - } - tsk_tree_check_state(self); -} - -/* Methods for positioning the tree along the sequence */ - -/* The following methods are performance sensitive and so we use a - * lot of restrict pointers. Because we are saying that we don't have - * any aliases to these pointers, we pass around the reference to parent - * since it's used in all the functions. */ -static inline void -tsk_tree_update_sample_lists( - tsk_tree_t *self, tsk_id_t node, const tsk_id_t *restrict parent) -{ - tsk_id_t u, v, sample_index; - tsk_id_t *restrict left_child = self->left_child; - tsk_id_t *restrict right_sib = self->right_sib; - tsk_id_t *restrict left = self->left_sample; - tsk_id_t *restrict right = self->right_sample; - tsk_id_t *restrict next = self->next_sample; - const tsk_id_t *restrict sample_index_map = self->tree_sequence->sample_index_map; - - for (u = node; u != TSK_NULL; u = parent[u]) { - sample_index = sample_index_map[u]; - if (sample_index != TSK_NULL) { - right[u] = left[u]; - } else { - left[u] = TSK_NULL; - right[u] = TSK_NULL; - } - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - if (left[v] != TSK_NULL) { - tsk_bug_assert(right[v] != TSK_NULL); - if (left[u] == TSK_NULL) { - left[u] = left[v]; - right[u] = right[v]; - } else { - next[right[u]] = left[v]; - right[u] = right[v]; - } - } - } - } -} - -static inline void -tsk_tree_remove_branch( - tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) -{ - tsk_id_t *restrict left_child = self->left_child; - tsk_id_t *restrict right_child = self->right_child; - tsk_id_t *restrict left_sib = self->left_sib; - tsk_id_t *restrict right_sib = self->right_sib; - tsk_id_t *restrict num_children = self->num_children; - tsk_id_t lsib = left_sib[c]; - tsk_id_t rsib = right_sib[c]; - - if (lsib == TSK_NULL) { - left_child[p] = rsib; - } else { - right_sib[lsib] = rsib; - } - if (rsib == TSK_NULL) { - right_child[p] = lsib; - } else { - left_sib[rsib] = lsib; - } - parent[c] = TSK_NULL; - left_sib[c] = TSK_NULL; - right_sib[c] = TSK_NULL; - num_children[p]--; -} - -static inline void -tsk_tree_insert_branch( - tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) -{ - tsk_id_t *restrict left_child = self->left_child; - tsk_id_t *restrict right_child = self->right_child; - tsk_id_t *restrict left_sib = self->left_sib; - tsk_id_t *restrict right_sib = self->right_sib; - tsk_id_t *restrict num_children = self->num_children; - tsk_id_t u; - - parent[c] = p; - u = right_child[p]; - if (u == TSK_NULL) { - left_child[p] = c; - left_sib[c] = TSK_NULL; - right_sib[c] = TSK_NULL; - } else { - right_sib[u] = c; - left_sib[c] = u; - right_sib[c] = TSK_NULL; - } - right_child[p] = c; - num_children[p]++; -} - -static inline void -tsk_tree_insert_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) -{ - tsk_tree_insert_branch(self, self->virtual_root, root, parent); - parent[root] = TSK_NULL; -} - -static inline void -tsk_tree_remove_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) -{ - tsk_tree_remove_branch(self, self->virtual_root, root, parent); -} - -static void -tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) -{ - tsk_id_t *restrict parent = self->parent; - tsk_size_t *restrict num_samples = self->num_samples; - tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; - tsk_id_t *restrict edge = self->edge; - const tsk_size_t root_threshold = self->root_threshold; - tsk_id_t u; - tsk_id_t path_end = TSK_NULL; - bool path_end_was_root = false; - -#define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) - - tsk_tree_remove_branch(self, p, c, parent); - self->num_edges--; - edge[c] = TSK_NULL; - - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - u = p; - while (u != TSK_NULL) { - path_end = u; - path_end_was_root = POTENTIAL_ROOT(u); - num_samples[u] -= num_samples[c]; - num_tracked_samples[u] -= num_tracked_samples[c]; - u = parent[u]; - } - - if (path_end_was_root && !POTENTIAL_ROOT(path_end)) { - tsk_tree_remove_root(self, path_end, parent); - } - if (POTENTIAL_ROOT(c)) { - tsk_tree_insert_root(self, c, parent); - } - } - - if (self->options & TSK_SAMPLE_LISTS) { - tsk_tree_update_sample_lists(self, p, parent); - } -} - -static void -tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t edge_id) -{ - tsk_id_t *restrict parent = self->parent; - tsk_size_t *restrict num_samples = self->num_samples; - tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; - tsk_id_t *restrict edge = self->edge; - const tsk_size_t root_threshold = self->root_threshold; - tsk_id_t u; - tsk_id_t path_end = TSK_NULL; - bool path_end_was_root = false; - -#define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) - - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - u = p; - while (u != TSK_NULL) { - path_end = u; - path_end_was_root = POTENTIAL_ROOT(u); - num_samples[u] += num_samples[c]; - num_tracked_samples[u] += num_tracked_samples[c]; - u = parent[u]; - } - - if (POTENTIAL_ROOT(c)) { - tsk_tree_remove_root(self, c, parent); - } - if (POTENTIAL_ROOT(path_end) && !path_end_was_root) { - tsk_tree_insert_root(self, path_end, parent); - } - } - - tsk_tree_insert_branch(self, p, c, parent); - self->num_edges++; - edge[c] = edge_id; - - if (self->options & TSK_SAMPLE_LISTS) { - tsk_tree_update_sample_lists(self, p, parent); - } -} - -static int -tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_breakpoints, - const tsk_id_t *restrict out_order, tsk_id_t *out_index, - const double *restrict in_breakpoints, const tsk_id_t *restrict in_order, - tsk_id_t *in_index) -{ - int ret = 0; - const int direction_change = direction * (direction != self->direction); - tsk_id_t in = *in_index + direction_change; - tsk_id_t out = *out_index + direction_change; - tsk_id_t k; - const tsk_table_collection_t *tables = self->tree_sequence->tables; - const double sequence_length = tables->sequence_length; - const tsk_id_t num_edges = (tsk_id_t) tables->edges.num_rows; - const tsk_id_t *restrict edge_parent = tables->edges.parent; - const tsk_id_t *restrict edge_child = tables->edges.child; - double x; - - if (direction == TSK_DIR_FORWARD) { - x = self->interval.right; - } else { - x = self->interval.left; - } - while (out >= 0 && out < num_edges && out_breakpoints[out_order[out]] == x) { - tsk_bug_assert(out < num_edges); - k = out_order[out]; - out += direction; - tsk_tree_remove_edge(self, edge_parent[k], edge_child[k]); - } - - while (in >= 0 && in < num_edges && in_breakpoints[in_order[in]] == x) { - k = in_order[in]; - in += direction; - tsk_tree_insert_edge(self, edge_parent[k], edge_child[k], k); - } - - self->direction = direction; - self->index = self->index + direction; - if (direction == TSK_DIR_FORWARD) { - self->interval.left = x; - self->interval.right = sequence_length; - if (out >= 0 && out < num_edges) { - self->interval.right - = TSK_MIN(self->interval.right, out_breakpoints[out_order[out]]); - } - if (in >= 0 && in < num_edges) { - self->interval.right - = TSK_MIN(self->interval.right, in_breakpoints[in_order[in]]); - } - } else { - self->interval.right = x; - self->interval.left = 0; - if (out >= 0 && out < num_edges) { - self->interval.left - = TSK_MAX(self->interval.left, out_breakpoints[out_order[out]]); - } - if (in >= 0 && in < num_edges) { - self->interval.left - = TSK_MAX(self->interval.left, in_breakpoints[in_order[in]]); - } - } - tsk_bug_assert(self->interval.left < self->interval.right); - *out_index = out; - *in_index = in; - if (tables->sites.num_rows > 0) { - self->sites = self->tree_sequence->tree_sites[self->index]; - self->sites_length = self->tree_sequence->tree_sites_length[self->index]; - } - ret = TSK_TREE_OK; - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_first(tsk_tree_t *self) -{ - int ret = TSK_TREE_OK; - tsk_table_collection_t *tables = self->tree_sequence->tables; - - self->interval.left = 0; - self->index = 0; - self->interval.right = tables->sequence_length; - self->sites = self->tree_sequence->tree_sites[0]; - self->sites_length = self->tree_sequence->tree_sites_length[0]; - - if (tables->edges.num_rows > 0) { - /* TODO this is redundant if this is the first usage of the tree. We - * should add a state machine here so we know what state the tree is - * in and can take the appropriate actions. - */ - ret = tsk_tree_clear(self); - if (ret != 0) { - goto out; - } - self->index = -1; - self->left_index = 0; - self->right_index = 0; - self->direction = TSK_DIR_FORWARD; - self->interval.right = 0; - - ret = tsk_tree_advance(self, TSK_DIR_FORWARD, tables->edges.right, - tables->indexes.edge_removal_order, &self->right_index, tables->edges.left, - tables->indexes.edge_insertion_order, &self->left_index); - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_last(tsk_tree_t *self) -{ - int ret = TSK_TREE_OK; - const tsk_treeseq_t *ts = self->tree_sequence; - const tsk_table_collection_t *tables = ts->tables; - - self->interval.left = 0; - self->interval.right = tables->sequence_length; - self->index = 0; - self->sites = ts->tree_sites[0]; - self->sites_length = ts->tree_sites_length[0]; - - if (tables->edges.num_rows > 0) { - /* TODO this is redundant if this is the first usage of the tree. We - * should add a state machine here so we know what state the tree is - * in and can take the appropriate actions. - */ - ret = tsk_tree_clear(self); - if (ret != 0) { - goto out; - } - self->index = (tsk_id_t) tsk_treeseq_get_num_trees(ts); - self->left_index = (tsk_id_t) tables->edges.num_rows - 1; - self->right_index = (tsk_id_t) tables->edges.num_rows - 1; - self->direction = TSK_DIR_REVERSE; - self->interval.left = tables->sequence_length; - self->interval.right = 0; - - ret = tsk_tree_advance(self, TSK_DIR_REVERSE, tables->edges.left, - tables->indexes.edge_insertion_order, &self->left_index, tables->edges.right, - tables->indexes.edge_removal_order, &self->right_index); - } -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_next(tsk_tree_t *self) -{ - int ret = 0; - const tsk_treeseq_t *ts = self->tree_sequence; - const tsk_table_collection_t *tables = ts->tables; - tsk_id_t num_trees = (tsk_id_t) tsk_treeseq_get_num_trees(ts); - - if (self->index == -1) { - ret = tsk_tree_first(self); - } else if (self->index < num_trees - 1) { - ret = tsk_tree_advance(self, TSK_DIR_FORWARD, tables->edges.right, - tables->indexes.edge_removal_order, &self->right_index, tables->edges.left, - tables->indexes.edge_insertion_order, &self->left_index); - } else { - ret = tsk_tree_clear(self); - } - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_prev(tsk_tree_t *self) -{ - int ret = 0; - const tsk_table_collection_t *tables = self->tree_sequence->tables; - - if (self->index == -1) { - ret = tsk_tree_last(self); - } else if (self->index > 0) { - ret = tsk_tree_advance(self, TSK_DIR_REVERSE, tables->edges.left, - tables->indexes.edge_insertion_order, &self->left_index, tables->edges.right, - tables->indexes.edge_removal_order, &self->right_index); - } else { - ret = tsk_tree_clear(self); - } - return ret; -} - -static inline bool -tsk_tree_position_in_interval(const tsk_tree_t *self, double x) -{ - return self->interval.left <= x && x < self->interval.right; -} - -/* NOTE: - * - * Notes from Kevin Thornton: - * - * This method inserts the edges for an arbitrary tree - * in linear time and requires no additional memory. - * - * During design, the following alternatives were tested - * (in a combination of rust + C): - * 1. Indexing edge insertion/removal locations by tree. - * The indexing can be done in O(n) time, giving O(1) - * access to the first edge in a tree. We can then add - * edges to the tree in O(e) time, where e is the number - * of edges. This apparoach requires O(n) additional memory - * and is only marginally faster than the implementation below. - * 2. Building an interval tree mapping edge id -> span. - * This approach adds a lot of complexity and wasn't any faster - * than the indexing described above. - */ -static int -tsk_tree_seek_from_null(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - tsk_size_t edge; - tsk_id_t p, c, e, j, k, tree_index; - const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); - const tsk_treeseq_t *treeseq = self->tree_sequence; - const tsk_table_collection_t *tables = treeseq->tables; - const tsk_id_t *restrict edge_parent = tables->edges.parent; - const tsk_id_t *restrict edge_child = tables->edges.child; - const tsk_size_t num_edges = tables->edges.num_rows; - const tsk_size_t num_trees = self->tree_sequence->num_trees; - const double *restrict edge_left = tables->edges.left; - const double *restrict edge_right = tables->edges.right; - const double *restrict breakpoints = treeseq->breakpoints; - const tsk_id_t *restrict insertion = tables->indexes.edge_insertion_order; - const tsk_id_t *restrict removal = tables->indexes.edge_removal_order; - - // NOTE: it may be better to get the - // index first and then ask if we are - // searching in the first or last 1/2 - // of trees. - j = -1; - if (x <= L / 2.0) { - for (edge = 0; edge < num_edges; edge++) { - e = insertion[edge]; - if (edge_left[e] > x) { - j = (tsk_id_t) edge; - break; - } - if (x >= edge_left[e] && x < edge_right[e]) { - p = edge_parent[e]; - c = edge_child[e]; - tsk_tree_insert_edge(self, p, c, e); - } - } - } else { - for (edge = 0; edge < num_edges; edge++) { - e = removal[num_edges - edge - 1]; - if (edge_right[e] < x) { - j = (tsk_id_t)(num_edges - edge - 1); - while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) { - j++; - } - break; - } - if (x >= edge_left[e] && x < edge_right[e]) { - p = edge_parent[e]; - c = edge_child[e]; - tsk_tree_insert_edge(self, p, c, e); - } - } - } - - if (j == -1) { - j = 0; - while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) { - j++; - } - } - k = 0; - while (k < (tsk_id_t) num_edges && edge_right[removal[k]] <= x) { - k++; - } - - /* NOTE: tsk_search_sorted finds the first the first - * insertion locatiom >= the query point, which - * finds a RIGHT value for queries not at the left edge. - */ - tree_index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x); - if (breakpoints[tree_index] > x) { - tree_index--; - } - self->index = tree_index; - self->interval.left = breakpoints[tree_index]; - self->interval.right = breakpoints[tree_index + 1]; - self->left_index = j; - self->right_index = k; - self->direction = TSK_DIR_FORWARD; - self->num_nodes = tables->nodes.num_rows; - if (tables->sites.num_rows > 0) { - self->sites = treeseq->tree_sites[self->index]; - self->sites_length = treeseq->tree_sites_length[self->index]; - } - - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options) -{ - int ret = 0; - double x; - - if (tree < 0 || tree >= (tsk_id_t) self->tree_sequence->num_trees) { - ret = TSK_ERR_SEEK_OUT_OF_BOUNDS; - goto out; - } - x = self->tree_sequence->breakpoints[tree]; - ret = tsk_tree_seek(self, x, options); -out: - return ret; -} - -static int TSK_WARN_UNUSED -tsk_tree_seek_linear(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) -{ - const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); - const double t_l = self->interval.left; - const double t_r = self->interval.right; - int ret = 0; - double distance_left, distance_right; - - if (x < t_l) { - /* |-----|-----|========|---------| */ - /* 0 x t_l t_r L */ - distance_left = t_l - x; - distance_right = L - t_r + x; - } else { - /* |------|========|------|-------| */ - /* 0 t_l t_r x L */ - distance_right = x - t_r; - distance_left = t_l + L - x; - } - if (distance_right <= distance_left) { - while (!tsk_tree_position_in_interval(self, x)) { - ret = tsk_tree_next(self); - if (ret < 0) { - goto out; - } - } - } else { - while (!tsk_tree_position_in_interval(self, x)) { - ret = tsk_tree_prev(self); - if (ret < 0) { - goto out; - } - } - } - ret = 0; -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options) -{ - int ret = 0; - const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); - - if (x < 0 || x >= L) { - ret = TSK_ERR_SEEK_OUT_OF_BOUNDS; - goto out; - } - - if (self->index == -1) { - ret = tsk_tree_seek_from_null(self, x, options); - } else { - ret = tsk_tree_seek_linear(self, x, options); - } - -out: - return ret; -} - -int TSK_WARN_UNUSED -tsk_tree_clear(tsk_tree_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t u; - const tsk_size_t N = self->num_nodes + 1; - const tsk_size_t num_samples = self->tree_sequence->num_samples; - const bool sample_counts = !(self->options & TSK_NO_SAMPLE_COUNTS); - const bool sample_lists = !!(self->options & TSK_SAMPLE_LISTS); - const tsk_flags_t *flags = self->tree_sequence->tables->nodes.flags; - - self->interval.left = 0; - self->interval.right = 0; - self->num_edges = 0; - self->index = -1; - /* TODO we should profile this method to see if just doing a single loop over - * the nodes would be more efficient than multiple memsets. - */ - tsk_memset(self->parent, 0xff, N * sizeof(*self->parent)); - tsk_memset(self->left_child, 0xff, N * sizeof(*self->left_child)); - tsk_memset(self->right_child, 0xff, N * sizeof(*self->right_child)); - tsk_memset(self->left_sib, 0xff, N * sizeof(*self->left_sib)); - tsk_memset(self->right_sib, 0xff, N * sizeof(*self->right_sib)); - tsk_memset(self->num_children, 0, N * sizeof(*self->num_children)); - tsk_memset(self->edge, 0xff, N * sizeof(*self->edge)); - - if (sample_counts) { - tsk_memset(self->num_samples, 0, N * sizeof(*self->num_samples)); - /* We can't reset the tracked samples via memset because we don't - * know where the tracked samples are. - */ - for (j = 0; j < self->num_nodes; j++) { - if (!(flags[j] & TSK_NODE_IS_SAMPLE)) { - self->num_tracked_samples[j] = 0; - } - } - /* The total tracked_samples gets set in set_tracked_samples */ - self->num_samples[self->virtual_root] = num_samples; - } - if (sample_lists) { - tsk_memset(self->left_sample, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->right_sample, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->next_sample, 0xff, num_samples * sizeof(tsk_id_t)); - } - /* Set the sample attributes */ - for (j = 0; j < num_samples; j++) { - u = self->samples[j]; - if (sample_counts) { - self->num_samples[u] = 1; - } - if (sample_lists) { - /* We are mapping to *indexes* into the list of samples here */ - self->left_sample[u] = (tsk_id_t) j; - self->right_sample[u] = (tsk_id_t) j; - } - } - if (sample_counts && self->root_threshold == 1 && num_samples > 0) { - for (j = 0; j < num_samples; j++) { - /* Set initial roots */ - if (self->root_threshold == 1) { - tsk_tree_insert_root(self, self->samples[j], self->parent); - } - } - } - return ret; -} - -tsk_size_t -tsk_tree_get_size_bound(const tsk_tree_t *self) -{ - tsk_size_t bound = 0; - - if (self->tree_sequence != NULL) { - /* This is a safe upper bound which can be computed cheaply. - * We have at most n roots and each edge adds at most one new - * node to the tree. We also allow space for the virtual root, - * to simplify client code. - * - * In the common case of a binary tree with a single root, we have - * 2n - 1 nodes in total, and 2n - 2 edges. Therefore, we return - * 3n - 1, which is an over-estimate of 1/2 and we allocate - * 1.5 times as much memory as we need. - * - * Since tracking the exact number of nodes in the tree would require - * storing the number of nodes beneath every node and complicate - * the tree transition method, this seems like a good compromise - * and will result in less memory usage overall in nearly all cases. - */ - bound = 1 + self->tree_sequence->num_samples + self->num_edges; - } - return bound; -} - -/* Traversal orders */ -static tsk_id_t * -tsk_tree_alloc_node_stack(const tsk_tree_t *self) -{ - return tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(tsk_id_t)); -} - -int -tsk_tree_preorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) -{ - return tsk_tree_preorder_from(self, -1, nodes, num_nodes_ret); -} - -int -tsk_tree_preorder_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) -{ - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - tsk_id_t *stack = tsk_tree_alloc_node_stack(self); - tsk_size_t num_nodes = 0; - tsk_id_t u, v; - int stack_top; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - if ((root == -1 || root == self->virtual_root) - && !tsk_tree_has_sample_counts(self)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - if (root == -1) { - stack_top = -1; - for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - stack[stack_top] = u; - } - } else { - ret = tsk_tree_check_node(self, root); - if (ret != 0) { - goto out; - } - stack_top = 0; - stack[stack_top] = root; - } - - while (stack_top >= 0) { - u = stack[stack_top]; - stack_top--; - nodes[num_nodes] = u; - num_nodes++; - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - stack_top++; - stack[stack_top] = v; - } - } - *num_nodes_ret = num_nodes; -out: - tsk_safe_free(stack); - return ret; -} - -/* We could implement this using the preorder function, but since it's - * going to be performance critical we want to avoid the overhead - * of mallocing the intermediate node list (which will be bigger than - * the number of samples). */ -int -tsk_tree_preorder_samples_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) -{ - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; - tsk_id_t *stack = tsk_tree_alloc_node_stack(self); - tsk_size_t num_nodes = 0; - tsk_id_t u, v; - int stack_top; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* We could push the virtual_root onto the stack directly to simplify - * the code a little, but then we'd have to check put an extra check - * when looking up the flags array (which isn't defined for virtual_root). - */ - if (root == -1 || root == self->virtual_root) { - if (!tsk_tree_has_sample_counts(self)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - stack_top = -1; - for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - stack[stack_top] = u; - } - } else { - ret = tsk_tree_check_node(self, root); - if (ret != 0) { - goto out; - } - stack_top = 0; - stack[stack_top] = root; - } - - while (stack_top >= 0) { - u = stack[stack_top]; - stack_top--; - if (flags[u] & TSK_NODE_IS_SAMPLE) { - nodes[num_nodes] = u; - num_nodes++; - } - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - stack_top++; - stack[stack_top] = v; - } - } - *num_nodes_ret = num_nodes; -out: - tsk_safe_free(stack); - return ret; -} - -int -tsk_tree_postorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) -{ - return tsk_tree_postorder_from(self, -1, nodes, num_nodes_ret); -} -int -tsk_tree_postorder_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) -{ - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - const tsk_id_t *restrict parent = self->parent; - tsk_id_t *stack = tsk_tree_alloc_node_stack(self); - tsk_size_t num_nodes = 0; - tsk_id_t u, v, postorder_parent; - int stack_top; - bool is_virtual_root = root == self->virtual_root; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - if (root == -1 || is_virtual_root) { - if (!tsk_tree_has_sample_counts(self)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; - goto out; - } - stack_top = -1; - for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - stack[stack_top] = u; - } - } else { - ret = tsk_tree_check_node(self, root); - if (ret != 0) { - goto out; - } - stack_top = 0; - stack[stack_top] = root; - } - - postorder_parent = TSK_NULL; - while (stack_top >= 0) { - u = stack[stack_top]; - if (right_child[u] != TSK_NULL && u != postorder_parent) { - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - stack_top++; - stack[stack_top] = v; - } - } else { - stack_top--; - postorder_parent = parent[u]; - nodes[num_nodes] = u; - num_nodes++; - } - } - if (is_virtual_root) { - nodes[num_nodes] = root; - num_nodes++; - } - *num_nodes_ret = num_nodes; -out: - tsk_safe_free(stack); - return ret; -} - -/* Balance/imbalance metrics */ - -/* Result is a tsk_size_t value here because we could imagine the total - * depth overflowing a 32bit integer for a large tree. */ -int -tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result) -{ - /* Keep the size of the stack elements to 8 bytes in total in the - * standard case. A tsk_id_t depth value is always safe, since - * depth counts the number of nodes encountered on a path. - */ - struct stack_elem { - tsk_id_t node; - tsk_id_t depth; - }; - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - struct stack_elem *stack - = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*stack)); - int stack_top; - tsk_size_t total_depth; - tsk_id_t u; - struct stack_elem s = { .node = TSK_NULL, .depth = 0 }; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - stack_top = -1; - for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - s.node = u; - stack[stack_top] = s; - } - total_depth = 0; - while (stack_top >= 0) { - s = stack[stack_top]; - stack_top--; - u = right_child[s.node]; - if (u == TSK_NULL) { - total_depth += (tsk_size_t) s.depth; - } else { - s.depth++; - while (u != TSK_NULL) { - stack_top++; - s.node = u; - stack[stack_top] = s; - u = left_sib[u]; - } - } - } - *result = total_depth; -out: - tsk_safe_free(stack); - return ret; -} - -int -tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) -{ - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - tsk_id_t *num_leaves = tsk_calloc(self->num_nodes, sizeof(*num_leaves)); - tsk_size_t j, num_nodes, total; - tsk_id_t num_children, u, v; - - if (nodes == NULL || num_leaves == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (tsk_tree_get_num_roots(self) != 1) { - ret = TSK_ERR_UNDEFINED_MULTIROOT; - goto out; - } - ret = tsk_tree_postorder(self, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - - total = 0; - for (j = 0; j < num_nodes; j++) { - u = nodes[j]; - /* Cheaper to compute this on the fly than to access the num_children array. - * since we're already iterating over the children. */ - num_children = 0; - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - num_children++; - num_leaves[u] += num_leaves[v]; - } - if (num_children == 0) { - num_leaves[u] = 1; - } else if (num_children == 2) { - v = right_child[u]; - total += (tsk_size_t) llabs(num_leaves[v] - num_leaves[left_sib[v]]); - } else { - ret = TSK_ERR_UNDEFINED_NONBINARY; - goto out; - } - } - *result = total; -out: - tsk_safe_free(nodes); - tsk_safe_free(num_leaves); - return ret; -} - -int -tsk_tree_b1_index(const tsk_tree_t *self, double *result) -{ - int ret = 0; - const tsk_id_t *restrict parent = self->parent; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - tsk_size_t *max_path_length = tsk_calloc(self->num_nodes, sizeof(*max_path_length)); - tsk_size_t j, num_nodes, mpl; - double total = 0.0; - tsk_id_t u, v; - - if (nodes == NULL || max_path_length == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_tree_postorder(self, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - - for (j = 0; j < num_nodes; j++) { - u = nodes[j]; - if (parent[u] != TSK_NULL && right_child[u] != TSK_NULL) { - mpl = 0; - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - mpl = TSK_MAX(mpl, max_path_length[v]); - } - max_path_length[u] = mpl + 1; - total += 1 / (double) max_path_length[u]; - } - } - *result = total; -out: - tsk_safe_free(nodes); - tsk_safe_free(max_path_length); - return ret; -} - -static double -general_log(double x, double base) -{ - return log(x) / log(base); -} - -int -tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result) -{ - struct stack_elem { - tsk_id_t node; - double path_product; - }; - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - struct stack_elem *stack - = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*stack)); - int stack_top; - double total_proba = 0; - double num_children; - tsk_id_t u; - struct stack_elem s = { .node = TSK_NULL, .path_product = 1 }; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (tsk_tree_get_num_roots(self) != 1) { - ret = TSK_ERR_UNDEFINED_MULTIROOT; - goto out; - } - - stack_top = 0; - s.node = tsk_tree_get_left_root(self); - stack[stack_top] = s; - - while (stack_top >= 0) { - s = stack[stack_top]; - stack_top--; - u = right_child[s.node]; - if (u == TSK_NULL) { - total_proba -= s.path_product * general_log(s.path_product, base); - } else { - num_children = 0; - for (; u != TSK_NULL; u = left_sib[u]) { - num_children++; - } - s.path_product *= 1 / num_children; - for (u = right_child[s.node]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - s.node = u; - stack[stack_top] = s; - } - } - } - *result = total_proba; -out: - tsk_safe_free(stack); - return ret; -} - -int -tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result) -{ - int ret = 0; - const tsk_id_t *restrict right_child = self->right_child; - const tsk_id_t *restrict left_sib = self->left_sib; - const double *restrict time = self->tree_sequence->tables->nodes.time; - tsk_id_t *stack = tsk_tree_alloc_node_stack(self); - tsk_size_t num_lineages = 0; - int stack_top; - tsk_id_t u, v; - double child_time, parent_time; - - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - if (!tsk_isfinite(t)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - /* Push the roots onto the stack */ - stack_top = -1; - for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { - stack_top++; - stack[stack_top] = u; - } - - while (stack_top >= 0) { - u = stack[stack_top]; - parent_time = time[u]; - stack_top--; - for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { - child_time = time[v]; - /* Only traverse down the tree as far as we need to */ - if (child_time > t) { - stack_top++; - stack[stack_top] = v; - } else if (t < parent_time) { - num_lineages++; - } - } - } - *result = num_lineages; -out: - tsk_safe_free(stack); - return ret; -} - -/* Parsimony methods */ - -static inline uint64_t -set_bit(uint64_t value, int32_t bit) -{ - return value | (1ULL << bit); -} - -static inline bool -bit_is_set(uint64_t value, int32_t bit) -{ - return (value & (1ULL << bit)) != 0; -} - -static inline int8_t -get_smallest_set_bit(uint64_t v) -{ - /* This is an inefficient implementation, there are several better - * approaches. On GCC we can use - * return (uint8_t) (__builtin_ffsll((long long) v) - 1); - */ - uint64_t t = 1; - int8_t r = 0; - - assert(v != 0); - while ((v & t) == 0) { - t <<= 1; - r++; - } - return r; -} - -#define HARTIGAN_MAX_ALLELES 64 - -/* This interface is experimental. In the future, we should provide the option to - * use a general cost matrix, in which case we'll use the Sankoff algorithm. For - * now this is unused. - * - * We should also vectorise the function so that several sites can be processed - * at once. - * - * The algorithm used here is Hartigan parsimony, "Minimum Mutation Fits to a - * Given Tree", Biometrics 1973. - */ -int TSK_WARN_UNUSED -tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, - double *TSK_UNUSED(cost_matrix), tsk_flags_t options, int32_t *r_ancestral_state, - tsk_size_t *r_num_transitions, tsk_state_transition_t **r_transitions) -{ - int ret = 0; - struct stack_elem { - tsk_id_t node; - tsk_id_t transition_parent; - int32_t state; - }; - const tsk_size_t num_samples = self->tree_sequence->num_samples; - const tsk_id_t *restrict left_child = self->left_child; - const tsk_id_t *restrict right_sib = self->right_sib; - const tsk_size_t N = tsk_treeseq_get_num_nodes(self->tree_sequence); - const tsk_flags_t *restrict node_flags = self->tree_sequence->tables->nodes.flags; - tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); - /* Note: to use less memory here and to improve cache performance we should - * probably change to allocating exactly the number of nodes returned by - * a preorder traversal, and then lay the memory out in this order. So, we'd - * need a map from node ID to its index in the preorder traversal, but this - * is trivial to compute. Probably doesn't matter so much at the moment - * when we're doing a single site, but it would make a big difference if - * we were vectorising over lots of sites. */ - uint64_t *restrict optimal_set = tsk_calloc(N + 1, sizeof(*optimal_set)); - struct stack_elem *restrict preorder_stack - = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*preorder_stack)); - tsk_id_t u, v; - /* The largest possible number of transitions is one over every sample */ - tsk_state_transition_t *transitions = tsk_malloc(num_samples * sizeof(*transitions)); - int32_t allele, ancestral_state; - int stack_top; - struct stack_elem s; - tsk_size_t j, num_transitions, max_allele_count, num_nodes; - tsk_size_t allele_count[HARTIGAN_MAX_ALLELES]; - tsk_size_t non_missing = 0; - int32_t num_alleles = 0; - - if (optimal_set == NULL || preorder_stack == NULL || transitions == NULL - || nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (j = 0; j < num_samples; j++) { - if (genotypes[j] >= HARTIGAN_MAX_ALLELES || genotypes[j] < TSK_MISSING_DATA) { - ret = TSK_ERR_BAD_GENOTYPE; - goto out; - } - u = self->tree_sequence->samples[j]; - if (genotypes[j] == TSK_MISSING_DATA) { - /* All bits set */ - optimal_set[u] = UINT64_MAX; - } else { - optimal_set[u] = set_bit(optimal_set[u], genotypes[j]); - num_alleles = TSK_MAX(genotypes[j], num_alleles); - non_missing++; - } - } - - if (non_missing == 0) { - ret = TSK_ERR_GENOTYPES_ALL_MISSING; - goto out; - } - num_alleles++; - - ancestral_state = 0; /* keep compiler happy */ - if (options & TSK_MM_FIXED_ANCESTRAL_STATE) { - ancestral_state = *r_ancestral_state; - if ((ancestral_state < 0) || (ancestral_state >= HARTIGAN_MAX_ALLELES)) { - ret = TSK_ERR_BAD_ANCESTRAL_STATE; - goto out; - } else if (ancestral_state >= num_alleles) { - num_alleles = (int32_t)(ancestral_state + 1); - } - } - - ret = tsk_tree_postorder_from(self, self->virtual_root, nodes, &num_nodes); - if (ret != 0) { - goto out; - } - for (j = 0; j < num_nodes; j++) { - u = nodes[j]; - tsk_memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - for (allele = 0; allele < num_alleles; allele++) { - allele_count[allele] += bit_is_set(optimal_set[v], allele); - } - } - /* the virtual root has no flags defined */ - if (u == (tsk_id_t) N || !(node_flags[u] & TSK_NODE_IS_SAMPLE)) { - max_allele_count = 0; - for (allele = 0; allele < num_alleles; allele++) { - max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]); - } - for (allele = 0; allele < num_alleles; allele++) { - if (allele_count[allele] == max_allele_count) { - optimal_set[u] = set_bit(optimal_set[u], allele); - } - } - } - } - if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) { - ancestral_state = get_smallest_set_bit(optimal_set[self->virtual_root]); - } else { - optimal_set[self->virtual_root] = UINT64_MAX; - } - - num_transitions = 0; - - /* Do a preorder traversal */ - preorder_stack[0].node = self->virtual_root; - preorder_stack[0].state = ancestral_state; - preorder_stack[0].transition_parent = TSK_NULL; - stack_top = 0; - while (stack_top >= 0) { - s = preorder_stack[stack_top]; - stack_top--; - - if (!bit_is_set(optimal_set[s.node], s.state)) { - s.state = get_smallest_set_bit(optimal_set[s.node]); - transitions[num_transitions].node = s.node; - transitions[num_transitions].parent = s.transition_parent; - transitions[num_transitions].state = s.state; - s.transition_parent = (tsk_id_t) num_transitions; - num_transitions++; - } - for (v = left_child[s.node]; v != TSK_NULL; v = right_sib[v]) { - stack_top++; - s.node = v; - preorder_stack[stack_top] = s; - } - } - - *r_transitions = transitions; - *r_num_transitions = num_transitions; - *r_ancestral_state = ancestral_state; - transitions = NULL; -out: - tsk_safe_free(transitions); - /* Cannot safe_free because of 'restrict' */ - if (optimal_set != NULL) { - free(optimal_set); - } - if (preorder_stack != NULL) { - free(preorder_stack); - } - if (nodes != NULL) { - free(nodes); - } - return ret; -} - -/* Compatibility shim for initialising the diff iterator from a tree sequence. We are - * using this function in a small number of places internally, so simplest to keep it - * until a more satisfactory "diff" API comes along. - */ -int TSK_WARN_UNUSED -tsk_diff_iter_init_from_ts( - tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) -{ - return tsk_diff_iter_init( - self, tree_sequence->tables, (tsk_id_t) tree_sequence->num_trees, options); -} - -/* ======================================================== * - * KC Distance - * ======================================================== */ - -typedef struct { - tsk_size_t *m; - double *M; - tsk_id_t n; - tsk_id_t N; -} kc_vectors; - -static int -kc_vectors_alloc(kc_vectors *self, tsk_id_t n) -{ - int ret = 0; - - self->n = n; - self->N = (n * (n - 1)) / 2; - self->m = tsk_calloc((size_t)(self->N + self->n), sizeof(*self->m)); - self->M = tsk_calloc((size_t)(self->N + self->n), sizeof(*self->M)); - if (self->m == NULL || self->M == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - -out: - return ret; -} - -static void -kc_vectors_free(kc_vectors *self) -{ - tsk_safe_free(self->m); - tsk_safe_free(self->M); -} - -static inline void -update_kc_vectors_single_sample( - const tsk_treeseq_t *ts, kc_vectors *kc_vecs, tsk_id_t u, double time) -{ - const tsk_id_t *sample_index_map = ts->sample_index_map; - tsk_id_t u_index = sample_index_map[u]; - - kc_vecs->m[kc_vecs->N + u_index] = 1; - kc_vecs->M[kc_vecs->N + u_index] = time; -} - -static inline void -update_kc_vectors_all_pairs(const tsk_tree_t *tree, kc_vectors *kc_vecs, tsk_id_t u, - tsk_id_t v, tsk_size_t depth, double time) -{ - tsk_id_t sample1_index, sample2_index, n1, n2, tmp, pair_index; - const tsk_id_t *restrict left_sample = tree->left_sample; - const tsk_id_t *restrict right_sample = tree->right_sample; - const tsk_id_t *restrict next_sample = tree->next_sample; - tsk_size_t *restrict kc_m = kc_vecs->m; - double *restrict kc_M = kc_vecs->M; - - sample1_index = left_sample[u]; - while (sample1_index != TSK_NULL) { - sample2_index = left_sample[v]; - while (sample2_index != TSK_NULL) { - n1 = sample1_index; - n2 = sample2_index; - if (n1 > n2) { - tmp = n1; - n1 = n2; - n2 = tmp; - } - - /* We spend ~40% of our time here because these accesses - * are not in order and gets very poor cache behavior */ - pair_index = n2 - n1 - 1 + (-1 * n1 * (n1 - 2 * kc_vecs->n + 1)) / 2; - kc_m[pair_index] = depth; - kc_M[pair_index] = time; - - if (sample2_index == right_sample[v]) { - break; - } - sample2_index = next_sample[sample2_index]; - } - if (sample1_index == right_sample[u]) { - break; - } - sample1_index = next_sample[sample1_index]; - } -} - -struct kc_stack_elmt { - tsk_id_t node; - tsk_size_t depth; -}; - -static int -fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs) -{ - int stack_top; - tsk_size_t depth; - double time; - const double *times; - struct kc_stack_elmt *stack; - tsk_id_t root, u, c1, c2; - int ret = 0; - const tsk_treeseq_t *ts = t->tree_sequence; - - stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - times = t->tree_sequence->tables->nodes.time; - - for (root = tsk_tree_get_left_root(t); root != TSK_NULL; root = t->right_sib[root]) { - stack_top = 0; - stack[stack_top].node = root; - stack[stack_top].depth = 0; - while (stack_top >= 0) { - u = stack[stack_top].node; - depth = stack[stack_top].depth; - stack_top--; - - if (tsk_tree_is_sample(t, u)) { - time = tsk_tree_get_branch_length_unsafe(t, u); - update_kc_vectors_single_sample(ts, kc_vecs, u, time); - } - - /* Don't bother going deeper if there are no samples under this node */ - if (t->left_sample[u] != TSK_NULL) { - for (c1 = t->left_child[u]; c1 != TSK_NULL; c1 = t->right_sib[c1]) { - stack_top++; - stack[stack_top].node = c1; - stack[stack_top].depth = depth + 1; - - for (c2 = t->right_sib[c1]; c2 != TSK_NULL; c2 = t->right_sib[c2]) { - time = times[root] - times[u]; - update_kc_vectors_all_pairs(t, kc_vecs, c1, c2, depth, time); - } - } - } - } - } - -out: - tsk_safe_free(stack); - return ret; -} - -static double -norm_kc_vectors(kc_vectors *self, kc_vectors *other, double lambda) -{ - double vT1, vT2, distance_sum; - tsk_id_t i; - - distance_sum = 0; - for (i = 0; i < self->n + self->N; i++) { - vT1 = ((double) self->m[i] * (1 - lambda)) + (lambda * self->M[i]); - vT2 = ((double) other->m[i] * (1 - lambda)) + (lambda * other->M[i]); - distance_sum += (vT1 - vT2) * (vT1 - vT2); - } - - return sqrt(distance_sum); -} - -static int -check_kc_distance_tree_inputs(const tsk_tree_t *self) -{ - tsk_id_t u, num_nodes, left_child; - int ret = 0; - - if (tsk_tree_get_num_roots(self) != 1) { - ret = TSK_ERR_MULTIPLE_ROOTS; - goto out; - } - if (!tsk_tree_has_sample_lists(self)) { - ret = TSK_ERR_NO_SAMPLE_LISTS; - goto out; - } - - num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(self->tree_sequence); - for (u = 0; u < num_nodes; u++) { - left_child = self->left_child[u]; - if (left_child != TSK_NULL && left_child == self->right_child[u]) { - ret = TSK_ERR_UNARY_NODES; - goto out; - } - } -out: - return ret; -} - -static int -check_kc_distance_samples_inputs(const tsk_treeseq_t *self, const tsk_treeseq_t *other) -{ - const tsk_id_t *samples, *other_samples; - tsk_id_t i, n; - int ret = 0; - - if (self->num_samples != other->num_samples) { - ret = TSK_ERR_SAMPLE_SIZE_MISMATCH; - goto out; - } - - samples = self->samples; - other_samples = other->samples; - n = (tsk_id_t) self->num_samples; - for (i = 0; i < n; i++) { - if (samples[i] != other_samples[i]) { - ret = TSK_ERR_SAMPLES_NOT_EQUAL; - goto out; - } - } -out: - return ret; -} - -int -tsk_tree_kc_distance( - const tsk_tree_t *self, const tsk_tree_t *other, double lambda, double *result) -{ - tsk_id_t n, i; - kc_vectors vecs[2]; - const tsk_tree_t *trees[2] = { self, other }; - int ret = 0; - - for (i = 0; i < 2; i++) { - tsk_memset(&vecs[i], 0, sizeof(kc_vectors)); - } - - ret = check_kc_distance_samples_inputs(self->tree_sequence, other->tree_sequence); - if (ret != 0) { - goto out; - } - for (i = 0; i < 2; i++) { - ret = check_kc_distance_tree_inputs(trees[i]); - if (ret != 0) { - goto out; - } - } - - n = (tsk_id_t) self->tree_sequence->num_samples; - for (i = 0; i < 2; i++) { - ret = kc_vectors_alloc(&vecs[i], n); - if (ret != 0) { - goto out; - } - ret = fill_kc_vectors(trees[i], &vecs[i]); - if (ret != 0) { - goto out; - } - } - - *result = norm_kc_vectors(&vecs[0], &vecs[1], lambda); -out: - for (i = 0; i < 2; i++) { - kc_vectors_free(&vecs[i]); - } - return ret; -} - -static int -check_kc_distance_tree_sequence_inputs( - const tsk_treeseq_t *self, const tsk_treeseq_t *other) -{ - int ret = 0; - - if (self->tables->sequence_length != other->tables->sequence_length) { - ret = TSK_ERR_SEQUENCE_LENGTH_MISMATCH; - goto out; - } - - ret = check_kc_distance_samples_inputs(self, other); - if (ret != 0) { - goto out; - } - -out: - return ret; -} - -static void -update_kc_pair_with_sample(const tsk_tree_t *self, kc_vectors *kc, tsk_id_t sample, - tsk_size_t *depths, double root_time) -{ - tsk_id_t c, p, sib; - double time; - tsk_size_t depth; - double *times = self->tree_sequence->tables->nodes.time; - - c = sample; - for (p = self->parent[sample]; p != TSK_NULL; p = self->parent[p]) { - time = root_time - times[p]; - depth = depths[p]; - for (sib = self->left_child[p]; sib != TSK_NULL; sib = self->right_sib[sib]) { - if (sib != c) { - update_kc_vectors_all_pairs(self, kc, sample, sib, depth, time); - } - } - c = p; - } -} - -static int -update_kc_subtree_state( - tsk_tree_t *t, kc_vectors *kc, tsk_id_t u, tsk_size_t *depths, double root_time) -{ - int stack_top; - tsk_id_t v, c; - tsk_id_t *stack = NULL; - int ret = 0; - - stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); - if (stack == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - stack_top = 0; - stack[stack_top] = u; - while (stack_top >= 0) { - v = stack[stack_top]; - stack_top--; - - if (tsk_tree_is_sample(t, v)) { - update_kc_pair_with_sample(t, kc, v, depths, root_time); - } - for (c = t->left_child[v]; c != TSK_NULL; c = t->right_sib[c]) { - if (depths[c] != 0) { - depths[c] = depths[v] + 1; - stack_top++; - stack[stack_top] = c; - } - } - } - -out: - tsk_safe_free(stack); - return ret; -} - -static int -update_kc_incremental(tsk_tree_t *self, kc_vectors *kc, tsk_edge_list_t *edges_out, - tsk_edge_list_t *edges_in, tsk_size_t *depths) -{ - int ret = 0; - tsk_edge_list_node_t *record; - tsk_edge_t *e; - tsk_id_t u; - double root_time, time; - const double *times = self->tree_sequence->tables->nodes.time; - - /* Update state of detached subtrees */ - for (record = edges_out->tail; record != NULL; record = record->prev) { - e = &record->edge; - u = e->child; - depths[u] = 0; - - if (self->parent[u] == TSK_NULL) { - root_time = times[tsk_tree_node_root(self, u)]; - ret = update_kc_subtree_state(self, kc, u, depths, root_time); - if (ret != 0) { - goto out; - } - } - } - - /* Propagate state change down into reattached subtrees. */ - for (record = edges_in->tail; record != NULL; record = record->prev) { - e = &record->edge; - u = e->child; - - tsk_bug_assert(depths[e->child] == 0); - depths[u] = depths[e->parent] + 1; - - root_time = times[tsk_tree_node_root(self, u)]; - ret = update_kc_subtree_state(self, kc, u, depths, root_time); - if (ret != 0) { - goto out; - } - - if (tsk_tree_is_sample(self, u)) { - time = tsk_tree_get_branch_length_unsafe(self, u); - update_kc_vectors_single_sample(self->tree_sequence, kc, u, time); - } - } - -out: - return ret; -} - -int -tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, - double lambda_, double *result) -{ - int i; - tsk_id_t n; - tsk_size_t num_nodes; - double left, span, total; - const tsk_treeseq_t *treeseqs[2] = { self, other }; - tsk_tree_t trees[2]; - kc_vectors kcs[2]; - tsk_diff_iter_t diff_iters[2]; - tsk_edge_list_t edges_out[2]; - tsk_edge_list_t edges_in[2]; - tsk_size_t *depths[2]; - double t0_left, t0_right, t1_left, t1_right; - int ret = 0; - - for (i = 0; i < 2; i++) { - tsk_memset(&trees[i], 0, sizeof(trees[i])); - tsk_memset(&diff_iters[i], 0, sizeof(diff_iters[i])); - tsk_memset(&kcs[i], 0, sizeof(kcs[i])); - tsk_memset(&edges_out[i], 0, sizeof(edges_out[i])); - tsk_memset(&edges_in[i], 0, sizeof(edges_in[i])); - depths[i] = NULL; - } - - ret = check_kc_distance_tree_sequence_inputs(self, other); - if (ret != 0) { - goto out; - } - - n = (tsk_id_t) self->num_samples; - for (i = 0; i < 2; i++) { - ret = tsk_tree_init(&trees[i], treeseqs[i], TSK_SAMPLE_LISTS); - if (ret != 0) { - goto out; - } - ret = tsk_diff_iter_init_from_ts(&diff_iters[i], treeseqs[i], false); - if (ret != 0) { - goto out; - } - ret = kc_vectors_alloc(&kcs[i], n); - if (ret != 0) { - goto out; - } - num_nodes = tsk_treeseq_get_num_nodes(treeseqs[i]); - depths[i] = tsk_calloc(num_nodes, sizeof(*depths[i])); - if (depths[i] == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - } - - total = 0; - left = 0; - - ret = tsk_tree_first(&trees[0]); - if (ret != TSK_TREE_OK) { - goto out; - } - ret = check_kc_distance_tree_inputs(&trees[0]); - if (ret != 0) { - goto out; - } - ret = tsk_diff_iter_next( - &diff_iters[0], &t0_left, &t0_right, &edges_out[0], &edges_in[0]); - tsk_bug_assert(ret == TSK_TREE_OK); - ret = update_kc_incremental( - &trees[0], &kcs[0], &edges_out[0], &edges_in[0], depths[0]); - if (ret != 0) { - goto out; - } - while ((ret = tsk_tree_next(&trees[1])) == TSK_TREE_OK) { - ret = check_kc_distance_tree_inputs(&trees[1]); - if (ret != 0) { - goto out; - } - ret = tsk_diff_iter_next( - &diff_iters[1], &t1_left, &t1_right, &edges_out[1], &edges_in[1]); - tsk_bug_assert(ret == TSK_TREE_OK); - - ret = update_kc_incremental( - &trees[1], &kcs[1], &edges_out[1], &edges_in[1], depths[1]); - if (ret != 0) { - goto out; - } - while (t0_right < t1_right) { - span = t0_right - left; - total += norm_kc_vectors(&kcs[0], &kcs[1], lambda_) * span; - - left = t0_right; - ret = tsk_tree_next(&trees[0]); - tsk_bug_assert(ret == TSK_TREE_OK); - ret = check_kc_distance_tree_inputs(&trees[0]); - if (ret != 0) { - goto out; - } - ret = tsk_diff_iter_next( - &diff_iters[0], &t0_left, &t0_right, &edges_out[0], &edges_in[0]); - tsk_bug_assert(ret == TSK_TREE_OK); - ret = update_kc_incremental( - &trees[0], &kcs[0], &edges_out[0], &edges_in[0], depths[0]); - if (ret != 0) { - goto out; - } - } - span = t1_right - left; - left = t1_right; - total += norm_kc_vectors(&kcs[0], &kcs[1], lambda_) * span; - } - if (ret != 0) { - goto out; - } - - *result = total / self->tables->sequence_length; -out: - for (i = 0; i < 2; i++) { - tsk_tree_free(&trees[i]); - tsk_diff_iter_free(&diff_iters[i]); - kc_vectors_free(&kcs[i]); - tsk_safe_free(depths[i]); - } - return ret; -} diff --git a/subprojects/tskit/tskit/trees.h b/subprojects/tskit/tskit/trees.h deleted file mode 100644 index efe998007..000000000 --- a/subprojects/tskit/tskit/trees.h +++ /dev/null @@ -1,1730 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2019-2023 Tskit Developers - * Copyright (c) 2015-2018 University of Oxford - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/** - * @file trees.h - * @brief Tskit core tree sequence operations. - */ -#ifndef TSK_TREES_H -#define TSK_TREES_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -// clang-format off - -/* - * These are both undocumented options for tsk_tree_init - */ -#define TSK_SAMPLE_LISTS (1 << 1) -#define TSK_NO_SAMPLE_COUNTS (1 << 2) - -#define TSK_STAT_SITE (1 << 0) -#define TSK_STAT_BRANCH (1 << 1) -#define TSK_STAT_NODE (1 << 2) - -/* Leave room for other stat types */ -#define TSK_STAT_POLARISED (1 << 10) -#define TSK_STAT_SPAN_NORMALISE (1 << 11) -#define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12) - -/* Options for map_mutations */ -#define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0) - -#define TSK_DIR_FORWARD 1 -#define TSK_DIR_REVERSE -1 - -/** -@defgroup API_FLAGS_TS_INIT_GROUP :c:func:`tsk_treeseq_init` specific flags. -@{ -*/ -/** -If specified edge indexes will be built and stored in the table collection -when the tree sequence is initialised. Indexes are required for a valid -tree sequence, and are not built by default for performance reasons. -*/ -#define TSK_TS_INIT_BUILD_INDEXES (1 << 0) -/** @} */ - -// clang-format on - -/** -@brief The tree sequence object. -*/ -typedef struct { - tsk_size_t num_trees; - tsk_size_t num_samples; - tsk_id_t *samples; - /* Does this tree sequence have time_units == "uncalibrated" */ - bool time_uncalibrated; - /* Are all genome coordinates discrete? */ - bool discrete_genome; - /* Are all time values discrete? */ - bool discrete_time; - /* Min and max time in node table and mutation table */ - double min_time; - double max_time; - /* Breakpoints along the sequence, including 0 and L. */ - double *breakpoints; - /* If a node is a sample, map to its index in the samples list */ - tsk_id_t *sample_index_map; - /* Map individuals to the list of nodes that reference them */ - tsk_id_t *individual_nodes_mem; - tsk_id_t **individual_nodes; - tsk_size_t *individual_nodes_length; - /* For each tree, a list of sites on that tree */ - tsk_site_t *tree_sites_mem; - tsk_site_t **tree_sites; - tsk_size_t *tree_sites_length; - /* For each site, a list of mutations at that site */ - tsk_mutation_t *site_mutations_mem; - tsk_mutation_t **site_mutations; - tsk_size_t *site_mutations_length; - /** @brief The table collection underlying this tree sequence, This table - * collection must be treated as read-only, and any changes to it will - * lead to undefined behaviour. */ - tsk_table_collection_t *tables; -} tsk_treeseq_t; - -/** -@brief A single tree in a tree sequence. - -@rst -A ``tsk_tree_t`` object has two basic functions: - -1. Represent the state of a single tree in a tree sequence; -2. Provide methods to transform this state into different trees in the sequence. - -The state of a single tree in the tree sequence is represented using the -quintuply linked encoding: please see the -:ref:`data model ` section for details on -how this works. The left-to-right ordering of nodes in this encoding -is arbitrary, and may change depending on the order in which trees are -accessed within the sequence. Please see the -:ref:`sec_c_api_examples_tree_traversals` examples for recommended -usage. - -On initialisation, a tree is in the :ref:`null state` and -we must call one of the :ref:`seeking` methods to make -the state of the tree object correspond to a particular tree in the sequence. -Please see the :ref:`sec_c_api_examples_tree_iteration` examples for -recommended usage. - -@endrst - */ -typedef struct { - /** - * @brief The parent tree sequence. - */ - const tsk_treeseq_t *tree_sequence; - /** - @brief The ID of the "virtual root" whose children are the roots of the - tree. - */ - tsk_id_t virtual_root; - /** - @brief The parent of node u is parent[u]. Equal to ``TSK_NULL`` if node u is - a root or is not a node in the current tree. - */ - tsk_id_t *parent; - /** - @brief The leftmost child of node u is left_child[u]. Equal to ``TSK_NULL`` - if node u is a leaf or is not a node in the current tree. - */ - tsk_id_t *left_child; - /** - @brief The rightmost child of node u is right_child[u]. Equal to ``TSK_NULL`` - if node u is a leaf or is not a node in the current tree. - */ - tsk_id_t *right_child; - /** - @brief The sibling to the left of node u is left_sib[u]. Equal to - ``TSK_NULL`` if node u has no siblings to its left. - */ - tsk_id_t *left_sib; - /** - @brief The sibling to the right of node u is right_sib[u]. Equal to - ``TSK_NULL`` if node u has no siblings to its right. - */ - tsk_id_t *right_sib; - /** - @brief The number of children of node u is num_children[u]. - */ - tsk_id_t *num_children; - /** - @brief Array of edge ids where ``edge[u]`` is the edge that encodes the - relationship between the child node ``u`` and its parent. Equal to - ``TSK_NULL`` if node ``u`` is a root, virtual root or is not a node in the - current tree. - */ - tsk_id_t *edge; - /** - @brief The total number of edges defining the topology of this tree. - This is equal to the number of tree sequence edges that intersect with - the tree's genomic interval. - */ - tsk_size_t num_edges; - /** - @brief Left and right coordinates of the genomic interval that this - tree covers. The left coordinate is inclusive and the right coordinate - exclusive. - - @rst - - Example: - - .. code-block:: c - - tsk_tree_t tree; - int ret; - // initialise etc - ret = tsk_tree_first(&tree); - // Check for error - assert(ret == TSK_TREE_OK); - printf("Coordinates covered by first tree are left=%f, right=%f\n", - tree.interval.left, tree.interval.right); - - @endrst - - */ - struct { - double left; - double right; - } interval; - /** - @brief The index of this tree in the tree sequence. - - @rst - This attribute provides the zero-based index of the tree represented by the - current state of the struct within the parent tree sequence. For example, - immediately after we call ``tsk_tree_first(&tree)``, ``tree.index`` will - be zero, and after we call ``tsk_tree_last(&tree)``, ``tree.index`` will - be the number of trees - 1 (see :c:func:`tsk_treeseq_get_num_trees`) - When the tree is in the null state (immediately after initialisation, - or after, e.g., calling :c:func:`tsk_tree_prev` on the first tree) - the value of the ``index`` is -1. - @endrst - */ - tsk_id_t index; - /* Attributes below are private and should not be used in client code. */ - tsk_size_t num_nodes; - tsk_flags_t options; - tsk_size_t root_threshold; - const tsk_id_t *samples; - /* - These are involved in the optional sample tracking; num_samples counts - all samples below a give node, and num_tracked_samples counts those - from a specific subset. By default sample counts are tracked and roots - maintained. If ``TSK_NO_SAMPLE_COUNTS`` is specified, then neither sample - counts or roots are available. - */ - tsk_size_t *num_samples; - tsk_size_t *num_tracked_samples; - /* These are for the optional sample list tracking. */ - tsk_id_t *left_sample; - tsk_id_t *right_sample; - tsk_id_t *next_sample; - /* The sites on this tree */ - const tsk_site_t *sites; - tsk_size_t sites_length; - /* Counters needed for next() and prev() transformations. */ - int direction; - tsk_id_t left_index; - tsk_id_t right_index; -} tsk_tree_t; - -/****************************************************************************/ -/* Tree sequence.*/ -/****************************************************************************/ - -/** -@defgroup TREESEQ_API_GROUP Tree sequence API -@{ -*/ - -/** -@brief Initialises the tree sequence based on the specified table collection. - -@rst -This method will copy the supplied table collection unless :c:macro:`TSK_TAKE_OWNERSHIP` -is specified. The table collection will be checked for integrity and index maps built. - -This must be called before any operations are performed on the tree sequence. -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. - -If specified, TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless -of error conditions. - -**Options** - -- :c:macro:`TSK_TS_INIT_BUILD_INDEXES` -- :c:macro:`TSK_TAKE_OWNERSHIP` (applies to the table collection). -@endrst - -@param self A pointer to an uninitialised tsk_table_collection_t object. -@param tables A pointer to a tsk_table_collection_t object. -@param options Allocation time options. See above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_init( - tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); - -/** -@brief Load a tree sequence from a file path. - -@rst -Loads the data from the specified file into this tree sequence. -The tree sequence is also initialised. -The resources allocated must be freed using -:c:func:`tsk_treeseq_free` even in error conditions. - -Works similarly to :c:func:`tsk_table_collection_load` please see -that function's documentation for details and options. - -**Examples** - -.. code-block:: c - - int ret; - tsk_treeseq_t ts; - ret = tsk_treeseq_load(&ts, "data.trees", 0); - if (ret != 0) { - fprintf(stderr, "Load error:%s\n", tsk_strerror(ret)); - exit(EXIT_FAILURE); - } - -@endrst - -@param self A pointer to an uninitialised tsk_treeseq_t object -@param filename A NULL terminated string containing the filename. -@param options Bitwise options. See above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options); - -/** -@brief Load a tree sequence from a stream. - -@rst -Loads a tree sequence from the specified file stream. The tree sequence -is also initialised. The resources allocated must be freed using -:c:func:`tsk_treeseq_free` even in error conditions. - -Works similarly to :c:func:`tsk_table_collection_loadf` please -see that function's documentation for details and options. - -@endrst - -@param self A pointer to an uninitialised tsk_treeseq_t object. -@param file A FILE stream opened in an appropriate mode for reading (e.g. - "r", "r+" or "w+") positioned at the beginning of a tree sequence - definition. -@param options Bitwise options. See above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options); - -/** -@brief Write a tree sequence to file. - -@rst -Writes the data from this tree sequence to the specified file. - -If an error occurs the file path is deleted, ensuring that only complete -and well formed files will be written. -@endrst - -@param self A pointer to an initialised tsk_treeseq_t object. -@param filename A NULL terminated string containing the filename. -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_dump( - const tsk_treeseq_t *self, const char *filename, tsk_flags_t options); - -/** -@brief Write a tree sequence to a stream. - -@rst -Writes the data from this tree sequence to the specified FILE stream. -Semantics are identical to :c:func:`tsk_treeseq_dump`. - -Please see the :ref:`sec_c_api_examples_file_streaming` section for an example -of how to sequentially dump and load tree sequences from a stream. -@endrst - -@param self A pointer to an initialised tsk_treeseq_t object. -@param file A FILE stream opened in an appropriate mode for writing (e.g. - "w", "a", "r+" or "w+"). -@param options Bitwise options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_dumpf(const tsk_treeseq_t *self, FILE *file, tsk_flags_t options); - -/** -@brief Copies the state of the table collection underlying this tree sequence -into the specified destination table collection. - -@rst -By default the method initialises the specified destination table collection. If the -destination is already initialised, the :c:macro:`TSK_NO_INIT` option should -be supplied to avoid leaking memory. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@param tables A pointer to a tsk_table_collection_t object. If the TSK_NO_INIT -option is specified, this must be an initialised table collection. If not, it must be an -uninitialised table collection. -@param options Bitwise option flags. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_copy_tables( - const tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified tree sequence. - -@param self A pointer to an initialised tsk_treeseq_t object. -@return Always returns 0. -*/ -int tsk_treeseq_free(tsk_treeseq_t *self); - -/** -@brief Print out the state of this tree sequence to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_treeseq_t object. -@param out The stream to write the summary to. -*/ -void tsk_treeseq_print_state(const tsk_treeseq_t *self, FILE *out); - -/** -@brief Get the number of nodes - -@rst -Returns the number of nodes in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of nodes. -*/ -tsk_size_t tsk_treeseq_get_num_nodes(const tsk_treeseq_t *self); - -/** -@brief Get the number of edges - -@rst -Returns the number of edges in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of edges. -*/ - -tsk_size_t tsk_treeseq_get_num_edges(const tsk_treeseq_t *self); - -/** -@brief Get the number of migrations - -@rst -Returns the number of migrations in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of migrations. -*/ -tsk_size_t tsk_treeseq_get_num_migrations(const tsk_treeseq_t *self); - -/** -@brief Get the number of sites - -@rst -Returns the number of sites in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of sites. -*/ -tsk_size_t tsk_treeseq_get_num_sites(const tsk_treeseq_t *self); - -/** -@brief Get the number of mutations - -@rst -Returns the number of mutations in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of mutations. -*/ -tsk_size_t tsk_treeseq_get_num_mutations(const tsk_treeseq_t *self); - -/** -@brief Get the number of provenances - -@rst -Returns the number of provenances in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of provenances. -*/ -tsk_size_t tsk_treeseq_get_num_provenances(const tsk_treeseq_t *self); - -/** -@brief Get the number of populations - -@rst -Returns the number of populations in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of populations. -*/ -tsk_size_t tsk_treeseq_get_num_populations(const tsk_treeseq_t *self); - -/** -@brief Get the number of individuals - -@rst -Returns the number of individuals in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of individuals. -*/ -tsk_size_t tsk_treeseq_get_num_individuals(const tsk_treeseq_t *self); - -/** -@brief Return the number of trees in this tree sequence. - -@rst -This is a constant time operation. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return The number of trees in the tree sequence. -*/ -tsk_size_t tsk_treeseq_get_num_trees(const tsk_treeseq_t *self); - -/** -@brief Get the number of samples - -@rst -Returns the number of nodes marked as samples in this tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the number of samples. -*/ -tsk_size_t tsk_treeseq_get_num_samples(const tsk_treeseq_t *self); - -/** -@brief Get the top-level tree sequence metadata. - -@rst -Returns a pointer to the metadata string, which is owned by the tree sequence and -not null-terminated. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns a pointer to the metadata. -*/ -const char *tsk_treeseq_get_metadata(const tsk_treeseq_t *self); - -/** -@brief Get the length of top-level tree sequence metadata - -@rst -Returns the length of the metadata string. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the length of the metadata. -*/ -tsk_size_t tsk_treeseq_get_metadata_length(const tsk_treeseq_t *self); - -/** -@brief Get the top-level tree sequence metadata schema. - -@rst -Returns a pointer to the metadata schema string, which is owned by the tree sequence and -not null-terminated. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns a pointer to the metadata schema. -*/ -const char *tsk_treeseq_get_metadata_schema(const tsk_treeseq_t *self); - -/** -@brief Get the length of the top-level tree sequence metadata schema. - -@rst -Returns the length of the metadata schema string. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the length of the metadata schema. -*/ -tsk_size_t tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self); - -/** -@brief Get the time units string - -@rst -Returns a pointer to the time units string, which is owned by the tree sequence and -not null-terminated. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns a pointer to the time units. -*/ -const char *tsk_treeseq_get_time_units(const tsk_treeseq_t *self); - -/** -@brief Get the length of time units string -@rst -Returns the length of the time units string. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the length of the time units. -*/ -tsk_size_t tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self); - -/** -@brief Get the file uuid - -@rst -Returns a pointer to the null-terminated file uuid string, which is owned by the tree -sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns a pointer to the time units. -*/ -const char *tsk_treeseq_get_file_uuid(const tsk_treeseq_t *self); - -/** -@brief Get the sequence length - -@rst -Returns the sequence length of this tree sequence -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the sequence length. -*/ -double tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self); - -/** -@brief Get the breakpoints - -@rst -Returns an array of breakpoint locations, the array is owned by the tree sequence. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the pointer to the breakpoint array. -*/ -const double *tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self); - -/** -@brief Get the samples - -@rst -Returns an array of ids of sample nodes in this tree sequence. -I.e. nodes that have the :c:macro:`TSK_NODE_IS_SAMPLE` flag set. -The array is owned by the tree sequence and should not be modified or free'd. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the pointer to the sample node id array. -*/ -const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self); - -/** -@brief Get the map of node id to sample index - -@rst -Returns the location of each node in the list of samples or -:c:macro:`TSK_NULL` for nodes that are not samples. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the pointer to the array of sample indexes. -*/ -const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); - -/** -@brief Check if a node is a sample - -@rst -Returns the sample status of a given node id. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param u The id of the node to be checked. -@return Returns true if the node is a sample. -*/ -bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u); - -/** -@brief Get the discrete genome status - -@rst -If all the genomic locations in the tree sequence are discrete integer values -then this flag will be true. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@return Returns true if all genomic locations are discrete. -*/ -bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self); - -/** -@brief Get the discrete time status - -@rst -If all times in the tree sequence are discrete integer values -then this flag will be true -@endrst -@param self A pointer to a tsk_treeseq_t object. -@return Returns true if all times are discrete. -*/ -bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self); - -/** -@brief Get the min time in node table and mutation table - -@rst -The times stored in both the node and mutation tables are considered. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the min time of all nodes and mutations. -*/ -double tsk_treeseq_get_min_time(const tsk_treeseq_t *self); - -/** -@brief Get the max time in node table and mutation table - -@rst -The times stored in both the node and mutation tables are considered. -@endrst - -@param self A pointer to a tsk_treeseq_t object. -@return Returns the max time of all nodes and mutations. -*/ -double tsk_treeseq_get_max_time(const tsk_treeseq_t *self); - -/** -@brief Get a node by its index - -@rst -Copies a node from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The node index to copy -@param node A pointer to a tsk_node_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node); - -/** -@brief Get a edge by its index - -@rst -Copies a edge from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The edge index to copy -@param edge A pointer to a tsk_edge_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge); - -/** -@brief Get a edge by its index - -@rst -Copies a migration from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The migration index to copy -@param migration A pointer to a tsk_migration_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_migration( - const tsk_treeseq_t *self, tsk_id_t index, tsk_migration_t *migration); - -/** -@brief Get a site by its index - -@rst -Copies a site from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The site index to copy -@param site A pointer to a tsk_site_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_site(const tsk_treeseq_t *self, tsk_id_t index, tsk_site_t *site); - -/** -@brief Get a mutation by its index - -@rst -Copies a mutation from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The mutation index to copy -@param mutation A pointer to a tsk_mutation_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_mutation( - const tsk_treeseq_t *self, tsk_id_t index, tsk_mutation_t *mutation); - -/** -@brief Get a provenance by its index - -@rst -Copies a provenance from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The provenance index to copy -@param provenance A pointer to a tsk_provenance_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_provenance( - const tsk_treeseq_t *self, tsk_id_t index, tsk_provenance_t *provenance); - -/** -@brief Get a population by its index - -@rst -Copies a population from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The population index to copy -@param population A pointer to a tsk_population_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_population( - const tsk_treeseq_t *self, tsk_id_t index, tsk_population_t *population); - -/** -@brief Get a individual by its index - -@rst -Copies a individual from this tree sequence to the specified destination. -@endrst -@param self A pointer to a tsk_treeseq_t object. -@param index The individual index to copy -@param individual A pointer to a tsk_individual_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_get_individual( - const tsk_treeseq_t *self, tsk_id_t index, tsk_individual_t *individual); - -/** -@brief Create a simplified instance of this tree sequence - -@rst -Copies this tree sequence to the specified destination and performs simplification. -The destination tree sequence should be uninitialised. -Simplification transforms the tables to remove redundancy and canonicalise -tree sequence data. See the :ref:`simplification ` tutorial for -more details. - -For full details and flags see :c:func:`tsk_table_collection_simplify` which performs -the same operation in place. - -@endrst -@param self A pointer to a uninitialised tsk_treeseq_t object. -@param samples Either NULL or an array of num_samples distinct and valid node IDs. - If non-null the nodes in this array will be marked as samples in the output. - If NULL, the num_samples parameter is ignored and the samples in the output - will be the same as the samples in the input. This is equivalent to populating - the samples array with all of the sample nodes in the input in increasing - order of ID. -@param num_samples The number of node IDs in the input samples array. Ignored - if the samples array is NULL. -@param options Simplify options; see above for the available bitwise flags. - For the default behaviour, a value of 0 should be provided. -@param output A pointer to an uninitialised tsk_treeseq_t object. -@param node_map If not NULL, this array will be filled to define the mapping - between nodes IDs in the table collection before and after simplification. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, - tsk_size_t num_samples, tsk_flags_t options, tsk_treeseq_t *output, - tsk_id_t *node_map); - -/** @} */ - -int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, - tsk_id_t population, const char *metadata, tsk_size_t metadata_length, - tsk_flags_t options, tsk_treeseq_t *output); - -bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); - -int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); -int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output); - -int tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, - double lambda_, double *result); - -int tsk_treeseq_genealogical_nearest_neighbours(const tsk_treeseq_t *self, - const tsk_id_t *focal, tsk_size_t num_focal, const tsk_id_t *const *reference_sets, - const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, - tsk_flags_t options, double *ret_array); -int tsk_treeseq_mean_descendants(const tsk_treeseq_t *self, - const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, - tsk_size_t num_reference_sets, tsk_flags_t options, double *ret_array); - -typedef int general_stat_func_t(tsk_size_t state_dim, const double *state, - tsk_size_t result_dim, double *result, void *params); - -int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const double *W, - tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); - -/* One way weighted stats */ - -typedef int one_way_weighted_method(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - -int tsk_treeseq_trait_covariance(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); -int tsk_treeseq_trait_correlation(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - -/* One way weighted stats with covariates */ - -typedef int one_way_covariates_method(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_covariates, const double *covariates, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); - -int tsk_treeseq_trait_linear_model(const tsk_treeseq_t *self, tsk_size_t num_weights, - const double *weights, tsk_size_t num_covariates, const double *covariates, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); - -/* One way sample set stats */ - -typedef int one_way_sample_stat_method(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - -int tsk_treeseq_diversity(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_segregating_sites(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - -typedef int general_sample_stat_method(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes, - tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); - -int tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, - const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - -/* Three way sample set stats */ -int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_f3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); - -/* Four way sample set stats */ -int tsk_treeseq_f4(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); - -/****************************************************************************/ -/* Tree */ -/****************************************************************************/ - -/** -@defgroup TREE_API_LIFECYCLE_GROUP Tree lifecycle -@{ -*/ - -/** -@brief Initialises the tree by allocating internal memory and associating - with the specified tree sequence. - -@rst -This must be called before any operations are performed on the tree. - -The specified tree sequence object must be initialised, and must be -valid for the full lifetime of this tree. - -See the :ref:`sec_c_api_overview_structure` for details on how objects -are initialised and freed. - -The ``options`` parameter is provided to support future expansions -of the API. A number of undocumented internal features are controlled -via this parameter, and it **must** be set to 0 to ensure that operations -work as expected and for compatibility with future versions of tskit. -@endrst - -@param self A pointer to an uninitialised tsk_tree_t object. -@param tree_sequence A pointer to an initialised tsk_treeseq_t object. -@param options Allocation time options. Must be 0, or behaviour is undefined. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_tree_init( - tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); - -/** -@brief Free the internal memory for the specified tree. - -@param self A pointer to an initialised tsk_tree_t object. -@return Always returns 0. -*/ -int tsk_tree_free(tsk_tree_t *self); - -/** -@brief Copies the state of this tree into the specified destination. - -@rst -By default (``options`` = 0) the method initialises the specified destination -tree by calling :c:func:`tsk_tree_init`. If the destination is already -initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid -leaking memory. If :c:macro:`TSK_NO_INIT` is supplied and the tree sequence associated -with the ``dest`` tree is not equal to the tree sequence associated -with ``self``, an error is raised. - -The destination tree will keep a reference to the tree sequence object -associated with the source tree, and this tree sequence must be -valid for the full lifetime of the destination tree. - -**Options** - -- :c:macro:`TSK_NO_INIT` - -If :c:macro:`TSK_NO_INIT` is not specified, options for :c:func:`tsk_tree_init` -can be provided and will be passed on. - -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@param dest A pointer to a tsk_tree_t object. If the TSK_NO_INIT option - is specified, this must be an initialised tree. If not, it must - be an uninitialised tree. -@param options Copy and allocation time options. See the notes above for details. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options); - -/** @} */ - -/** -@defgroup TREE_API_SEEKING_GROUP Seeking along the sequence -@{ -*/ - -/** -@brief Seek to the first tree in the sequence. - -@rst -Set the state of this tree to reflect the first tree in parent -tree sequence. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Return TSK_TREE_OK on success; or a negative value if an error occurs. -*/ -int tsk_tree_first(tsk_tree_t *self); - -/** -@brief Seek to the last tree in the sequence. - -@rst -Set the state of this tree to reflect the last tree in parent -tree sequence. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Return TSK_TREE_OK on success; or a negative value if an error occurs. -*/ -int tsk_tree_last(tsk_tree_t *self); - -/** -@brief Seek to the next tree in the sequence. - -@rst -Set the state of this tree to reflect the next tree in parent -tree sequence. If the index of the current tree is ``j``, -then the after this operation the index will be ``j + 1``. - -Calling :c:func:`tsk_tree_next` a tree in the -:ref:`null state` is equivalent to calling -:c:func:`tsk_tree_first`. - -Calling :c:func:`tsk_tree_next` on the last tree in the -sequence will transform it into the -:ref:`null state` (equivalent to -calling :c:func:`tsk_tree_clear`). - -Please see the :ref:`sec_c_api_examples_tree_iteration` examples for -recommended usage. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Return TSK_TREE_OK on successfully transforming to a -non-null tree; 0 on successfully transforming into the null -tree; or a negative value if an error occurs. -*/ -int tsk_tree_next(tsk_tree_t *self); - -/** -@brief Seek to the previous tree in the sequence. - -@rst -Set the state of this tree to reflect the previous tree in parent -tree sequence. If the index of the current tree is ``j``, -then the after this operation the index will be ``j - 1``. - -Calling :c:func:`tsk_tree_prev` a tree in the -:ref:`null state` is equivalent to calling -:c:func:`tsk_tree_last`. - -Calling :c:func:`tsk_tree_prev` on the first tree in the -sequence will transform it into the -:ref:`null state` (equivalent to -calling :c:func:`tsk_tree_clear`). - -Please see the :ref:`sec_c_api_examples_tree_iteration` examples for -recommended usage. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Return TSK_TREE_OK on successfully transforming to a -non-null tree; 0 on successfully transforming into the null -tree; or a negative value if an error occurs. -*/ -int tsk_tree_prev(tsk_tree_t *self); - -/** -@brief Set the tree into the null state. - -@rst -Transform this tree into the :ref:`null state`. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_tree_clear(tsk_tree_t *self); - -/** -@brief Seek to a particular position on the genome. - -@rst -Set the state of this tree to reflect the tree in parent -tree sequence covering the specified ``position``. That is, on success -we will have ``tree.interval.left <= position`` and -we will have ``position < tree.interval.right``. - -Seeking to a position currently covered by the tree is -a constant time operation. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@param position The position in genome coordinates -@param options Seek options. Currently unused. Set to 0 for compatibility - with future versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options); - -/** -@brief Seek to a specific tree in a tree sequence. - -@rst -Set the state of this tree to reflect the tree in parent -tree sequence whose index is ``0 <= tree < num_trees``. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@param tree The target tree index. -@param options Seek options. Currently unused. Set to 0 for compatibility - with future versions of tskit. -@return Return 0 on success or a negative value on failure. -*/ -int tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options); - -/** @} */ - -/** -@defgroup TREE_API_TREE_QUERY_GROUP Tree Queries -@{ -*/ - -/** -@brief Returns the number of roots in this tree. - -@rst -See the :ref:`sec_data_model_tree_roots` section for more information -on how the roots of a tree are defined. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Returns the number roots in this tree. -*/ -tsk_size_t tsk_tree_get_num_roots(const tsk_tree_t *self); - -/** -@brief Returns the leftmost root in this tree. - -@rst -See the :ref:`sec_data_model_tree_roots` section for more information -on how the roots of a tree are defined. - -This function is equivalent to ``tree.left_child[tree.virtual_root]``. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Returns the leftmost root in the tree. -*/ -tsk_id_t tsk_tree_get_left_root(const tsk_tree_t *self); - -/** -@brief Returns the rightmost root in this tree. - -@rst -See the :ref:`sec_data_model_tree_roots` section for more information -on how the roots of a tree are defined. - -This function is equivalent to ``tree.right_child[tree.virtual_root]``. -@endrst - -@param self A pointer to an initialised tsk_tree_t object. -@return Returns the rightmost root in the tree. -*/ -tsk_id_t tsk_tree_get_right_root(const tsk_tree_t *self); - -/** -@brief Get the list of sites for this tree. - -@rst -Gets the list of :c:data:`tsk_site_t` objects in the parent tree sequence -for which the position lies within this tree's genomic interval. - -The memory pointed to by the ``sites`` parameter is managed by the -``tsk_tree_t`` object and must not be altered or freed by client code. - -.. code-block:: c - - static void - print_sites(const tsk_tree_t *tree) - { - int ret; - tsk_size_t j, num_sites; - const tsk_site_t *sites; - - ret = tsk_tree_get_sites(tree, &sites, &num_sites); - check_tsk_error(ret); - for (j = 0; j < num_sites; j++) { - printf("position = %f\n", sites[j].position); - } - } - -This is a constant time operation. - -@endrst - -@param self A pointer to a tsk_tree_t object. -@param sites The destination pointer for the list of sites. -@param sites_length A pointer to a tsk_size_t value in which the number - of sites is stored. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_sites( - const tsk_tree_t *self, const tsk_site_t **sites, tsk_size_t *sites_length); - -/** -@brief Return an upper bound on the number of nodes reachable - from the roots of this tree. - -@rst -This function provides an upper bound on the number of nodes that -can be reached in tree traversals, and is intended to be used -for memory allocation purposes. If ``num_nodes`` is the number -of nodes visited in a tree traversal from the -:ref:`virtual root` -(e.g., ``tsk_tree_preorder_from(tree, tree->virtual_root, nodes, -&num_nodes)``), the bound ``N`` returned here is guaranteed to -be greater than or equal to ``num_nodes``. - -.. warning:: The precise value returned is not defined and should - not be depended on, as it may change from version-to-version. - -@endrst - -@param self A pointer to a tsk_tree_t object. -@return An upper bound on the number nodes reachable from the roots - of this tree, or zero if this tree has not been initialised. -*/ -tsk_size_t tsk_tree_get_size_bound(const tsk_tree_t *self); - -/** -@brief Print out the state of this tree to the specified stream. - -This method is intended for debugging purposes and should not be used -in production code. The format of the output should **not** be depended -on and may change arbitrarily between versions. - -@param self A pointer to a tsk_tree_t object. -@param out The stream to write the summary to. -*/ -void tsk_tree_print_state(const tsk_tree_t *self, FILE *out); - -/** @} */ - -/** -@defgroup TREE_API_NODE_QUERY_GROUP Node Queries -@{ -*/ - -/** -@brief Returns the parent of the specified node. - -@rst -Equivalent to ``tree.parent[u]`` with bounds checking for the node u. -Performance sensitive code which can guarantee that the node u is -valid should use the direct array access in preference to this method. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The tree node. -@param parent A tsk_id_t pointer to store the returned parent node. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent); - -/** -@brief Returns the time of the specified node. - -@rst -Equivalent to ``tables->nodes.time[u]`` with bounds checking for the node u. -Performance sensitive code which can guarantee that the node u is -valid should use the direct array access in preference to this method, -for example: - -.. code-block:: c - - static void - print_times(const tsk_tree_t *tree) - { - int ret; - tsk_size_t num_nodes, j; - const double *node_time = tree->tree_sequence->tables->nodes.time; - tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); - - if (nodes == NULL) { - errx(EXIT_FAILURE, "Out of memory"); - } - ret = tsk_tree_preorder(tree, nodes, &num_nodes); - check_tsk_error(ret); - for (j = 0; j < num_nodes; j++) { - printf("time = %f\n", node_time[nodes[j]]); - } - free(nodes); - } - -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The tree node. -@param ret_time A double pointer to store the returned node time. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *ret_time); - -/** -@brief Return number of nodes on the path from the specified node to root. - -@rst -Return the number of nodes on the path from u to root, not including u. -The depth of a root is therefore zero. - -As a special case, the depth of the -:ref:`virtual root ` is defined as -1. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The tree node. -@param ret_depth An int pointer to store the returned node depth. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *ret_depth); - -/** -@brief Return the length of the branch ancestral to the specified node. - -@rst -Return the length of the branch ancestral to the specified node. -Branch length is defined as difference between the time -of a node and its parent. The branch length of a root is zero. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The tree node. -@param ret_branch_length A double pointer to store the returned branch length. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_branch_length( - const tsk_tree_t *self, tsk_id_t u, double *ret_branch_length); - -/** -@brief Computes the sum of the lengths of all branches reachable from - the specified node, or from all roots if ``u=TSK_NULL``. - -@rst -Return the total branch length in a particular subtree or of the -entire tree. If the specified node is :c:macro:`TSK_NULL` (or the -:ref:`virtual root`) -the sum of the lengths of all branches reachable from roots -is returned. Branch length is defined as difference between the time -of a node and its parent. The branch length of a root is zero. - -Note that if the specified node is internal its branch length is -*not* included, so that, e.g., the total branch length of a -leaf node is zero. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The root of the subtree of interest, or ``TSK_NULL`` to return the - total branch length of the tree. -@param ret_tbl A double pointer to store the returned total branch length. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_total_branch_length( - const tsk_tree_t *self, tsk_id_t u, double *ret_tbl); - -/** -@brief Counts the number of samples in the subtree rooted at a node. - -@rst -Returns the number of samples descending from a particular node, -including the node itself. - -This is a constant time operation. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The tree node. -@param ret_num_samples A tsk_size_t pointer to store the returned - number of samples. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_num_samples( - const tsk_tree_t *self, tsk_id_t u, tsk_size_t *ret_num_samples); - -/** -@brief Compute the most recent common ancestor of two nodes. - -@rst -If two nodes do not share a common ancestor in the current tree, the MRCA -node is :c:macro:`TSK_NULL`. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u A tree node. -@param v A tree node. -@param mrca A tsk_id_t pointer to store the returned most recent common ancestor node. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca); - -/** -@brief Returns true if u is a descendant of v. - -@rst -Returns true if u and v are both valid nodes in the tree sequence -and v lies on the path from u to root, and false otherwise. - -Any node is a descendant of itself. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param u The descendant node. -@param v The ancestral node. -@return true if u is a descendant of v, and false otherwise. -*/ -bool tsk_tree_is_descendant(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v); - -/** @} */ - -/** -@defgroup TREE_API_TRAVERSAL_GROUP Traversal orders. -@{ -*/ - -/** -@brief Fill an array with the nodes of this tree in preorder. - -@rst -Populate an array with the nodes in this tree in preorder. The array -must be pre-allocated and be sufficiently large to hold the array -of nodes visited. The recommended approach is to use the -:c:func:`tsk_tree_get_size_bound` function, as in the following example: - -.. code-block:: c - - static void - print_preorder(tsk_tree_t *tree) - { - int ret; - tsk_size_t num_nodes, j; - tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); - - if (nodes == NULL) { - errx(EXIT_FAILURE, "Out of memory"); - } - ret = tsk_tree_preorder(tree, nodes, &num_nodes); - check_tsk_error(ret); - for (j = 0; j < num_nodes; j++) { - printf("Visit preorder %lld\n", (long long) nodes[j]); - } - free(nodes); - } - -.. seealso:: - See the :ref:`sec_c_api_examples_tree_traversals` section for - more examples. - -@endrst - -@param self A pointer to a tsk_tree_t object. -@param nodes The tsk_id_t array to store nodes in. See notes above for - details. -@param num_nodes A pointer to a tsk_size_t value where we store the number - of nodes in the traversal. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_preorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes); - -/** -@brief Fill an array with the nodes of this tree starting from a particular node. - -@rst -As for :c:func:`tsk_tree_preorder` but starting the traversal at a particular node -(which will be the first node in the traversal list). The -:ref:`virtual root` is a valid input for this function -and will be treated like any other tree node. The value ``-1`` is a special case, -in which we visit all nodes reachable from the roots, and equivalent to -calling :c:func:`tsk_tree_preorder`. - -See :c:func:`tsk_tree_preorder` for details the requirements for the ``nodes`` -array. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param root The root of the subtree to traverse, or -1 to visit all nodes. -@param nodes The tsk_id_t array to store nodes in. -@param num_nodes A pointer to a tsk_size_t value where we store the number - of nodes in the traversal. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_preorder_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); - -/** -@brief Fill an array with the nodes of this tree in postorder. - -@rst -Populate an array with the nodes in this tree in postorder. The array -must be pre-allocated and be sufficiently large to hold the array -of nodes visited. The recommended approach is to use the -:c:func:`tsk_tree_get_size_bound` function, as in the following example: - -.. code-block:: c - - static void - print_postorder(tsk_tree_t *tree) - { - int ret; - tsk_size_t num_nodes, j; - tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); - - if (nodes == NULL) { - errx(EXIT_FAILURE, "Out of memory"); - } - ret = tsk_tree_postorder(tree, nodes, &num_nodes); - check_tsk_error(ret); - for (j = 0; j < num_nodes; j++) { - printf("Visit postorder %lld\n", (long long) nodes[j]); - } - free(nodes); - } - -.. seealso:: - See the :ref:`sec_c_api_examples_tree_traversals` section for - more examples. - -@endrst - -@param self A pointer to a tsk_tree_t object. -@param nodes The tsk_id_t array to store nodes in. See notes above for - details. -@param num_nodes A pointer to a tsk_size_t value where we store the number - of nodes in the traversal. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_postorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes); - -/** -@brief Fill an array with the nodes of this tree starting from a particular node. - -@rst -As for :c:func:`tsk_tree_postorder` but starting the traversal at a particular node -(which will be the last node in the traversal list). The -:ref:`virtual root` is a valid input for this function -and will be treated like any other tree node. The value ``-1`` is a special case, -in which we visit all nodes reachable from the roots, and equivalent to -calling :c:func:`tsk_tree_postorder`. - -See :c:func:`tsk_tree_postorder` for details the requirements for the ``nodes`` -array. -@endrst - -@param self A pointer to a tsk_tree_t object. -@param root The root of the subtree to traverse, or -1 to visit all nodes. -@param nodes The tsk_id_t array to store nodes in. See - :c:func:`tsk_tree_postorder` for more details. -@param num_nodes A pointer to a tsk_size_t value where we store the number - of nodes in the traversal. -@return 0 on success or a negative value on failure. -*/ -int tsk_tree_postorder_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); - -/** @} */ - -/* Undocumented for now */ - -int tsk_tree_preorder_samples_from( - const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); - -int tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold); -tsk_size_t tsk_tree_get_root_threshold(const tsk_tree_t *self); - -bool tsk_tree_has_sample_counts(const tsk_tree_t *self); -bool tsk_tree_has_sample_lists(const tsk_tree_t *self); - -int tsk_tree_get_num_tracked_samples( - const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_tracked_samples); -int tsk_tree_set_tracked_samples( - tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples); -int tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node); - -typedef struct { - tsk_id_t node; - tsk_id_t parent; - int32_t state; -} tsk_state_transition_t; - -int tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, double *cost_matrix, - tsk_flags_t options, int32_t *ancestral_state, tsk_size_t *num_transitions, - tsk_state_transition_t **transitions); - -int tsk_tree_kc_distance( - const tsk_tree_t *self, const tsk_tree_t *other, double lambda, double *result); - -/* Don't document these balance metrics for now so it doesn't get in the way of - * C API 1.0, but should be straightforward to document based on Python docs. */ -int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result); -int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result); -int tsk_tree_b1_index(const tsk_tree_t *self, double *result); -/* NOTE: if we document this as part of the C API we'll have to be more careful - * about the error behaviour on bad log bases. At the moment we're just returning - * the resulting value which can be nan, inf etc, but some surprising results - * happen like a base 0 seems to return a finite value. */ -int tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result); - -int tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result); - -/* Things to consider removing: */ - -/* This is redundant, really */ -bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); - -/* Not terribly useful, since the definition is - * return (self->tree_sequence == other->tree_sequence) && (self->index == other->index) - * Remove? - */ -bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); - -int tsk_diff_iter_init_from_ts( - tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); - -#ifdef __cplusplus -} -#endif -#endif