From f14ab2015624a0d04b522d2f9f63ceb3c349528e Mon Sep 17 00:00:00 2001 From: Vito Zanotelli Date: Fri, 1 Sep 2023 12:32:06 +0200 Subject: [PATCH] Add support for categorical columns with NA Anndata 0.7.8 exports categories with NA with a value -1, causing the reading of mudata/anndata to fail. This adds support for NAs. The convert_categoricals function is added as an internal function in order to facilitate testing of this logic. Fixes: #6 --- R/read_h5mu.R | 30 +++++++++++++++++++++++++----- tests/testthat/test_readh5mu.R | 22 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/R/read_h5mu.R b/R/read_h5mu.R index 73cfc81..66e0bde 100644 --- a/R/read_h5mu.R +++ b/R/read_h5mu.R @@ -25,11 +25,7 @@ read_dataframe <- function(group) { } else { labels <- H5Rdereference(labels, h5loc=col) labels_items <- H5Dread(labels) - n_labels <- length(unique(values)) - if (length(labels_items) > n_labels) { - labels_items <- labels_items[seq_len(n_labels)] - } - values <- factor(as.integer(values), labels=labels_items) + values <- convert_categoricals(values, labels_items) H5Dclose(labels) } H5Aclose(attr) @@ -44,6 +40,30 @@ read_dataframe <- function(group) { do.call(data.frame, args=col_list) } +#' Helper function to convert values + labels into factors +#' +#' @description A helper function to convert categories into factors. +#' Assumptions: +#' - values correspond to the zero indexed categories +#' (i.e. value 0 is the first category) +#' - NA are encoded with a value -1 +#' Categories not uses will be dropped. +#' +#' @param values Vector of integer level numbers (zero indexed). -1 indicate NA +#' @param categories Labels for level numbers (zero indexed). +#' +#' @returns factor with categorical values +#' +#' @keywords internal +#' @noRd +convert_categoricals <- function(values, categories) { + # The levels are 0 indexed integers + levels <- seq_len(length(categories))-1 + value_factor <- factor(as.integer(values), levels, labels=categories) + # Drop unused levels + droplevels(value_factor) +} + #' @importFrom rhdf5 H5Dread H5Aexists H5Aopen H5Aread H5Aclose read_dataframe_legacy <- function(dataset) { table <- H5Dread(dataset) diff --git a/tests/testthat/test_readh5mu.R b/tests/testthat/test_readh5mu.R index 015d2fc..e0ddda5 100644 --- a/tests/testthat/test_readh5mu.R +++ b/tests/testthat/test_readh5mu.R @@ -74,3 +74,25 @@ test_that("a SE object with a sparse matrix written to H5AD can be read", { expect_true(inherits(assay(se_b), "DelayedArray")) } }) + +test_that("Categoricals columns with NA are loaded correctly", { + values <- c(0, -1) + categories <- c('a') + res <- convert_categoricals(values, categories) + expect_equal(as.character(res[1]), 'a') + expect_true(is.na(res[2])) +}) + +test_that("Extra levels in categoricals are ignored", { + values <- c(0) + categories <- c('a', 'b') + res <- convert_categoricals(values, categories) + expect_equal(levels(res), c('a')) +}) + +test_that("Extra levels in categoricals are ignored when NA are present", { + values <- c(0, -1) + categories <- c('a', 'b') + res <- convert_categoricals(values, categories) + expect_equal(levels(res), c('a')) +}) \ No newline at end of file