From d6a5e05cb613981c0db49b9b6c8242d0d3139e3e Mon Sep 17 00:00:00 2001 From: randef1ned <46381867+randef1ned@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:02:35 +0200 Subject: [PATCH 1/5] Fixed the duplicated names --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/as_list.R | 85 +++++++++++++++++++++++++++++++++++++++++++++ R/as_xml_document.R | 35 ++++++++++++++++++- R/xml_missing.R | 1 + 5 files changed, 123 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c44b42ae..3efad521 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,7 +36,7 @@ VignetteBuilder: Config/Needs/website: tidyverse/tidytemplate Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 SystemRequirements: libxml2: libxml2-dev (deb), libxml2-devel (rpm) Collate: 'S4.R' diff --git a/NAMESPACE b/NAMESPACE index 8f74da43..6b460acb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -123,7 +123,9 @@ export("xml_attrs<-") export("xml_name<-") export("xml_text<-") export(as_list) +export(as_list2) export(as_xml_document) +export(deduplicate) export(download_html) export(download_xml) export(html_structure) diff --git a/R/as_list.R b/R/as_list.R index be9474fc..179266c8 100644 --- a/R/as_list.R +++ b/R/as_list.R @@ -25,6 +25,7 @@ #' as_list(read_xml(" ")) #' as_list(read_xml("")) #' as_list(read_xml("")) +#' as_list(read_xml("ab")) as_list <- function(x, ns = character(), ...) { UseMethod("as_list") } @@ -106,3 +107,87 @@ r_attrs_to_xml <- function(x) { names(x)[special] <- sub("^\\.", "", names(x)[special]) x } + +#' Coerce xml nodes to a list with better handling duplicate elements +#' +#' This turns an XML document (or node or nodeset) into the equivalent R +#' list. This functions like `as_list()` but ensures elements with duplicate +#' names are put into indexed lists. +#' +#' @inheritParams xml_name +#' @param ... Needed for compatibility with generic. Unused. +#' @export +#' +#' @examples +#' # With duplicate elements +#' xml <- read_xml("ab") +#' lst <- as_list(xml) +#' lst$content$x # Returns "a" solely +#' lst2 <- as_list2(xml) +#' lst2$content$x # Returns "a" and "b" +#' lst2$content$x[[1]] # Returns "a" +#' lst2$content$x[[2]] # Returns "b" +#' +#' # With attributes preserved +#' xml <- read_xml("34") +#' as_list2(xml) +as_list2 <- function(x, ns = character(), ...) { + result <- as_list(x, ns = ns, ...) + + deduplicate(result) +} + + +#' Deduplicate named elements in a list +#' @param lst A list potentially containing duplicate named elements +#' @return A list with duplicate elements consolidated +#' @export +deduplicate <- function(lst) { + if (!is.list(lst) || length(lst) == 0 || is.null(names(lst)) || all(names(lst) == "")) { + return(lst) + } + + attrs <- attributes(lst) + + nms <- names(lst) + + duplicated_names <- unique(nms[duplicated(nms[nms != ""])]) + + if (length(duplicated_names) == 0) { + # Recursively deal with duplications in the list + result <- lapply(lst, deduplicate) + attributes(result) <- attrs + return(result) + } + + for (name in duplicated_names) { + deduplicated_index <- which(nms == name) + + values <- lapply(deduplicated_index, function(i) { + if (is.list(lst[[i]]) && length(lst[[i]]) == 1 && is.character(lst[[i]][[1]])) { + return(lst[[i]][[1]]) + } else { + # Fallback option + return(lst[[i]]) + } + }) + + lst[[deduplicated_index[1]]] <- values + + if (length(deduplicated_index) > 1) { + lst <- lst[-deduplicated_index[-1]] + nms <- nms[-deduplicated_index[-1]] + } + names(lst) <- nms + } + + for (i in seq_along(lst)) { + if (is.list(lst[[i]]) && !(i %in% which(names(lst) %in% duplicated_names))) { + lst[[i]] <- deduplicate(lst[[i]]) + } + } + + attrs$names <- names(lst) + attributes(lst) <- attrs + return(lst) +} diff --git a/R/as_xml_document.R b/R/as_xml_document.R index c24f1226..225834d8 100644 --- a/R/as_xml_document.R +++ b/R/as_xml_document.R @@ -56,7 +56,31 @@ as_xml_document.list <- function(x, ...) { } } for (i in seq_along(x)) { - add_node(x[[i]], parent, names(x)[[i]]) + # Handle for duplicate-named elements + item <- x[[i]] + item_name <- names(x)[i] + + if (is_contain_duplicated(item)) { + for (j in seq_along(item)) { + sub_item <- item[[j]] + new_node <- xml_add_child(parent, item_name) + + if (is.character(sub_item) && length(sub_item) == 1) { + xml_text(new_node) <- sub_item + } else if (is.list(sub_item)) { + attr <- r_attrs_to_xml(attributes(sub_item)) + for (k in seq_along(attr)) { + xml_set_attr(new_node, names(attr)[[k]], attr[[k]]) + } + + for (k in seq_along(sub_item)) { + add_node(sub_item[[k]], new_node, names(sub_item)[k]) + } + } + } + } else { + add_node(item, parent, names(x)[[i]]) + } } } @@ -83,3 +107,12 @@ as_xml_document.xml_nodeset <- function(x, root, ...) { as_xml_document.xml_document <- function(x, ...) { x } + +is_contain_duplicated <- function(lst) { + if (is.null(names(lst)) || all(names(lst) == "")) { + if (length(lst) > 1 && all(sapply(lst, function(x) is.list(x) || is.character(x)))) { + return(TRUE) + } + } + return(FALSE) +} diff --git a/R/xml_missing.R b/R/xml_missing.R index 0600b270..5945731f 100644 --- a/R/xml_missing.R +++ b/R/xml_missing.R @@ -7,6 +7,7 @@ xml_missing <- function() { out } +#' @export format.xml_missing <- function(x, ...) { "" } From acba7ef37766e00b1955807b934d436390039a14 Mon Sep 17 00:00:00 2001 From: randef1ned <46381867+randef1ned@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:33:36 +0200 Subject: [PATCH 2/5] Fix issue #215 --- R/as_list.R | 9 ++++++++- R/as_xml_document.R | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/R/as_list.R b/R/as_list.R index 179266c8..61b20350 100644 --- a/R/as_list.R +++ b/R/as_list.R @@ -134,7 +134,14 @@ r_attrs_to_xml <- function(x) { as_list2 <- function(x, ns = character(), ...) { result <- as_list(x, ns = ns, ...) - deduplicate(result) + if (length(result) == 1 && length(unlist(result)) == 1) { + item <- unlist(result) + result <- list(unname(item)) + names(result) <- names(item) + } else { + result <- deduplicate(result) + } + return(result) } diff --git a/R/as_xml_document.R b/R/as_xml_document.R index 225834d8..a289b80f 100644 --- a/R/as_xml_document.R +++ b/R/as_xml_document.R @@ -42,7 +42,9 @@ as_xml_document.list <- function(x, ...) { if (length(x) > 1) { cli::cli_abort("Root nodes must be of length 1.") } - + if (length(x[[1]]) == 1 && is.vector(x[[1]])) { + x[[1]] <- list(x[[1]]) + } add_node <- function(x, parent, tag = NULL) { if (is.atomic(x)) { From a0544d7bfd7f53881e53bfec56de2f466be01623 Mon Sep 17 00:00:00 2001 From: randef1ned <46381867+randef1ned@users.noreply.github.com> Date: Tue, 17 Jun 2025 11:38:09 +0200 Subject: [PATCH 3/5] Add test cases --- tests/testthat/test-as_list.R | 8 ++++++++ tests/testthat/test-as_xml_document.R | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-as_list.R b/tests/testthat/test-as_list.R index a85dc4b8..6f7c9335 100644 --- a/tests/testthat/test-as_list.R +++ b/tests/testthat/test-as_list.R @@ -1,4 +1,5 @@ list_xml <- function(x) as_list(read_xml(x)) +list_xml2 <- function(x) as_list2(read_xml(x)) test_that("empty elements become empty lists", { expect_equal(list_xml(""), list(x = list())) @@ -39,3 +40,10 @@ test_that("attributes in child nodes", { list(w = structure(list(x = structure(list(y = list("3"), z = list("4")), a = "1", b = "2", .names = "esc")), aa = "0")) ) }) + +test_that("Duplicated items", { + expect_equal( + list_xml2("ab"), + list(content = list(x = list('a', 'b'))) + ) +}) diff --git a/tests/testthat/test-as_xml_document.R b/tests/testthat/test-as_xml_document.R index a902162a..1b53bb5c 100644 --- a/tests/testthat/test-as_xml_document.R +++ b/tests/testthat/test-as_xml_document.R @@ -1,6 +1,6 @@ -roundtrip_xml <- function(x) { +roundtrip_xml <- function(x, as_list_version = 1) { xml <- read_xml(x) - lst <- as_list(xml) + lst <- if (as_list_version > 1) as_list2(xml) else as_list(xml) xml2 <- as_xml_document(lst) expect_equal(as.character(xml), as.character(xml2)) } @@ -34,6 +34,11 @@ test_that("rountrips with special attributes", { roundtrip_xml("") }) +test_that("rountrips with only one element", { + roundtrip_xml("bar") + roundtrip_xml("bar", as_list_version = 2) +}) + test_that("more than one root node is an error", { expect_error(as_xml_document(list(a = list(), b = list())), "Root nodes must be of length 1") }) From 082e120ea26b9fa83b940ab13ccbe2486227725f Mon Sep 17 00:00:00 2001 From: randef1ned <46381867+randef1ned@users.noreply.github.com> Date: Tue, 17 Jun 2025 12:00:17 +0200 Subject: [PATCH 4/5] Documentation for two functions --- NAMESPACE | 1 + man/as_list.Rd | 1 + man/as_list2.Rd | 40 ++++++++++++++++++++++++++++++++++++++++ man/deduplicate.Rd | 17 +++++++++++++++++ 4 files changed, 59 insertions(+) create mode 100644 man/as_list2.Rd create mode 100644 man/deduplicate.Rd diff --git a/NAMESPACE b/NAMESPACE index 6b460acb..8d6ded73 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ S3method(as_xml_document,response) S3method(as_xml_document,xml_document) S3method(as_xml_document,xml_node) S3method(as_xml_document,xml_nodeset) +S3method(format,xml_missing) S3method(format,xml_node) S3method(is.na,xml_missing) S3method(is.na,xml_node) diff --git a/man/as_list.Rd b/man/as_list.Rd index ffb8d876..622c6730 100644 --- a/man/as_list.Rd +++ b/man/as_list.Rd @@ -43,4 +43,5 @@ as_list(read_xml(" a ]]>")) as_list(read_xml(" ")) as_list(read_xml("")) as_list(read_xml("")) +as_list(read_xml("ab")) } diff --git a/man/as_list2.Rd b/man/as_list2.Rd new file mode 100644 index 00000000..5a793da0 --- /dev/null +++ b/man/as_list2.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_list.R +\name{as_list2} +\alias{as_list2} +\title{Coerce xml nodes to a list with better handling duplicate elements} +\usage{ +as_list2(x, ns = character(), ...) +} +\arguments{ +\item{x}{A document, node, or node set.} + +\item{ns}{Optionally, a named vector giving prefix-url pairs, as produced +by \code{\link[=xml_ns]{xml_ns()}}. If provided, all names will be explicitly +qualified with the ns prefix, i.e. if the element \code{bar} is defined +in namespace \code{foo}, it will be called \code{foo:bar}. (And +similarly for attributes). Default namespaces must be given an explicit +name. The ns is ignored when using \code{\link[=xml_name<-]{xml_name<-()}} and +\code{\link[=xml_set_name]{xml_set_name()}}.} + +\item{...}{Needed for compatibility with generic. Unused.} +} +\description{ +This turns an XML document (or node or nodeset) into the equivalent R +list. This functions like \code{as_list()} but ensures elements with duplicate +names are put into indexed lists. +} +\examples{ +# With duplicate elements +xml <- read_xml("ab") +lst <- as_list(xml) +lst$content$x # Returns "a" solely +lst2 <- as_list2(xml) +lst2$content$x # Returns "a" and "b" +lst2$content$x[[1]] # Returns "a" +lst2$content$x[[2]] # Returns "b" + +# With attributes preserved +xml <- read_xml("34") +as_list2(xml) +} diff --git a/man/deduplicate.Rd b/man/deduplicate.Rd new file mode 100644 index 00000000..4c4a6863 --- /dev/null +++ b/man/deduplicate.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_list.R +\name{deduplicate} +\alias{deduplicate} +\title{Deduplicate named elements in a list} +\usage{ +deduplicate(lst) +} +\arguments{ +\item{lst}{A list potentially containing duplicate named elements} +} +\value{ +A list with duplicate elements consolidated +} +\description{ +Deduplicate named elements in a list +} From 48061618819175700c5dbcad619a17d01f97b86a Mon Sep 17 00:00:00 2001 From: randef1ned <46381867+randef1ned@users.noreply.github.com> Date: Tue, 17 Jun 2025 12:14:17 +0200 Subject: [PATCH 5/5] Add navigation in pkgdown configuration file --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 01e37265..db0535dc 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -57,6 +57,7 @@ reference: contents: - ends_with("serialize") - xml2_example + - deduplicate news: releases: