Skip to content

Fix for #436 and #215 #457

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ VignetteBuilder:
Config/Needs/website: tidyverse/tidytemplate
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
SystemRequirements: libxml2: libxml2-dev (deb), libxml2-devel (rpm)
Collate:
'S4.R'
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ S3method(as_xml_document,response)
S3method(as_xml_document,xml_document)
S3method(as_xml_document,xml_node)
S3method(as_xml_document,xml_nodeset)
S3method(format,xml_missing)
S3method(format,xml_node)
S3method(is.na,xml_missing)
S3method(is.na,xml_node)
Expand Down Expand Up @@ -123,7 +124,9 @@ export("xml_attrs<-")
export("xml_name<-")
export("xml_text<-")
export(as_list)
export(as_list2)
export(as_xml_document)
export(deduplicate)
export(download_html)
export(download_xml)
export(html_structure)
Expand Down
92 changes: 92 additions & 0 deletions R/as_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#' as_list(read_xml("<foo> <bar><baz /></bar> </foo>"))
#' as_list(read_xml("<foo id = 'a'></foo>"))
#' as_list(read_xml("<foo><bar id='a'/><bar id='b'/></foo>"))
#' as_list(read_xml("<foo><bar>a</bar><bar>b</bar></foo>"))
as_list <- function(x, ns = character(), ...) {
UseMethod("as_list")
}
Expand Down Expand Up @@ -106,3 +107,94 @@
names(x)[special] <- sub("^\\.", "", names(x)[special])
x
}

#' Coerce xml nodes to a list with better handling duplicate elements
#'
#' This turns an XML document (or node or nodeset) into the equivalent R
#' list. This functions like `as_list()` but ensures elements with duplicate
#' names are put into indexed lists.
#'
#' @inheritParams xml_name
#' @param ... Needed for compatibility with generic. Unused.
#' @export
#'
#' @examples
#' # With duplicate elements
#' xml <- read_xml("<content><x>a</x><x>b</x></content>")
#' lst <- as_list(xml)
#' lst$content$x # Returns "a" solely
#' lst2 <- as_list2(xml)
#' lst2$content$x # Returns "a" and "b"
#' lst2$content$x[[1]] # Returns "a"
#' lst2$content$x[[2]] # Returns "b"
#'
#' # With attributes preserved
#' xml <- read_xml("<w aa='0'><x a='1' b='2'><y>3</y><z>4</z></x></w>")
#' as_list2(xml)
as_list2 <- function(x, ns = character(), ...) {
result <- as_list(x, ns = ns, ...)

if (length(result) == 1 && length(unlist(result)) == 1) {
item <- unlist(result)
result <- list(unname(item))
names(result) <- names(item)
} else {
result <- deduplicate(result)
}
return(result)
}


#' Deduplicate named elements in a list
#' @param lst A list potentially containing duplicate named elements
#' @return A list with duplicate elements consolidated
#' @export
deduplicate <- function(lst) {
if (!is.list(lst) || length(lst) == 0 || is.null(names(lst)) || all(names(lst) == "")) {
return(lst)

Check warning on line 154 in R/as_list.R

View check run for this annotation

Codecov / codecov/patch

R/as_list.R#L154

Added line #L154 was not covered by tests
}

attrs <- attributes(lst)

nms <- names(lst)

duplicated_names <- unique(nms[duplicated(nms[nms != ""])])

if (length(duplicated_names) == 0) {
# Recursively deal with duplications in the list
result <- lapply(lst, deduplicate)
attributes(result) <- attrs
return(result)
}

for (name in duplicated_names) {
deduplicated_index <- which(nms == name)

values <- lapply(deduplicated_index, function(i) {
if (is.list(lst[[i]]) && length(lst[[i]]) == 1 && is.character(lst[[i]][[1]])) {
return(lst[[i]][[1]])
} else {
# Fallback option
return(lst[[i]])

Check warning on line 178 in R/as_list.R

View check run for this annotation

Codecov / codecov/patch

R/as_list.R#L178

Added line #L178 was not covered by tests
}
})

lst[[deduplicated_index[1]]] <- values

if (length(deduplicated_index) > 1) {
lst <- lst[-deduplicated_index[-1]]
nms <- nms[-deduplicated_index[-1]]
}
names(lst) <- nms
}

for (i in seq_along(lst)) {
if (is.list(lst[[i]]) && !(i %in% which(names(lst) %in% duplicated_names))) {
lst[[i]] <- deduplicate(lst[[i]])

Check warning on line 193 in R/as_list.R

View check run for this annotation

Codecov / codecov/patch

R/as_list.R#L193

Added line #L193 was not covered by tests
}
}

attrs$names <- names(lst)
attributes(lst) <- attrs
return(lst)
}
39 changes: 37 additions & 2 deletions R/as_xml_document.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
if (length(x) > 1) {
cli::cli_abort("Root nodes must be of length 1.")
}

if (length(x[[1]]) == 1 && is.vector(x[[1]])) {
x[[1]] <- list(x[[1]])
}

add_node <- function(x, parent, tag = NULL) {
if (is.atomic(x)) {
Expand All @@ -56,7 +58,31 @@
}
}
for (i in seq_along(x)) {
add_node(x[[i]], parent, names(x)[[i]])
# Handle for duplicate-named elements
item <- x[[i]]
item_name <- names(x)[i]

if (is_contain_duplicated(item)) {
for (j in seq_along(item)) {
sub_item <- item[[j]]
new_node <- xml_add_child(parent, item_name)

Check warning on line 68 in R/as_xml_document.R

View check run for this annotation

Codecov / codecov/patch

R/as_xml_document.R#L66-L68

Added lines #L66 - L68 were not covered by tests

if (is.character(sub_item) && length(sub_item) == 1) {
xml_text(new_node) <- sub_item
} else if (is.list(sub_item)) {
attr <- r_attrs_to_xml(attributes(sub_item))
for (k in seq_along(attr)) {
xml_set_attr(new_node, names(attr)[[k]], attr[[k]])

Check warning on line 75 in R/as_xml_document.R

View check run for this annotation

Codecov / codecov/patch

R/as_xml_document.R#L70-L75

Added lines #L70 - L75 were not covered by tests
}

for (k in seq_along(sub_item)) {
add_node(sub_item[[k]], new_node, names(sub_item)[k])

Check warning on line 79 in R/as_xml_document.R

View check run for this annotation

Codecov / codecov/patch

R/as_xml_document.R#L78-L79

Added lines #L78 - L79 were not covered by tests
}
}
}
} else {
add_node(item, parent, names(x)[[i]])
}
}
}

Expand All @@ -83,3 +109,12 @@
as_xml_document.xml_document <- function(x, ...) {
x
}

is_contain_duplicated <- function(lst) {
if (is.null(names(lst)) || all(names(lst) == "")) {
if (length(lst) > 1 && all(sapply(lst, function(x) is.list(x) || is.character(x)))) {
return(TRUE)

Check warning on line 116 in R/as_xml_document.R

View check run for this annotation

Codecov / codecov/patch

R/as_xml_document.R#L116

Added line #L116 was not covered by tests
}
}
return(FALSE)
}
1 change: 1 addition & 0 deletions R/xml_missing.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ xml_missing <- function() {
out
}

#' @export
format.xml_missing <- function(x, ...) {
"<NA>"
}
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ reference:
contents:
- ends_with("serialize")
- xml2_example
- deduplicate

news:
releases:
Expand Down
1 change: 1 addition & 0 deletions man/as_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions man/as_list2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/deduplicate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions tests/testthat/test-as_list.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
list_xml <- function(x) as_list(read_xml(x))
list_xml2 <- function(x) as_list2(read_xml(x))

test_that("empty elements become empty lists", {
expect_equal(list_xml("<x></x>"), list(x = list()))
Expand Down Expand Up @@ -39,3 +40,10 @@ test_that("attributes in child nodes", {
list(w = structure(list(x = structure(list(y = list("3"), z = list("4")), a = "1", b = "2", .names = "esc")), aa = "0"))
)
})

test_that("Duplicated items", {
expect_equal(
list_xml2("<content><x>a</x><x>b</x></content>"),
list(content = list(x = list('a', 'b')))
)
})
9 changes: 7 additions & 2 deletions tests/testthat/test-as_xml_document.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
roundtrip_xml <- function(x) {
roundtrip_xml <- function(x, as_list_version = 1) {
xml <- read_xml(x)
lst <- as_list(xml)
lst <- if (as_list_version > 1) as_list2(xml) else as_list(xml)
xml2 <- as_xml_document(lst)
expect_equal(as.character(xml), as.character(xml2))
}
Expand Down Expand Up @@ -34,6 +34,11 @@ test_that("rountrips with special attributes", {
roundtrip_xml("<a names = 'test'><b/></a>")
})

test_that("rountrips with only one element", {
roundtrip_xml("<foo>bar</foo>")
roundtrip_xml("<foo>bar</foo>", as_list_version = 2)
})

test_that("more than one root node is an error", {
expect_error(as_xml_document(list(a = list(), b = list())), "Root nodes must be of length 1")
})
Expand Down
Loading