Skip to content

Commit

Permalink
ARROW-10386: [R] List column class attributes not preserved in roundtrip
Browse files Browse the repository at this point in the history
Closes apache#9182 from jonkeane/ARROW-10386/List_metadata

Lead-authored-by: Jonathan Keane <[email protected]>
Co-authored-by: Romain Francois <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
2 people authored and nealrichardson committed Jan 13, 2021
1 parent 2b6c71f commit 6deb892
Show file tree
Hide file tree
Showing 21 changed files with 223 additions and 16 deletions.
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ importFrom(tidyselect,vars_select)
importFrom(utils,head)
importFrom(utils,install.packages)
importFrom(utils,modifyList)
importFrom(utils,object.size)
importFrom(utils,packageVersion)
importFrom(utils,tail)
importFrom(vctrs,s3_register)
Expand Down
2 changes: 2 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
* Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found.
* `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging
* Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others have `jemalloc`. These are used by default if they were built, and they're generally much faster than the system malloc, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`.
* List columns that have attributes on each element are now also included with the metadata that is saved when creating Arrow tables. This allows `sf` tibbles to faithfully preserved and roundtripped (ARROW-10386)[https://issues.apache.org/jira/browse/ARROW-10386].
* R metadata that exceeds 100Kb is now compressed before being written to a table; see `schema()` for more details.

## Bug fixes

Expand Down
4 changes: 4 additions & 0 deletions r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ print.arrow_info <- function(x, ...) {
invisible(x)
}

option_compress_metadata <- function() {
!is_false(getOption("arrow.compress_metadata"))
}

#' @include enums.R
ArrowObject <- R6Class("ArrowObject",
public = list(
Expand Down
1 change: 1 addition & 0 deletions r/R/feather.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#' the stream will be left open.
#' @export
#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
#' @seealso [Schema] for information about schemas and metadata handling.
#' @examples
#' \donttest{
#' tf <- tempfile()
Expand Down
1 change: 1 addition & 0 deletions r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ make_valid_version <- function(version, valid_versions = valid_parquet_version)
#' "snappy" for the `compression` argument.
#'
#' @seealso [write_parquet]
#' @seealso [Schema] for information about schemas and metadata handling.
#'
#' @export
ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject)
Expand Down
45 changes: 36 additions & 9 deletions r/R/record-batch.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
#' - `$schema`
#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
#' Modify or replace by assigning in (`batch$metadata <- new_metadata`).
#' All list elements are coerced to string.
#' All list elements are coerced to string. See `schema()` for more information.
#' - `$columns`: Returns a list of `Array`s
#' @rdname RecordBatch
#' @name RecordBatch
Expand Down Expand Up @@ -274,24 +274,57 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
df
}

#' @importFrom utils object.size
.serialize_arrow_r_metadata <- function(x) {
assert_is(x, "list")

# drop problems attributes (most likely from readr)
x[["attributes"]][["problems"]] <- NULL

rawToChar(serialize(x, NULL, ascii = TRUE))
out <- serialize(x, NULL, ascii = TRUE)

# if the metadata is over 100 kB, compress
if (option_compress_metadata() && object.size(out) > 100000) {
out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)

# but ensure that the compression+serialization is effective.
if (object.size(out) > object.size(out_comp)) out <- out_comp
}

rawToChar(out)
}

.unserialize_arrow_r_metadata <- function(x) {
tryCatch(unserialize(charToRaw(x)), error = function(e) {
tryCatch({
out <- unserialize(charToRaw(x))

# if this is still raw, try decompressing
if (is.raw(out)) {
out <- unserialize(memDecompress(out, type = "gzip"))
}
out
}, error = function(e) {
warning("Invalid metadata$r", call. = FALSE)
NULL
})
}

apply_arrow_r_metadata <- function(x, r_metadata) {
tryCatch({
columns_metadata <- r_metadata$columns
if (is.data.frame(x)) {
if (length(names(x)) && !is.null(columns_metadata)) {
for (name in intersect(names(columns_metadata), names(x))) {
x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
}
}
} else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
x <- map2(x, columns_metadata, function(.x, .y) {
apply_arrow_r_metadata(.x, .y)
})
x
}

if (!is.null(r_metadata$attributes)) {
attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
if (inherits(x, "POSIXlt")) {
Expand All @@ -303,12 +336,6 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
}
}

columns_metadata <- r_metadata$columns
if (length(names(x)) && !is.null(columns_metadata)) {
for (name in intersect(names(columns_metadata), names(x))) {
x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
}
}
}, error = function(e) {
warning("Invalid metadata$r", call. = FALSE)
})
Expand Down
25 changes: 25 additions & 0 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,31 @@
#' - `$metadata`: returns the key-value metadata as a named list.
#' Modify or replace by assigning in (`sch$metadata <- new_metadata`).
#' All list elements are coerced to string.
#'
#' @section R Metadata:
#'
#' When converting a data.frame to an Arrow Table or RecordBatch, attributes
#' from the `data.frame` are saved alongside tables so that the object can be
#' reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
#' can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
#' at the column (e.g. `attributes(df$col_a)`) or for list columns only:
#' element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
#' for storing `haven` columns in a table and being able to faithfully
#' re-create them when pulled back into R. This metadata is separate from the
#' schema (column names and types) which is compatible with other Arrow
#' clients. The R metadata is only read by R and is ignored by other clients
#' (e.g. Pandas has its own custom metadata). This metadata is stored in
#' `$metadata$r`.
#'
#' Since Schema metadata keys and values must be strings, this metadata is
#' saved by serializing R's attribute list structure to a string. If the
#' serialized metadata exceeds 100Kb in size, by default it is compressed
#' starting in version 3.0.0. To disable this compression (e.g. for tables
#' that are compatible with Arrow versions before 3.0.0 and include large
#' amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
#' Files with compressed metadata are readable by older versions of arrow, but
#' the metadata is dropped.
#'
#' @rdname Schema
#' @name Schema
#' @examples
Expand Down
21 changes: 17 additions & 4 deletions r/R/table.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
#' - `$schema`
#' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
#' Modify or replace by assigning in (`tab$metadata <- new_metadata`).
#' All list elements are coerced to string.
#' All list elements are coerced to string. See `schema()` for more information.
#' - `$columns`: Returns a list of `ChunkedArray`s
#' @rdname Table
#' @name Table
Expand Down Expand Up @@ -211,11 +211,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) {

if (is.data.frame(x)) {
columns <- map(x, arrow_attributes)
if (length(att) || !all(map_lgl(columns, is.null))) {
out <- if (length(att) || !all(map_lgl(columns, is.null))) {
list(attributes = att, columns = columns)
}
} else if (length(att)) {
list(attributes = att, columns = NULL)
return(out)
}

columns <- NULL
if (is.list(x) && !inherits(x, "POSIXlt")) {
# for list columns, we also keep attributes of each
# element in columns
columns <- map(x, arrow_attributes)
if (all(map_lgl(columns, is.null))) {
columns <- NULL
}
}

if (length(att) || !is.null(columns)) {
list(attributes = att, columns = columns)
} else {
NULL
}
Expand Down
4 changes: 4 additions & 0 deletions r/extra-tests/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ if_version <- function(version, op = `==`) {
op(packageVersion("arrow"), version)
}

if_version_less_than <- function(version) {
if_version(version, op = `<`)
}

skip_if_version_less_than <- function(version, msg) {
if(if_version(version, `<`)) {
skip(msg)
Expand Down
32 changes: 32 additions & 0 deletions r/extra-tests/test-read-files.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,36 @@ test_that("Can see the metadata (stream)", {
)
})

test_that("Can see the extra metadata (parquet)", {
pq_file <- "files/ex_data_extra_metadata.parquet"

if (if_version_less_than("3.0.0")) {
expect_warning(
df <- read_parquet(pq_file),
"Invalid metadata$r",
fixed = TRUE
)
expect_s3_class(df, "tbl")
} else {
# version 3.0.0 and greater
df <- read_parquet(pq_file)
expect_s3_class(df, "tbl")

expect_equal(
attributes(df),
list(
names = letters[1:4],
row.names = 1L,
class = c("tbl_df", "tbl", "data.frame"),
top_level = list(
field_one = 12,
field_two = "more stuff"
)
)
)

# column-level attributes for the large column.
expect_named(attributes(df$b), "lots")
expect_length(attributes(df$b)$lots, 100)
}
})
3 changes: 3 additions & 0 deletions r/extra-tests/write-files.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL
write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1)

write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")

write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet")

2 changes: 2 additions & 0 deletions r/man/ParquetWriterProperties.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/RecordBatch.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions r/man/Schema.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/Table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions r/man/write_feather.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file not shown.
7 changes: 7 additions & 0 deletions r/tests/testthat/helper-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,10 @@ make_big_string <- function() {
# This creates a character vector that would exceed the capacity of BinaryArray
rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
}

make_string_of_size <- function(size = 1) {
purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE), collapse = ""))
}

example_with_extra_metadata <- example_with_metadata
attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
9 changes: 9 additions & 0 deletions r/tests/testthat/test-backwards-compatibility.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ expect_identical_with_metadata <- function(object, expected, ..., top_level = TR
expect_identical(object, expected, ...)
}

test_that("reading a known Parquet file to dataframe with 3.0.0", {
skip_if_not_available("snappy")
pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")

df <- read_parquet(pq_file)
# this is equivalent to `expect_identical()`
expect_identical_with_metadata(df, example_with_extra_metadata)
})

test_that("reading a known Parquet file to dataframe with 2.0.0", {
skip_if_not_available("snappy")
pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")
Expand Down
Loading

0 comments on commit 6deb892

Please sign in to comment.