ARROW-10386: [R] List column class attributes not preserved in roundtrip

Closes apache#9182 from jonkeane/ARROW-10386/List_metadata Lead-authored-by: Jonathan Keane <[email protected]> Co-authored-by: Romain Francois <[email protected]> Signed-off-by: Neal Richardson <[email protected]>
Kyligence · Jan 13, 2021 · 6deb892 · 6deb892
1 parent 2b6c71f
commit 6deb892
Show file tree

Hide file tree

Showing 21 changed files with 223 additions and 16 deletions.
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -332,6 +332,7 @@ importFrom(tidyselect,vars_select)
 importFrom(utils,head)
 importFrom(utils,install.packages)
 importFrom(utils,modifyList)
+importFrom(utils,object.size)
 importFrom(utils,packageVersion)
 importFrom(utils,tail)
 importFrom(vctrs,s3_register)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -37,6 +37,8 @@
 * Option `arrow.skip_nul` (default `FALSE`, as in `base::scan()`) allows conversion of Arrow string (`utf8()`) type data containing embedded nul `\0` characters to R. If set to `TRUE`, nuls will be stripped and a warning is emitted if any are found.
 * `arrow_info()` for an overview of various run-time and build-time Arrow configurations, useful for debugging
 * Set environment variable `ARROW_DEFAULT_MEMORY_POOL` before loading the Arrow package to change memory allocators. Windows packages are built with `mimalloc`; most others have `jemalloc`. These are used by default if they were built, and they're generally much faster than the system malloc, but sometimes it is useful to turn them off for debugging purposes. To disable them, set `ARROW_DEFAULT_MEMORY_POOL=system`.
+* List columns that have attributes on each element are now also included with the metadata that is saved when creating Arrow tables. This allows `sf` tibbles to faithfully preserved and roundtripped (ARROW-10386)[https://issues.apache.org/jira/browse/ARROW-10386].
+* R metadata that exceeds 100Kb is now compressed before being written to a table; see `schema()` for more details.
 
 ## Bug fixes
 

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -147,6 +147,10 @@ print.arrow_info <- function(x, ...) {
   invisible(x)
 }
 
+option_compress_metadata <- function() {
+  !is_false(getOption("arrow.compress_metadata"))
+}
+
 #' @include enums.R
 ArrowObject <- R6Class("ArrowObject",
   public = list(

diff --git a/r/R/feather.R b/r/R/feather.R
@@ -44,6 +44,7 @@
 #' the stream will be left open.
 #' @export
 #' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data.
+#' @seealso [Schema] for information about schemas and metadata handling.
 #' @examples
 #' \donttest{
 #' tf <- tempfile()

diff --git a/r/R/parquet.R b/r/R/parquet.R
@@ -275,6 +275,7 @@ make_valid_version <- function(version, valid_versions = valid_parquet_version)
 #' "snappy" for the `compression` argument.
 #'
 #' @seealso [write_parquet]
+#' @seealso [Schema] for information about schemas and metadata handling.
 #'
 #' @export
 ParquetWriterProperties <- R6Class("ParquetWriterProperties", inherit = ArrowObject)

diff --git a/r/R/record-batch.R b/r/R/record-batch.R
@@ -67,7 +67,7 @@
 #' - `$schema`
 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
 #'    Modify or replace by assigning in (`batch$metadata <- new_metadata`).
-#'    All list elements are coerced to string.
+#'    All list elements are coerced to string. See `schema()` for more information.
 #' - `$columns`: Returns a list of `Array`s
 #' @rdname RecordBatch
 #' @name RecordBatch
@@ -274,24 +274,57 @@ as.data.frame.RecordBatch <- function(x, row.names = NULL, optional = FALSE, ...
   df
 }
 
+#' @importFrom utils object.size
 .serialize_arrow_r_metadata <- function(x) {
   assert_is(x, "list")
 
   # drop problems attributes (most likely from readr)
   x[["attributes"]][["problems"]] <- NULL
 
-  rawToChar(serialize(x, NULL, ascii = TRUE))
+  out <- serialize(x, NULL, ascii = TRUE)
+
+  # if the metadata is over 100 kB, compress
+  if (option_compress_metadata() && object.size(out) > 100000) {
+    out_comp <- serialize(memCompress(out, type = "gzip"), NULL, ascii = TRUE)
+
+    # but ensure that the compression+serialization is effective.
+    if (object.size(out) > object.size(out_comp)) out <- out_comp
+  }
+
+  rawToChar(out)
 }
 
 .unserialize_arrow_r_metadata <- function(x) {
-  tryCatch(unserialize(charToRaw(x)), error = function(e) {
+  tryCatch({
+    out <- unserialize(charToRaw(x))
+
+    # if this is still raw, try decompressing
+    if (is.raw(out)) {
+      out <- unserialize(memDecompress(out, type = "gzip"))
+    }
+    out
+  }, error = function(e) {
     warning("Invalid metadata$r", call. = FALSE)
     NULL
   })
 }
 
 apply_arrow_r_metadata <- function(x, r_metadata) {
   tryCatch({
+    columns_metadata <- r_metadata$columns
+    if (is.data.frame(x)) {
+      if (length(names(x)) && !is.null(columns_metadata)) {
+        for (name in intersect(names(columns_metadata), names(x))) {
+          x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
+        }
+      }
+    } else if(is.list(x) && !inherits(x, "POSIXlt") && !is.null(columns_metadata)) {
+      x <- map2(x, columns_metadata, function(.x, .y) {
+        apply_arrow_r_metadata(.x, .y)
+      })
+      x
+    }
+
     if (!is.null(r_metadata$attributes)) {
       attributes(x)[names(r_metadata$attributes)] <- r_metadata$attributes
       if (inherits(x, "POSIXlt")) {
@@ -303,12 +336,6 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
       }
     }
 
-    columns_metadata <- r_metadata$columns
-    if (length(names(x)) && !is.null(columns_metadata)) {
-      for (name in intersect(names(columns_metadata), names(x))) {
-        x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])
-      }
-    }
   }, error = function(e) {
     warning("Invalid metadata$r", call. = FALSE)
   })

diff --git a/r/R/schema.R b/r/R/schema.R
@@ -50,6 +50,31 @@
 #' - `$metadata`: returns the key-value metadata as a named list.
 #'    Modify or replace by assigning in (`sch$metadata <- new_metadata`).
 #'    All list elements are coerced to string.
+#'
+#' @section R Metadata:
+#'
+#'   When converting a data.frame to an Arrow Table or RecordBatch, attributes
+#'   from the `data.frame` are saved alongside tables so that the object can be
+#'   reconstructed faithfully in R (e.g. with `as.data.frame()`). This metadata
+#'   can be both at the top-level of the `data.frame` (e.g. `attributes(df)`) or
+#'   at the column (e.g. `attributes(df$col_a)`) or for list columns only:
+#'   element level (e.g. `attributes(df[1, "col_a"])`). For example, this allows
+#'   for storing `haven` columns in a table and being able to faithfully
+#'   re-create them when pulled back into R. This metadata is separate from the
+#'   schema (column names and types) which is compatible with other Arrow
+#'   clients. The R metadata is only read by R and is ignored by other clients
+#'   (e.g. Pandas has its own custom metadata). This metadata is stored in
+#'   `$metadata$r`.
+#'
+#'   Since Schema metadata keys and values must be strings, this metadata is
+#'   saved by serializing R's attribute list structure to a string. If the
+#'   serialized metadata exceeds 100Kb in size, by default it is compressed
+#'   starting in version 3.0.0. To disable this compression (e.g. for tables
+#'   that are compatible with Arrow versions before 3.0.0 and include large
+#'   amounts of metadata), set the option `arrow.compress_metadata` to `FALSE`.
+#'   Files with compressed metadata are readable by older versions of arrow, but
+#'   the metadata is dropped.
+#'
 #' @rdname Schema
 #' @name Schema
 #' @examples

diff --git a/r/R/table.R b/r/R/table.R
@@ -76,7 +76,7 @@
 #' - `$schema`
 #' - `$metadata`: Returns the key-value metadata of the `Schema` as a named list.
 #'    Modify or replace by assigning in (`tab$metadata <- new_metadata`).
-#'    All list elements are coerced to string.
+#'    All list elements are coerced to string. See `schema()` for more information.
 #' - `$columns`: Returns a list of `ChunkedArray`s
 #' @rdname Table
 #' @name Table
@@ -211,11 +211,24 @@ arrow_attributes <- function(x, only_top_level = FALSE) {
 
   if (is.data.frame(x)) {
     columns <- map(x, arrow_attributes)
-    if (length(att) || !all(map_lgl(columns, is.null))) {
+    out <- if (length(att) || !all(map_lgl(columns, is.null))) {
       list(attributes = att, columns = columns)
     }
-  } else if (length(att)) {
-    list(attributes = att, columns = NULL)
+    return(out)
+  }
+
+  columns <- NULL
+  if (is.list(x) && !inherits(x, "POSIXlt")) {
+    # for list columns, we also keep attributes of each
+    # element in columns
+    columns <- map(x, arrow_attributes)
+    if (all(map_lgl(columns, is.null))) {
+      columns <- NULL
+    }
+  }
+
+  if (length(att) || !is.null(columns)) {
+    list(attributes = att, columns = columns)
   } else {
     NULL
   }

diff --git a/r/extra-tests/helpers.R b/r/extra-tests/helpers.R
@@ -19,6 +19,10 @@ if_version <- function(version, op = `==`) {
   op(packageVersion("arrow"), version)
 }
 
+if_version_less_than <- function(version) {
+  if_version(version, op = `<`)
+}
+
 skip_if_version_less_than <- function(version, msg) {
   if(if_version(version, `<`)) {
     skip(msg)

diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
@@ -162,4 +162,36 @@ test_that("Can see the metadata (stream)", {
   )
 })
 
+test_that("Can see the extra metadata (parquet)", {
+  pq_file <- "files/ex_data_extra_metadata.parquet"
+
+  if (if_version_less_than("3.0.0")) {
+    expect_warning(
+      df <- read_parquet(pq_file),
+      "Invalid metadata$r",
+      fixed = TRUE
+    )
+    expect_s3_class(df, "tbl")
+  } else {
+    # version 3.0.0 and greater
+    df <- read_parquet(pq_file)
+    expect_s3_class(df, "tbl")
 
+    expect_equal(
+      attributes(df),
+      list(
+        names = letters[1:4],
+        row.names = 1L,
+        class = c("tbl_df", "tbl", "data.frame"),
+        top_level = list(
+          field_one = 12,
+          field_two = "more stuff"
+        )
+      )
+    )
+
+    # column-level attributes for the large column.
+    expect_named(attributes(df$b), "lots")
+    expect_length(attributes(df$b)$lots, 100)
+  }
+})
diff --git a/r/extra-tests/write-files.R b/r/extra-tests/write-files.R
@@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL
 write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1)
 
 write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")
+
+write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet")
+
diff --git a/r/man/ParquetWriterProperties.Rd b/r/man/ParquetWriterProperties.Rd
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
diff --git a/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet b/r/tests/testthat/golden-files/data-arrow-extra-meta_3.0.0.parquet
diff --git a/r/tests/testthat/helper-data.R b/r/tests/testthat/helper-data.R
@@ -67,3 +67,10 @@ make_big_string <- function() {
   # This creates a character vector that would exceed the capacity of BinaryArray
   rep(purrr::map_chr(2047:2050, ~paste(sample(letters, ., replace = TRUE), collapse = "")), 2^18)
 }
+
+make_string_of_size <- function(size = 1) {
+  purrr::map_chr(1000*size, ~paste(sample(letters, ., replace = TRUE), collapse = ""))
+}
+
+example_with_extra_metadata <- example_with_metadata
+attributes(example_with_extra_metadata$b) <- list(lots = rep(make_string_of_size(1), 100))
diff --git a/r/tests/testthat/test-backwards-compatibility.R b/r/tests/testthat/test-backwards-compatibility.R
@@ -45,6 +45,15 @@ expect_identical_with_metadata <- function(object, expected, ..., top_level = TR
   expect_identical(object, expected, ...)
 }
 
+test_that("reading a known Parquet file to dataframe with 3.0.0", {
+  skip_if_not_available("snappy")
+  pq_file <- test_path("golden-files/data-arrow-extra-meta_3.0.0.parquet")
+
+  df <- read_parquet(pq_file)
+  # this is equivalent to `expect_identical()`
+  expect_identical_with_metadata(df, example_with_extra_metadata)
+})
+
 test_that("reading a known Parquet file to dataframe with 2.0.0", {
   skip_if_not_available("snappy")
   pq_file <- test_path("golden-files/data-arrow_2.0.0.parquet")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -37,3 +37,6 @@ example_with_metadata_v1$c <- NULL
		write_feather(example_with_metadata_v1, "extra-tests/files/ex_data_v1.feather", version = 1)

		write_ipc_stream(example_with_metadata, "extra-tests/files/ex_data.stream")

		write_parquet(example_with_extra_metadata, "extra-tests/files/ex_data_extra_metadata.parquet")