Skip to content

Commit

Permalink
ARROW-11468: [R] Allow user to pass schema to read_json_arrow()
Browse files Browse the repository at this point in the history
A couple of things I wanted to check are expected behaviour:

1. If I specify in the schema that a numeric column should be a string column, I get the error `Error: Invalid: JSON parse error: Column(/third_col) changed from string to number in row 0`
 (e.g. if I run the following)
```
tf <- tempfile()
writeLines('
    { "hello": 3.5, "world": 2, "third_col": 99}
    { "hello": 3.25, "world": 5, "third_col": 98}
    { "hello": 3.125, "world": 8, "third_col": 97 }
    { "hello": 0.0, "world": 10, "third_col": 96}
', tf)
read_json_arrow(tf, schema = schema(third_col = utf8(), world = float64()))
```
2. As can be seen in the tests output (will delete the `print` statements before this is merged), table columns are returned in the order specified in the schema and then the columns not mentioned in the schema.

Closes apache#9950 from thisisnic/ARROW-11468

Authored-by: Nic Crane <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
  • Loading branch information
thisisnic authored and nealrichardson committed Apr 14, 2021
1 parent b5045ed commit 894fab0
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 14 deletions.
8 changes: 6 additions & 2 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 12 additions & 4 deletions r/R/json.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' Using [JsonTableReader]
#'
#' @inheritParams read_delim_arrow
#' @param schema [Schema] that describes the table.
#' @param ... Additional options passed to `JsonTableReader$create()`
#'
#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
Expand All @@ -38,12 +39,13 @@
read_json_arrow <- function(file,
col_select = NULL,
as_data_frame = TRUE,
schema = NULL,
...) {
if (!inherits(file, "InputStream")) {
file <- make_readable_file(file)
on.exit(file$close())
}
tab <- JsonTableReader$create(file, ...)$Read()
tab <- JsonTableReader$create(file, schema = schema, ...)$Read()

col_select <- enquo(col_select)
if (!quo_is_null(col_select)) {
Expand All @@ -69,7 +71,8 @@ JsonTableReader <- R6Class("JsonTableReader", inherit = ArrowObject,
)
JsonTableReader$create <- function(file,
read_options = JsonReadOptions$create(),
parse_options = JsonParseOptions$create(),
parse_options = JsonParseOptions$create(schema = schema),
schema = NULL,
...) {
assert_is(file, "InputStream")
json___TableReader__Make(file, read_options, parse_options)
Expand All @@ -91,6 +94,11 @@ JsonReadOptions$create <- function(use_threads = option_use_threads(), block_siz
#' @docType class
#' @export
JsonParseOptions <- R6Class("JsonParseOptions", inherit = ArrowObject)
JsonParseOptions$create <- function(newlines_in_values = FALSE) {
json___ParseOptions__initialize(newlines_in_values)
JsonParseOptions$create <- function(newlines_in_values = FALSE, schema = NULL) {
if (is.null(schema)) {
json___ParseOptions__initialize1(newlines_in_values)
} else {
json___ParseOptions__initialize2(newlines_in_values, schema)
}

}
10 changes: 9 additions & 1 deletion r/man/read_json_arrow.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 23 additions & 6 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion r/src/json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,24 @@ std::shared_ptr<arrow::json::ReadOptions> json___ReadOptions__initialize(bool us
}

// [[arrow::export]]
std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize(
std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize1(
bool newlines_in_values) {
auto res =
std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
res->newlines_in_values = newlines_in_values;
return res;
}

// [[arrow::export]]
std::shared_ptr<arrow::json::ParseOptions> json___ParseOptions__initialize2(
bool newlines_in_values, const std::shared_ptr<arrow::Schema>& explicit_schema) {
auto res =
std::make_shared<arrow::json::ParseOptions>(arrow::json::ParseOptions::Defaults());
res->newlines_in_values = newlines_in_values;
res->explicit_schema = explicit_schema;
return res;
}

// [[arrow::export]]
std::shared_ptr<arrow::json::TableReader> json___TableReader__Make(
const std::shared_ptr<arrow::io::InputStream>& input,
Expand Down
88 changes: 88 additions & 0 deletions r/tests/testthat/test-json.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,94 @@ test_that("read_json_arrow() supports col_select=", {
expect_equal(names(tab2), c("hello", "world"))
})

test_that("read_json_arrow(schema=) with empty schema", {
tf <- tempfile()
writeLines('
{ "hello": 3.5, "world": 2, "third_col": 99}
{ "hello": 3.25, "world": 5, "third_col": 98}
{ "hello": 3.125, "world": 8, "third_col": 97 }
{ "hello": 0.0, "world": 10, "third_col": 96}
', tf)

tab1 <- read_json_arrow(tf, schema = schema())

expect_identical(
tab1,
tibble::tibble(
hello = c(3.5, 3.25, 3.125, 0),
world = c(2L, 5L, 8L, 10L),
third_col = c(99L,98L,97L,96L)
)
)
})

test_that("read_json_arrow(schema=) with partial schema", {
tf <- tempfile()
writeLines('
{ "hello": 3.5, "world": 2, "third_col": 99}
{ "hello": 3.25, "world": 5, "third_col": 98}
{ "hello": 3.125, "world": 8, "third_col": 97 }
{ "hello": 0.0, "world": 10, "third_col": 96}
', tf)

tab1 <- read_json_arrow(tf, schema = schema(third_col = float64(), world = float64()))

expect_identical(
tab1,
tibble::tibble(
third_col = c(99,98,97,96),
world = c(2, 5, 8, 10),
hello = c(3.5, 3.25, 3.125, 0)
)
)

tf2 <- tempfile()
writeLines('
{ "hello": 3.5, "world": 2, "third_col": "99"}
{ "hello": 3.25, "world": 5, "third_col": "98"}
{ "hello": 3.125, "world": 8, "third_col": "97"}
', tf2)

tab2 <- read_json_arrow(tf2, schema = schema(third_col = string(), world = float64()))

expect_identical(
tab2,
tibble::tibble(
third_col = c("99","98","97"),
world = c(2, 5, 8),
hello = c(3.5, 3.25, 3.125)
)
)
})

test_that("read_json_arrow(schema=) with full schema", {
tf <- tempfile()
writeLines('
{ "hello": 3.5, "world": 2, "third_col": 99}
{ "hello": 3.25, "world": 5, "third_col": 98}
{ "hello": 3.125, "world": 8, "third_col": 97}
{ "hello": 0.0, "world": 10, "third_col": 96}
', tf)

tab1 <- read_json_arrow(
tf,
schema = schema(
hello = float64(),
third_col = float64(),
world = float64()
)
)

expect_identical(
tab1,
tibble::tibble(
hello = c(3.5, 3.25, 3.125, 0),
third_col = c(99,98,97,96),
world = c(2, 5, 8, 10)
)
)
})

test_that("Can read json file with nested columns (ARROW-5503)", {
tf <- tempfile()
on.exit(unlink(tf))
Expand Down

0 comments on commit 894fab0

Please sign in to comment.