Skip to content

Commit

Permalink
feat(df-repr/bridge): upgrade datafusion to 43.0.0 (#260)
Browse files Browse the repository at this point in the history
Despite the upgrade,

* New `create_df_context` to be used across all crates to create a
datafusion context with optd. We had too much duplicate code before to
set up the context.
* The main refactor is about the aggregation expressions. Datafusion has
a new way of doing that.
* Datafusion removed cross join. We didn't. We can eventually remove it
but now it's blocked on two-stage cascades: if we simply treat cross
join the same as inner join, we would time out.
* Several other refactors to adapt to datafusion (i.e., limit node now
takes i64, empty relation / placeholder row executor)
* Keep as much as the original datafusion cli crate as possible. We now
only patch main.rs and exec.rs.
* There's one more breaking change that we might encounter later when
doing sort physical properties. Now datafusion logical plan will remove
duplicate sorts if there are no limits present. I feel this is a bad
move b/c it's not a direct mapping from the original SQL statement...

---------

Signed-off-by: Alex Chi <[email protected]>
  • Loading branch information
skyzh authored Dec 8, 2024
1 parent 6696706 commit 8f269c5
Show file tree
Hide file tree
Showing 67 changed files with 12,894 additions and 14,378 deletions.
1,639 changes: 969 additions & 670 deletions Cargo.lock

Large diffs are not rendered by default.

3,937 changes: 0 additions & 3,937 deletions datafusion-optd-cli/Cargo.lock

This file was deleted.

43 changes: 26 additions & 17 deletions datafusion-optd-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,52 +18,61 @@
[package]
name = "datafusion-optd-cli"
description = "Command Line Client for DataFusion query engine."
version = "32.0.0"
version = "43.0.0"
authors = ["Apache DataFusion <[email protected]>"]
edition = "2021"
keywords = ["arrow", "datafusion", "query", "sql"]
license = "Apache-2.0"
homepage = "https://github.com/cmu-db/optd"
repository = "https://github.com/cmu-db/optd"
rust-version = "1.70"
# Specify MSRV here as `cargo msrv` doesn't support workspace version
rust-version = "1.79"
readme = "README.md"

[dependencies]
arrow = "47.0.0"
async-trait = "0.1.41"
aws-config = "0.55"
aws-credential-types = "0.55"
clap = { version = "3", features = ["derive", "cargo"] }
datafusion = { version = "32.0.0", features = [
arrow = { version = "53.0.0" }
async-trait = "0.1.73"
aws-config = "1.5.5"
aws-sdk-sso = "1.43.0"
aws-sdk-ssooidc = "1.44.0"
aws-sdk-sts = "1.43.0"
# end pin aws-sdk crates
aws-credential-types = "1.2.0"
clap = { version = "4.5.16", features = ["derive", "cargo"] }
datafusion = { version = "43.0.0", features = [
"avro",
"crypto_expressions",
"datetime_expressions",
"encoding_expressions",
"parquet",
"regex_expressions",
"unicode_expressions",
"compression",
] }
dirs = "4.0.0"
env_logger = "0.9"
dirs = "5.0.1"
env_logger = "0.11"
futures = "0.3"
mimalloc = { version = "0.1", default-features = false }
object_store = { version = "0.7.0", features = ["aws", "gcp"] }
object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] }
parking_lot = { version = "0.12" }
parquet = { version = "53.0.0", default-features = false }
regex = "1.8"
rustyline = "11.0"
rustyline = "14.0"
tokio = { version = "1.24", features = [
"macros",
"rt",
"rt-multi-thread",
"sync",
"parking_lot",
"signal",
] }
url = "2.2"
# begin optd-cli patch
optd-datafusion-bridge = { path = "../optd-datafusion-bridge", version = "0.1" }
optd-datafusion-repr-adv-cost = { path = "../optd-datafusion-repr-adv-cost", version = "0.1" }
optd-datafusion-repr = { path = "../optd-datafusion-repr", version = "0.1" }
tracing-subscriber = "0.3"
tracing = "0.1"
# end optd-cli patch

[dev-dependencies]
assert_cmd = "2.0"
ctor = "0.2.0"
predicates = "3.0"
rstest = "0.17"
rstest = "0.22"
17 changes: 9 additions & 8 deletions datafusion-optd-cli/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,23 @@
# specific language governing permissions and limitations
# under the License.

FROM rust:1.70 as builder
FROM rust:1.79-bookworm AS builder

COPY . /usr/src/arrow-datafusion
COPY ./datafusion /usr/src/arrow-datafusion/datafusion
COPY . /usr/src/datafusion
COPY ./datafusion /usr/src/datafusion/datafusion
COPY ./datafusion-cli /usr/src/datafusion/datafusion-cli

COPY ./datafusion-cli /usr/src/arrow-datafusion/datafusion-cli

WORKDIR /usr/src/arrow-datafusion/datafusion-cli
WORKDIR /usr/src/datafusion/datafusion-cli

RUN rustup component add rustfmt

RUN cargo build --release

FROM debian:bullseye-slim
FROM debian:bookworm-slim

COPY --from=builder /usr/src/datafusion/datafusion-cli/target/release/datafusion-cli /usr/local/bin

COPY --from=builder /usr/src/arrow-datafusion/datafusion-cli/target/release/datafusion-cli /usr/local/bin
RUN mkdir /data

ENTRYPOINT ["datafusion-cli"]

Expand Down
24 changes: 21 additions & 3 deletions datafusion-optd-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,26 @@

# DataFusion Command-line Interface

[DataFusion](https://arrow.apache.org/datafusion/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.
[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format.

The DataFusion CLI is a command line utility that runs SQL queries using the DataFusion engine.
DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine.

See the [`datafusion-cli` documentation](https://arrow.apache.org/datafusion/user-guide/cli.html) for further information.
# Frequently Asked Questions

## Where can I find more information?

See the [`datafusion-cli` documentation](https://datafusion.apache.org/user-guide/cli/index.html) for further information.

## How do I make my IDE work with `datafusion-cli`?

"open" the `datafusion/datafusion-cli` project as its own top level
project in my IDE (rather than opening `datafusion`)

The reason `datafusion-cli` is not part of the main workspace in
[`datafusion Cargo.toml`] file is that `datafusion-cli` is a binary and has a
checked in `Cargo.lock` file to ensure reproducible builds.

However, the `datafusion` and sub crates are intended for use as libraries and
thus do not have a `Cargo.lock` file checked in.

[`datafusion cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml
92 changes: 92 additions & 0 deletions datafusion-optd-cli/examples/cli-session-context.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Shows an example of a custom session context that unions the input plan with itself.
//! To run this example, use `cargo run --example cli-session-context` from within the `datafusion-cli` directory.
use std::sync::Arc;

use datafusion::{
dataframe::DataFrame,
error::DataFusionError,
execution::{context::SessionState, TaskContext},
logical_expr::{LogicalPlan, LogicalPlanBuilder},
prelude::SessionContext,
};
use datafusion_optd_cli::{
cli_context::CliSessionContext, exec::exec_from_repl, print_options::PrintOptions,
};
use object_store::ObjectStore;

/// This is a toy example of a custom session context that unions the input plan with itself.
struct MyUnionerContext {
ctx: SessionContext,
}

impl Default for MyUnionerContext {
fn default() -> Self {
Self {
ctx: SessionContext::new(),
}
}
}

#[async_trait::async_trait]
impl CliSessionContext for MyUnionerContext {
fn task_ctx(&self) -> Arc<TaskContext> {
self.ctx.task_ctx()
}

fn session_state(&self) -> SessionState {
self.ctx.state()
}

fn register_object_store(
&self,
url: &url::Url,
object_store: Arc<dyn ObjectStore>,
) -> Option<Arc<dyn ObjectStore + 'static>> {
self.ctx.register_object_store(url, object_store)
}

fn register_table_options_extension_from_scheme(&self, _scheme: &str) {
unimplemented!()
}

async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result<DataFrame, DataFusionError> {
let new_plan = LogicalPlanBuilder::from(plan.clone())
.union(plan.clone())?
.build()?;

self.ctx.execute_logical_plan(new_plan).await
}
}

#[tokio::main]
/// Runs the example.
pub async fn main() {
let my_ctx = MyUnionerContext::default();

let mut print_options = PrintOptions {
format: datafusion_optd_cli::print_format::PrintFormat::Automatic,
quiet: false,
maxrows: datafusion_optd_cli::print_options::MaxRows::Unlimited,
color: true,
};

exec_from_repl(&my_ctx, &mut print_options).await.unwrap();
}
Loading

0 comments on commit 8f269c5

Please sign in to comment.