Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump polars version to <1.23 #17986

Open
wants to merge 19 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.20,<1.22
- polars>=1.20,<1.23
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<20.0.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-128_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.20,<1.22
- polars>=1.20,<1.23
- pre-commit
- pyarrow>=14.0.0,<20.0.0a0
- pydata-sphinx-theme>=0.15.4
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf-polars/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
run:
- python
- pylibcudf ={{ version }}
- polars >=1.20,<1.22
- polars >=1.20,<1.23
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

test:
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=1.20,<1.22
- polars>=1.20,<1.23
run_cudf_polars_experimental:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
43 changes: 37 additions & 6 deletions python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -1650,6 +1650,16 @@ def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame:
return DataFrame(columns)


class MergeSorted(IR):
"""Merge sorted operation."""

def __init__(self, schema: Schema, left: IR, right: IR, key: str):
# libcudf merge is not stable wrt order of inputs, since
# it uses a priority queue to manage the tables it produces.
# See: https://github.com/rapidsai/cudf/issues/16010
raise NotImplementedError("MergeSorted not yet implemented")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to note (this comment is fine for now still), but I think polars no longer guarantees ordering for merge_sorted, so we can probably now implement this fine.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref #18026



class MapFunction(IR):
"""Apply some function to a dataframe."""

Expand All @@ -1663,13 +1673,10 @@ class MapFunction(IR):
_NAMES: ClassVar[frozenset[str]] = frozenset(
[
"rechunk",
# libcudf merge is not stable wrt order of inputs, since
# it uses a priority queue to manage the tables it produces.
# See: https://github.com/rapidsai/cudf/issues/16010
# "merge_sorted",
"rename",
"explode",
"unpivot",
"row_index",
]
)

Expand All @@ -1678,8 +1685,12 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
self.name = name
self.options = options
self.children = (df,)
if self.name not in MapFunction._NAMES:
raise NotImplementedError(f"Unhandled map function {self.name}")
if (
self.name not in MapFunction._NAMES
): # pragma: no cover; need more polars rust functions
raise NotImplementedError(
f"Unhandled map function {self.name}"
) # pragma: no cover
if self.name == "explode":
(to_explode,) = self.options
if len(to_explode) > 1:
Expand Down Expand Up @@ -1716,6 +1727,9 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
variable_name,
value_name,
)
elif self.name == "row_index":
col_name, offset = options
self.options = (col_name, offset)
self._non_child_args = (schema, name, self.options)

@classmethod
Expand Down Expand Up @@ -1781,6 +1795,23 @@ def do_evaluate(
Column(value_column, name=value_name),
]
)
elif name == "row_index":
col_name, offset = options
dtype = schema[col_name]
step = plc.interop.from_arrow(
pa.scalar(1, type=plc.interop.to_arrow(dtype))
)
init = plc.interop.from_arrow(
pa.scalar(offset, type=plc.interop.to_arrow(dtype))
)
index_col = Column(
plc.filling.sequence(df.num_rows, init, step),
is_sorted=plc.types.Sorted.YES,
order=plc.types.Order.ASCENDING,
null_order=plc.types.NullOrder.AFTER,
name=col_name,
)
return DataFrame([index_col, *df.columns])
else:
raise AssertionError("Should never be reached") # pragma: no cover

Expand Down
28 changes: 21 additions & 7 deletions python/cudf_polars/cudf_polars/dsl/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
# IR is versioned with major.minor, minor is bumped for backwards
# compatible changes (e.g. adding new nodes), major is bumped for
# incompatible changes (e.g. renaming nodes).
if (version := self.visitor.version()) >= (5, 1):
if (version := self.visitor.version()) >= (6, 1):
e = NotImplementedError(
f"No support for polars IR {version=}"
) # pragma: no cover; no such version for now.
Expand Down Expand Up @@ -299,7 +299,7 @@ def _(
# Join key dtypes are dependent on the schema of the left and
# right inputs, so these must be translated with the relevant
# input active.
def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal: # pragma: no cover
if literal.dtype.id() == plc.types.TypeId.INT32:
plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
return expr.Literal(
Expand All @@ -308,7 +308,7 @@ def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
)
return literal

def maybe_adjust_binop(e) -> expr.Expr:
def maybe_adjust_binop(e) -> expr.Expr: # pragma: no cover
if isinstance(e.value, expr.BinOp):
left, right = e.value.children
if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
Expand All @@ -323,10 +323,10 @@ def translate_expr_and_maybe_fix_binop_args(translator, exprs):
]

with set_node(translator.visitor, node.input_left):
# TODO: There's bug in the polars type coercion phase.
# Use translate_named_expr directly once our minimum
# supported polars version is 1.22
inp_left = translator.translate_ir(n=None)
# TODO: There's bug in the polars type coercion phase. Use
# translate_named_expr directly once it is resolved.
# Tracking issue: https://github.com/pola-rs/polars/issues/20935
left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
with set_node(translator.visitor, node.input_right):
inp_right = translator.translate_ir(n=None)
Expand Down Expand Up @@ -463,6 +463,21 @@ def _(
return ir.Projection(schema, translator.translate_ir(n=node.input))


@_translate_ir.register
def _(
node: pl_ir.MergeSorted, translator: Translator, schema: dict[str, plc.DataType]
) -> ir.IR:
inp_left = translator.translate_ir(n=node.input_left)
inp_right = translator.translate_ir(n=node.input_right)
key = node.key
return ir.MergeSorted(
schema,
inp_left,
inp_right,
key,
)


@_translate_ir.register
def _(
node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType]
Expand All @@ -472,7 +487,6 @@ def _(
schema,
name,
options,
# TODO: merge_sorted breaks this pattern
translator.translate_ir(n=node.input),
)

Expand Down
3 changes: 3 additions & 0 deletions python/cudf_polars/cudf_polars/testing/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def pytest_configure(config: pytest.Config) -> None:
"tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
"tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
"tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
"tests/unit/io/test_multiscan.py::test_include_file_paths[scan_parquet-write_parquet]": "Need to expose include_file_paths xref: cudf#18012",
"tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012",
"tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match",
# Maybe flaky, order-dependent?
"tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
"tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ authors = [
license = { text = "Apache 2.0" }
requires-python = ">=3.10"
dependencies = [
"polars>=1.20,<1.22",
"polars>=1.20,<1.23",
"pylibcudf==25.4.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
classifiers = [
Expand Down
13 changes: 12 additions & 1 deletion python/cudf_polars/tests/test_mapfunction.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

Expand Down Expand Up @@ -93,3 +93,14 @@ def test_unpivot_defaults():
)
q = df.unpivot(index="d")
assert_gpu_result_equal(q)


def test_with_row_index_defaults():
lf = pl.LazyFrame(
{
"a": [1, 3, 5],
"b": [2, 4, 6],
}
)
q = lf.with_row_index()
assert_gpu_result_equal(q)
Loading