diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 09eb9949f1d..4ec6ef1883a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.22 +- polars>=1.20,<1.23 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<20.0.0a0 diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index 56cef28ac61..dcf96a02a36 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.20,<1.22 +- polars>=1.20,<1.23 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index fb7ab9332d8..1d36ab2a3e4 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.20,<1.22 + - polars >=1.20,<1.23 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index 7188e10b058..c8893fc8b49 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -803,7 +803,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.20,<1.22 + - polars>=1.20,<1.23 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 8f12a4a7570..603f51e9d40 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1650,6 +1650,16 @@ def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame: return DataFrame(columns) +class MergeSorted(IR): + """Merge sorted operation.""" + + def __init__(self, schema: Schema, left: IR, right: IR, key: str): + # libcudf merge is not stable wrt order of inputs, since + # it uses a priority queue to manage the tables it produces. + # See: https://github.com/rapidsai/cudf/issues/16010 + raise NotImplementedError("MergeSorted not yet implemented") + + class MapFunction(IR): """Apply some function to a dataframe.""" @@ -1663,13 +1673,10 @@ class MapFunction(IR): _NAMES: ClassVar[frozenset[str]] = frozenset( [ "rechunk", - # libcudf merge is not stable wrt order of inputs, since - # it uses a priority queue to manage the tables it produces. - # See: https://github.com/rapidsai/cudf/issues/16010 - # "merge_sorted", "rename", "explode", "unpivot", + "row_index", ] ) @@ -1678,8 +1685,12 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): self.name = name self.options = options self.children = (df,) - if self.name not in MapFunction._NAMES: - raise NotImplementedError(f"Unhandled map function {self.name}") + if ( + self.name not in MapFunction._NAMES + ): # pragma: no cover; need more polars rust functions + raise NotImplementedError( + f"Unhandled map function {self.name}" + ) # pragma: no cover if self.name == "explode": (to_explode,) = self.options if len(to_explode) > 1: @@ -1716,6 +1727,9 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): variable_name, value_name, ) + elif self.name == "row_index": + col_name, offset = options + self.options = (col_name, offset) self._non_child_args = (schema, name, self.options) @classmethod @@ -1781,6 +1795,23 @@ def do_evaluate( Column(value_column, name=value_name), ] ) + elif name == "row_index": + col_name, offset = options + dtype = schema[col_name] + step = plc.interop.from_arrow( + pa.scalar(1, type=plc.interop.to_arrow(dtype)) + ) + init = plc.interop.from_arrow( + pa.scalar(offset, type=plc.interop.to_arrow(dtype)) + ) + index_col = Column( + plc.filling.sequence(df.num_rows, init, step), + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + name=col_name, + ) + return DataFrame([index_col, *df.columns]) else: raise AssertionError("Should never be reached") # pragma: no cover diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 4ed36e463f3..22f97f2bf52 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (5, 1): + if (version := self.visitor.version()) >= (6, 1): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. @@ -299,7 +299,7 @@ def _( # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. - def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal: + def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal: # pragma: no cover if literal.dtype.id() == plc.types.TypeId.INT32: plc_int64 = plc.types.DataType(plc.types.TypeId.INT64) return expr.Literal( @@ -308,7 +308,7 @@ def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal: ) return literal - def maybe_adjust_binop(e) -> expr.Expr: + def maybe_adjust_binop(e) -> expr.Expr: # pragma: no cover if isinstance(e.value, expr.BinOp): left, right = e.value.children if isinstance(left, expr.Col) and isinstance(right, expr.Literal): @@ -323,10 +323,10 @@ def translate_expr_and_maybe_fix_binop_args(translator, exprs): ] with set_node(translator.visitor, node.input_left): + # TODO: There's bug in the polars type coercion phase. + # Use translate_named_expr directly once our minimum + # supported polars version is 1.22 inp_left = translator.translate_ir(n=None) - # TODO: There's bug in the polars type coercion phase. Use - # translate_named_expr directly once it is resolved. - # Tracking issue: https://github.com/pola-rs/polars/issues/20935 left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on) with set_node(translator.visitor, node.input_right): inp_right = translator.translate_ir(n=None) @@ -463,6 +463,21 @@ def _( return ir.Projection(schema, translator.translate_ir(n=node.input)) +@_translate_ir.register +def _( + node: pl_ir.MergeSorted, translator: Translator, schema: dict[str, plc.DataType] +) -> ir.IR: + inp_left = translator.translate_ir(n=node.input_left) + inp_right = translator.translate_ir(n=node.input_right) + key = node.key + return ir.MergeSorted( + schema, + inp_left, + inp_right, + key, + ) + + @_translate_ir.register def _( node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType] @@ -472,7 +487,6 @@ def _( schema, name, options, - # TODO: merge_sorted breaks this pattern translator.translate_ir(n=node.input), ) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 48629af920d..cf1bfbe8a69 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -193,6 +193,9 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", + "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_parquet-write_parquet]": "Need to expose include_file_paths xref: cudf#18012", + "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012", + "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match", # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 805d7925bb4..872c08a66f9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.20,<1.22", + "polars>=1.20,<1.23", "pylibcudf==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index 63aa1c573a9..7a9f4a56545 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -93,3 +93,14 @@ def test_unpivot_defaults(): ) q = df.unpivot(index="d") assert_gpu_result_equal(q) + + +def test_with_row_index_defaults(): + lf = pl.LazyFrame( + { + "a": [1, 3, 5], + "b": [2, 4, 6], + } + ) + q = lf.with_row_index() + assert_gpu_result_equal(q)