rapidsai · Matt711 · Feb 11, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.22
+- polars>=1.20,<1.23
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<20.0.0a0

@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.22
+- polars>=1.20,<1.23
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4

@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.20,<1.22
+    - polars >=1.20,<1.23
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:

@@ -803,7 +803,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.20,<1.22
+          - polars>=1.20,<1.23
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]

@@ -1650,6 +1650,16 @@ def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame:
         return DataFrame(columns)
 
 
+class MergeSorted(IR):
+    """Merge sorted operation."""
+
+    def __init__(self, schema: Schema, left: IR, right: IR, key: str):
+        # libcudf merge is not stable wrt order of inputs, since
+        # it uses a priority queue to manage the tables it produces.
+        # See: https://github.com/rapidsai/cudf/issues/16010
+        raise NotImplementedError("MergeSorted not yet implemented")
+
+
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
@@ -1663,13 +1673,10 @@ class MapFunction(IR):
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
             "rechunk",
-            # libcudf merge is not stable wrt order of inputs, since
-            # it uses a priority queue to manage the tables it produces.
-            # See: https://github.com/rapidsai/cudf/issues/16010
-            # "merge_sorted",
             "rename",
             "explode",
             "unpivot",
+            "row_index",
         ]
     )
 
@@ -1678,8 +1685,12 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
         self.name = name
         self.options = options
         self.children = (df,)
-        if self.name not in MapFunction._NAMES:
-            raise NotImplementedError(f"Unhandled map function {self.name}")
+        if (
+            self.name not in MapFunction._NAMES
+        ):  # pragma: no cover; need more polars rust functions
+            raise NotImplementedError(
+                f"Unhandled map function {self.name}"
+            )  # pragma: no cover
         if self.name == "explode":
             (to_explode,) = self.options
             if len(to_explode) > 1:
@@ -1716,6 +1727,9 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
                 variable_name,
                 value_name,
             )
+        elif self.name == "row_index":
+            col_name, offset = options
+            self.options = (col_name, offset)
         self._non_child_args = (schema, name, self.options)
 
     @classmethod
@@ -1781,6 +1795,23 @@ def do_evaluate(
                     Column(value_column, name=value_name),
                 ]
             )
+        elif name == "row_index":
+            col_name, offset = options
+            dtype = schema[col_name]
+            step = plc.interop.from_arrow(
+                pa.scalar(1, type=plc.interop.to_arrow(dtype))
+            )
+            init = plc.interop.from_arrow(
+                pa.scalar(offset, type=plc.interop.to_arrow(dtype))
+            )
+            index_col = Column(
+                plc.filling.sequence(df.num_rows, init, step),
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.AFTER,
+                name=col_name,
+            )
+            return DataFrame([index_col, *df.columns])
         else:
             raise AssertionError("Should never be reached")  # pragma: no cover
 

@@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (5, 1):
+        if (version := self.visitor.version()) >= (6, 1):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
@@ -299,7 +299,7 @@ def _(
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
-    def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
+    def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:  # pragma: no cover
         if literal.dtype.id() == plc.types.TypeId.INT32:
             plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
             return expr.Literal(
@@ -308,7 +308,7 @@ def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
             )
         return literal
 
-    def maybe_adjust_binop(e) -> expr.Expr:
+    def maybe_adjust_binop(e) -> expr.Expr:  # pragma: no cover
         if isinstance(e.value, expr.BinOp):
             left, right = e.value.children
             if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
@@ -323,10 +323,10 @@ def translate_expr_and_maybe_fix_binop_args(translator, exprs):
         ]
 
     with set_node(translator.visitor, node.input_left):
+        # TODO: There's bug in the polars type coercion phase.
+        # Use translate_named_expr directly once our minimum
+        # supported polars version is 1.22
         inp_left = translator.translate_ir(n=None)
-        # TODO: There's bug in the polars type coercion phase. Use
-        # translate_named_expr directly once it is resolved.
-        # Tracking issue: https://github.com/pola-rs/polars/issues/20935
         left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
@@ -463,6 +463,21 @@ def _(
     return ir.Projection(schema, translator.translate_ir(n=node.input))
 
 
+@_translate_ir.register
+def _(
+    node: pl_ir.MergeSorted, translator: Translator, schema: dict[str, plc.DataType]
+) -> ir.IR:
+    inp_left = translator.translate_ir(n=node.input_left)
+    inp_right = translator.translate_ir(n=node.input_right)
+    key = node.key
+    return ir.MergeSorted(
+        schema,
+        inp_left,
+        inp_right,
+        key,
+    )
+
+
 @_translate_ir.register
 def _(
     node: pl_ir.MapFunction, translator: Translator, schema: dict[str, plc.DataType]
@@ -472,7 +487,6 @@ def _(
         schema,
         name,
         options,
-        # TODO: merge_sorted breaks this pattern
         translator.translate_ir(n=node.input),
     )
 

@@ -193,6 +193,9 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
+    "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_parquet-write_parquet]": "Need to expose include_file_paths xref: cudf#18012",
+    "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012",
+    "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",

@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.20,<1.22",
+    "polars>=1.20,<1.23",
     "pylibcudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -93,3 +93,14 @@ def test_unpivot_defaults():
     )
     q = df.unpivot(index="d")
     assert_gpu_result_equal(q)
+
+
+def test_with_row_index_defaults():
+    lf = pl.LazyFrame(
+        {
+            "a": [1, 3, 5],
+            "b": [2, 4, 6],
+        }
+    )
+    q = lf.with_row_index()
+    assert_gpu_result_equal(q)