rapidsai · Matt711 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 7, 2025
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -41,6 +41,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
       - telemetry-setup
+      - third-party-integration-tests-cudf-pandas
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@nvks-runners
     if: always()
@@ -356,3 +357,17 @@ jobs:
     steps:
       - name: Telemetry summarize
         uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main
+  third-party-integration-tests-cudf-pandas:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      arch: "amd64"
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      node_type: "gpu-v100-latest-1"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: |
+        ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -76,6 +76,13 @@ files:
       - py_version
       - test_base
       - test_xgboost
+  test_catboost:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_catboost
   test_cuml:
     output: none
     includes:
@@ -244,6 +251,15 @@ dependencies:
           - pip
           - pip:
             - xgboost>=2.0.1
+  test_catboost:
+    common:
+      - output_types: conda
+        packages:
+        # TODO: Remove numpy pinning once https://github.com/catboost/catboost/issues/2671 is resolved
+          - numpy>=1.23,<2.0.0
+          - scipy
+          - scikit-learn
+          - catboost
   test_cuml:
     common:
       - output_types: conda
@@ -262,7 +278,7 @@ dependencies:
         packages:
           - pip
           - pip:
-              - ibis-framework[pandas]
+              - ibis-framework[pandas]==9.5.*
   test_hvplot:
     common:
       - output_types: conda

@@ -0,0 +1,128 @@
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import pytest
+from catboost import CatBoostClassifier, CatBoostRegressor, Pool
+from sklearn.datasets import make_classification, make_regression
+
+rng = np.random.default_rng(seed=42)
+
+
+def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
+    if isinstance(expect, (tuple, list)):
+        assert len(expect) == len(got)
+        for e, g in zip(expect, got):
+            assert_catboost_equal(e, g, rtol, atol)
+    elif isinstance(expect, np.ndarray):
+        np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
+    elif isinstance(expect, pd.DataFrame):
+        pd.testing.assert_frame_equal(expect, got)
+    elif isinstance(expect, pd.Series):
+        pd.testing.assert_series_equal(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)
+
+
+@pytest.fixture
+def regression_data():
+    X, y = make_regression(n_samples=100, n_features=10, random_state=42)
+    return pd.DataFrame(X), pd.Series(y)
+
+
+@pytest.fixture
+def classification_data():
+    X, y = make_classification(
+        n_samples=100, n_features=10, n_classes=2, random_state=42
-        n_samples=100, n_features=10, n_classes=2, random_state=42
+        n_samples=1_000, n_features=10, n_classes=2, random_state=42
-        n_samples=100, n_features=10, n_classes=2, random_state=42
+        n_samples=1_000, n_features=10, n_classes=2, random_state=42
+    )
+    return pd.DataFrame(X), pd.Series(y)
+
+
+def test_catboost_regressor_with_dataframe(regression_data):
+    X, y = regression_data
+    model = CatBoostRegressor(iterations=10, verbose=0)
+    model.fit(X, y)
+    predictions = model.predict(X)
+    return predictions
+
+
+def test_catboost_regressor_with_numpy(regression_data):
+    X, y = regression_data
+    model = CatBoostRegressor(iterations=10, verbose=0)
+    model.fit(X.values, y.values)
+    predictions = model.predict(X.values)
+    return predictions
+
+
+def test_catboost_classifier_with_dataframe(classification_data):
+    X, y = classification_data
+    model = CatBoostClassifier(iterations=10, verbose=0)
+    model.fit(X, y)
+    predictions = model.predict(X)
+    return predictions
+
+
+def test_catboost_classifier_with_numpy(classification_data):
+    X, y = classification_data
+    model = CatBoostClassifier(iterations=10, verbose=0)
+    model.fit(X.values, y.values)
+    predictions = model.predict(X.values)
+    return predictions
+
+
+def test_catboost_with_pool_and_dataframe(regression_data):
+    X, y = regression_data
+    train_pool = Pool(X, y)
+    model = CatBoostRegressor(iterations=10, verbose=0)
+    model.fit(train_pool)
+    predictions = model.predict(X)
+    return predictions
+
+
+def test_catboost_with_pool_and_numpy(regression_data):
+    X, y = regression_data
+    train_pool = Pool(X.values, y.values)
+    model = CatBoostRegressor(iterations=10, verbose=0)
+    model.fit(train_pool)
+    predictions = model.predict(X.values)
+    return predictions
+
+
+def test_catboost_with_categorical_features():
+    data = {
+        "numerical_feature": rng.standard_normal(100),
+        "categorical_feature": rng.choice(["A", "B", "C"], size=100),
+        "target": rng.integers(0, 2, size=100),
+    }
+    df = pd.DataFrame(data)
+    X = df[["numerical_feature", "categorical_feature"]]
+    y = df["target"]
+    cat_features = ["categorical_feature"]
+    model = CatBoostClassifier(
+        iterations=10, verbose=0, cat_features=cat_features
+    )
+    model.fit(X, y)
+    predictions = model.predict(X)
+    return predictions
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [
+        (
+            pd.DataFrame(rng.standard_normal((100, 5))),
+            pd.Series(rng.standard_normal(100)),
+        ),
+        (rng.standard_normal((100, 5)), rng.standard_normal(100)),
+    ],
+)
+def test_catboost_train_test_split(X, y):
+    from sklearn.model_selection import train_test_split
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+    model = CatBoostRegressor(iterations=10, verbose=0)
+    model.fit(X_train, y_train)
+    predictions = model.predict(X_test)
+    return len(X_train), len(X_test), len(y_train), len(y_test), predictions
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 import cugraph
 import cupy as cp
 import networkx as nx
@@ -52,6 +52,10 @@ def adjacency_matrix():
     return df
 
 
+# TODO: Tracking Issue https://github.com/rapidsai/cudf/issues/17934
+@pytest.mark.skip(
+    reason="TypeError: Could not construct DataFrame from <class 'pandas.core.frame.DataFrame'>"
+)
 @pytest.mark.parametrize("algo", cugraph_algos)
 def test_cugraph_from_pandas_edgelist(df, algo):
     G = cugraph.Graph()