Skip to content

Commit

Permalink
Merge branch 'branch-25.04' into fea/polars/bump-polars-version
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt711 authored Feb 13, 2025
2 parents 128bffe + 9ead47b commit 7836795
Show file tree
Hide file tree
Showing 112 changed files with 1,979 additions and 1,857 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,14 @@ jobs:
third-party-integration-tests-cudf-pandas:
needs: conda-python-build
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners
uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
with:
build_type: pull-request
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
node_type: "gpu-l4-latest-1"
continue-on-error: true
container_image: "rapidsai/ci-conda:latest"
run_script: |
ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
Expand Down
648 changes: 648 additions & 0 deletions CHANGELOG.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
DEPENDENCIES=(
cudf
cudf_kafka
cudf-polars
cugraph
cuml
custreamz
Expand Down
8 changes: 4 additions & 4 deletions ci/test_python_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ set -euo pipefail

RAPIDS_VERSION="$(rapids-version)"

rapids-logger "Downloading artifacts from previous jobs"
CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)

rapids-logger "Generate Python testing dependencies"

ENV_YAML_DIR="$(mktemp -d)"
Expand All @@ -26,10 +30,6 @@ set +u
conda activate test
set -u

rapids-logger "Downloading artifacts from previous jobs"
CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)

RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${RESULTS_DIR}/coverage-results"}
Expand Down
12 changes: 1 addition & 11 deletions ci/test_python_other.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,9 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
# Common setup steps shared by Python test jobs
source ./ci/test_python_common.sh test_python_other

RAPIDS_VERSION="$(rapids-version)"

rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
"dask-cudf=${RAPIDS_VERSION}" \
"cudf_kafka=${RAPIDS_VERSION}" \
"custreamz=${RAPIDS_VERSION}" \
"cudf-polars=${RAPIDS_VERSION}"

rapids-logger "Check GPU usage"
nvidia-smi

rapids-print-env
EXITCODE=0
trap "EXITCODE=1" ERR
set +e
Expand Down
14 changes: 1 addition & 13 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ option(CUDA_ENABLE_LINEINFO
option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
option(CUDA_STATIC_CUFILE "Statically link cuFile" OFF)

set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
Expand Down Expand Up @@ -464,7 +463,6 @@ add_library(
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/arrow_utilities.cpp
src/interop/decimal_conversion_utilities.cu
src/interop/to_arrow_device.cu
src/interop/to_arrow_host.cu
src/interop/from_arrow_device.cu
Expand Down Expand Up @@ -547,7 +545,6 @@ add_library(
src/io/utilities/data_casting.cu
src/io/utilities/data_sink.cpp
src/io/utilities/datasource.cpp
src/io/utilities/file_io_utilities.cpp
src/io/utilities/row_selection.cpp
src/io/utilities/type_inference.cu
src/io/utilities/trie.cu
Expand Down Expand Up @@ -923,15 +920,6 @@ target_compile_definitions(
# Enable remote IO through KvikIO
target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)

# Enable cuFile support
set(_cufile_suffix)
if(CUDA_STATIC_CUFILE)
set(_cufile_suffix _static)
endif()
if(TARGET CUDA::cuFile${_cufile_suffix})
target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
endif()

# Remove this after upgrading to a CCCL that has a proper CMake option. See
# https://github.com/NVIDIA/cccl/pull/2844
target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
Expand All @@ -944,7 +932,7 @@ target_link_libraries(
cudf
PUBLIC CCCL::CCCL rapids_logger::rapids_logger rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
kvikio::kvikio $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}> nanoarrow
kvikio::kvikio nanoarrow
)

# Add Conda library, and include paths if specified
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/hashing/partition.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,7 @@
#include <cudf/partitioning.hpp>

#include <algorithm>
#include <numeric>

class Hashing : public cudf::benchmark {};

Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,6 +38,7 @@
#include <cstdio>
#include <fstream>
#include <memory>
#include <numeric>
#include <random>

temp_directory const temp_dir("cudf_nvbench");
Expand Down
3 changes: 2 additions & 1 deletion cpp/examples/billion_rows/brc_pipeline.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,6 +31,7 @@
#include <iostream>
#include <memory>
#include <string>
#include <thread>

using elapsed_t = std::chrono::duration<double>;
using byte_range = std::pair<std::size_t, std::size_t>;
Expand Down
21 changes: 3 additions & 18 deletions cpp/include/cudf/io/config_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,22 +19,7 @@

namespace CUDF_EXPORT cudf {
namespace io {
namespace cufile_integration {

/**
* @brief Returns true if cuFile and its compatibility mode are enabled.
*/
bool is_always_enabled();

/**
* @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
*/
bool is_gds_enabled();

/**
* @brief Returns true if KvikIO is enabled.
*/
bool is_kvikio_enabled();
namespace kvikio_integration {

/**
* @brief Set KvikIO parameters, including:
Expand All @@ -45,7 +30,7 @@ bool is_kvikio_enabled();
*/
void set_up_kvikio();

} // namespace cufile_integration
} // namespace kvikio_integration

namespace nvcomp_integration {

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/io/data_sink.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -122,7 +122,7 @@ class data_sink {
*
* In the case where the sink type is itself a memory buffered write, this ends up
* being effectively a second memcpy. So a useful optimization for a "smart"
* custom data_sink is to do it's own internal management of the movement
* custom data_sink is to do its own internal management of the movement
* of data between cpu and gpu; turning the internals of the writer into simply
*
* sink->device_write(device_buffer, size)
Expand Down
134 changes: 0 additions & 134 deletions cpp/include/cudf/rolling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,140 +321,6 @@ std::unique_ptr<column> grouped_rolling_window(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Applies a grouping-aware, timestamp-based rolling window function to the values in a
* column.
*
* @deprecated Since 25.02, to be removed in 25.04
*
* Like `rolling_window()`, this function aggregates values in a window around each
* element of a specified `input` column. It differs from `rolling_window()` in two respects:
* 1. The elements of the `input` column are grouped into distinct groups (e.g. the result of a
* groupby), determined by the corresponding values of the columns under `group_keys`. The
* window-aggregation cannot cross the group boundaries.
* 2. Within a group, the aggregation window is calculated based on a time interval (e.g. number
* of days preceding/following the current row). The timestamps for the input data are
* specified by the `timestamp_column` argument.
*
* Note: This method requires that the rows are presorted by the group keys and timestamp values.
*
* @code{.pseudo}
* Example: Consider a user-sales dataset, where the rows look as follows:
* { "user_id", sales_amt, date }
*
* This method enables windowing queries such as grouping a dataset by `user_id`, sorting by
* increasing `date`, and summing up the `sales_amt` column over a window of 3 days (1 preceding
*day, the current day, and 1 following day).
*
* In this example,
* 1. `group_keys == [ user_id ]`
* 2. `timestamp_column == date`
* 3. `input == sales_amt`
* The data are grouped by `user_id`, and ordered by `date`. The aggregation
* (SUM) is then calculated for a window of 3 days around (and including) each row.
*
* For the following input:
*
* [ // user, sales_amt, YYYYMMDD (date)
* { "user1", 10, 20200101 },
* { "user2", 20, 20200101 },
* { "user1", 20, 20200102 },
* { "user1", 10, 20200103 },
* { "user2", 30, 20200101 },
* { "user2", 80, 20200102 },
* { "user1", 50, 20200107 },
* { "user1", 60, 20200107 },
* { "user2", 40, 20200104 }
* ]
*
* Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt`
* vector (with 2 groups, one for each distinct `user_id`):
*
* Date :(202001-) [ 01, 02, 03, 07, 07, 01, 01, 02, 04 ]
* Input: [ 10, 20, 10, 50, 60, 20, 30, 80, 40 ]
* <-------user1-------->|<---------user2--------->
*
* The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1
* period. The aggregation window is thus 3 *days* wide, yielding the following output column:
*
* Results: [ 30, 40, 30, 110, 110, 130, 130, 130, 40 ]
*
* @endcode
*
* Note: The number of rows participating in each window might vary, based on the index within the
* group, datestamp, and `min_periods`. Apropos:
* 1. results[0] considers 2 values, because it is at the beginning of its group, and has no
* preceding values.
* 2. results[5] considers 3 values, despite being at the beginning of its group. It must include 2
* following values, based on its datestamp.
*
* Each aggregation operation cannot cross group boundaries.
*
* The returned column for `op == COUNT` always has `INT32` type. All other operators return a
* column of the same type as the input. Therefore it is suggested to convert integer column types
* (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`.
*
* @param[in] group_keys The (pre-sorted) grouping columns
* @param[in] timestamp_column The (pre-sorted) timestamps for each row
* @param[in] timestamp_order The order (ASCENDING/DESCENDING) in which the timestamps are sorted
* @param[in] input The input column (to be aggregated)
* @param[in] preceding_window_in_days The rolling window time-interval in the backward direction
* @param[in] following_window_in_days The rolling window time-interval in the forward direction
* @param[in] min_periods Minimum number of observations in window required to have a value,
* otherwise element `i` is null.
* @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
* @param[in] stream CUDA stream used for device memory operations and kernel launches
* @param[in] mr Device memory resource used to allocate the returned column's device memory
*
* @returns A nullable output column containing the rolling window results
*/
[[deprecated("Use cudf::grouped_range_rolling_window instead")]] std::unique_ptr<column>
grouped_time_range_rolling_window(
table_view const& group_keys,
column_view const& timestamp_column,
cudf::order const& timestamp_order,
column_view const& input,
size_type preceding_window_in_days,
size_type following_window_in_days,
size_type min_periods,
rolling_aggregation const& aggr,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Applies a grouping-aware, timestamp-based rolling window function to the values in a
* column,.
*
* @deprecated Since 25.02, to be removed in 25.04
*
* @details @copydetails grouped_time_range_rolling_window(
* table_view const& group_keys,
* column_view const& timestamp_column,
* cudf::order const& timestamp_order,
* column_view const& input,
* size_type preceding_window_in_days,
* size_type following_window_in_days,
* size_type min_periods,
* rolling_aggregation const& aggr,
* rmm::cuda_stream_view stream,
* rmm::device_async_resource_ref mr)
*
* The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
* and supports "unbounded" windows, if set to `window_bounds::unbounded()`.
*/
[[deprecated("Use cudf::grouped_range_rolling_window instead")]] std::unique_ptr<column>
grouped_time_range_rolling_window(
table_view const& group_keys,
column_view const& timestamp_column,
cudf::order const& timestamp_order,
column_view const& input,
window_bounds preceding_window_in_days,
window_bounds following_window_in_days,
size_type min_periods,
rolling_aggregation const& aggr,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Applies a grouping-aware, value range-based rolling window function to the values in a
* column.
Expand Down
Loading

0 comments on commit 7836795

Please sign in to comment.