Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gather performance-related static parameters into one place. #9694

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 13 additions & 37 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2023, XGBoost Contributors
* \file hist_util.cc
*/
#include "hist_util.h"

#include <dmlc/timer.h>

#include <vector>

#include "../data/adapter.h" // for SparsePageAdapterBatch
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "quantile.h"
#include "xgboost/base.h"
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for SparsePage, SortedCSCPage
#include "quantile.h" // for HostSketchContainer, SortedSketchContainer, CalcColu...
#include "tuning.h" // for kHistAdHocL2Size
#include "xgboost/base.h" // for bst_row_t, GradientPair, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for SparsePage, SortedCSCPage

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
Expand Down Expand Up @@ -105,28 +104,7 @@ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, siz
}
}

struct Prefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;

private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);

public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}

template <typename T>
static constexpr size_t GetPrefetchStep() {
return Prefetch::kCacheLineSize / sizeof(T);
}
};

constexpr size_t Prefetch::kNoPrefetchSize;
using Prefetch = HistPrefetch<decltype(GHistIndexMatrix::row_ptr)::value_type>;

struct RuntimeFlags {
const bool first_page;
Expand Down Expand Up @@ -233,12 +211,11 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,

if (do_prefetch) {
const size_t icol_start_prefetch =
kAnyMissing
? get_row_ptr(rid[i + Prefetch::kPrefetchOffset])
: get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features;
const size_t icol_end_prefetch =
kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;
kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset])
: get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features;
const size_t icol_end_prefetch = kAnyMissing
? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;

PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
Expand Down Expand Up @@ -345,8 +322,7 @@ void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_
/* force_read_by_column is used for testing the columnwise building of histograms.
* default force_read_by_column = false
*/
constexpr double kAdhocL2Size = 1024 * 1024 * 0.8;
const bool hist_fit_to_l2 = kAdhocL2Size > 2 * sizeof(float) * gmat.cut.Ptrs().back();
const bool hist_fit_to_l2 = kHistAdHocL2Size > 2 * sizeof(float) * gmat.cut.Ptrs().back();
bool first_page = gmat.base_rowid == 0;
bool read_by_column = !hist_fit_to_l2 && !any_missing;
auto bin_type_size = gmat.index.GetBinTypeSize();
Expand Down
2 changes: 1 addition & 1 deletion src/common/partition_builder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2021-2023 by Contributors
* Copyright 2021-2023, XGBoost Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
Expand Down
48 changes: 48 additions & 0 deletions src/common/tuning.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/**
* Copyright 2023, XGBoost Contributors
*
* @brief Some performance-related compile time constants.
*/
#pragma once

#include <algorithm> // for min
#include <cstddef> // for size_t

namespace xgboost::common {
// An ad-hoc estimation of CPU L2 cache size.
constexpr inline double kHistAdHocL2Size = 1024 * 1024 * 0.8;

// Control gradient hardware prefetching during histogram build.
template <typename RowPtrT>
struct HistPrefetch {
public:
static constexpr size_t kCacheLineSize = 64;
static constexpr size_t kPrefetchOffset = 10;

private:
constexpr static std::size_t KNoPrefetchSize() {
return kPrefetchOffset + kCacheLineSize / sizeof(RowPtrT);
}

public:
static size_t NoPrefetchSize(std::size_t rows) { return std::min(rows, KNoPrefetchSize()); }

template <typename T>
static constexpr size_t GetPrefetchStep() {
return HistPrefetch::kCacheLineSize / sizeof(T);
}
};

// block size of partitioning samples after tree split.
inline constexpr std::size_t kPartitionBlockSize = 2048;

// block size of prediction.
inline constexpr std::size_t kPredictionBlockSize = 64;

// block size of histogram synchronization including subtraction and aggregation from
// threads.
inline constexpr std::size_t kSyncHistBlockSize = 1024;

// block size for build hist.
inline constexpr std::size_t kHistBlockSize = 256;
} // namespace xgboost::common
9 changes: 5 additions & 4 deletions src/predictor/cpu_predictor.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
* Copyright 2017-2023, XGBoost Contributors
*/
#include <algorithm> // for max, fill, min
#include <any> // for any, any_cast
Expand All @@ -19,6 +19,7 @@
#include "../common/error_msg.h" // for InplacePredictProxy
#include "../common/math.h" // for CheckNAN
#include "../common/threading_utils.h" // for ParallelFor
#include "../common/tuning.h" // for kPredictionBlockSize
#include "../data/adapter.h" // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
#include "../data/gradient_index.h" // for GHistIndexMatrix
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
Expand Down Expand Up @@ -546,7 +547,7 @@ class ColumnSplitHelper {
}
}

template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
template <typename DataView, std::size_t block_of_rows_size, bool predict_leaf = false>
void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
auto const num_group = model_.learner_model_param->num_output_group;

Expand Down Expand Up @@ -582,7 +583,7 @@ class ColumnSplitHelper {
ClearBitVectors();
}

static std::size_t constexpr kBlockOfRowsSize = 64;
static std::size_t constexpr kBlockOfRowsSize = common::kPredictionBlockSize;

std::int32_t const n_threads_;
gbm::GBTreeModel const &model_;
Expand Down Expand Up @@ -984,7 +985,7 @@ class CPUPredictor : public Predictor {
}

private:
static size_t constexpr kBlockOfRowsSize = 64;
static size_t constexpr kBlockOfRowsSize = common::kPredictionBlockSize;
};

XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor")
Expand Down
39 changes: 19 additions & 20 deletions src/tree/common_row_partitioner.h
Original file line number Diff line number Diff line change
@@ -1,35 +1,33 @@
/**
* Copyright 2021-2023 XGBoost contributors
* Copyright 2021-2023, XGBoost contributors
* \file common_row_partitioner.h
* \brief Common partitioner logic for hist and approx methods.
*/
#ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
#define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_

#include <algorithm> // std::all_of
#include <cinttypes> // std::uint32_t
#include <limits> // std::numeric_limits
#include <vector>
#include <algorithm> // for all_of
#include <cstdint> // for uint32_t
#include <limits> // for numeric_limits
#include <vector> // for vector

#include "../collective/communicator-inl.h"
#include "../common/linalg_op.h" // cbegin
#include "../common/numeric.h" // Iota
#include "../common/partition_builder.h"
#include "hist/expand_entry.h" // CPUExpandEntry
#include "xgboost/base.h"
#include "xgboost/context.h" // Context
#include "xgboost/linalg.h" // TensorView
#include "../common/linalg_op.h" // for cbegin
#include "../common/numeric.h" // for Iota
#include "../common/partition_builder.h" // for PartitionBuilder
#include "../common/tuning.h" // for kPartitionBlockSize
#include "xgboost/base.h" // for bst_row_t, bst_bin_t
#include "xgboost/context.h" // for Context
#include "xgboost/linalg.h" // for TensorView
#include "xgboost/span.h" // for Span

namespace xgboost::tree {

static constexpr size_t kPartitionBlockSize = 2048;

class ColumnSplitHelper {
public:
ColumnSplitHelper() = default;

ColumnSplitHelper(bst_row_t num_row,
common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
common::PartitionBuilder<common::kPartitionBlockSize>* partition_builder,
common::RowSetCollection* row_set_collection)
: partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
decision_storage_.resize(num_row);
Expand Down Expand Up @@ -79,7 +77,7 @@ class ColumnSplitHelper {
BitVector decision_bits_{};
std::vector<BitVector::value_type> missing_storage_{};
BitVector missing_bits_{};
common::PartitionBuilder<kPartitionBlockSize>* partition_builder_;
common::PartitionBuilder<common::kPartitionBlockSize>* partition_builder_;
common::RowSetCollection* row_set_collection_;
};

Expand Down Expand Up @@ -204,14 +202,15 @@ class CommonRowPartitioner {
int32_t nid = nodes[node_in_set].nid;
return row_set_collection_[nid].Size();
},
kPartitionBlockSize);
common::kPartitionBlockSize);

// 2.2 Initialize the partition builder
// allocate buffers for storage intermediate results by each thread
partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) {
const int32_t nid = nodes[node_in_set].nid;
const size_t size = row_set_collection_[nid].Size();
const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize);
const size_t n_tasks =
size / common::kPartitionBlockSize + !!(size % common::kPartitionBlockSize);
return n_tasks;
});
CHECK_EQ(base_rowid, gmat.base_rowid);
Expand Down Expand Up @@ -291,7 +290,7 @@ class CommonRowPartitioner {
}

private:
common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
common::PartitionBuilder<common::kPartitionBlockSize> partition_builder_;
common::RowSetCollection row_set_collection_;
bool is_col_split_;
ColumnSplitHelper column_split_helper_;
Expand Down
14 changes: 9 additions & 5 deletions src/tree/hist/histogram.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2021-2023 by XGBoost Contributors
* Copyright 2021-2023, XGBoost Contributors
*/
#ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
#define XGBOOST_TREE_HIST_HISTOGRAM_H_
Expand All @@ -16,6 +16,7 @@
#include "../../common/hist_util.h" // for GHistRow, ParallelGHi...
#include "../../common/row_set.h" // for RowSetCollection
#include "../../common/threading_utils.h" // for ParallelFor2d, Range1d, BlockedSpace2d
#include "../../common/tuning.h" // for kSyncHistBlockSize
#include "../../data/gradient_index.h" // for GHistIndexMatrix
#include "expand_entry.h" // for MultiExpandEntry, CPUExpandEntry
#include "hist_cache.h" // for BoundedHistCollection
Expand Down Expand Up @@ -175,7 +176,8 @@ class HistogramBuilder {
std::vector<bst_node_t> const &nodes_to_trick) {
auto n_total_bins = buffer_.TotalBins();
common::BlockedSpace2d space(
nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
nodes_to_build.size(), [&](std::size_t) { return n_total_bins; },
common::kSyncHistBlockSize);
common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
// Merging histograms from each thread.
this->buffer_.ReduceHist(node, r.begin(), r.end());
Expand All @@ -193,7 +195,8 @@ class HistogramBuilder {
nodes_to_trick.size() == nodes_to_build.size()
? space
: common::BlockedSpace2d{nodes_to_trick.size(),
[&](std::size_t) { return n_total_bins; }, 1024};
[&](std::size_t) { return n_total_bins; },
common::kSyncHistBlockSize};
common::ParallelFor2d(
subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
auto subtraction_nidx = nodes_to_trick[nidx_in_set];
Expand Down Expand Up @@ -231,8 +234,9 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
k++;
}
}
common::BlockedSpace2d space{
nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
common::BlockedSpace2d space{nodes_to_build.size(),
[&](std::size_t nidx_in_set) { return partition_size[nidx_in_set]; },
common::kHistBlockSize};
return space;
}

Expand Down