Skip to content

Commit ee562fc

Browse files
authored
Add segsort from moderngpu (k2-fsa#181)
1 parent 960abbc commit ee562fc

16 files changed

+259
-42
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ if(USE_PYTORCH)
101101
include(torch)
102102
endif()
103103
include(cub)
104+
include(moderngpu)
104105
include(googletest)
105106

106107
add_subdirectory(k2)

cmake/cub.cmake

-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ function(download_cub)
2424
message(STATUS "cub is downloaded to ${cub_SOURCE_DIR}")
2525
add_library(cub INTERFACE)
2626
target_include_directories(cub INTERFACE ${cub_SOURCE_DIR})
27-
2827
endfunction()
2928

3029
download_cub()

cmake/moderngpu.cmake

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
2+
# See ../LICENSE for clarification regarding multiple authors
3+
4+
function(download_moderngpu)
5+
if(CMAKE_VERSION VERSION_LESS 3.11)
6+
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
7+
endif()
8+
9+
include(FetchContent)
10+
11+
# this is the latest commit of modern gpu as of 2020-09-26
12+
set(moderngpu_URL "https://github.com/moderngpu/moderngpu/archive/2b3985541c8e88a133769598c406c33ddde9d0a5.zip")
13+
set(moderngpu_HASH "SHA256=191546af18cd5fb858ecb561316f3af67537ab16f610fc8f1a5febbffc27755a")
14+
15+
FetchContent_Declare(moderngpu
16+
URL ${moderngpu_URL}
17+
URL_HASH ${moderngpu_HASH}
18+
)
19+
20+
FetchContent_GetProperties(moderngpu)
21+
if(NOT moderngpu)
22+
message(STATUS "Downloading moderngpu")
23+
FetchContent_Populate(moderngpu)
24+
endif()
25+
message(STATUS "moderngpu is downloaded to ${moderngpu_SOURCE_DIR}")
26+
add_library(moderngpu INTERFACE)
27+
target_include_directories(moderngpu INTERFACE ${moderngpu_SOURCE_DIR}/src)
28+
target_compile_options(moderngpu INTERFACE -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w)
29+
endfunction()
30+
31+
download_moderngpu()

k2/csrc/CMakeLists.txt

+20-17
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ set(context_srcs
1515
fsa.cu
1616
fsa_algo.cu
1717
math.cu
18+
moderngpu_allocator.cu
1819
ragged.cu
1920
tensor.cu
2021
tensor_ops.cu
@@ -33,6 +34,8 @@ set_target_properties(context PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
3334

3435
# lib deps
3536
target_link_libraries(context PUBLIC cub)
37+
target_link_libraries(context PUBLIC fsa)
38+
target_link_libraries(context PUBLIC moderngpu)
3639
if(USE_PYTORCH)
3740
target_link_libraries(context PUBLIC ${TORCH_LIBRARIES})
3841
endif()
@@ -41,14 +44,14 @@ endif()
4144

4245
# please sort the source files alphabetically
4346
set(cuda_tests
44-
array_ops_test
45-
array_test
46-
log_test
47-
ragged_shape_test
48-
ragged_test
49-
tensor_test
50-
utils_test
51-
)
47+
array_ops_test
48+
array_test
49+
log_test
50+
ragged_shape_test
51+
ragged_test
52+
tensor_test
53+
utils_test
54+
)
5255

5356
# utility function to add gtest
5457
function(k2_add_cuda_test name)
@@ -58,16 +61,16 @@ function(k2_add_cuda_test name)
5861
add_executable(${target_name} "${name}.cu")
5962
set_target_properties(${target_name} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
6063
target_link_libraries(${target_name}
61-
PRIVATE
62-
context
63-
fsa # for code in k2/csrc/host
64-
gtest
65-
gtest_main
66-
)
64+
PRIVATE
65+
context
66+
fsa # for code in k2/csrc/host
67+
gtest
68+
gtest_main
69+
)
6770
add_test(NAME "Test.Cuda.${target_name}"
68-
COMMAND
69-
$<TARGET_FILE:${target_name}>
70-
)
71+
COMMAND
72+
$<TARGET_FILE:${target_name}>
73+
)
7174
endfunction()
7275

7376
foreach (name IN LISTS cuda_tests)

k2/csrc/array_ops_inl.h

+5-3
Original file line numberDiff line numberDiff line change
@@ -379,15 +379,17 @@ Array1<T> RandUniformArray1(ContextPtr &c, int32_t dim, T min_value,
379379
T *data = temp.Data();
380380
K2_CHECK(max_value >= min_value);
381381
if (max_value == min_value) {
382-
for (int32_t i = 0; i < dim; i++) data[i] = 0;
382+
for (int32_t i = 0; i < dim; ++i) data[i] = 0;
383383
} else if (std::is_floating_point<T>::value ||
384384
std::abs(min_value) > RAND_MAX || std::abs(max_value) > RAND_MAX) {
385385
for (int32_t i = 0; i < dim; i++)
386386
data[i] =
387387
min_value + (rand() * (max_value - min_value) / RAND_MAX); // NOLINT
388388
} else {
389-
for (int32_t i = 0; i < dim; i++)
390-
data[i] = min_value + (rand() % (max_value + 1 - min_value)); // NOLINT
389+
for (int32_t i = 0; i < dim; ++i)
390+
data[i] =
391+
min_value +
392+
(rand() % static_cast<int32_t>(max_value + 1 - min_value)); // NOLINT
391393
}
392394
return temp.To(c);
393395
}

k2/csrc/moderngpu_allocator.cu

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/**
2+
* @brief A better memory allocator for moderngpu.
3+
*
4+
*
5+
* @copyright
6+
* Copyright (c) 2020 Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
7+
*
8+
* @copyright
9+
* See LICENSE for clarification regarding multiple authors
10+
*/
11+
12+
#include <utility>
13+
14+
#include "k2/csrc/context.h"
15+
#include "k2/csrc/moderngpu_allocator.h"
16+
#include "moderngpu/context.hxx"
17+
18+
namespace {
19+
20+
class ModernGpuAllocator : public mgpu::standard_context_t {
21+
public:
22+
explicit ModernGpuAllocator(k2::ContextPtr context)
23+
: mgpu::standard_context_t(false, context->GetCudaStream()),
24+
context_(std::move(context)) {}
25+
26+
void *alloc(size_t size, mgpu::memory_space_t space) override {
27+
K2_DCHECK_EQ(space, mgpu::memory_space_device);
28+
void *deleter_ = nullptr;
29+
void *p = context_->Allocate(size, &deleter_);
30+
K2_DCHECK(deleter_ == nullptr);
31+
return p;
32+
}
33+
34+
void free(void *p, mgpu::memory_space_t space) override {
35+
K2_DCHECK_EQ(space, mgpu::memory_space_device);
36+
context_->Deallocate(p, nullptr);
37+
}
38+
39+
private:
40+
k2::ContextPtr context_;
41+
};
42+
43+
} // namespace
44+
45+
namespace k2 {
46+
47+
std::unique_ptr<mgpu::context_t> GetModernGpuAllocator(
48+
int32_t device_id /*= -1*/) {
49+
return std::make_unique<ModernGpuAllocator>(GetCudaContext(device_id));
50+
}
51+
52+
} // namespace k2

k2/csrc/moderngpu_allocator.h

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/**
2+
* @brief This is an allocator for moderngpu only.
3+
*
4+
* Currently it is used by `SortSublists`.
5+
*
6+
* @copyright
7+
* Copyright (c) 2020 AI Lab, Beijing, China (authors: Fangjun Kuang)
8+
*
9+
* @copyright
10+
* See LICENSE for clarification regarding multiple authors
11+
*/
12+
13+
#ifndef K2_CSRC_MODERNGPU_ALLOCATOR_H_
14+
#define K2_CSRC_MODERNGPU_ALLOCATOR_H_
15+
16+
#include <memory>
17+
18+
#include "moderngpu/context.hxx"
19+
20+
namespace k2 {
21+
// Return a context for moderngpu that has a better memory allocator
22+
// than mgpu::standard_context_t
23+
std::unique_ptr<mgpu::context_t> GetModernGpuAllocator(int32_t device_id = -1);
24+
25+
} // namespace k2
26+
27+
#endif // K2_CSRC_MODERNGPU_ALLOCATOR_H_

k2/csrc/pytorch_context.cu

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111

1212
#include <memory>
1313

14+
#include "c10/cuda/CUDACachingAllocator.h"
1415
#include "c10/cuda/CUDAFunctions.h"
16+
#include "k2/csrc/context.h"
17+
#include "k2/csrc/log.h"
1518
#include "k2/csrc/pytorch_context.h"
1619

1720
namespace k2 {
@@ -23,9 +26,6 @@ class PytorchCpuContext : public Context {
2326
K2_CHECK(allocator_->raw_deleter() != nullptr);
2427
}
2528

26-
// since the constructor is private, the only way to create an instance
27-
// of PytorchCpuContext is via `Make`, which returns a `shared_ptr`.
28-
// Thus it is safe to call `shared_from_this`.
2929
ContextPtr GetCpuContext() override { return shared_from_this(); }
3030

3131
ContextPtr GetPinnedContext() override { return nullptr; }

k2/csrc/pytorch_context.h

-3
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@
1515

1616
#include <memory>
1717

18-
#include "c10/cuda/CUDACachingAllocator.h"
19-
#include "c10/cuda/CUDAFunctions.h"
2018
#include "k2/csrc/context.h"
21-
#include "k2/csrc/log.h"
2219
#include "torch/torch.h"
2320

2421
namespace k2 {

k2/csrc/ragged.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ int32_t RaggedShape::operator[](const std::vector<int32_t> &indexes) {
294294
void RaggedShape::Check() {
295295
ContextPtr c = Context();
296296
int32_t num_axes = axes_.size();
297-
for (int32_t axis = 0; axis < axes_.size(); axis++) {
297+
for (int32_t axis = 0; axis < num_axes; ++axis) {
298298
RaggedShapeDim &rsd = axes_[axis];
299299
K2_CHECK_GE(rsd.row_splits.Dim(), 0);
300300
if (rsd.cached_tot_size >= 0) {
@@ -343,7 +343,7 @@ void RaggedShape::Check() {
343343
<< " but cached_tot_size == " << rsd.cached_tot_size;
344344
}
345345
}
346-
if (axis + 1 < axes_.size()) {
346+
if (axis + 1 < num_axes) {
347347
int32_t next_num_rows = axes_[axis + 1].row_splits.Dim() - 1;
348348
if (num_elems != next_num_rows) {
349349
K2_LOG(FATAL) << "Ragged shape has num_elems for axes_[" << axis

k2/csrc/ragged.h

+16-4
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,7 @@ class RaggedShape {
102102
// row_splits on that axis.
103103
int32_t MaxSize(int32_t axis);
104104

105-
ContextPtr &Context() { return axes_[0].row_splits.Context(); }
106-
const ContextPtr &Context() const { return axes_[0].row_splits.Context(); }
105+
ContextPtr &Context() const { return axes_[0].row_splits.Context(); }
107106

108107
/*
109108
It is an error to call this if this.NumAxes() < 2. This will return
@@ -127,7 +126,8 @@ class RaggedShape {
127126

128127
RaggedShapeIndexIterator Iterator();
129128

130-
explicit RaggedShape(std::vector<RaggedShapeDim> &axes, bool check = true)
129+
explicit RaggedShape(const std::vector<RaggedShapeDim> &axes,
130+
bool check = true)
131131
: axes_(axes) {
132132
if (check) Check();
133133
}
@@ -486,7 +486,6 @@ Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> **src);
486486
template <typename T>
487487
Ragged<T> Stack(int32_t axis, int32_t num_srcs, Ragged<T> *src);
488488

489-
490489
/*
491490
Construct a RaggedShape with 2 axes.
492491
@param [in] row_splits row_splits, or NULL (at least one of this and
@@ -574,6 +573,19 @@ Ragged<T> RandomRagged(T min_value = static_cast<T>(0),
574573
int32_t min_num_elements = 0,
575574
int32_t max_num_elements = 2000);
576575

576+
/*
577+
Sort a ragged array in-place.
578+
579+
@param [inout] The input array to be sorted.
580+
CAUTION: it is sorted in-place.
581+
@param [out] The indexes mapping from the sorted
582+
array to the input array. The caller
583+
has to pre-allocate memory for it
584+
on the same device as `src`.
585+
*/
586+
template <typename T, typename Op = LessThan<T>>
587+
void SortSublists(Ragged<T> *src, Array1<int32_t> *order);
588+
577589
} // namespace k2
578590

579591
// TODO(dan): include guard maybe.

k2/csrc/ragged_inl.h

+34
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*
88
* @copyright
99
* Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey)
10+
* Mobvoi AI Lab, Beijing, China (authors: Fangjun Kuang)
1011
*
1112
* @copyright
1213
* See LICENSE for clarification regarding multiple authors
@@ -15,8 +16,12 @@
1516
#ifndef K2_CSRC_RAGGED_INL_H_
1617
#define K2_CSRC_RAGGED_INL_H_
1718

19+
#include <memory>
1820
#include <vector>
1921

22+
#include "k2/csrc/moderngpu_allocator.h"
23+
#include "moderngpu/kernel_segsort.hxx"
24+
2025
namespace k2 {
2126

2227
template <typename T>
@@ -95,6 +100,35 @@ Ragged<T> RandomRagged(T min_value, T max_value, int32_t min_num_axes,
95100
return Ragged<T>(shape, values);
96101
}
97102

103+
template <typename T, typename Op /* = LessThan<T> */>
104+
void SortSublists(Ragged<T> *src, Array1<int32_t> *order) {
105+
K2_DCHECK(IsCompatible(src->values, *order));
106+
K2_DCHECK_EQ(src->values.Dim(), order->Dim());
107+
K2_DCHECK_EQ(src->Context()->GetDeviceType(), kCuda)
108+
<< "It supports only CUDA at present";
109+
110+
std::unique_ptr<mgpu::context_t> context =
111+
GetModernGpuAllocator(src->Context()->GetDeviceId());
112+
113+
Array1<int32_t> &segment = src->shape.RowSplits(src->NumAxes() - 1);
114+
mgpu::segmented_sort_indices(src->values.Data(), // keys
115+
order->Data(), // indices
116+
src->values.Dim(), // count
117+
segment.Data() + 1, // segments
118+
segment.Dim() - 1, // num_segments
119+
Op(), // cmp
120+
*context); // context
121+
auto err = cudaGetLastError();
122+
(void)err;
123+
// TODO(fangjun): err is not cudaSuccess, but why was the data sorted
124+
// correctly?
125+
//
126+
// Check failed: err == cudaSuccess (9 vs. 0) Error: invalid configuration
127+
// argument.
128+
//
129+
// K2_DCHECK_CUDA_ERROR(err);
130+
}
131+
98132
} // namespace k2
99133

100134
#endif // K2_CSRC_RAGGED_INL_H_

0 commit comments

Comments
 (0)