Skip to content

Commit 32cd026

Browse files
authored
add implementation of Eval(ctx, data, size, lambda) (k2-fsa#164)
1 parent c13d1b8 commit 32cd026

File tree

6 files changed

+60
-64
lines changed

6 files changed

+60
-64
lines changed

.clang-format

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ BasedOnStyle: Google
33
---
44
Language: Cpp
55
Cpp11BracedListStyle: true
6-
Standard: c++11
6+
Standard: Cpp11
77
DerivePointerAlignment: false
88
PointerAlignment: Right
99
---

k2/csrc/array.h

+2-9
Original file line numberDiff line numberDiff line change
@@ -61,19 +61,12 @@ class Array1 {
6161
// with CUDA) and also on the CPU. We'll do src(i) to evaluate element i.
6262
// NOTE: we assume this thread is already set to use the device associated
6363
// with the context in 'ctx', if it's a CUDA context.
64-
// TODO(haowen): require Callable to be a function, the compiler may confuse
65-
// with Array1(ctx, size, elem)
66-
/*
6764
template <typename Callable>
6865
Array1(ContextPtr ctx, int32_t size, Callable &&callable) {
6966
Init(ctx, size);
70-
K2_LOG(FATAL) << "Not Implemented";
71-
72-
// TODO(haowen): there's no such definition
73-
// `Eval(ContextPtr, T*, int32_t, Callable&)` now
74-
// Eval(ctx, Data(), size, std::forward<Callable>(callable));
67+
T *data = Data();
68+
Eval(ctx, data, size, std::forward<Callable>(callable));
7569
}
76-
*/
7770

7871
Array1(ContextPtr ctx, int32_t size) { Init(ctx, size); }
7972

k2/csrc/array_test.cu

+19-53
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,8 @@ void TestArray1() {
6262
MemoryCopy(static_cast<void *>(array_data),
6363
static_cast<void *>(data.data()),
6464
array.Dim() * array.ElementSize(), kind);
65-
// copy data from CPU/GPU to CPU
66-
kind = GetMemoryCopyKind(*array.Context(), *cpu);
67-
std::vector<T> cpu_data(array.Dim());
68-
MemoryCopy(static_cast<void *>(cpu_data.data()),
69-
static_cast<const void *>(array_data),
70-
array.Dim() * array.ElementSize(), kind);
7165
for (int32_t i = 0; i < array.Dim(); ++i) {
72-
EXPECT_EQ(cpu_data[i], i);
66+
EXPECT_EQ(array[i], i);
7367
}
7468
}
7569

@@ -79,32 +73,30 @@ void TestArray1() {
7973
ASSERT_EQ(array.Dim(), 5);
8074
// operator=(T t)
8175
array = 2;
82-
// copy data from CPU/GPU to CPU
83-
const T *array_data = array.Data();
84-
auto kind = GetMemoryCopyKind(*array.Context(), *cpu);
85-
std::vector<T> cpu_data(array.Dim());
86-
MemoryCopy(static_cast<void *>(cpu_data.data()),
87-
static_cast<const void *>(array_data),
88-
array.Dim() * array.ElementSize(), kind);
8976
for (int32_t i = 0; i < array.Dim(); ++i) {
90-
EXPECT_EQ(cpu_data[i], 2);
9177
EXPECT_EQ(array[i], 2);
9278
}
9379
}
9480

9581
{
9682
// created with Array1(ContextPtr, int32_t size, T elem)
97-
Array1<T> array(context, 5, 2);
83+
Array1<T> array(context, 5, T(2));
9884
ASSERT_EQ(array.Dim(), 5);
9985
// copy data from CPU/GPU to CPU
100-
const T *array_data = array.Data();
101-
auto kind = GetMemoryCopyKind(*array.Context(), *cpu);
102-
std::vector<T> cpu_data(array.Dim());
103-
MemoryCopy(static_cast<void *>(cpu_data.data()),
104-
static_cast<const void *>(array_data),
105-
array.Dim() * array.ElementSize(), kind);
10686
for (int32_t i = 0; i < array.Dim(); ++i) {
107-
EXPECT_EQ(cpu_data[i], 2);
87+
EXPECT_EQ(array[i], 2);
88+
}
89+
}
90+
91+
{
92+
// created with Array1(ContextPtr, int32_t size, Callable &&callable)
93+
auto lambda_set_values = [] __host__ __device__(int32_t i) -> T {
94+
return i * i;
95+
};
96+
Array1<T> array(context, 5, lambda_set_values);
97+
ASSERT_EQ(array.Dim(), 5);
98+
for (int32_t i = 0; i < array.Dim(); ++i) {
99+
EXPECT_EQ(array[i], i * i);
108100
}
109101
}
110102

@@ -114,15 +106,8 @@ void TestArray1() {
114106
std::iota(data.begin(), data.end(), 0);
115107
Array1<T> array(context, data);
116108
ASSERT_EQ(array.Dim(), 5);
117-
// copy data from CPU/GPU to CPU
118-
const T *array_data = array.Data();
119-
auto kind = GetMemoryCopyKind(*array.Context(), *cpu);
120-
std::vector<T> cpu_data(array.Dim());
121-
MemoryCopy(static_cast<void *>(cpu_data.data()),
122-
static_cast<const void *>(array_data),
123-
array.Dim() * array.ElementSize(), kind);
124109
for (int32_t i = 0; i < array.Dim(); ++i) {
125-
EXPECT_EQ(cpu_data[i], data[i]);
110+
EXPECT_EQ(array[i], data[i]);
126111
}
127112
}
128113

@@ -135,15 +120,8 @@ void TestArray1() {
135120
int32_t size = 6;
136121
Array1<T> sub_array = array.Range(start, size);
137122
ASSERT_EQ(sub_array.Dim(), size);
138-
// copy data from CPU/GPU to CPU
139-
const T *sub_array_data = sub_array.Data();
140-
auto kind = GetMemoryCopyKind(*sub_array.Context(), *cpu);
141-
std::vector<T> cpu_data(sub_array.Dim());
142-
MemoryCopy(static_cast<void *>(cpu_data.data()),
143-
static_cast<const void *>(sub_array_data),
144-
sub_array.Dim() * sub_array.ElementSize(), kind);
145123
for (int32_t i = 0; i < sub_array.Dim(); ++i) {
146-
EXPECT_EQ(cpu_data[i], data[i + start]);
124+
EXPECT_EQ(sub_array[i], data[i + start]);
147125
}
148126
}
149127

@@ -239,16 +217,9 @@ void TestArray1() {
239217
Array1<int32_t> indexes_array(context, indexes);
240218
std::vector<T> expected_data = {1, 2, 3, 6, 2, 7, 9, 10, 3, 5, 7, 4};
241219
Array1<T> ans_array = array[indexes_array];
242-
// copy data from CPU/GPU to CPU
243220
ASSERT_EQ(ans_array.Dim(), expected_data.size());
244-
const T *ans_array_data = ans_array.Data();
245-
auto kind = GetMemoryCopyKind(*ans_array.Context(), *cpu);
246-
std::vector<T> cpu_data(ans_array.Dim());
247-
MemoryCopy(static_cast<void *>(cpu_data.data()),
248-
static_cast<const void *>(ans_array_data),
249-
ans_array.Dim() * ans_array.ElementSize(), kind);
250221
for (int32_t i = 0; i < ans_array.Dim(); ++i) {
251-
EXPECT_EQ(cpu_data[i], expected_data[i]);
222+
EXPECT_EQ(ans_array[i], expected_data[i]);
252223
}
253224
}
254225

@@ -272,14 +243,9 @@ void TestArray1() {
272243
EXPECT_EQ(array.Dim(), new_size);
273244
// copy data from CPU/GPU to CPU
274245
const T *array_data = array.Data();
275-
auto kind = GetMemoryCopyKind(*array.Context(), *cpu);
276-
std::vector<T> cpu_data(array.Dim());
277-
MemoryCopy(static_cast<void *>(cpu_data.data()),
278-
static_cast<const void *>(array_data),
279-
array.Dim() * array.ElementSize(), kind);
280246
// data.size() == 5, array.Dim() == 8, there are 3 uninitialized elements.
281247
for (int32_t i = 0; i < data.size(); ++i) {
282-
EXPECT_EQ(cpu_data[i], data[i]);
248+
EXPECT_EQ(array[i], data[i]);
283249
}
284250
}
285251
}

k2/csrc/context.h

+36
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,14 @@ __global__ void eval_lambda(int32_t n, LambdaT lambda) {
337337
}
338338
}
339339

340+
template <typename T, typename LambdaT>
341+
__global__ void eval_lambda(T *data, int32_t n, LambdaT lambda) {
342+
int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
343+
if (i < n) {
344+
data[i] = lambda(i);
345+
}
346+
}
347+
340348
template <typename LambdaT>
341349
__global__ void eval_lambda2(int32_t m, int32_t n, LambdaT lambda) {
342350
// actually threadIdx.y will always be 1 for now so we could drop that part of
@@ -380,6 +388,34 @@ void Eval(ContextPtrType c, int32_t n, LambdaT &lambda) {
380388
Eval(c->GetCudaStream(), n, lambda);
381389
}
382390

391+
/* Eval() will do `data[i] = lambda(i)` for 0 <= i < n, on the appropriate
392+
device (CPU or GPU) */
393+
template <typename T, typename LambdaT>
394+
void Eval(cudaStream_t stream, T *data, int32_t n, LambdaT &lambda) {
395+
if (n <= 0) return; // actually it would be an error if n < 0.
396+
if (stream == kCudaStreamInvalid) {
397+
// TODO: if n is very large, we'll eventually support running this with
398+
// multiple threads.
399+
for (int32_t i = 0; i < n; ++i) {
400+
data[i] = lambda(i);
401+
}
402+
} else {
403+
int32_t block_size = 256;
404+
int32_t grid_size = NumBlocks(n, block_size);
405+
eval_lambda<T, LambdaT>
406+
<<<grid_size, block_size, 0, stream>>>(data, n, lambda);
407+
auto err = cudaGetLastError();
408+
K2_DCHECK_CUDA_ERROR(err);
409+
}
410+
}
411+
412+
template <typename ContextPtrType, // Context* or ContextPtr ==
413+
// std::shared_ptr<Context>
414+
typename T, typename LambdaT>
415+
void Eval(ContextPtrType c, T *data, int32_t n, LambdaT &lambda) {
416+
Eval(c->GetCudaStream(), data, n, lambda);
417+
}
418+
383419
/*
384420
This is a form of Eval() where the lambda takes two arguments.
385421

k2/csrc/fsa.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ int32_t GetFsaVecBasicProperties(FsaVec &fsa_vec) {
7676
// the final-state of its FSA (i.e. last-numbered) or has at least one arc
7777
// leaving it, not counting self-loops. Again, it's a looser condition than
7878
// being 'co-accessible' in FSA terminology.
79-
Array1<char> reachable(c, num_states * 2 + 1, 0);
79+
Array1<char> reachable(c, num_states * 2 + 1, static_cast<char>(0));
8080
Array1<char> flag = reachable.Range(num_states * 1, 1);
8181
Array1<char> co_reachable = reachable.Range(num_states, num_states);
8282
reachable = reachable.Range(0, num_states);

k2/csrc/tensor.cu

+1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ void Tensor::Init(ContextPtr c) {
133133
Tensor ToContiguous(const Tensor &tensor) {
134134
// TODO(haowen): implement
135135
K2_LOG(FATAL) << "Not implemented";
136+
return tensor;
136137
}
137138

138139
} // namespace k2

0 commit comments

Comments
 (0)