implement GetTransposeReordering. (k2-fsa#237)

csukuangfj · web-flow · commit 8a32c4c09ece · 2020-10-11T10:16:58.000+08:00
* implement GetTransposeReordering.

* remove block comments.

* switch to sorting based transpose.

* remove cub warnings.

* add GPU version.

* fix style issues.

* remove duplicate code.

* resolve some comments.

* optimize the CPU version.
diff --git a/cmake/cub.cmake b/cmake/cub.cmake
@@ -8,8 +8,8 @@ function(download_cub)
 
   include(FetchContent)
 
-  set(cub_URL  "https://github.com/NVlabs/cub/archive/1.9.10.tar.gz")
-  set(cub_HASH "SHA256=2bd7077a3d9741f0689e6c1eb58c6278fc96eccc27d964168bc8be1bc3a9040f")
+  set(cub_URL  "https://github.com/NVlabs/cub/archive/1.10.0.tar.gz")
+  set(cub_HASH "SHA256=8531e09f909aa021125cffa70a250761dfc247f960d7a1a12f65e6651ffb6477")
 
   FetchContent_Declare(cub
     URL               ${cub_URL}
diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu
@@ -4,18 +4,24 @@
  *
  * @copyright
  * Copyright (c)  2020  Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu)
+ *                      Mobvoi Inc.        (authors: Fangjun Kuang)
  *
  * @copyright
  * See LICENSE for clarification regarding multiple authors
  */
 
-#include <cub/cub.cuh>
+#include <algorithm>
+#include <memory>
 #include <vector>
 
+#include "cub/cub.cuh"
 #include "k2/csrc/array_ops.h"
 #include "k2/csrc/math.h"
+#include "k2/csrc/moderngpu_allocator.h"
 #include "k2/csrc/ragged.h"
 #include "k2/csrc/ragged_ops.h"
+#include "moderngpu/kernel_mergesort.hxx"
+
 namespace {
 
 /*
@@ -806,4 +812,69 @@ Ragged<int32_t> GetCountsPartitioned(Ragged<int32_t> &src,
   return Ragged<int32_t>(ans_ragged_shape, counts);
 }
 
+static Array1<int32_t> GetTransposeReorderingCpu(Ragged<int32_t> &src,
+                                                 int32_t num_cols) {
+  std::vector<std::vector<int32_t>> column_indexes(num_cols);  // [column][row]
+  const int32_t *values_data = src.values.Data();
+  int32_t n = src.values.Dim();
+
+  for (int32_t i = 0; i != n; ++i) {
+    int32_t bucket = values_data[i];
+    column_indexes[bucket].push_back(i);
+  }
+
+  Array1<int32_t> ans(src.Context(), n);
+  int32_t *ans_data = ans.Data();
+  for (int32_t i = 0; i != num_cols; ++i) {
+    std::copy(column_indexes[i].begin(), column_indexes[i].end(), ans_data);
+    ans_data += column_indexes[i].size();
+  }
+  return ans;
+}
+
+Array1<int32_t> GetTransposeReordering(Ragged<int32_t> &src, int32_t num_cols) {
+  ContextPtr &context = src.Context();
+  if (src.NumAxes() < 2) {
+    // src is empty
+    return Array1<int32_t>(context, 0);
+  }
+
+  DeviceType device_type = context->GetDeviceType();
+  if (device_type == kCpu) return GetTransposeReorderingCpu(src, num_cols);
+
+  K2_CHECK_EQ(device_type, kCuda);
+
+  const int32_t *row_splits1_data = src.RowSplits(src.NumAxes() - 1).Data();
+  const int32_t *row_ids1_data = src.RowIds(src.NumAxes() - 1).Data();
+  const int32_t *value_data = src.values.Data();
+  int32_t n = src.values.Dim();
+  Array1<int32_t> ans = Range(context, n, 0);
+
+  auto lambda_comp = [=] __device__(int32_t a_idx01, int32_t b_idx01) -> bool {
+    int32_t a_idx0 = row_ids1_data[a_idx01];
+    int32_t b_idx0 = row_ids1_data[b_idx01];
+
+    int32_t a_col_index = value_data[a_idx01];
+    int32_t b_col_index = value_data[b_idx01];
+
+    if (a_col_index < b_col_index) return true;  // sort by column indexes
+    if (a_col_index > b_col_index) return false;
+
+    // now we have a_col_index == b_col_index
+    if (a_idx0 < b_idx0) return true;  // sort by row indexes
+    if (a_idx0 > b_idx0) return false;
+
+    // now we have a_idx0 == b_idx0 && a_col_index == b_col_index
+    // this entry is duplicated in the sparse matrix.
+    return false;  // we can return either true or false here.
+  };
+
+  std::unique_ptr<mgpu::context_t> mgpu_context =
+      GetModernGpuAllocator(context->GetDeviceId());
+
+  K2_CUDA_SAFE_CALL(mgpu::mergesort(ans.Data(), n, lambda_comp, *mgpu_context));
+
+  return ans;
+}
+
 }  // namespace k2
diff --git a/k2/csrc/ragged_test.cu b/k2/csrc/ragged_test.cu
@@ -1015,6 +1015,53 @@ TEST(RaggedShapeOpsTest, TestRenumber) {
   TestRenumber<kCpu>();
   TestRenumber<kCuda>();
 }
+TEST(GetTransposeReordering, NoDuplicates) {
+  // 0 0 0 9 2
+  // 5 8 0 0 1
+  // 0 0 3 0 0
+  // 0 6 0 0 0
+  std::vector<int32_t> col_indexes{3, 4, 0, 1, 4, 2, 1};
+  std::vector<int32_t> _row_splits{0, 2, 5, 6, 7};
+  for (auto &context : {GetCpuContext(), GetCudaContext()}) {
+    Array1<int32_t> row_splits(context, _row_splits);
+    RaggedShape shape = RaggedShape2(&row_splits, nullptr, -1);
+    Array1<int32_t> values(context, col_indexes);
+
+    Ragged<int32_t> ragged(shape, values);
+    Array1<int32_t> order = GetTransposeReordering(ragged, 5);
+    //   index 0 1 2 3 4 5 6
+    // it maps 9 2 5 8 1 3 6 to
+    //         5 8 6 3 9 2 1
+    // so it returns
+    //         2 3 6 5 0 1 4
+    CheckArrayData(order, {2, 3, 6, 5, 0, 1, 4});
+    EXPECT_TRUE(context->IsCompatible(*order.Context()));
+  }
+}
+
+TEST(GetTransposeReordering, WithDuplicates) {
+  // 0 0 0 (9,9,9)
+  // 5 8 0     0
+  // 0 0 (3,3) 0
+  // 0 6 0     0
+  std::vector<int32_t> col_indexes{3, 3, 3, 0, 1, 2, 2, 1};
+  std::vector<int32_t> _row_splits{0, 3, 5, 7, 8};
+  for (auto &context : {GetCpuContext(), GetCudaContext()}) {
+    Array1<int32_t> row_splits(context, _row_splits);
+    RaggedShape shape = RaggedShape2(&row_splits, nullptr, -1);
+    Array1<int32_t> values(context, col_indexes);
+
+    Ragged<int32_t> ragged(shape, values);
+    Array1<int32_t> order = GetTransposeReordering(ragged, 4);
+    //   index 0 1 2 3 4 5 6 7
+    // it maps 9 9 9 5 8 3 3 6 to
+    //         5 8 6 3 3 9 9 9
+    // so it returns
+    //         3 4 7 5 6 0 1 2   Note that it is stable
+    CheckArrayData(order, {3, 4, 7, 5, 6, 0, 1, 2});
+    EXPECT_TRUE(context->IsCompatible(*order.Context()));
+  }
+}
 
 template <DeviceType d>
 void TestGetCountsPartitioned() {