k2-fsa · csukuangfj · Apr 24, 2021 · Apr 20, 2021 · Apr 21, 2021 · Apr 21, 2021
diff --git a/k2/csrc/array_ops.cu b/k2/csrc/array_ops.cu
@@ -304,10 +304,39 @@ Array1<int32_t> GetCounts(ContextPtr c, const int32_t *src_data,
         nullptr, temp_storage_bytes, src_data, ans_data, n + 1, 0, n, src_dim,
         c->GetCudaStream()));  // The first time is to determine temporary
                                // device storage requirements.
-    Array1<int8_t> d_temp_storage(c, temp_storage_bytes);
-    K2_CHECK_CUDA_ERROR(cub::DeviceHistogram::HistogramEven(
-        d_temp_storage.Data(), temp_storage_bytes, src_data, ans_data, n + 1, 0,
-        n, src_dim, c->GetCudaStream()));
+
+    constexpr std::size_t kTreshold = (static_cast<std::size_t>(1) << 33);
+
+    if (temp_storage_bytes < kTreshold) {
+      RegionPtr temp_storage = NewRegion(c, temp_storage_bytes);
+      K2_CHECK_CUDA_ERROR(cub::DeviceHistogram::HistogramEven(
+          temp_storage->data, temp_storage_bytes, src_data, ans_data, n + 1, 0,
+          n, src_dim, c->GetCudaStream()));
+    } else {
+      // split the array and do a recursive call
+      //
+      // See https://github.com/NVIDIA/cub/issues/288
+      // for why we split it
+      int32_t first_start = 0;          // inclusive
+      int32_t first_end = src_dim / 2;  // exclusive
+      int32_t first_dim = first_end - first_start;
+
+      int32_t second_start = first_end;  // inclusive
+      int32_t second_end = src_dim;      // exclusive
+      int32_t second_dim = second_end - second_start;
+
+      Array1<int32_t> first_subset =
+          GetCounts(c, src_data + first_start, first_dim, n);
+      Array1<int32_t> second_subset =
+          GetCounts(c, src_data + second_start, second_dim, n);
+
+      const int32_t *first_subset_data = first_subset.Data();
+      const int32_t *second_subset_data = second_subset.Data();
+      K2_EVAL(
+          c, n, set_ans, (int32_t i)->void {
+            ans_data[i] = first_subset_data[i] + second_subset_data[i];
+          });
+    }
   }
   return ans;
 }
@@ -341,14 +370,13 @@ Array1<int32_t> InvertMonotonicDecreasing(const Array1<int32_t> &src) {
   MonotonicDecreasingUpperBound(ans, &ans);
 #ifndef NDEBUG
   K2_EVAL(
-      c, ans_dim, lambda_check_values, (int32_t i) -> void {
+      c, ans_dim, lambda_check_values, (int32_t i)->void {
         int32_t j = ans_data[i];
         K2_CHECK((j == src_dim || src_data[j] <= i) &&
-                 (j == 0 || src_data[j-1] > i));
+                 (j == 0 || src_data[j - 1] > i));
       });
 #endif
 
-
   return ans;
 }
 
@@ -482,17 +510,17 @@ bool IsPermutation(const Array1<int32_t> &a) {
   return Equal(ones, 0);
 }
 
-
 void RowSplitsToRowIdsOffset(const Array1<int32_t> &row_splits_part,
-                           Array1<int32_t> *row_ids_part) {
+                             Array1<int32_t> *row_ids_part) {
   NVTX_RANGE(K2_FUNC);
   ContextPtr c = row_splits_part.Context();
   Array1<int32_t> row_splits(c, row_splits_part.Dim());
   int32_t *row_splits_data = row_splits.Data();
   const int32_t *row_splits_part_data = row_splits_part.Data();
-  K2_EVAL(c, row_splits_part.Dim(), lambda_subtract_offset, (int32_t i) {
-      row_splits_data[i] = row_splits_part_data[i] - row_splits_part_data[0];
-    });
+  K2_EVAL(
+      c, row_splits_part.Dim(), lambda_subtract_offset, (int32_t i) {
+        row_splits_data[i] = row_splits_part_data[i] - row_splits_part_data[0];
+      });
   RowSplitsToRowIds(row_splits, row_ids_part);
 }
 

diff --git a/k2/csrc/array_ops.h b/k2/csrc/array_ops.h
@@ -665,14 +665,14 @@ bool ApproxEqual(const Array2<T> &a, const Array2<T> &b, T tol = T(0.0001));
                        `0 <= indexes[i] < src.Dim()` if
                        `allow_minus_one == false`,
                        else -1 is also allowed and the corresponding
-                       output element will be zero.
+                       output element will be `default_value`.
      @return  Returns an `Array1<T>` of dimension indexes.Dim(),
-               with `ans[i] = src[indexes[i]]` (or zero if
+               with `ans[i] = src[indexes[i]]` (or `default_value` if
                `allow_minus_one == true` and `indexes[i] == -1`).
  */
 template <typename T>
 Array1<T> Index(const Array1<T> &src, const Array1<int32_t> &indexes,
-                bool allow_minus_one);
+                bool allow_minus_one, T default_value);
 
 /*
   Index src's rows with `indexes` which contains the row indexes.

diff --git a/k2/csrc/array_ops_inl.h b/k2/csrc/array_ops_inl.h
@@ -760,7 +760,7 @@ Array1<T> Plus(const Array1<T> &src, T t) {
 
 template <typename T>
 Array1<T> Index(const Array1<T> &src, const Array1<int32_t> &indexes,
-                bool allow_minus_one) {
+                bool allow_minus_one, T default_value) {
   NVTX_RANGE(K2_FUNC);
   ContextPtr &c = src.Context();
   K2_CHECK(c->IsCompatible(*indexes.Context()));
@@ -775,13 +775,13 @@ Array1<T> Index(const Array1<T> &src, const Array1<int32_t> &indexes,
 #pragma unroll(4)
       for (int32_t i = 0; i < ans_dim; i++) {
         int32_t index = index_data[i];
-        T value = (index < 0 ? T(0) : src_data[index]);
+        T value = (index < 0 ? default_value : src_data[index]);
         ans_data[i] = value;
       }
     } else {
       auto lambda_set_values = [=] __device__(int32_t i) -> void {
         int32_t index = index_data[i];
-        T value = (index < 0 ? T(0) : src_data[index]);
+        T value = (index < 0 ? default_value : src_data[index]);
         ans_data[i] = value;
       };
       EvalDevice(c, ans_dim, lambda_set_values);

diff --git a/k2/csrc/array_ops_test.cu b/k2/csrc/array_ops_test.cu
@@ -1519,20 +1519,23 @@ TEST(OpsTest, Array1IndexTest) {
                         RandUniformArray1<int32_t>(c, ans_dim, 0, src_dim - 1),
                     indexes_minus_one =
                         RandUniformArray1<int32_t>(c, ans_dim, -1, src_dim - 1);
-    Array1<T> ans_no_minus_one = Index(src, indexes_no_minus_one, false),
+
+    T default_value = loop - 1;
+
+    Array1<T> ans_no_minus_one = Index(src, indexes_no_minus_one, false, default_value),
               ans_no_minus_one_check = src[indexes_no_minus_one],
-              ans_no_minus_one_check2 = Index(src, indexes_no_minus_one, true);
+              ans_no_minus_one_check2 = Index(src, indexes_no_minus_one, true, default_value);
     ASSERT_TRUE(Equal(ans_no_minus_one, ans_no_minus_one_check));
     ASSERT_TRUE(Equal(ans_no_minus_one, ans_no_minus_one_check2));
 
-    Array1<T> ans_minus_one = Index(src, indexes_minus_one, true);
+    Array1<T> ans_minus_one = Index(src, indexes_minus_one, true, default_value);
 
     ans_minus_one = ans_minus_one.To(cpu_context);
     src = src.To(cpu_context);
     indexes_minus_one = indexes_minus_one.To(cpu_context);
     for (int32_t i = 0; i < indexes_minus_one.Dim(); i++) {
       int32_t index = indexes_minus_one[i];
-      ASSERT_EQ(ans_minus_one[i], (index < 0 ? 0 : src[index]));
+      ASSERT_EQ(ans_minus_one[i], (index < 0 ? default_value : src[index]));
     }
   }
 }
@@ -1717,7 +1720,8 @@ void Array2ContiguousTest() {
     Array2<T> src_part = src.ColArange(slice_dim1_begin, slice_dim1_end),
         src_part_contiguous1 = ToContiguous(src_part);
     Array2<Any> src_part_contiguous_generic2 = ToContiguous(src_part.Generic());
-    Array2<T> src_part_contiguous2 = src_part_contiguous_generic2.Specialize<T>();
+    Array2<T> src_part_contiguous2 =
+        src_part_contiguous_generic2.Specialize<T>();
 
     K2_CHECK_EQ(Equal(src_part_contiguous1, src_part_contiguous2),
                 true);
@@ -1729,8 +1733,8 @@ void Array2ContiguousTest() {
 TEST(OpsTest, ApproxEqualTest) {
   Array2<float> array1("[ [ 1 2 3  ] [4 5 6 ]]"),
       array2("[ [ 1.1 2 3  ] [4 5 6 ]]");
-  K2_CHECK_EQ(ApproxEqual(array1, array2, float(0.2)), true);
-  K2_CHECK_EQ(ApproxEqual(array1, array2, float(0.01)), false);
+  K2_CHECK_EQ(ApproxEqual(array1, array2, 0.2f), true);
+  K2_CHECK_EQ(ApproxEqual(array1, array2, 0.01f), false);
 }
 
 TEST(OpsTest, Array2Contiguous) {

diff --git a/k2/csrc/array_test.cu b/k2/csrc/array_test.cu
@@ -67,7 +67,6 @@ void TestArray1() {
       // it is necessary
       ASSERT_THROW(array.Generic().template Specialize<unsigned char>(),
                    std::runtime_error);  // assuming T != unsigned char
-
     }
 
     {
@@ -268,12 +267,14 @@ void TestArray2() {
       EXPECT_EQ(array.Data(), array_copy.Data());
 
       const Array2<T> &array_copy_const = array;
-      const Array2<T> &array_copy_const2 = array_copy_const.Generic().template Specialize<T>();
+      const Array2<T> &array_copy_const2 =
+          array_copy_const.Generic().template Specialize<T>();
       EXPECT_EQ(array_copy_const.Data(), array_copy.Data());
       EXPECT_EQ(array_copy_const2.Data(), array_copy.Data());
 
       const Array1<T> &array1_copy_const = arr1;
-      const Array1<T> &array1_copy_const2 = array1_copy_const.Generic().template Specialize<T>();
+      const Array1<T> &array1_copy_const2 =
+          array1_copy_const.Generic().template Specialize<T>();
       EXPECT_EQ(array1_copy_const.Data(), arr1.Data());
       EXPECT_EQ(array1_copy_const2.Data(), arr1.Data());
 

diff --git a/k2/csrc/dtype.cu b/k2/csrc/dtype.cu
@@ -11,8 +11,10 @@ namespace k2 {
 const DtypeTraits g_dtype_traits_array[] = {
   {kUnknownBase, 0, "Any", 0},
   {kFloatBase, 4, "half"}, {kFloatBase, 4, "float"}, {kFloatBase, 8, "double"},
-  {kIntBase, 1, "int8"}, {kIntBase, 2, "int16"},   {kIntBase, 4, "int32"},  {kIntBase, 8, "int64"},
-  {kUintBase, 1, "uint8"}, {kUintBase, 2, "uint16"},  {kUintBase, 4, "uint32"}, {kUintBase, 8, "uint64"},
+  {kIntBase, 1, "int8"}, {kIntBase, 2, "int16"},
+  {kIntBase, 4, "int32"},  {kIntBase, 8, "int64"},
+  {kUintBase, 1, "uint8"}, {kUintBase, 2, "uint16"},
+  {kUintBase, 4, "uint32"}, {kUintBase, 8, "uint64"},
   {kUnknownBase, 16, "Arc", 4}, {kUnknownBase, 0, "Other", 0}
 };
 

diff --git a/k2/csrc/dtype.h b/k2/csrc/dtype.h
@@ -81,7 +81,7 @@ enum class Dtype {
 
 // This is needed because the comma in std::is_same<T,Any>::value prevents it
 // from appearing inside macro arguments.
-#define K2_TYPE_IS_ANY(T)  std::is_same<T,Any>::value
+#define K2_TYPE_IS_ANY(T) (std::is_same<T, Any>::value)
 
 constexpr Dtype kAnyDtype = Dtype::kAnyDtype;
 constexpr Dtype kHalfDtype = Dtype::kHalfDtype;
@@ -112,10 +112,10 @@ struct DtypeOf<Any> {
   static const Dtype dtype = kAnyDtype;
 };
 
-//template <>
-//struct DtypeOf<half> {
+// template <>
+// struct DtypeOf<half> {
 //  static const Dtype dtype = kHalfDtype;
-//};
+// };
 
 template <>
 struct DtypeOf<float> {

diff --git a/k2/csrc/dtype_test.cu b/k2/csrc/dtype_test.cu
@@ -39,7 +39,7 @@ template <typename T> void CheckDtypes() {
 }
 
 TEST(DtypeTest, CheckDtypes) {
-  //CheckDtypes<half>();
+  // CheckDtypes<half>();
   CheckDtypes<float>();
   CheckDtypes<double>();
   CheckDtypes<int8_t>();

diff --git a/k2/csrc/fsa_utils.h b/k2/csrc/fsa_utils.h
@@ -527,7 +527,6 @@ FsaVec FsaVecFromArcIndexes(FsaVec &fsas, Ragged<int32_t> &best_arc_indexes);
 Ragged<int32_t> ComposeArcMaps(Ragged<int32_t> &step1_arc_map,
                                Ragged<int32_t> &step2_arc_map);
 
-
 /*
   Return a ragged array that represents the cumulative distribution function
   (cdf) of the probability of arcs leaving each state of `fsas`.

diff --git a/k2/csrc/hash.cu b/k2/csrc/hash.cu
@@ -20,7 +20,7 @@ void Hash::CheckEmpty() {
       if (~(hash_data[i]) != 0) error_data[0] = i;
     });
   int32_t i = error[0];
-  if (i >= 0) { // there was an error; i is the index into the hash where
+  if (i >= 0) {  // there was an error; i is the index into the hash where
     // there was an element.
     int64_t elem = data_[i];
     // We don't know the number of bits the user was using for the key vs.
@@ -30,10 +30,9 @@ void Hash::CheckEmpty() {
   }
 }
 
-void Hash::Resize(int32_t new_num_buckets,
-                  int32_t num_key_bits,
-                  int32_t num_value_bits, // = -1,
-                  bool copy_data) {  // = true
+void Hash::Resize(int32_t new_num_buckets, int32_t num_key_bits,
+                  int32_t num_value_bits,  // = -1,
+                  bool copy_data) {        // = true
   NVTX_RANGE(K2_FUNC);
   if (num_value_bits < 0)
     num_value_bits = 64 - num_key_bits;
@@ -66,5 +65,4 @@ void Hash::Resize(int32_t new_num_buckets,
                        // expect the hash to be empty when destroyed).
 }
 
-
 }  // namespace k2
diff --git a/k2/csrc/hash_test.cu b/k2/csrc/hash_test.cu
@@ -42,7 +42,8 @@ void TestHashConstruct() {
             *values_data = values.Data(),
             *success_data = success.Data();
       int32_t   *counts_data = count_per_key.Data();
-      Hash::Accessor<NUM_KEY_BITS> acc = hash.GetAccessor<Hash::Accessor<NUM_KEY_BITS>>();
+      Hash::Accessor<NUM_KEY_BITS> acc =
+          hash.GetAccessor<Hash::Accessor<NUM_KEY_BITS>>();
       K2_EVAL(c, num_elems, lambda_insert_pairs, (int32_t i) -> void {
           uint32_t key = keys_data[i],
                  value = values_data[i],
@@ -237,7 +238,7 @@ void TestHashConstructPacked(int32_t num_key_bits,
           success_data[i] = success;
         });
 
-      if (size != 65535) // just for some variety..
+      if (size != 65535)      // just for some variety..
         num_value_bits += 1;  // Try changing the number of value bits, so we
                               // can test Resize() with changes in that.
 
@@ -254,8 +255,8 @@ void TestHashConstructPacked(int32_t num_key_bits,
 
           int32_t num_implicit_key_bits = num_key_bits + num_value_bits - 64,
               num_kept_key_bits = num_key_bits - num_implicit_key_bits;
-          uint64_t implicit_key_bits_mask = (uint64_t(1) << num_implicit_key_bits) - 1;
-
+          uint64_t implicit_key_bits_mask =
+              (uint64_t(1) << num_implicit_key_bits) - 1;
 
           uint64_t val = 0;
           uint64_t *key_val_addr = nullptr;

diff --git a/k2/csrc/ragged.h b/k2/csrc/ragged.h
@@ -325,8 +325,7 @@ struct Ragged {
   }
 
   explicit Ragged(const RaggedShape &shape, Dtype dtype = DtypeOf<T>::dtype)
-      : shape(shape), values(shape.Context(), shape.NumElements(), dtype) {
-  }
+      : shape(shape), values(shape.Context(), shape.NumElements(), dtype) {}
 
   // Defined in ragged_ops_inl.h
   // This will crash if T == Any.

diff --git a/k2/csrc/tensor.h b/k2/csrc/tensor.h
@@ -106,12 +106,12 @@ class Shape {
 
   int32_t num_axes_;  // Must be >= 0
 
-  // num_elements_ is the number of distinct tuples of indexes; since strides may
-  // be zero, we do not guarantee that all these elements occupy distinct memory
-  // locations.  See NumElements()
+  // num_elements_ is the number of distinct tuples of indexes; since strides
+  // may be zero, we do not guarantee that all these elements occupy distinct
+  // memory locations.  See NumElements()
   int64_t num_elements_;
-  // see documentation for IsContiguous() for its meaning.  This is "derived data";
-  // it is computed by IsContiguous().
+  // see documentation for IsContiguous() for its meaning.  This is "derived
+  // data"; it is computed by IsContiguous().
   bool is_contiguous_;
 
   // elements of dims_ and strides_ >= num_axes_ are currently not set;
@@ -233,6 +233,7 @@ class Tensor {
   TensorImplPtr Impl() const { return impl_; }
   // This is for use by implementation code; be careful with it.
   explicit Tensor(TensorImplPtr impl);
+
  private:
   // For use when `shape` and `dtype` are already set up; sets data and
   // byte_offset.