Fixed bug in GetStatesBatch (complex branch) (k2-fsa#266)

qindazhu · web-flow · commit 5e7fce98fdce · 2020-10-16T23:23:12.000+08:00
diff --git a/k2/csrc/fsa_algo.cu b/k2/csrc/fsa_algo.cu
@@ -149,29 +149,24 @@ void Intersect(FsaOrVec &a_fsas, FsaOrVec &b_fsas, FsaVec *out,
 
 Fsa LinearFsa(Array1<int32_t> &symbols) {
   ContextPtr c = symbols.Context();
-  int32_t n = symbols.Dim(),
-      num_states = n + 2,
-      num_arcs = n + 1;
+  int32_t n = symbols.Dim(), num_states = n + 2, num_arcs = n + 1;
   Array1<int32_t> row_splits1 = Range(c, num_states + 1, 0),
-                     row_ids1 = Range(c, num_arcs, 0);
+                  row_ids1 = Range(c, num_arcs, 0);
   Array1<Arc> arcs(c, num_arcs);
   Arc *arcs_data = arcs.Data();
   const int32_t *symbols_data = symbols.Data();
-  auto lambda_set_arcs = [=] __host__ __device__ (int32_t arc_idx01) -> void {
-    int32_t src_state = arc_idx01,
-      dest_state = arc_idx01 + 1,
-    // -1 == kFinalSymbol
-    symbol = (arc_idx01 < n ? symbols_data[n] : -1);
+  auto lambda_set_arcs = [=] __host__ __device__(int32_t arc_idx01) -> void {
+    int32_t src_state = arc_idx01, dest_state = arc_idx01 + 1,
+            // -1 == kFinalSymbol
+        symbol = (arc_idx01 < n ? symbols_data[n] : -1);
     K2_CHECK_NE(symbol, -1);
     float score = 0.0;
     arcs_data[arc_idx01] = Arc(src_state, dest_state, symbol, score);
   };
   Eval(c, num_arcs, lambda_set_arcs);
-  return Ragged<Arc>(RaggedShape2(&row_splits1, &row_ids1, num_arcs),
-                     arcs);
+  return Ragged<Arc>(RaggedShape2(&row_splits1, &row_ids1, num_arcs), arcs);
 }
 
-
 Fsa LinearFsas(Ragged<int32_t> &symbols) {
   K2_CHECK(symbols.NumAxes() == 2);
   ContextPtr c = symbols.Context();
@@ -180,61 +175,59 @@ Fsa LinearFsas(Ragged<int32_t> &symbols) {
   RaggedShape states_shape = ChangeSublistSize(symbols.shape, 2);
 
   int32_t num_states = states_shape.NumElements(),
-            num_arcs = symbols.NumElements() + symbols.Dim0();
+          num_arcs = symbols.NumElements() + symbols.Dim0();
 
   // row_splits2 maps from state_idx01 to arc_idx012; row_ids2 does the reverse.
   // We'll set them in the lambda below.
-  Array1<int32_t> row_splits2(c, num_states + 2),
-      row_ids2(c, num_arcs);
-
+  Array1<int32_t> row_splits2(c, num_states + 2), row_ids2(c, num_arcs);
 
   int32_t *row_ids2_data = row_ids2.Data(),
-       *row_splits2_data = row_splits2.Data();
+          *row_splits2_data = row_splits2.Data();
   const int32_t *row_ids1_data = states_shape.RowIds(1).Data(),
-             *row_splits1_data = states_shape.RowSplits(1).Data(),
-                 *symbols_data = symbols.values.Data();
+                *row_splits1_data = states_shape.RowSplits(1).Data(),
+                *symbols_data = symbols.values.Data();
   Array1<Arc> arcs(c, num_arcs);
   Arc *arcs_data = arcs.Data();
-  auto lambda = [=] __host__ __device__ (int32_t state_idx01) -> void {
+  auto lambda = [=] __host__ __device__(int32_t state_idx01) -> void {
     int32_t fsa_idx0 = row_ids1_data[state_idx01],
-      state_idx0x = row_splits1_data[fsa_idx0],
-      next_state_idx0x = row_splits1_data[fsa_idx0 + 1],
-      idx1 = state_idx01 - state_idx0x;
+            state_idx0x = row_splits1_data[fsa_idx0],
+            next_state_idx0x = row_splits1_data[fsa_idx0 + 1],
+            idx1 = state_idx01 - state_idx0x;
 
     // the following works because each FSA has one fewer arcs than states.
     int32_t arc_idx0xx = state_idx0x - fsa_idx0,
-      next_arc_idx0xx = next_state_idx0x - (fsa_idx0 + 1),
-    // the following may look a bit wrong.. here, the idx1 is the same as
-    // the idx12 if the arc exists, because each state has one arc leaving
-    // it (except the last state).
-    arc_idx012 = arc_idx0xx + idx1;
+            next_arc_idx0xx = next_state_idx0x - (fsa_idx0 + 1),
+            // the following may look a bit wrong.. here, the idx1 is the same
+            // as the idx12 if the arc exists, because each state has one arc
+            // leaving it (except the last state).
+        arc_idx012 = arc_idx0xx + idx1;
     // the following works because each FSA has one fewer symbols than arcs
     // (however it doesn't work for the last arc of each FSA; we check below.)
     int32_t symbol_idx01 = arc_idx012 - fsa_idx0;
     if (arc_idx012 < next_arc_idx0xx) {
-      int32_t src_state = idx1,
-             dest_state = idx1 + 1,
-                 symbol = (arc_idx012 + 1 < next_arc_idx0xx ?
-                           symbols_data[symbol_idx01] : -1);  // kFinalSymbol
+      int32_t src_state = idx1, dest_state = idx1 + 1,
+              symbol =
+                  (arc_idx012 + 1 < next_arc_idx0xx ? symbols_data[symbol_idx01]
+                                                    : -1);  // kFinalSymbol
       float score = 0.0;
       arcs_data[arc_idx012] = Arc(src_state, dest_state, symbol, score);
       row_ids2_data[arc_idx012] = state_idx01;
     } else {
       // The following ensures that the last element of row_splits1_data
-      // (i.e. row_splits1[num_states]) is set to num_arcs.  It also writes something
-      // unnecessary for the last state of each FSA but the last one, which will
-      // cause 2 threads to write the same item to the same location.
+      // (i.e. row_splits1[num_states]) is set to num_arcs.  It also writes
+      // something unnecessary for the last state of each FSA but the last one,
+      // which will cause 2 threads to write the same item to the same location.
       // Note that there is no arc with index `arc_idx01`, if you reach here.
-      row_splits2_data[state_idx01+1] = arc_idx012;
+      row_splits2_data[state_idx01 + 1] = arc_idx012;
     }
     row_splits2_data[state_idx01] = arc_idx012;
   };
   Eval(c, num_states, lambda);
 
-  return Ragged<Arc>(RaggedShape3(&states_shape.RowSplits(1),
-                                  &states_shape.RowIds(1), num_states,
-                                  &row_splits2, &row_splits2, num_arcs),
-                     arcs);
+  return Ragged<Arc>(
+      RaggedShape3(&states_shape.RowSplits(1), &states_shape.RowIds(1),
+                   num_states, &row_splits2, &row_splits2, num_arcs),
+      arcs);
 }
 
 namespace {
diff --git a/k2/csrc/fsa_utils.cu b/k2/csrc/fsa_utils.cu
@@ -651,7 +651,7 @@ Ragged<int32_t> GetStateBatches(FsaVec &fsas, bool transpose) {
           *batch_starts_data = batch_starts.Data();
   const int32_t *fsas_row_splits1_data = fsas.RowSplits(1).Data();
 
-#if 1
+#if 0
   // This is a simple version of the kernel that demonstrates what we're trying
   // to do with the more complex code.
   auto lambda_set_batch_info_simple = [=] __host__ __device__(int32_t fsa_idx) {
@@ -710,15 +710,16 @@ Ragged<int32_t> GetStateBatches(FsaVec &fsas, bool transpose) {
 
     int32_t begin_state_idx01 = fsas_row_splits1_data[fsa_idx],
             end_state_idx01 = fsas_row_splits1_data[fsa_idx + 1];
+    int32_t num_states_this_fsa = end_state_idx01 - begin_state_idx01;
     int32_t i = 0, cur_state_idx01 = begin_state_idx01;
 
-    if (task_idx >= end_state_idx01 - begin_state_idx01) return;
+    if (task_idx >= num_states_this_fsa) return;
 
     // The next loop advances `cur_state_idx01` by
     // a number of steps equal to `task_idx`.
     for (int32_t m = 0; m < log_power; ++m) {
       int32_t n = 1 << m;
-      if (task_idx % n != 0) {
+      if ((task_idx & n) != 0) {
         i += n;
         int32_t next = dest_states_powers_acc(m, cur_state_idx01);
         if (next >= end_state_idx01) return;
@@ -728,18 +729,20 @@ Ragged<int32_t> GetStateBatches(FsaVec &fsas, bool transpose) {
     K2_CHECK_EQ(i, task_idx);
 
     while (1) {
+      if (i >= num_states_this_fsa) return;
       batch_starts_data[begin_state_idx01 + i] = cur_state_idx01;
-      int32_t next_state_idx01 =
-          dest_states_powers_acc(log_power, cur_state_idx01);
+      int32_t next_state_idx01 = dest_states_powers_acc(
+          log_power,
+          cur_state_idx01);  // advance jobs_per_fsa = (1 << log_power) steps
       if (next_state_idx01 >= end_state_idx01) {
         // if exactly one step would also be enough to take us past the
         // boundary...
-        if (dest_states_powers_acc(0, cur_state_idx01) >= next_state_idx01) {
+        if (dest_states_powers_acc(0, cur_state_idx01) >= end_state_idx01) {
           num_batches_per_fsa_data[fsa_idx] = i + 1;
         }
         return;
       } else {
-        i += cur_state_idx01;
+        i += jobs_per_fsa;
         cur_state_idx01 = next_state_idx01;
       }
     }
@@ -757,7 +760,7 @@ Ragged<int32_t> GetStateBatches(FsaVec &fsas, bool transpose) {
   int32_t *ans_row_splits2_data = ans_row_splits2.Data();
   ans_row_splits2.Range(num_batches, 1) = num_states;  // The kernel below won't
                                                        // set this last element
-  auto lambda_set_ans_row_ids2 =
+  auto lambda_set_ans_row_splits2 =
       [=] __host__ __device__(int32_t idx01) -> void {
     int32_t idx0 = ans_row_ids1_data[idx01],  // Fsa index
         idx0x = ans_row_splits1_data[idx0], idx1 = idx01 - idx0x,
@@ -770,7 +773,7 @@ Ragged<int32_t> GetStateBatches(FsaVec &fsas, bool transpose) {
         this_batch_start = batch_starts_data[fsas_idx01];
     ans_row_splits2_data[idx01] = this_batch_start;
   };
-  Eval(c, num_batches, lambda_set_ans_row_ids2);
+  Eval(c, num_batches, lambda_set_ans_row_splits2);
 
   RaggedShape ans_shape =
       RaggedShape3(&ans_row_splits1, &ans_row_ids1, num_batches,
diff --git a/k2/csrc/ragged_ops.cu b/k2/csrc/ragged_ops.cu
@@ -946,59 +946,58 @@ RaggedShape ChangeSublistSize(RaggedShape &src, int32_t size_delta) {
   std::vector<RaggedShapeDim> ans_axes(src.NumAxes() - 1);
   int32_t last_axis = src.NumAxes() - 1;
   // The following will only do something if src.NumAxes() > 2.
-  for (int32_t i = 0; i + 1 < last_axis; i++)
-    ans_axes[i] = src.Axes()[i];
+  for (int32_t i = 0; i + 1 < last_axis; i++) ans_axes[i] = src.Axes()[i];
 
   ContextPtr c = src.Context();
   int32_t num_rows = src.TotSize(last_axis - 1),
-    src_num_elems = src.TotSize(last_axis),
-    num_elems = src_num_elems + size_delta * num_rows;
+          src_num_elems = src.TotSize(last_axis),
+          num_elems = src_num_elems + size_delta * num_rows;
   ans_axes[0].row_splits = Array1<int32_t>(c, num_rows + 1);
   ans_axes[0].row_ids = Array1<int32_t>(c, num_elems);
   ans_axes[0].cached_tot_size = num_elems;
   const int32_t *src_row_splits_data = src.RowSplits(last_axis).Data(),
-    *src_row_ids_data = src.RowIds(last_axis).Data();
+                *src_row_ids_data = src.RowIds(last_axis).Data();
   int32_t *row_splits_data = ans_axes[0].row_splits.Data(),
-    *row_ids_data = ans_axes[0].row_ids.Data();
+          *row_ids_data = ans_axes[0].row_ids.Data();
 
   {
     ParallelRunner pr(c);
     {
       With w(pr.NewStream());
-      auto lambda_set_row_splits = [=] __host__ __device__ (int32_t idx0) -> void {
+      auto lambda_set_row_splits =
+          [=] __host__ __device__(int32_t idx0) -> void {
         row_splits_data[idx0] = src_row_splits_data[idx0] + size_delta * idx0;
       };
       Eval(c, num_rows + 1, lambda_set_row_splits);
     }
 
     {
       With w(pr.NewStream());
-      auto lambda_set_row_ids1 = [=] __host__ __device__ (int32_t src_idx01) -> void {
+      auto lambda_set_row_ids1 =
+          [=] __host__ __device__(int32_t src_idx01) -> void {
         int32_t src_idx0 = src_row_ids_data[src_idx01],
-        src_idx0x = src_row_splits_data[src_idx0],
-        src_idx1 = src_idx01 - src_idx0x,
-        new_idx0x = row_splits_data[src_idx0],
-        new_idx0x_next = row_splits_data[src_idx0 + 1],
-        new_idx01 = new_idx0x + src_idx1;
+                src_idx0x = src_row_splits_data[src_idx0],
+                src_idx1 = src_idx01 - src_idx0x,
+                new_idx0x = row_splits_data[src_idx0],
+                new_idx0x_next = row_splits_data[src_idx0 + 1],
+                new_idx01 = new_idx0x + src_idx1;
         // it's only necessary to guard the next statement with in 'if' because
         // size_delta might be negative.
-        if (new_idx01 < new_idx0x_next)
-          row_ids_data[new_idx01] = src_idx0;
+        if (new_idx01 < new_idx0x_next) row_ids_data[new_idx01] = src_idx0;
       };
       Eval(c, num_elems, lambda_set_row_ids1);
     }
     if (size_delta > 0) {
       // This sets the row-ids that are not set by lambda_set_row_ids1.
       With w(pr.NewStream());
-      auto lambda_set_row_ids2 = [=] __host__ __device__ (int32_t i) -> void {
-        int32_t idx0 = i / size_delta, n = i % size_delta,
-        next_idx0 = idx0 + 1;
+      auto lambda_set_row_ids2 = [=] __host__ __device__(int32_t i) -> void {
+        int32_t idx0 = i / size_delta, n = i % size_delta, next_idx0 = idx0 + 1;
         // The following formula is the same as the one in
         // lambda_set_row_splits; we want to compute the new value of
         // row_splits_data[next_idx0] without waiting for that kernel to
         // terminate.
-        int32_t next_idx0x = src_row_splits_data[next_idx0] +
-        size_delta * next_idx0;
+        int32_t next_idx0x =
+            src_row_splits_data[next_idx0] + size_delta * next_idx0;
         row_ids_data[next_idx0x - 1 - n] = idx0;
       };
       Eval(c, num_rows * size_delta, lambda_set_row_ids2);
@@ -1010,5 +1009,4 @@ RaggedShape ChangeSublistSize(RaggedShape &src, int32_t size_delta) {
   return RaggedShape(ans_axes);
 }
 
-
 }  // namespace k2