|
4 | 4 | *
|
5 | 5 | * @copyright
|
6 | 6 | * Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu)
|
| 7 | + * Mobvoi Inc. (authors: Fangjun Kuang) |
7 | 8 | *
|
8 | 9 | * @copyright
|
9 | 10 | * See LICENSE for clarification regarding multiple authors
|
10 | 11 | */
|
11 | 12 |
|
12 |
| -#include <cub/cub.cuh> |
| 13 | +#include <algorithm> |
| 14 | +#include <memory> |
13 | 15 | #include <vector>
|
14 | 16 |
|
| 17 | +#include "cub/cub.cuh" |
15 | 18 | #include "k2/csrc/array_ops.h"
|
16 | 19 | #include "k2/csrc/math.h"
|
| 20 | +#include "k2/csrc/moderngpu_allocator.h" |
17 | 21 | #include "k2/csrc/ragged.h"
|
18 | 22 | #include "k2/csrc/ragged_ops.h"
|
| 23 | +#include "moderngpu/kernel_mergesort.hxx" |
| 24 | + |
19 | 25 | namespace {
|
20 | 26 |
|
21 | 27 | /*
|
@@ -806,4 +812,69 @@ Ragged<int32_t> GetCountsPartitioned(Ragged<int32_t> &src,
|
806 | 812 | return Ragged<int32_t>(ans_ragged_shape, counts);
|
807 | 813 | }
|
808 | 814 |
|
| 815 | +static Array1<int32_t> GetTransposeReorderingCpu(Ragged<int32_t> &src, |
| 816 | + int32_t num_cols) { |
| 817 | + std::vector<std::vector<int32_t>> column_indexes(num_cols); // [column][row] |
| 818 | + const int32_t *values_data = src.values.Data(); |
| 819 | + int32_t n = src.values.Dim(); |
| 820 | + |
| 821 | + for (int32_t i = 0; i != n; ++i) { |
| 822 | + int32_t bucket = values_data[i]; |
| 823 | + column_indexes[bucket].push_back(i); |
| 824 | + } |
| 825 | + |
| 826 | + Array1<int32_t> ans(src.Context(), n); |
| 827 | + int32_t *ans_data = ans.Data(); |
| 828 | + for (int32_t i = 0; i != num_cols; ++i) { |
| 829 | + std::copy(column_indexes[i].begin(), column_indexes[i].end(), ans_data); |
| 830 | + ans_data += column_indexes[i].size(); |
| 831 | + } |
| 832 | + return ans; |
| 833 | +} |
| 834 | + |
| 835 | +Array1<int32_t> GetTransposeReordering(Ragged<int32_t> &src, int32_t num_cols) { |
| 836 | + ContextPtr &context = src.Context(); |
| 837 | + if (src.NumAxes() < 2) { |
| 838 | + // src is empty |
| 839 | + return Array1<int32_t>(context, 0); |
| 840 | + } |
| 841 | + |
| 842 | + DeviceType device_type = context->GetDeviceType(); |
| 843 | + if (device_type == kCpu) return GetTransposeReorderingCpu(src, num_cols); |
| 844 | + |
| 845 | + K2_CHECK_EQ(device_type, kCuda); |
| 846 | + |
| 847 | + const int32_t *row_splits1_data = src.RowSplits(src.NumAxes() - 1).Data(); |
| 848 | + const int32_t *row_ids1_data = src.RowIds(src.NumAxes() - 1).Data(); |
| 849 | + const int32_t *value_data = src.values.Data(); |
| 850 | + int32_t n = src.values.Dim(); |
| 851 | + Array1<int32_t> ans = Range(context, n, 0); |
| 852 | + |
| 853 | + auto lambda_comp = [=] __device__(int32_t a_idx01, int32_t b_idx01) -> bool { |
| 854 | + int32_t a_idx0 = row_ids1_data[a_idx01]; |
| 855 | + int32_t b_idx0 = row_ids1_data[b_idx01]; |
| 856 | + |
| 857 | + int32_t a_col_index = value_data[a_idx01]; |
| 858 | + int32_t b_col_index = value_data[b_idx01]; |
| 859 | + |
| 860 | + if (a_col_index < b_col_index) return true; // sort by column indexes |
| 861 | + if (a_col_index > b_col_index) return false; |
| 862 | + |
| 863 | + // now we have a_col_index == b_col_index |
| 864 | + if (a_idx0 < b_idx0) return true; // sort by row indexes |
| 865 | + if (a_idx0 > b_idx0) return false; |
| 866 | + |
| 867 | + // now we have a_idx0 == b_idx0 && a_col_index == b_col_index |
| 868 | + // this entry is duplicated in the sparse matrix. |
| 869 | + return false; // we can return either true or false here. |
| 870 | + }; |
| 871 | + |
| 872 | + std::unique_ptr<mgpu::context_t> mgpu_context = |
| 873 | + GetModernGpuAllocator(context->GetDeviceId()); |
| 874 | + |
| 875 | + K2_CUDA_SAFE_CALL(mgpu::mergesort(ans.Data(), n, lambda_comp, *mgpu_context)); |
| 876 | + |
| 877 | + return ans; |
| 878 | +} |
| 879 | + |
809 | 880 | } // namespace k2
|
0 commit comments