danpovey
diff --git a/‎.github/workflows/build.yml
+6-1 b/‎.github/workflows/build.yml
+6-1
diff --git a/‎.gitignore
+6 b/‎.gitignore
+6
diff --git a/‎CMakeLists.txt
+34-2 b/‎CMakeLists.txt
+34-2
diff --git a/‎LICENSE
+9-2 b/‎LICENSE
+9-2
diff --git a/‎README.md
+6 b/‎README.md
+6
diff --git a/‎cmake/cub.cmake
+30 b/‎cmake/cub.cmake
+30
diff --git a/‎k2/csrc/CMakeLists.txt
+2 b/‎k2/csrc/CMakeLists.txt
+2
diff --git a/‎k2/csrc/cuda/CMakeLists.txt
+28 b/‎k2/csrc/cuda/CMakeLists.txt
+28
diff --git a/‎k2/csrc/cuda/README.md
+24 b/‎k2/csrc/cuda/README.md
+24
diff --git a/‎k2/csrc/cuda/algorithms.h
+39 b/‎k2/csrc/cuda/algorithms.h
+39
@@ -10,15 +10,20 @@ on:
   push:
     branches:
       - master
+      - cuda
   pull_request:
     branches:
       - master
+      - cuda
 
 env:
   BUILD_TYPE: Debug
 
 jobs:
   build:
+    # disable CI now since GitHub action does not support CUDA
+    # and it always fails
+    if: false
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -55,4 +60,4 @@ jobs:
       - name: Test
         shell: bash
         working-directory: ${{runner.workspace}}/build
-        run: ctest --verbose --build-config $BUILD_TYPE
+        run: ctest --verbose --exclude-regex Cuda --build-config $BUILD_TYPE
@@ -1,9 +1,15 @@
 # Build folder
 **/build*
 
+# emacs saves
+[#]*[#]
+.[#]*
+*~
+
 # Prerequisites
 *.d
 
+
 # Compiled Object files
 *.slo
 *.lo
 
@@ -9,9 +9,9 @@ to build this project"
   )
 endif()
 
-cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
-project(k2)
+project(k2 CUDA CXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -26,19 +26,51 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release CACHE STRING
     "Set the build type. Available values are: Debug Release RelWithDebInfo MinSizeRel"
     FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    Debug Release RelWithDebInfo MinSizeRel
+  )
 endif()
 
 if(WIN32 AND BUILD_SHARED_LIBS)
   message(STATUS "Set BUILD_SHARED_LIBS to OFF for Windows")
   set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
 endif()
 
+# the following settings are modified from cub/CMakeLists.txt
+#[[ start settings for CUB ]]
+
+set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
+
+# Force CUDA C++ standard to be the same as the C++ standard used.
+#
+# Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
+# which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
+# versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
+# In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
+if(DEFINED CMAKE_CUDA_STANDARD)
+  message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
+    " is used as the C++ standard version for both C++ and CUDA.")
+endif()
+unset(CMAKE_CUDA_STANDARD CACHE)
+set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+
+set(K2_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72)
+foreach(COMPUTE_ARCH IN LISTS K2_COMPUTE_ARCHS)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
+endforeach()
+
+#[[ end settings for CUB ]]
+
 enable_testing()
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
 include(cpplint)
 include(glog)
 include(googletest)
 include(pybind11)
+include(cub)
 
 add_subdirectory(k2)
@@ -1,7 +1,14 @@
 MIT License
 
-Copyright (c) 2020 Daniel Povey
-Copyright (c) 2020- The Authors (see individual files for names)
+Copyright (c) 2020- The Authors (see commit history and individual files
+        for names)
+All Rights Reserved
+
+ NOTE (this is not from the MIT license): The copyright model is that authors
+ (or their employers, if noted in individual files) own their individual
+ contributions.  The authors' contributions can be discerned from the git
+ history.
+
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -4,3 +4,9 @@
 
 # k2
 FSA/FST algorithms, intended to (eventually) be interoperable with PyTorch and similar.
+
+## Quick start
+
+Want to try it out without installing anything? We have setup a [Google Colab][1].
+
+[1]: https://colab.research.google.com/drive/1qbHUhNZUX7AYEpqnZyf29Lrz2IPHBGlX?usp=sharing
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 Fangjun Kuang ([email protected])
+# See ../LICENSE for clarification regarding multiple authors
+
+function(download_cub)
+  if(CMAKE_VERSION VERSION_LESS 3.11)
+    list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+  endif()
+
+  include(FetchContent)
+
+  set(cub_URL  "https://github.com/NVlabs/cub/archive/1.9.10.tar.gz")
+  set(cub_HASH "SHA256=2bd7077a3d9741f0689e6c1eb58c6278fc96eccc27d964168bc8be1bc3a9040f")
+
+  FetchContent_Declare(cub
+    URL               ${cub_URL}
+    URL_HASH          ${cub_HASH}
+  )
+
+  FetchContent_GetProperties(cub)
+  if(NOT cub)
+    message(STATUS "Downloading cub")
+    FetchContent_Populate(cub)
+  endif()
+  message(STATUS "cub is downloaded to ${cub_SOURCE_DIR}")
+  add_library(cub INTERFACE)
+  target_include_directories(cub INTERFACE ${cub_SOURCE_DIR})
+
+endfunction()
+
+download_cub()
@@ -61,3 +61,5 @@ set(fsa_tests
 foreach(name IN LISTS fsa_tests)
   k2_add_fsa_test(${name})
 endforeach()
+
+add_subdirectory(cuda)
@@ -0,0 +1,28 @@
+add_library(context context.cu)
+target_include_directories(context PUBLIC ${CMAKE_SOURCE_DIR})
+target_link_libraries(context PUBLIC cub)
+target_link_libraries(context PUBLIC glog)
+
+function(k2_add_cuda_test name)
+  add_executable(${name} "${name}.cu")
+  target_link_libraries(${name}
+    PRIVATE
+      context
+      gtest
+      gtest_main
+  )
+  add_test(NAME "Test.Cuda.${name}"
+    COMMAND
+      $<TARGET_FILE:${name}>
+  )
+endfunction()
+
+# please sort the source files alphabetically
+set(cuda_tests
+  ops_test
+  utils_test
+)
+
+foreach(name IN LISTS cuda_tests)
+  k2_add_cuda_test(${name})
+endforeach()
@@ -0,0 +1,24 @@
+
+
+ So far this directory just contains some notes on implementation; all the code
+ is just a VERY EARLY DRAFT.  The goal here is to show *in principle* how we parallelize
+ things, building up from low-level primitives, but without actually creating any
+ CUDA code.
+
+ Actually we probably shouldn't separate this into a separate directory from the CPU code,
+ since most of it is general purpose.
+
+ Notes on build, and types of file:
+
+ Currently the plan is for *all* of these files to be put through the CUDA compiler
+ (nvcc).  Most of it is host code, but some of it leads to CUDA dependencies
+ (e.g. one of the constructors of Array1 is a template which can instantiate
+ CUDA code).
+
+ Eventually I'd like to make compilation conditional, so we can create a version of this
+ that runs on CPU with no CUDA dependency.  That can be done later though.
+ (Would involve a bunch of #ifdefs, plus defining things like __host__ and __device__ to
+ be the empty string).
+
+ For CUDA streams, I intend to always use cudaStreamPerThread as the stream.  This will
+ keep usage of the library relatively simple (no need to pass streams around).
@@ -0,0 +1,39 @@
+// k2/csrc/cuda/algorithms.h
+
+// Copyright (c)  2020  Xiaomi Corporation (authors: Daniel Povey)
+
+// See ../../LICENSE for clarification regarding multiple authors
+
+#ifndef K2_CSRC_CUDA_ALGORITHMS_H_
+#define K2_CSRC_CUDA_ALGORITHMS_H_
+
+#include "k2/csrc/cuda/array.h"
+
+//  this really contains various utilities that are useful for k2 algorithms.
+namespace k2 {
+
+class Renumbering {
+ public:
+  Renumbering(int32_t num_old_elems);
+
+  int32_t NumOldElems();
+  int32_t NumNewElems();
+
+  Array1<char> &Kept();
+
+  Array1<int32_t> &New2Old();  // dim is NumNewElems()
+
+  Array1<int32_t> &Old2New();  // dim is NumOldElems()
+
+ private:
+  Array1<char> kept;
+  Array1<int32_t> new2old;
+  Array1<int32_t> old2new;
+
+};
+
+
+
+}  // namespace k2
+
+#endif  // K2_CSRC_CUDA_ALGORITHMS_H_