Add type annotations and mypy checks for cuda.parallel (#3180)

* Refactor the source layout for cuda.parallel * Add initial type annotations * Update pre-commit config * More typing * Fix bad merge * Fix TYPE_CHECKING and numpy annotations * typing bindings.py correctly * Address review feedback --------- Co-authored-by: Ashwin Srinath <[email protected]>
NVIDIA · Dec 19, 2024 · 7b19adb · 7b19adb
1 parent 6adaca3
commit 7b19adb
Show file tree

Hide file tree

Showing 7 changed files with 109 additions and 89 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -58,5 +58,16 @@ repos:
             CITATION.md
           )
 
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v1.13.0'
+    hooks:
+      - id: mypy
+        additional_dependencies: [types-cachetools, numpy]
+        args: ["--config-file=python/cuda_parallel/pyproject.toml",
+               "python/cuda_parallel/cuda/"]
+        pass_filenames: false
+
+
 default_language_version:
   python: python3
diff --git a/python/cuda_parallel/cuda/__init__.py b/python/cuda_parallel/cuda/__init__.py
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_bindings.py b/python/cuda_parallel/cuda/parallel/experimental/_bindings.py
@@ -5,13 +5,15 @@
 
 import os
 import shutil
-import importlib
+from importlib.resources import files, as_file
 import ctypes
+from functools import lru_cache
+from typing import List, Optional
 
 from . import _cccl as cccl
 
 
-def _get_cuda_path():
+def _get_cuda_path() -> Optional[str]:
     cuda_path = os.environ.get("CUDA_PATH", "")
     if os.path.exists(cuda_path):
         return cuda_path
@@ -27,18 +29,10 @@ def _get_cuda_path():
     return None
 
 
-_bindings = None
-_paths = None
-
-
-def get_bindings():
-    global _bindings
-    if _bindings is not None:
-        return _bindings
-    include_path = importlib.resources.files("cuda.parallel.experimental").joinpath(
-        "cccl"
-    )
-    cccl_c_path = os.path.join(include_path, "libcccl.c.parallel.so")
+@lru_cache()
+def get_bindings() -> ctypes.CDLL:
+    with as_file(files("cuda.parallel.experimental")) as f:
+        cccl_c_path = str(f / "cccl" / "libcccl.c.parallel.so")
     _bindings = ctypes.CDLL(cccl_c_path)
     _bindings.cccl_device_reduce.restype = ctypes.c_int
     _bindings.cccl_device_reduce.argtypes = [
@@ -56,23 +50,24 @@ def get_bindings():
     return _bindings
 
 
-def get_paths():
-    global _paths
-    if _paths is not None:
-        return _paths
-    # Using `.parent` for compatibility with pip install --editable:
-    include_path = importlib.resources.files("cuda.parallel").parent.joinpath(
-        "_include"
-    )
-    include_path_str = str(include_path)
-    include_option = "-I" + include_path_str
-    cub_path = include_option.encode()
-    thrust_path = cub_path
-    libcudacxx_path_str = str(os.path.join(include_path, "libcudacxx"))
-    libcudacxx_option = "-I" + libcudacxx_path_str
-    libcudacxx_path = libcudacxx_option.encode()
-    cuda_include_str = os.path.join(_get_cuda_path(), "include")
-    cuda_include_option = "-I" + cuda_include_str
-    cuda_include_path = cuda_include_option.encode()
-    _paths = cub_path, thrust_path, libcudacxx_path, cuda_include_path
-    return _paths
+@lru_cache()
+def get_paths() -> List[bytes]:
+    with as_file(files("cuda.parallel")) as f:
+        # Using `.parent` for compatibility with pip install --editable:
+        cub_include_path = str(f.parent / "_include")
+    thrust_include_path = cub_include_path
+    libcudacxx_include_path = str(os.path.join(cub_include_path, "libcudacxx"))
+    cuda_include_path = None
+    if cuda_path := _get_cuda_path():
+        cuda_include_path = str(os.path.join(cuda_path, "include"))
+    paths = [
+        f"-I{path}".encode()
+        for path in (
+            cub_include_path,
+            thrust_include_path,
+            libcudacxx_include_path,
+            cuda_include_path,
+        )
+        if path is not None
+    ]
+    return paths
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_cccl.py b/python/cuda_parallel/cuda/parallel/experimental/_cccl.py
@@ -6,8 +6,11 @@
 import numba
 import functools
 import ctypes
+import numpy as np
 from numba import types, cuda
 
+from .iterators._iterators import IteratorBase
+
 
 # MUST match `cccl_type_enum` in c/include/cccl/c/types.h
 class TypeEnum(ctypes.c_int):
@@ -91,29 +94,29 @@ class Value(ctypes.Structure):
 
 
 _TYPE_TO_ENUM = {
-    types.int8: TypeEnum.INT8,
-    types.int16: TypeEnum.INT16,
-    types.int32: TypeEnum.INT32,
-    types.int64: TypeEnum.INT64,
-    types.uint8: TypeEnum.UINT8,
-    types.uint16: TypeEnum.UINT16,
-    types.uint32: TypeEnum.UINT32,
-    types.uint64: TypeEnum.UINT64,
-    types.float32: TypeEnum.FLOAT32,
-    types.float64: TypeEnum.FLOAT64,
+    types.int8: TypeEnum(TypeEnum.INT8),
+    types.int16: TypeEnum(TypeEnum.INT16),
+    types.int32: TypeEnum(TypeEnum.INT32),
+    types.int64: TypeEnum(TypeEnum.INT64),
+    types.uint8: TypeEnum(TypeEnum.UINT8),
+    types.uint16: TypeEnum(TypeEnum.UINT16),
+    types.uint32: TypeEnum(TypeEnum.UINT32),
+    types.uint64: TypeEnum(TypeEnum.UINT64),
+    types.float32: TypeEnum(TypeEnum.FLOAT32),
+    types.float64: TypeEnum(TypeEnum.FLOAT64),
 }
 
 
-def _type_to_enum(numba_type):
+def _type_to_enum(numba_type: types.Type) -> TypeEnum:
     if numba_type in _TYPE_TO_ENUM:
         return _TYPE_TO_ENUM[numba_type]
-    return TypeEnum.STORAGE
+    return TypeEnum(TypeEnum.STORAGE)
 
 
 # TODO: replace with functools.cache once our docs build environment
 # is upgraded to at least Python 3.9
 @functools.lru_cache(maxsize=None)
-def _numba_type_to_info(numba_type):
+def _numba_type_to_info(numba_type: types.Type) -> TypeInfo:
     context = cuda.descriptor.cuda_target.target_context
     value_type = context.get_value_type(numba_type)
     size = value_type.get_abi_size(context.target_data)
@@ -122,12 +125,12 @@ def _numba_type_to_info(numba_type):
 
 
 @functools.lru_cache(maxsize=None)
-def _numpy_type_to_info(numpy_type):
+def _numpy_type_to_info(numpy_type: np.dtype) -> TypeInfo:
     numba_type = numba.from_dtype(numpy_type)
     return _numba_type_to_info(numba_type)
 
 
-def _device_array_to_cccl_iter(array):
+def _device_array_to_cccl_iter(array) -> Iterator:
     info = _numpy_type_to_info(array.dtype)
     return Iterator(
         info.size,
@@ -143,7 +146,7 @@ def _device_array_to_cccl_iter(array):
     )
 
 
-def _iterator_to_cccl_iter(it):
+def _iterator_to_cccl_iter(it: IteratorBase) -> Iterator:
     context = cuda.descriptor.cuda_target.target_context
     numba_type = it.numba_type
     size = context.get_value_type(numba_type).get_abi_size(context.target_data)
@@ -180,8 +183,7 @@ def _iterator_to_cccl_iter(it):
     )
 
 
-def type_enum_as_name(enum_value):
-    assert isinstance(enum_value, int)
+def type_enum_as_name(enum_value: int) -> str:
     return (
         "int8",
         "int16",
@@ -197,14 +199,12 @@ def type_enum_as_name(enum_value):
     )[enum_value]
 
 
-def to_cccl_iter(array_or_iterator):
-    from cuda.parallel.experimental.iterators._iterators import IteratorBase
-
+def to_cccl_iter(array_or_iterator) -> Iterator:
     if isinstance(array_or_iterator, IteratorBase):
         return _iterator_to_cccl_iter(array_or_iterator)
     return _device_array_to_cccl_iter(array_or_iterator)
 
 
-def host_array_to_value(array):
+def host_array_to_value(array: np.ndarray) -> Value:
     info = _numpy_type_to_info(array.dtype)
     return Value(info, array.ctypes.data)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -3,24 +3,26 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import numba
 import ctypes
+import numba
+import numpy as np
 from numba import cuda
 from numba.cuda.cudadrv import enums
+from typing import Callable
 
 from .. import _cccl as cccl
 from .._bindings import get_paths, get_bindings
 
 
 class _Op:
-    def __init__(self, dtype, op):
+    def __init__(self, dtype: np.dtype, op: Callable):
         value_type = numba.from_dtype(dtype)
         self.ltoir, _ = cuda.compile(
             op, sig=value_type(value_type, value_type), output="ltoir"
         )
         self.name = op.__name__.encode("utf-8")
 
-    def handle(self):
+    def handle(self) -> cccl.Op:
         return cccl.Op(
             cccl.OpKind.STATELESS,
             self.name,
@@ -39,7 +41,7 @@ def _dtype_validation(dt1, dt2):
 
 class _Reduce:
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
-    def __init__(self, d_in, d_out, op, h_init):
+    def __init__(self, d_in, d_out, op: Callable, h_init: np.ndarray):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
@@ -70,7 +72,7 @@ def __init__(self, d_in, d_out, op, h_init):
         if error != enums.CUDA_SUCCESS:
             raise ValueError("Error building reduce")
 
-    def __call__(self, temp_storage, d_in, d_out, num_items, h_init):
+    def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR:
             assert num_items is not None
@@ -119,7 +121,7 @@ def __del__(self):
 
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
-def reduce_into(d_in, d_out, op, h_init):
+def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray):
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 
     Example: