Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions .github/workflows/torchao_experimental_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,29 @@ jobs:
python torchao/experimental/tests/test_embedding_xbit_quantizer.py
python torchao/experimental/tests/test_quant_passes.py
pytest -s test/prototype/test_dynamic_activation_lut.py
- name: Run kernels/cpu/aarch64/tests
- name: torchao/csrc/cpu - build and run C++ tests
if: runner.os == 'macOS'
run: |
conda activate venv
pushd torchao/experimental/kernels/cpu/aarch64/tests
pushd torchao/csrc/cpu
sh build_and_run_tests.sh
rm -rf /tmp/cmake-out
rm -rf cmake-out
popd
- name: Run torchao/experimental/ops/tests
- name: torchao/csrc/cpu - build benchmarks
if: runner.os == 'macOS'
run: |
conda activate venv
pushd torchao/experimental/ops/tests
sh build_and_run_tests.sh
rm -rf /tmp/cmake-out
pushd torchao/csrc/cpu
sh build_and_run_benchmarks.sh build_only
rm -rf cmake-out
popd
- name: ET ops build
- name: torchao/csrc/cpu - build shared_kernels with ExecuTorch
if: runner.os == 'macOS'
run: |
conda activate venv
pushd torchao/experimental
sh build_torchao_ops.sh executorch
pushd torchao/csrc/cpu
sh build_shared_kernels.sh executorch
rm -rf cmake-out
popd
# test-mps-ops:
Expand Down
44 changes: 33 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def read_version(file_path="version.txt"):
# ├── USE_CPU_KERNELS="1" + Linux → Include optimized CPU kernels (AVX512, etc.)
# └── ARM64 + macOS → Auto-enable experimental builds (build_macos_arm_auto)
#
# Level 3: Experimental builds (cmake-based)
# Level 3: Shared CPU kernel builds (cmake-based)
# ├── BUILD_TORCHAO_EXPERIMENTAL="1" → Force experimental builds
# ├── build_macos_arm_auto → Auto-enable on ARM64 macOS
# └── When enabled, provides access to:
Expand Down Expand Up @@ -322,6 +322,19 @@ def build_cmake(self, ext):
ext_filename = os.path.basename(self.get_ext_filename(ext.name))
ext_basename = os.path.splitext(ext_filename)[0]

print(
"CMAKE COMMANG",
[
"cmake",
ext.cmake_lists_dir,
]
+ ext.cmake_args
+ [
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
"-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
],
)

subprocess.check_call(
[
"cmake",
Expand Down Expand Up @@ -472,10 +485,22 @@ def get_extensions():

# Collect C++ source files
sources = list(glob.glob(os.path.join(extensions_dir, "**/*.cpp"), recursive=True))

# Exclude C++ CPU sources that are built by CMake
cpu_cmake_sources = glob.glob(
os.path.join(extensions_dir, "cpu", "torch_free_kernels", "**", "*.cpp"),
recursive=True,
)
cpu_cmake_sources += glob.glob(
os.path.join(extensions_dir, "cpu", "shared_kernels", "**", "*.cpp"),
recursive=True,
)
sources = [s for s in sources if s not in cpu_cmake_sources]

if not use_cpu_kernels or not is_linux:
# Remove csrc/cpu/*.cpp
excluded_sources = list(
glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=True)
glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=False)
)
sources = [s for s in sources if s not in excluded_sources]

Expand Down Expand Up @@ -616,6 +641,7 @@ def get_extensions():

ext_modules = []
if len(sources) > 0:
print("SOURCES", sources)
# Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
sources = [
s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
Expand Down Expand Up @@ -703,7 +729,7 @@ def get_extensions():
)
)

# Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
# Build CMakeLists from /torchao/csrc/cpu - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
build_options = BuildOptions()

Expand All @@ -716,24 +742,20 @@ def bool_to_on_off(value):

ext_modules.append(
CMakeExtension(
"torchao._experimental_aten_ops",
cmake_lists_dir="torchao/experimental",
"torchao._C_cpu_shared_kernels_aten",
cmake_lists_dir="torchao/csrc/cpu",
cmake_args=(
[
f"-DCMAKE_BUILD_TYPE={'Debug' if use_debug_mode() else 'Release'}",
f"-DTORCHAO_BUILD_CPU_AARCH64={bool_to_on_off(build_options.build_cpu_aarch64)}",
f"-DTORCHAO_BUILD_KLEIDIAI={bool_to_on_off(build_options.build_kleidi_ai)}",
f"-DTORCHAO_BUILD_MPS_OPS={bool_to_on_off(build_options.build_experimental_mps)}",
f"-DTORCHAO_ENABLE_ARM_NEON_DOT={bool_to_on_off(build_options.enable_arm_neon_dot)}",
f"-DTORCHAO_ENABLE_ARM_I8MM={bool_to_on_off(build_options.enable_arm_i8mm)}",
f"-DTORCHAO_PARALLEL_BACKEND={build_options.parallel_backend}",
"-DTORCHAO_BUILD_TESTS=OFF",
"-DTORCHAO_BUILD_BENCHMARKS=OFF",
"-DTorch_DIR=" + torch_dir,
]
+ (
["-DCMAKE_INSTALL_PREFIX=cmake-out"]
if build_options.build_experimental_mps
else []
)
),
)
)
Expand Down
6 changes: 1 addition & 5 deletions torchao/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@
torch.ops.load_library(str(file))
from . import ops

# The following library contains CPU kernels from torchao/experimental
# They are built automatically by ao/setup.py if on an ARM machine.
# They can also be built outside of the torchao install process by
# running the script `torchao/experimental/build_torchao_ops.sh <aten|executorch>`
# For more information, see https://github.com/pytorch/ao/blob/main/torchao/experimental/docs/readme.md
# The following import registers meta kernels for experimental ops
from torchao.experimental.op_lib import * # noqa: F403
except Exception as e:
logger.debug(f"Skipping import of cpp extensions: {e}")
Expand Down
232 changes: 232 additions & 0 deletions torchao/csrc/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

cmake_minimum_required(VERSION 3.19)
include(CMakeDependentOption)

project(torchao)

set(CMAKE_CXX_STANDARD 17)

if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

# Platform options
option(TORCHAO_BUILD_ATEN_OPS "Building torchao ops for ATen." ON)
option(TORCHAO_BUILD_EXECUTORCH_OPS "Building torchao ops for ExecuTorch." OFF)
option(TORCHAO_BUILD_CPU_AARCH64 "Build torchao's CPU aarch64 kernels" OFF)
option(TORCHAO_BUILD_KLEIDIAI "Download, build, and link against Arm KleidiAI library (arm64 only)" OFF)
option(TORCHAO_ENABLE_ARM_NEON_DOT "Enable ARM Neon Dot Product extension" OFF)
option(TORCHAO_ENABLE_ARM_I8MM "Enable ARM 8-bit Integer Matrix Multiply instructions" OFF)
option(TORCHAO_BUILD_TESTS "Build tests" OFF)
option(TORCHAO_BUILD_BENCHMARKS "Build tests" OFF)

# Set default compiler options
add_compile_options("-fPIC" "-Wall" "-Werror" "-Wno-deprecated")
if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
add_compile_options(
"-Wno-error=unknown-pragmas"
"-Wno-array-parameter"
"-Wno-maybe-uninitialized"
"-Wno-sign-compare"
)
elseif (APPLE)
add_compile_options("-Wno-shorten-64-to-32")
endif()



if (NOT TARGET cpuinfo)
cmake_policy(PUSH)
cmake_policy(VERSION 3.5) # cpuinfo requires CMake 3.5

# For some reason cpuinfo package has unused functions/variables
# TODO (T215533422): fix upstream
add_compile_options(-Wno-unused-function -Wno-unused-variable)

# set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
include(FetchContent)
set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE)
set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "" FORCE)
set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
FetchContent_Declare(cpuinfo
GIT_REPOSITORY https://github.com/pytorch/cpuinfo.git
GIT_TAG c61fe919607bbc534d7a5a5707bdd7041e72c5ff
)
FetchContent_MakeAvailable(
cpuinfo)

cmake_policy(POP)
endif()

if (TORCHAO_BUILD_TESTS)
include(FetchContent)
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
)
FetchContent_MakeAvailable(googletest)
endif()

if (TORCHAO_BUILD_BENCHMARKS)
include(FetchContent)
FetchContent_Declare(googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG main) # need main for benchmark::benchmark

set(BENCHMARK_ENABLE_TESTING OFF)
FetchContent_MakeAvailable(
googlebenchmark)
endif()

if(NOT TORCHAO_INCLUDE_DIRS)
set(TORCHAO_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
endif()

if(NOT DEFINED TORCHAO_PARALLEL_BACKEND)
set(TORCHAO_PARALLEL_BACKEND aten_openmp)
endif()

# Set default compiler options

include(CMakePrintHelpers)
include(${CMAKE_CURRENT_SOURCE_DIR}/shared_kernels/Utils.cmake)

message("TORCHAO_INCLUDE_DIRS: ${TORCHAO_INCLUDE_DIRS}")
include_directories(${TORCHAO_INCLUDE_DIRS})


# Build fallback kernels
add_subdirectory(torch_free_kernels/fallback)

# Build cpu/aarch64 kernels
if(TORCHAO_BUILD_CPU_AARCH64)
message(STATUS "Building with cpu/aarch64")
add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)

if(TORCHAO_ENABLE_ARM_NEON_DOT)
message(STATUS "Building with ARM NEON dot product support")
add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
add_compile_options("-march=armv8.4-a+dotprod")
endif()

if(TORCHAO_ENABLE_ARM_I8MM)
message(STATUS "Building with ARM I8MM support")
add_compile_definitions(TORCHAO_ENABLE_ARM_I8MM)
add_compile_options("-march=armv8.6-a")
endif()

if(TORCHAO_BUILD_KLEIDIAI)
message(STATUS "Building with Arm KleidiAI library")
add_compile_definitions(TORCHAO_ENABLE_KLEIDI)
if (NOT TARGET kleidiai)
include(FetchContent)
# KleidiAI is an open-source library that provides optimized
# performance-critical routines, also known as micro-kernels, for artificial
# intelligence (AI) workloads tailored for Arm® CPUs.
set(KLEIDIAI_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(KLEIDIAI_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
FetchContent_Declare(kleidiai
GIT_REPOSITORY https://git.gitlab.arm.com/kleidi/kleidiai.git
GIT_TAG v1.12.0
)
FetchContent_MakeAvailable(kleidiai)
endif()
endif()

# Defines torchao_kernels_aarch64
add_subdirectory(torch_free_kernels/aarch64)
endif()

# Build ATen ops
if(TORCHAO_BUILD_ATEN_OPS)
find_package(Torch REQUIRED)
set(_torchao_op_srcs_aten)
list(APPEND _torchao_op_srcs_aten
shared_kernels/embedding_xbit/op_embedding_xbit_aten.cpp
shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_aten.cpp
)
list(TRANSFORM _torchao_op_srcs_aten PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")

# Use the Python extension name if provided
add_library(torchao_ops_aten SHARED ${_torchao_op_srcs_aten})
if(DEFINED TORCHAO_CMAKE_EXT_SO_NAME)
message(STATUS "Setting output name to: ${TORCHAO_CMAKE_EXT_SO_NAME}.so")
set_target_properties(torchao_ops_aten PROPERTIES
OUTPUT_NAME ${TORCHAO_CMAKE_EXT_SO_NAME}
PREFIX "" # Remove "lib" prefix for Python extensions
SUFFIX ".so" # Add ".so" suffix for Python extensions
)
endif()

target_link_torchao_parallel_backend(torchao_ops_aten "${TORCHAO_PARALLEL_BACKEND}")
if (TORCHAO_BUILD_CPU_AARCH64)
target_link_libraries(torchao_ops_aten PRIVATE torchao_kernels_aarch64)
if (TORCHAO_BUILD_KLEIDIAI)
target_link_libraries(torchao_ops_aten PRIVATE kleidiai)
endif()
endif()
target_link_libraries(torchao_ops_aten PRIVATE cpuinfo)
target_include_directories(torchao_ops_aten PRIVATE "${TORCH_INCLUDE_DIRS}")
target_link_libraries(torchao_ops_aten PRIVATE "${TORCH_LIBRARIES}")
target_compile_definitions(torchao_ops_aten PRIVATE TORCHAO_SHARED_KERNELS_BUILD_ATEN=1)

if (TORCHAO_BUILD_TESTS)
add_subdirectory(shared_kernels/tests)
endif()

if (TORCHAO_BUILD_BENCHMARKS)
add_subdirectory(shared_kernels/benchmarks)
endif()

# Install ATen targets
install(
TARGETS torchao_ops_aten
EXPORT _targets
DESTINATION lib
)
endif()


# Build ExecuTorch ops
if(TORCHAO_BUILD_EXECUTORCH_OPS)
# ExecuTorch package is not required, but EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES must
# be defined and EXECUTORCH_LIBRARIES must include the following libraries installed by ExecuTorch:
# libexecutorch.a
# libextension_threadpool.a
# libcpuinfo.a
# libpthreadpool.a
if(NOT DEFINED EXECUTORCH_INCLUDE_DIRS AND NOT DEFINED EXECUTORCH_LIBRARIES)
message(WARNING "EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES are not defined. Looking for ExecuTorch.")
find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
endif()
set(_torchao_op_srcs_executorch)
list(APPEND _torchao_op_srcs_executorch
shared_kernels/embedding_xbit/op_embedding_xbit_executorch.cpp
shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_executorch.cpp)

list(TRANSFORM _torchao_op_srcs_executorch PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
add_library(torchao_ops_executorch STATIC ${_torchao_op_srcs_executorch})

target_compile_definitions(torchao_ops_executorch PRIVATE TORCHAO_SHARED_KERNELS_BUILD_EXECUTORCH=1)

# This links to ExecuTorch
target_link_torchao_parallel_backend(torchao_ops_executorch executorch)
if (TORCHAO_BUILD_CPU_AARCH64)
target_link_libraries(torchao_ops_executorch PRIVATE torchao_kernels_aarch64)
if (TORCHAO_BUILD_KLEIDIAI)
target_link_libraries(torchao_ops_executorch PRIVATE kleidiai)
endif()
endif()
target_link_libraries(torchao_ops_executorch PRIVATE cpuinfo)
endif()
Loading
Loading