pytorch · metascroy · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/.github/workflows/torchao_experimental_test.yml b/.github/workflows/torchao_experimental_test.yml
@@ -54,28 +54,29 @@ jobs:
           python torchao/experimental/tests/test_embedding_xbit_quantizer.py
           python torchao/experimental/tests/test_quant_passes.py
           pytest -s test/prototype/test_dynamic_activation_lut.py
-      - name: Run kernels/cpu/aarch64/tests
+      - name: torchao/csrc/cpu - build and run C++ tests
         if: runner.os == 'macOS'
         run: |
           conda activate venv
-          pushd torchao/experimental/kernels/cpu/aarch64/tests
+          pushd torchao/csrc/cpu
           sh build_and_run_tests.sh
-          rm -rf /tmp/cmake-out
+          rm -rf cmake-out
           popd
-      - name: Run torchao/experimental/ops/tests
+      - name: torchao/csrc/cpu - build benchmarks
         if: runner.os == 'macOS'
         run: |
           conda activate venv
-          pushd torchao/experimental/ops/tests
-          sh build_and_run_tests.sh
-          rm -rf /tmp/cmake-out
+          pushd torchao/csrc/cpu
+          sh build_and_run_benchmarks.sh build_only
+          rm -rf cmake-out
           popd
-      - name: ET ops build
+      - name: torchao/csrc/cpu - build shared_kernels with ExecuTorch
         if: runner.os == 'macOS'
         run: |
           conda activate venv
-          pushd torchao/experimental
-          sh build_torchao_ops.sh executorch
+          pushd torchao/csrc/cpu
+          sh build_shared_kernels.sh executorch
+          rm -rf cmake-out
           popd
 
   # test-mps-ops:

diff --git a/setup.py b/setup.py
@@ -73,7 +73,7 @@ def read_version(file_path="version.txt"):
 #   ├── USE_CPU_KERNELS="1" + Linux → Include optimized CPU kernels (AVX512, etc.)
 #   └── ARM64 + macOS → Auto-enable experimental builds (build_macos_arm_auto)
 #
-# Level 3: Experimental builds (cmake-based)
+# Level 3: Shared CPU kernel builds (cmake-based)
 #   ├── BUILD_TORCHAO_EXPERIMENTAL="1" → Force experimental builds
 #   ├── build_macos_arm_auto → Auto-enable on ARM64 macOS
 #   └── When enabled, provides access to:
@@ -322,6 +322,19 @@ def build_cmake(self, ext):
         ext_filename = os.path.basename(self.get_ext_filename(ext.name))
         ext_basename = os.path.splitext(ext_filename)[0]
 
+        print(
+            "CMAKE COMMANG",
+            [
+                "cmake",
+                ext.cmake_lists_dir,
+            ]
+            + ext.cmake_args
+            + [
+                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+                "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
+            ],
+        )
+
         subprocess.check_call(
             [
                 "cmake",
@@ -472,10 +485,22 @@ def get_extensions():
 
     # Collect C++ source files
     sources = list(glob.glob(os.path.join(extensions_dir, "**/*.cpp"), recursive=True))
+
+    # Exclude C++ CPU sources that are built by CMake
+    cpu_cmake_sources = glob.glob(
+        os.path.join(extensions_dir, "cpu", "torch_free_kernels", "**", "*.cpp"),
+        recursive=True,
+    )
+    cpu_cmake_sources += glob.glob(
+        os.path.join(extensions_dir, "cpu", "shared_kernels", "**", "*.cpp"),
+        recursive=True,
+    )
+    sources = [s for s in sources if s not in cpu_cmake_sources]
+
     if not use_cpu_kernels or not is_linux:
         # Remove csrc/cpu/*.cpp
         excluded_sources = list(
-            glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=True)
+            glob.glob(os.path.join(extensions_dir, "cpu/*.cpp"), recursive=False)
         )
         sources = [s for s in sources if s not in excluded_sources]
 
@@ -616,6 +641,7 @@ def get_extensions():
 
     ext_modules = []
     if len(sources) > 0:
+        print("SOURCES", sources)
         # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
         sources = [
             s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
@@ -703,7 +729,7 @@ def get_extensions():
             )
         )
 
-    # Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
+    # Build CMakeLists from /torchao/csrc/cpu - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
     if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
         build_options = BuildOptions()
 
@@ -716,24 +742,20 @@ def bool_to_on_off(value):
 
         ext_modules.append(
             CMakeExtension(
-                "torchao._experimental_aten_ops",
-                cmake_lists_dir="torchao/experimental",
+                "torchao._C_cpu_shared_kernels_aten",
+                cmake_lists_dir="torchao/csrc/cpu",
                 cmake_args=(
                     [
                         f"-DCMAKE_BUILD_TYPE={'Debug' if use_debug_mode() else 'Release'}",
                         f"-DTORCHAO_BUILD_CPU_AARCH64={bool_to_on_off(build_options.build_cpu_aarch64)}",
                         f"-DTORCHAO_BUILD_KLEIDIAI={bool_to_on_off(build_options.build_kleidi_ai)}",
-                        f"-DTORCHAO_BUILD_MPS_OPS={bool_to_on_off(build_options.build_experimental_mps)}",
                         f"-DTORCHAO_ENABLE_ARM_NEON_DOT={bool_to_on_off(build_options.enable_arm_neon_dot)}",
                         f"-DTORCHAO_ENABLE_ARM_I8MM={bool_to_on_off(build_options.enable_arm_i8mm)}",
                         f"-DTORCHAO_PARALLEL_BACKEND={build_options.parallel_backend}",
+                        "-DTORCHAO_BUILD_TESTS=OFF",
+                        "-DTORCHAO_BUILD_BENCHMARKS=OFF",
                         "-DTorch_DIR=" + torch_dir,
                     ]
-                    + (
-                        ["-DCMAKE_INSTALL_PREFIX=cmake-out"]
-                        if build_options.build_experimental_mps
-                        else []
-                    )
                 ),
             )
         )

diff --git a/torchao/__init__.py b/torchao/__init__.py
@@ -31,11 +31,7 @@
             torch.ops.load_library(str(file))
         from . import ops
 
-    # The following library contains CPU kernels from torchao/experimental
-    # They are built automatically by ao/setup.py if on an ARM machine.
-    # They can also be built outside of the torchao install process by
-    # running the script `torchao/experimental/build_torchao_ops.sh <aten|executorch>`
-    # For more information, see https://github.com/pytorch/ao/blob/main/torchao/experimental/docs/readme.md
+    # The following import registers meta kernels for experimental ops
     from torchao.experimental.op_lib import *  # noqa: F403
 except Exception as e:
     logger.debug(f"Skipping import of cpp extensions: {e}")

diff --git a/torchao/csrc/cpu/CMakeLists.txt b/torchao/csrc/cpu/CMakeLists.txt
@@ -0,0 +1,232 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+include(CMakeDependentOption)
+
+project(torchao)
+
+set(CMAKE_CXX_STANDARD 17)
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Platform options
+option(TORCHAO_BUILD_ATEN_OPS "Building torchao ops for ATen." ON)
+option(TORCHAO_BUILD_EXECUTORCH_OPS "Building torchao ops for ExecuTorch." OFF)
+option(TORCHAO_BUILD_CPU_AARCH64 "Build torchao's CPU aarch64 kernels" OFF)
+option(TORCHAO_BUILD_KLEIDIAI "Download, build, and link against Arm KleidiAI library (arm64 only)" OFF)
+option(TORCHAO_ENABLE_ARM_NEON_DOT "Enable ARM Neon Dot Product extension" OFF)
+option(TORCHAO_ENABLE_ARM_I8MM "Enable ARM 8-bit Integer Matrix Multiply instructions" OFF)
+option(TORCHAO_BUILD_TESTS "Build tests" OFF)
+option(TORCHAO_BUILD_BENCHMARKS "Build tests" OFF)
+
+# Set default compiler options
+add_compile_options("-fPIC" "-Wall" "-Werror" "-Wno-deprecated")
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    add_compile_options(
+        "-Wno-error=unknown-pragmas"
+        "-Wno-array-parameter"
+        "-Wno-maybe-uninitialized"
+        "-Wno-sign-compare"
+    )
+elseif (APPLE)
+    add_compile_options("-Wno-shorten-64-to-32")
+endif()
+
+
+
+if (NOT TARGET cpuinfo)
+    cmake_policy(PUSH)
+    cmake_policy(VERSION 3.5)  # cpuinfo requires CMake 3.5
+
+    # For some reason cpuinfo package has unused functions/variables
+    # TODO (T215533422): fix upstream
+    add_compile_options(-Wno-unused-function -Wno-unused-variable)
+
+    # set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
+    include(FetchContent)
+    set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE)
+    set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "" FORCE)
+    set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+    FetchContent_Declare(cpuinfo
+        GIT_REPOSITORY https://github.com/pytorch/cpuinfo.git
+        GIT_TAG c61fe919607bbc534d7a5a5707bdd7041e72c5ff
+    )
+    FetchContent_MakeAvailable(
+        cpuinfo)
+
+    cmake_policy(POP)
+endif()
+
+if (TORCHAO_BUILD_TESTS)
+    include(FetchContent)
+    FetchContent_Declare(
+    googletest
+    URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
+    )
+    FetchContent_MakeAvailable(googletest)
+endif()
+
+if (TORCHAO_BUILD_BENCHMARKS)
+    include(FetchContent)
+    FetchContent_Declare(googlebenchmark
+            GIT_REPOSITORY https://github.com/google/benchmark.git
+            GIT_TAG main) # need main for benchmark::benchmark
+
+    set(BENCHMARK_ENABLE_TESTING OFF)
+    FetchContent_MakeAvailable(
+        googlebenchmark)
+endif()
+
+if(NOT TORCHAO_INCLUDE_DIRS)
+  set(TORCHAO_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+if(NOT DEFINED TORCHAO_PARALLEL_BACKEND)
+    set(TORCHAO_PARALLEL_BACKEND aten_openmp)
+endif()
+
+# Set default compiler options
+
+include(CMakePrintHelpers)
+include(${CMAKE_CURRENT_SOURCE_DIR}/shared_kernels/Utils.cmake)
+
+message("TORCHAO_INCLUDE_DIRS: ${TORCHAO_INCLUDE_DIRS}")
+include_directories(${TORCHAO_INCLUDE_DIRS})
+
+
+# Build fallback kernels
+add_subdirectory(torch_free_kernels/fallback)
+
+# Build cpu/aarch64 kernels
+if(TORCHAO_BUILD_CPU_AARCH64)
+    message(STATUS "Building with cpu/aarch64")
+    add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64)
+
+    if(TORCHAO_ENABLE_ARM_NEON_DOT)
+        message(STATUS "Building with ARM NEON dot product support")
+        add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT)
+        add_compile_options("-march=armv8.4-a+dotprod")
+    endif()
+
+    if(TORCHAO_ENABLE_ARM_I8MM)
+        message(STATUS "Building with ARM I8MM support")
+        add_compile_definitions(TORCHAO_ENABLE_ARM_I8MM)
+        add_compile_options("-march=armv8.6-a")
+    endif()
+
+    if(TORCHAO_BUILD_KLEIDIAI)
+        message(STATUS "Building with Arm KleidiAI library")
+        add_compile_definitions(TORCHAO_ENABLE_KLEIDI)
+        if (NOT TARGET kleidiai)
+            include(FetchContent)
+            # KleidiAI is an open-source library that provides optimized
+            # performance-critical routines, also known as micro-kernels, for artificial
+            # intelligence (AI) workloads tailored for Arm® CPUs.
+            set(KLEIDIAI_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+            set(KLEIDIAI_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+            FetchContent_Declare(kleidiai
+                GIT_REPOSITORY https://git.gitlab.arm.com/kleidi/kleidiai.git
+                GIT_TAG v1.12.0
+            )
+            FetchContent_MakeAvailable(kleidiai)
+        endif()
+    endif()
+
+    # Defines torchao_kernels_aarch64
+    add_subdirectory(torch_free_kernels/aarch64)
+endif()
+
+# Build ATen ops
+if(TORCHAO_BUILD_ATEN_OPS)
+    find_package(Torch REQUIRED)
+    set(_torchao_op_srcs_aten)
+    list(APPEND _torchao_op_srcs_aten
+        shared_kernels/embedding_xbit/op_embedding_xbit_aten.cpp
+        shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
+        shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
+        shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
+        shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_aten.cpp
+    )
+    list(TRANSFORM _torchao_op_srcs_aten PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # Use the Python extension name if provided
+    add_library(torchao_ops_aten SHARED ${_torchao_op_srcs_aten})
+    if(DEFINED TORCHAO_CMAKE_EXT_SO_NAME)
+        message(STATUS "Setting output name to: ${TORCHAO_CMAKE_EXT_SO_NAME}.so")
+        set_target_properties(torchao_ops_aten PROPERTIES
+            OUTPUT_NAME ${TORCHAO_CMAKE_EXT_SO_NAME}
+            PREFIX ""  # Remove "lib" prefix for Python extensions
+            SUFFIX ".so"  # Add ".so" suffix for Python extensions
+        )
+    endif()
+
+    target_link_torchao_parallel_backend(torchao_ops_aten "${TORCHAO_PARALLEL_BACKEND}")
+    if (TORCHAO_BUILD_CPU_AARCH64)
+        target_link_libraries(torchao_ops_aten PRIVATE torchao_kernels_aarch64)
+        if (TORCHAO_BUILD_KLEIDIAI)
+            target_link_libraries(torchao_ops_aten PRIVATE kleidiai)
+        endif()
+    endif()
+    target_link_libraries(torchao_ops_aten PRIVATE cpuinfo)
+    target_include_directories(torchao_ops_aten PRIVATE "${TORCH_INCLUDE_DIRS}")
+    target_link_libraries(torchao_ops_aten PRIVATE "${TORCH_LIBRARIES}")
+    target_compile_definitions(torchao_ops_aten PRIVATE TORCHAO_SHARED_KERNELS_BUILD_ATEN=1)
+
+    if (TORCHAO_BUILD_TESTS)
+        add_subdirectory(shared_kernels/tests)
+    endif()
+
+    if (TORCHAO_BUILD_BENCHMARKS)
+        add_subdirectory(shared_kernels/benchmarks)
+    endif()
+
+    # Install ATen targets
+    install(
+        TARGETS torchao_ops_aten
+        EXPORT _targets
+        DESTINATION lib
+    )
+endif()
+
+
+# Build ExecuTorch ops
+if(TORCHAO_BUILD_EXECUTORCH_OPS)
+    # ExecuTorch package is not required, but EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES must
+    # be defined and EXECUTORCH_LIBRARIES must include the following libraries installed by ExecuTorch:
+    # libexecutorch.a
+    # libextension_threadpool.a
+    # libcpuinfo.a
+    # libpthreadpool.a
+    if(NOT DEFINED EXECUTORCH_INCLUDE_DIRS AND NOT DEFINED EXECUTORCH_LIBRARIES)
+        message(WARNING "EXECUTORCH_INCLUDE_DIRS and EXECUTORCH_LIBRARIES are not defined. Looking for ExecuTorch.")
+        find_package(ExecuTorch HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
+    endif()
+    set(_torchao_op_srcs_executorch)
+    list(APPEND _torchao_op_srcs_executorch
+        shared_kernels/embedding_xbit/op_embedding_xbit_executorch.cpp
+        shared_kernels/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
+        shared_kernels/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch.cpp
+        shared_kernels/groupwise_lowbit_weight_lut/groupwise_lowbit_weight_lut.cpp
+        shared_kernels/groupwise_lowbit_weight_lut/op_groupwise_lowbit_weight_lut_executorch.cpp)
+
+    list(TRANSFORM _torchao_op_srcs_executorch PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/")
+    add_library(torchao_ops_executorch STATIC ${_torchao_op_srcs_executorch})
+
+    target_compile_definitions(torchao_ops_executorch PRIVATE TORCHAO_SHARED_KERNELS_BUILD_EXECUTORCH=1)
+
+    # This links to ExecuTorch
+    target_link_torchao_parallel_backend(torchao_ops_executorch executorch)
+    if (TORCHAO_BUILD_CPU_AARCH64)
+        target_link_libraries(torchao_ops_executorch PRIVATE torchao_kernels_aarch64)
+        if (TORCHAO_BUILD_KLEIDIAI)
+            target_link_libraries(torchao_ops_executorch PRIVATE kleidiai)
+        endif()
+    endif()
+    target_link_libraries(torchao_ops_executorch PRIVATE cpuinfo)
+endif()