From 4f04a2307e3f4637b9e4bb4dfb8a5384428132da Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Tue, 24 Jun 2025 20:16:47 +0200 Subject: [PATCH 1/4] cmake: Reduce unnecessary nesting --- ggml/src/ggml-cpu/CMakeLists.txt | 90 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 71b1d67b8d0a6..35ddc9e81b91d 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -154,53 +154,51 @@ function(ggml_add_cpu_backend_variant_impl tag_name) check_arm_feature(sme "#include \n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }") list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}") - else() - if (GGML_CPU_ARM_ARCH) - list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) - elseif(GGML_CPU_ALL_VARIANTS) - # Begin with the lowest baseline - set(ARM_MCPU "armv8-a") - set(ARCH_TAGS "") - set(ARCH_DEFINITIONS "") - - # When a feature is selected, bump the MCPU to the first - # version that supported it - if (GGML_INTERNAL_DOTPROD) - set(ARM_MCPU "armv8.2-a") - set(ARCH_TAGS "${ARCH_TAGS}+dotprod") - list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD) - endif() - if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC) - set(ARM_MCPU "armv8.2-a") - set(ARCH_TAGS "${ARCH_TAGS}+fp16") - list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC) - endif() - if (GGML_INTERNAL_SVE) - set(ARM_MCPU "armv8.2-a") - set(ARCH_TAGS "${ARCH_TAGS}+sve") - list(APPEND ARCH_DEFINITIONS GGML_USE_SVE) - endif() - if (GGML_INTERNAL_MATMUL_INT8) - set(ARM_MCPU "armv8.6-a") - set(ARCH_TAGS "${ARCH_TAGS}+i8mm") - list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8) - endif() - if (GGML_INTERNAL_SVE2) - set(ARM_MCPU "armv8.6-a") - set(ARCH_TAGS "${ARCH_TAGS}+sve2") - list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2) - endif() - if (GGML_INTERNAL_NOSVE) - set(ARCH_TAGS "${ARCH_TAGS}+nosve") - endif() - if (GGML_INTERNAL_SME) - set(ARM_MCPU "armv9.2-a") - set(ARCH_TAGS "${ARCH_TAGS}+sme") - list(APPEND ARCH_DEFINITIONS GGML_USE_SME) - endif() - list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}") - ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS}) + elseif (GGML_CPU_ARM_ARCH) + list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) + elseif(GGML_CPU_ALL_VARIANTS) + # Begin with the lowest baseline + set(ARM_MCPU "armv8-a") + set(ARCH_TAGS "") + set(ARCH_DEFINITIONS "") + + # When a feature is selected, bump the MCPU to the first + # version that supported it + if (GGML_INTERNAL_DOTPROD) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD) + endif() + if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+fp16") + list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC) + endif() + if (GGML_INTERNAL_SVE) + set(ARM_MCPU "armv8.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE) + endif() + if (GGML_INTERNAL_MATMUL_INT8) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8) + endif() + if (GGML_INTERNAL_SVE2) + set(ARM_MCPU "armv8.6-a") + set(ARCH_TAGS "${ARCH_TAGS}+sve2") + list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2) + endif() + if (GGML_INTERNAL_NOSVE) + set(ARCH_TAGS "${ARCH_TAGS}+nosve") + endif() + if (GGML_INTERNAL_SME) + set(ARM_MCPU "armv9.2-a") + set(ARCH_TAGS "${ARCH_TAGS}+sme") + list(APPEND ARCH_DEFINITIONS GGML_USE_SME) endif() + list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}") + ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS}) endif() # show enabled features From 15dd2f7bc914030fac92b797f7949f5b1f8de066 Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Tue, 24 Jun 2025 20:31:53 +0200 Subject: [PATCH 2/4] ggml-cpu: Add ARM variant targeting neoverse-v2 --- ggml/src/CMakeLists.txt | 7 ++++--- ggml/src/ggml-cpu/CMakeLists.txt | 25 ++++++++++++++++--------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9cb2c228dcfb2..632112d7e5cb4 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -317,9 +317,7 @@ if (GGML_CPU_ALL_VARIANTS) endif() elseif(GGML_SYSTEM_ARCH STREQUAL "ARM") if (CMAKE_SYSTEM_NAME MATCHES "Linux") - # Many of these features are optional so we build versions with popular - # combinations and name the backends based on the version they were - # first released with + # Generic ARM builds ggml_add_cpu_backend_variant(armv8.0_1) ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD) ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC) @@ -328,6 +326,9 @@ if (GGML_CPU_ALL_VARIANTS) ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2) ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME) ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME) + # Builds optimized for specific cores + # neoverse-v2: AWS Graviton4, NVIDIA Grace + ggml_add_cpu_backend_variant(neoverse-v2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2) elseif (CMAKE_SYSTEM_NAME MATCHES "Android") # Android-specific backends with SoC-compatible feature sets ggml_add_cpu_backend_variant(android_armv8.0_1) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 35ddc9e81b91d..a7f6a064155ba 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -157,30 +157,30 @@ function(ggml_add_cpu_backend_variant_impl tag_name) elseif (GGML_CPU_ARM_ARCH) list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) elseif(GGML_CPU_ALL_VARIANTS) - # Begin with the lowest baseline - set(ARM_MCPU "armv8-a") + # For the generic builds, begin with the lowest supported baseline + set(ARM_GENERIC_ARCH "armv8-a") set(ARCH_TAGS "") set(ARCH_DEFINITIONS "") - # When a feature is selected, bump the MCPU to the first - # version that supported it + # When a feature is selected, bump GENERIC_ARCH to the earliest + # version which supported that feature if (GGML_INTERNAL_DOTPROD) - set(ARM_MCPU "armv8.2-a") + set(ARM_GENERIC_ARCH "armv8.2-a") set(ARCH_TAGS "${ARCH_TAGS}+dotprod") list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD) endif() if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC) - set(ARM_MCPU "armv8.2-a") + set(ARM_GENERIC_ARCH "armv8.2-a") set(ARCH_TAGS "${ARCH_TAGS}+fp16") list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC) endif() if (GGML_INTERNAL_SVE) - set(ARM_MCPU "armv8.2-a") + set(ARM_GENERIC_ARCH "armv8.2-a") set(ARCH_TAGS "${ARCH_TAGS}+sve") list(APPEND ARCH_DEFINITIONS GGML_USE_SVE) endif() if (GGML_INTERNAL_MATMUL_INT8) - set(ARM_MCPU "armv8.6-a") + set(ARM_GENERIC_ARCH "armv8.6-a") set(ARCH_TAGS "${ARCH_TAGS}+i8mm") list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8) endif() @@ -197,7 +197,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) set(ARCH_TAGS "${ARCH_TAGS}+sme") list(APPEND ARCH_DEFINITIONS GGML_USE_SME) endif() - list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}") + + # CPU targeted builds first, else do the generic build + if (${tag_name} STREQUAL "neoverse-v2") + list(APPEND ARCH_FLAGS "-mcpu=neoverse-v2${ARCH_TAGS}") + list(APPEND ARCH_DEFINITIONS GGML_ARM_MCPU=NEOVERSE_V2) + else() + list(APPEND ARCH_FLAGS "-march=${ARM_GENERIC_ARCH}${ARCH_TAGS}") + endif() ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS}) endif() From c02e0da8ad50d98d676d94dc0a362707826013dc Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Tue, 24 Jun 2025 20:59:46 +0200 Subject: [PATCH 3/4] ggml-cpu: Split ARM backend scores This allows for ranking backends when they otherwise support the same features. --- ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp index 67369147ce851..0bd2e00a16d26 100644 --- a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +++ b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp @@ -61,29 +61,33 @@ static int ggml_backend_cpu_aarch64_score() { int score = 1; aarch64_features af; + // Bits 2-8 are used to rank backends by architecture or core when they + // otherwise have identical features. + + // Bits 9+: Features always trump architecture or core. #ifdef GGML_USE_DOTPROD if (!af.has_dotprod) { return 0; } - score += 1<<1; + score += 1<<8; #endif #ifdef GGML_USE_FP16_VECTOR_ARITHMETIC if (!af.has_fp16_va) { return 0; } - score += 1<<2; + score += 1<<9; #endif #ifdef GGML_USE_SVE if (!af.has_sve) { return 0; } - score += 1<<3; + score += 1<<10; #endif #ifdef GGML_USE_MATMUL_INT8 if (!af.has_i8mm) { return 0; } - score += 1<<4; + score += 1<<11; #endif #ifdef GGML_USE_SVE2 if (!af.has_sve2) { return 0; } - score += 1<<5; + score += 1<<12; #endif #ifdef GGML_USE_SME if (!af.has_sme) { return 0; } - score += 1<<6; + score += 1<<13; #endif return score; From 14ca242d0d924e0e12df9ff1a78f0d991fa9a3c5 Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Wed, 25 Jun 2025 20:24:15 +0200 Subject: [PATCH 4/4] ggml-cpu: Rank neoverse-v2 over generic ARM --- ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp index 0bd2e00a16d26..fd47621ff113f 100644 --- a/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +++ b/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp @@ -4,6 +4,9 @@ #if defined(__linux__) #include + +#include +#include #elif defined(__APPLE__) #include #endif @@ -17,6 +20,7 @@ #endif struct aarch64_features { + int cpu_part = -1; // has_neon not needed, aarch64 has NEON guaranteed bool has_dotprod = false; bool has_fp16_va = false; @@ -36,6 +40,17 @@ struct aarch64_features { has_sve2 = !!(hwcap2 & HWCAP2_SVE2); has_i8mm = !!(hwcap2 & HWCAP2_I8MM); has_sme = !!(hwcap2 & HWCAP2_SME); + + std::ifstream cpuinfo("/proc/cpuinfo"); + std::string line; + while (std::getline(cpuinfo, line)) { + if (line.find("CPU part") == 0) { + // Parse the hex number after the colon + cpu_part = std::stoi(line.substr(line.find(':') + 1), nullptr, 16); + break; + } + } + cpuinfo.close(); #elif defined(__APPLE__) int oldp = 0; size_t size = sizeof(oldp); @@ -63,6 +78,11 @@ static int ggml_backend_cpu_aarch64_score() { // Bits 2-8 are used to rank backends by architecture or core when they // otherwise have identical features. +#if defined(GGML_ARM_MCPU) && GGML_ARM_MCPU == NEOVERSE_V2 + if (af.cpu_part == 0xd4f) { + score += 1<<5; + } +#endif // Bits 9+: Features always trump architecture or core. #ifdef GGML_USE_DOTPROD