ggml: Fix PowerPC build and enable MMA optimizations

This change resolves PowerPC build breakage and enables Matrix Math
Accelerator (MMA) optimizations on supported hardware.

Key changes:
- Apply required upstream ggml fixes for PowerPC builds, including:
  - vector macro collision fixes
  - conditional POWER11 backend enablement
- Enable Matrix Math Accelerator (MMA) support for Power10.
- Add architecture-specific compiler flags to enable optimized code paths:
  - `-mcpu=power10` when built with the `ppc64le.power10` build tag
    (enables MMA-based kernels, including llamafile_sgemm)
  - `-mcpu=power9` when built with the `ppc64le.power9` build tag
    (enables VSX optimizations)

Build instructions:
- Power10:
    go build --tags ppc64le.power10 .
- Power9:
    go build --tags ppc64le.power9 .

Performance impact:
- ~30% inference time reduction on Power10 with MMA enabled.
- Measured using:
    ollama run llama3:8b (Q4_0)
    ~50-word summarization, 512-token prompt
  - With MMA: ~6.05s
  - Without MMA: ~8.45s

Improves performance for Q4_0, Q8_0, FP32, and BF16 models on Power10.
This commit is contained in:
Shalini Salomi Bodapati 2026-01-16 06:08:20 -06:00
parent 55d0b6e8b9
commit 4115e4f58f
7 changed files with 106 additions and 2 deletions

View file

@ -0,0 +1,44 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
Date: Mon, 22 Dec 2025 01:57:01 -0600
Subject: [PATCH] ggml: fix vector macro collision on Power
When compiling with MMA enabled, 'vector' may conflict with
compiler headers or language keywords on Power platforms.
Map 'vector' to '__vector' to avoid macro collisions and
restore successful compilation.
Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
---
ggml/src/ggml-cpu/llamafile/sgemm.cpp | 3 +++
ggml/src/ggml-cpu/simd-mappings.h | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index a0cce10aa..f1331de21 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -117,6 +117,9 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
#endif
#if defined(__MMA__)
+#ifndef vector
+#define vector __vector
+#endif
#include "sgemm-ppc.h"
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 101a9c086..6f742d2d6 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -631,7 +631,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_STEP 32
#define GGML_F32_EPR 4
-#define GGML_F32x4 vector float
+#define GGML_F32x4 __vector float
#define GGML_F32x4_ZERO {0.0f}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)

View file

@ -0,0 +1,36 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
Date: Mon, 22 Dec 2025 01:58:58 -0600
Subject: [PATCH] ggml: conditionally enable POWER11 CPU backend based on
compiler support
Guard POWER11 backend creation behind a compiler flag check for
-mcpu=power11. This avoids build failures on current GCC/Clang
toolchains while preserving forward compatibility once POWER11
support becomes available.
Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
---
ggml/src/CMakeLists.txt | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 9a134b7af..ce2208201 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -403,7 +403,14 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
- ggml_add_cpu_backend_variant(power11 POWER11 VSX)
+ # POWER11 backend: only if compiler supports -mcpu=power11
+ check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
+ if (GGML_CXX_SUPPORTS_POWER11)
+ message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
+ ggml_add_cpu_backend_variant(power11 POWER11 VSX)
+ else()
+ message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
+ endif()
else()
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
endif()

View file

@ -403,7 +403,14 @@ if (GGML_CPU_ALL_VARIANTS)
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
# POWER11 backend: only if compiler supports -mcpu=power11
check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
if (GGML_CXX_SUPPORTS_POWER11)
message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
else()
message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
endif()
else()
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
endif()

View file

@ -0,0 +1,7 @@
// +build ppc64le.power10
package llamafile
// #cgo CXXFLAGS: -std=c++17 -mcpu=power10
// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../../include
import "C"

View file

@ -0,0 +1,7 @@
// +build ppc64le.power9
package llamafile
// #cgo CXXFLAGS: -std=c++17 -mcpu=power9
// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../../include
import "C"

View file

@ -117,6 +117,9 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
#endif
#if defined(__MMA__)
#ifndef vector
#define vector __vector
#endif
#include "sgemm-ppc.h"
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////

View file

@ -631,7 +631,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
#define GGML_F32_STEP 32
#define GGML_F32_EPR 4
#define GGML_F32x4 vector float
#define GGML_F32x4 __vector float
#define GGML_F32x4_ZERO {0.0f}
#define GGML_F32x4_SET1 vec_splats
#define GGML_F32x4_LOAD(p) vec_xl(0, p)