diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d389dccbfb42..05e9b23f50a07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,10 @@ if (NOT DEFINED GGML_LLAMAFILE) set(GGML_LLAMAFILE_DEFAULT ON) endif() +if (NOT DEFINED GGML_OPENMP_SIMD) + set(GGML_OPENMP_SIMD_DEFAULT ON) +endif() + if (NOT DEFINED GGML_AMX) set(GGML_AMX ON) endif() diff --git a/Makefile b/Makefile index 83adcef28cb7d..e494c53fa36a4 100644 --- a/Makefile +++ b/Makefile @@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1 DEPRECATE_WARNING := 1 endif +ifdef LLAMA_NO_OPENMP_SIMD +GGML_NO_OPENMP_SIMD := 1 +endif + ifdef LLAMA_NO_METAL GGML_NO_METAL := 1 DEPRECATE_WARNING := 1 @@ -542,6 +546,12 @@ ifndef GGML_NO_OPENMP MK_CXXFLAGS += -fopenmp endif # GGML_NO_OPENMP +ifndef GGML_NO_OPENMP_SIMD + MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD + MK_CFLAGS += -fopenmp-simd + MK_CXXFLAGS += -fopenmp-simd +endif # GGML_NO_OPENMP_SIMD + ifdef GGML_OPENBLAS MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) @@ -948,12 +958,14 @@ OBJ_GGML = \ $(DIR_GGML)/src/ggml-alloc.o \ $(DIR_GGML)/src/ggml-backend.o \ $(DIR_GGML)/src/ggml-backend-reg.o \ + $(DIR_GGML)/src/ggml-fp8.o \ $(DIR_GGML)/src/ggml-opt.o \ $(DIR_GGML)/src/ggml-quants.o \ $(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-fp8.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ $(OBJ_GGML_EXT) @@ -1094,17 +1106,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) # Default target all: $(BUILD_TARGETS) +# force c++ build for source file that have same name as c file # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files -# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml -$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ - ggml/src/ggml-cpu/ggml-cpu.cpp \ - ggml/include/ggml-backend.h \ - ggml/include/ggml.h \ - ggml/include/ggml-alloc.h \ - ggml/src/ggml-backend-impl.h \ - ggml/include/ggml-cpu.h \ - ggml/src/ggml-impl.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c diff --git a/Package.swift b/Package.swift index 1e75aa7e2538b..54ea096e6cd07 100644 --- a/Package.swift +++ b/Package.swift @@ -20,6 +20,8 @@ var sources = [ "ggml/src/ggml-cpu/ggml-cpu-quants.c", "ggml/src/ggml-threading.cpp", "ggml/src/ggml-quants.c", + "ggml/src/ggml-fp8.cpp", + "ggml/src/ggml-cpu/ggml-cpu-fp8.cpp", ] var resources: [Resource] = [] @@ -88,5 +90,5 @@ let package = Package( linkerSettings: linkerSettings ) ], - cxxLanguageStandard: .cxx11 + cxxLanguageStandard: .cxx17 ) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 64a84607c22d8..f3ff90d4b0dae 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1797,9 +1797,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { total_seconds = total_seconds % (60*60); } LOG("%.2f minutes\n", total_seconds / 60.0); + LOG("\n"); + LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); } - LOG("\n"); - LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b989932107dba..079b11112a482 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -51,6 +51,8 @@ static const std::vector QUANT_OPTIONS = { { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "E4M3_Q", LLAMA_FTYPE_MOSTLY_E4M3_Q, "12.21G, 0.0050 kld @ Mistral-Nemo", }, + { "E3M4_Q", LLAMA_FTYPE_MOSTLY_E3M4_Q, "12.21G, 0.0016 kld @ Mistral-Nemo", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 789fa3b0c42f4..17d92518ea22c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -62,6 +62,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT) set(GGML_LLAMAFILE_DEFAULT OFF) endif() +if (NOT GGML_OPENMP_SIMD_DEFAULT) + set(GGML_OPENMP_SIMD_DEFAULT OFF) +endif() + if (NOT GGML_CUDA_GRAPHS_DEFAULT) set(GGML_CUDA_GRAPHS_DEFAULT OFF) endif() @@ -112,6 +116,7 @@ option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LSX "ggml: enable lsx" ON) option(GGML_RVV "ggml: enable rvv" ON) option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT}) if (WIN32) set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 65cb92c444bb7..3cc6fec78f4eb 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -392,6 +392,10 @@ extern "C" { GGML_TYPE_IQ4_NL_4_4 = 36, // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, + GGML_TYPE_E5M2 = 39, + GGML_TYPE_E4M3 = 40, + GGML_TYPE_E4M3_Q = 41, + GGML_TYPE_E3M4_Q = 42, GGML_TYPE_COUNT, }; @@ -436,6 +440,10 @@ extern "C" { GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_E5M2 = 28, // except 1d tensors + GGML_FTYPE_MOSTLY_E4M3 = 29, // except 1d tensors + GGML_FTYPE_MOSTLY_E4M3_Q = 30, // except 1d tensors + GGML_FTYPE_MOSTLY_E3M4_Q = 31, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 19289f32beaac..f925f91817bb7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -222,7 +222,10 @@ add_library(ggml-base ggml-quants.c ggml-quants.h ggml-aarch64.c - ggml-aarch64.h) + ggml-aarch64.h + ggml-fp8.cpp + ggml-fp8.h + ) target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 27253a6c2b3ca..42026c16fe4a2 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -6,7 +6,20 @@ typedef uint16_t ggml_half; typedef uint32_t ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S + +#define GGML_COMMON_DECL +#elif defined(GGML_COMMON_DECL_CPP) +#include + +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +// std-c++ allow anonymous unions but some compiler warn on it +#define GGML_COMMON_AGGR_U data +// std-c++ do not allow it. +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) @@ -29,7 +43,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_HIP) @@ -39,7 +54,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_SYCL) @@ -49,7 +65,8 @@ typedef half2 ggml_half2; typedef sycl::half ggml_half; typedef sycl::half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #endif @@ -154,9 +171,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -175,9 +192,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -196,9 +213,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half s; // d * sum(qs[i]) - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 ds; - }; + } GGML_COMMON_AGGR_U; int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); @@ -261,9 +278,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -288,9 +305,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -305,9 +322,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits @@ -424,6 +441,24 @@ typedef struct { } block_iq4_nlx4; static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); +// fp8 support +// - fp8 simple type +typedef struct { uint8_t bits; } ggml_e5m2_t; +typedef struct { uint8_t bits; } ggml_e4m3_t; + +// - fp8 with bloc delta => 8.125 bpw +typedef struct { + float d; // delta + uint8_t qs[QK_K]; +} block_e4m3_q; +static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding"); + +typedef struct { + float d; // delta + uint8_t qs[QK_K]; +} block_e3m4_q; +static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding"); + #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL @@ -437,6 +472,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_END() }; +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CPP) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_METAL) #include diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index fe2222084e05a..c61aefeb88941 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -7,6 +7,8 @@ list (APPEND GGML_CPU_SOURCES ggml-cpu-aarch64.h ggml-cpu-quants.c ggml-cpu-quants.h + ggml-cpu-fp8.cpp + ggml-cpu-fp8.h amx/amx.cpp amx/amx.h amx/mmq.cpp @@ -45,6 +47,18 @@ if (GGML_OPENMP) endif() endif() +if (GGML_OPENMP_SIMD) + check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD) + if (SUPPORTS_OPENMP_SIMD) + # OpenMP_RUNTIME_MSVC=experimental / if (MSVC) + message(STATUS "Using OPENMP_SIMD.") + add_compile_definitions(GGML_USE_OPENMP_SIMD) + set(OPENMP_SIMD_FLAGS -fopenmp-simd) + else() + message(WARNING "C++ compiler lacks OPENMP_SIMD support.") + endif() +endif() + if (GGML_LLAMAFILE) message(STATUS "Using llamafile") @@ -304,3 +318,11 @@ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS " if (EMSCRIPTEN) set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") endif() + +# FP8 +if (OPENMP_SIMD_FLAGS) + # set_source_files_properties(ggml-cpu-fp8.cpp PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS}) + set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS}) +endif() + + diff --git a/ggml/src/ggml-cpu/ggml-cpu-fp8.cpp b/ggml/src/ggml-cpu/ggml-cpu-fp8.cpp new file mode 100644 index 0000000000000..db5a7cf45035d --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-fp8.cpp @@ -0,0 +1,260 @@ +#include +#include + +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml.h" + +#include "ggml-cpu-fp8.h" + +namespace fp8 { +union fp32_int32 { + float f; + uint32_t bits; +}; + +#ifdef GGML_USE_OPENMP_SIMD +#pragma omp declare simd +#endif +template +inline uint8_t from_float(float value) { + FP8 out; + fp32_int32 in = {value}; + out.bits = (in.bits >> 24) & 0x80; + in.bits &= 0x7fffffff; + if (in.f >= FP8::MAX) { + out.bits |= 0x7E; + } else if (in.f < FP8::MIN) { // => 0. + } else { + in.f *= exp_f2::E_BIAS-127>(); + uint32_t eps = (0x3fffff>>FP8::M) + ((in.bits >> (23-FP8::M)) & 0x1); + in.bits += eps; + out.bits |= (in.bits >> (23-FP8::M)) & 0x7F; + } + return out.bits; +} + +#ifdef GGML_USE_OPENMP_SIMD +#pragma omp declare simd +#endif +template +inline float to_float(const FP8& in) { + fp32_int32 out = {0}; + out.bits = in.bits & 0x80; + out.bits <<= 24; + uint32_t _bits = in.bits & 0x7F; + _bits <<= (23-FP8::M); + out.bits |= _bits; + out.f *= exp_f2<127-FP8::E_BIAS>(); + return out.f; +} +} // namespace fp8 + +template +static inline void conv(const float* x, FP8* y, int64_t size) { +#ifdef GGML_USE_OPENMP_SIMD + #pragma omp simd +#endif + for (int64_t i=0; i(x[i]); + } +} + +template +static inline float dot(const FP8* x, const float* y, int64_t size) { + float z = 0; +#ifdef GGML_USE_OPENMP_SIMD + #pragma omp simd reduction(+:z) +#endif + for (int64_t i=0; i +struct bloc_fp8 { + float d; + FP8 qs[QK]; +}; + +template +static inline void conv(const float* x, bloc_fp8* y, int64_t size) { + const auto qk_size = size / QK; + for (int64_t q=0; q::MAX/m; + y[q].d = m/FP8::MAX; +#ifdef GGML_USE_OPENMP_SIMD + #pragma omp simd +#endif + for (int64_t i=0; i(x[q*QK+i]*D); + } + } +} + +template +static inline float dot(const bloc_fp8* x, const float* y, int64_t size) { + float z = 0; + const auto qk_size = size / QK; + for (int64_t q=0; q +float dot_reg(const bloc_fp8* x, const _Y* y, int64_t size) { + static_assert(QK%(VECT_SIZE*NB_REG)==0, "size not supported"); + using fp8_t = FP8; + + float z = 0; + float Z[NB_REG][VECT_SIZE]; + for(int64_t r=0; r(); } + for(int64_t v=0; v(); + } + } + } + // reduction 1 + for(int64_t r=1; r*>(y), k); +} + +void ggml_fp32_to_e4m3_row(const float * x, ggml_e4m3_t * y, int64_t k) { + conv(x, reinterpret_cast*>(y), k); +} + +void quantize_row_e4m3_q(const float * x, block_e4m3_q * y, int64_t k) { + assert(k % QK_K == 0); + conv(x, reinterpret_cast*>(y), k); +} + +void quantize_row_e3m4_q(const float * x, block_e3m4_q * y, int64_t k) { + assert(k % QK_K == 0); + conv(x, reinterpret_cast*>(y), k); +} + +// the dot product for FP8 weight +void ggml_vec_dot_e5m2(int n, float * s, size_t bs, const ggml_e5m2_t * vx, size_t bx, const float * vy, size_t by, int nrc) { + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + *s = dot(reinterpret_cast*>(vx), vy, n); +} + +void ggml_vec_dot_e4m3(int n, float * s, size_t bs, const ggml_e4m3_t * vx, size_t bx, const float * vy, size_t by, int nrc) { + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); + *s = dot(reinterpret_cast*>(vx), vy, n); +} + +void ggml_vec_dot_e4m3_q(int n, float * s, size_t bs, const block_e4m3_q * vx, size_t bx, const float * vy, size_t by, int nrc) { + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); +#if defined(__AVX512F__) // 32xfloat32x16_t + *s = dot_reg<16,4>(reinterpret_cast*>(vx), vy, n); +#elif defined(__AVX__) || defined(__AVX2__) // 16xfloat32x8_t + *s = dot_reg<8,4>(reinterpret_cast*>(vx), vy, n); +#elif defined(__ARM_NEON) // 32xfloat32x4_t + *s = dot_reg<4,4>(reinterpret_cast*>(vx), vy, n); +// #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // 32xfloat16x8_t +#else + *s = dot(reinterpret_cast*>(vx), vy, n); +#endif +} + +void ggml_vec_dot_e3m4_q(int n, float * s, size_t bs, const block_e3m4_q * vx, size_t bx, const float * vy, size_t by, int nrc) { + assert(nrc == 1); + GGML_UNUSED(nrc); + GGML_UNUSED(bx); + GGML_UNUSED(by); + GGML_UNUSED(bs); +#if defined(__AVX512F__) // 32xfloat32x16_t + *s = dot_reg<16,4>(reinterpret_cast*>(vx), vy, n); +#elif defined(__AVX__) || defined(__AVX2__) // 16xfloat32x8_t + *s = dot_reg<8,4>(reinterpret_cast*>(vx), vy, n); +#elif defined(__ARM_NEON) // 32xfloat32x4_t + *s = dot_reg<4,4>(reinterpret_cast*>(vx), vy, n); +// #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) // 32xfloat16x8_t +#else + *s = dot(reinterpret_cast*>(vx), vy, n); +#endif +} diff --git a/ggml/src/ggml-cpu/ggml-cpu-fp8.h b/ggml/src/ggml-cpu/ggml-cpu-fp8.h new file mode 100644 index 0000000000000..3f15c35a6fdd1 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-fp8.h @@ -0,0 +1,21 @@ +#include "ggml-fp8.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_fp32_to_e5m2_row(const float * x, ggml_e5m2_t * y, int64_t k); +void ggml_fp32_to_e4m3_row(const float * x, ggml_e4m3_t * y, int64_t k); +void quantize_row_e4m3_q(const float * x, block_e4m3_q * y, int64_t k); +void quantize_row_e3m4_q(const float * x, block_e3m4_q * y, int64_t k); + +// TODO: the best depend on the CPU fp32 / bf16 / fp16 +#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32 +void ggml_vec_dot_e5m2 (int n, float * s, size_t bs, const ggml_e5m2_t * vx, size_t bx, const float * vy, size_t by, int nrc); +void ggml_vec_dot_e4m3 (int n, float * s, size_t bs, const ggml_e4m3_t * vx, size_t bx, const float * vy, size_t by, int nrc); +void ggml_vec_dot_e4m3_q(int n, float * s, size_t bs, const block_e4m3_q * vx, size_t bx, const float * vy, size_t by, int nrc); +void ggml_vec_dot_e3m4_q(int n, float * s, size_t bs, const block_e3m4_q * vx, size_t bx, const float * vy, size_t by, int nrc); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 23ae2e10cd520..2ecaed0f07855 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1,11 +1,15 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows #define _USE_MATH_DEFINES // For M_PI on MSVC +#define GGML_COMMON_DECL_C +#include "ggml-common.h" + #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-cpu-aarch64.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" +#include "ggml-cpu-fp8.h" #include "ggml-impl.h" #include "ggml-quants.h" #include "ggml-cpu-quants.h" @@ -457,6 +461,30 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .gemv = ggml_gemv_iq4_nl_4x4_q8_0, .gemm = ggml_gemm_iq4_nl_4x4_q8_0, }, + [GGML_TYPE_E5M2] = { + .from_float = (ggml_from_float_t) ggml_fp32_to_e5m2_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e5m2, + .vec_dot_type = GGML_FP8_VECT_DOT_TYPE, + .nrows = 1, + }, + [GGML_TYPE_E4M3] = { + .from_float = (ggml_from_float_t) ggml_fp32_to_e4m3_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e4m3, + .vec_dot_type = GGML_FP8_VECT_DOT_TYPE, + .nrows = 1, + }, + [GGML_TYPE_E4M3_Q] = { + .from_float = (ggml_from_float_t) quantize_row_e4m3_q, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e4m3_q, + .vec_dot_type = GGML_FP8_VECT_DOT_TYPE, + .nrows = 1, + }, + [GGML_TYPE_E3M4_Q] = { + .from_float = (ggml_from_float_t) quantize_row_e3m4_q, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_e3m4_q, + .vec_dot_type = GGML_FP8_VECT_DOT_TYPE, + .nrows = 1, + }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { @@ -4509,6 +4537,10 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_E5M2 : + case GGML_TYPE_E4M3 : + case GGML_TYPE_E4M3_Q: + case GGML_TYPE_E3M4_Q: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -4889,6 +4921,10 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_E5M2 : + case GGML_TYPE_E4M3 : + case GGML_TYPE_E4M3_Q: + case GGML_TYPE_E3M4_Q: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -4992,33 +5028,6 @@ static void ggml_compute_forward_acc( { ggml_compute_forward_acc_f32(params, dst); } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -8259,33 +8268,6 @@ static void ggml_compute_forward_set( { ggml_compute_forward_set_f32(params, dst); } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -8550,6 +8532,10 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_E5M2 : + case GGML_TYPE_E4M3 : + case GGML_TYPE_E4M3_Q: + case GGML_TYPE_E3M4_Q: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -9114,41 +9100,7 @@ static void ggml_compute_forward_clamp( { ggml_compute_forward_clamp_f32(params, dst); } break; - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - case GGML_TYPE_IQ4_NL_4_4: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_I64: - case GGML_TYPE_F64: - case GGML_TYPE_COUNT: + default: { GGML_ABORT("fatal error"); } diff --git a/ggml/src/ggml-fp8.cpp b/ggml/src/ggml-fp8.cpp new file mode 100644 index 0000000000000..0fd81e8638fa0 --- /dev/null +++ b/ggml/src/ggml-fp8.cpp @@ -0,0 +1,134 @@ +#include +#include + +#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" +#include "ggml.h" + +#include "ggml-fp8.h" + +union fp32_int32 { + float f; + uint32_t bits; +}; + +template +inline FP8 float_to_fp8(float value) { + FP8 out; + fp32_int32 in = {value}; + // the sign + out.bits = (in.bits >> 24) & 0x80; + // value without sign + in.bits &= 0x7fffffff; + //GGML_ASSERT(in.bits < 0x7f800000); // +/- infinity or NAN + if (in.f >= FP8::MAX) { + out.bits |= 0x7E; + } else if (in.f < FP8::MIN) { // => 0. + // OK: S.0000000 + } else { + in.f *= exp_f2::E_BIAS-127>(); + // - trunc + //uint32_t eps = 0; + // - rounding half away from zero + //uint32_t eps = 0x400000>>FP8::M; + // - rounding half toward zero + //uint32_t eps = 0x3fffff>>FP8::M; + // - rounding to nearest even + uint32_t eps = (0x3fffff>>FP8::M) + ((in.bits >> (23-FP8::M)) & 0x1); + // shift mantissa. + in.bits += eps; + out.bits |= (in.bits >> (23-FP8::M)) & 0x7F; + } + return out; +} + +template +inline float fp8_to_float(const FP8& in) { + fp32_int32 out = {0}; + out.bits = in.bits & 0x80; + out.bits <<= 24; + uint32_t _bits = in.bits & 0x7F; + _bits <<= (23-FP8::M); + out.bits |= _bits; + out.f *= exp_f2<127-FP8::E_BIAS>(); + return out.f; +} + +template +static inline void conv(const FP8* x, float* y, int64_t size) { + for (int64_t i=0; i +static inline void conv(const float* x, FP8* y, int64_t size) { + for (int64_t i=0; i(x[i]); + } +} + +template +struct bloc_fp8 { + float d; + FP8 qs[QK]; +}; + +template +static inline void conv(const bloc_fp8* x, float* y, int64_t size) { + const auto qk_size = size / QK; + for (int64_t q=0; q +static inline void conv(const float* x, bloc_fp8* y, int64_t size) { + const auto qk_size = size / QK; + for (int64_t q=0; q::MAX/m; + y[q].d = m/FP8::MAX; + for (int64_t i=0; i(x[q*QK+i]*D); + } + } +} + +// the C API. +void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + conv(reinterpret_cast*>(x), y, k); +} +void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) { + conv(x, reinterpret_cast*>(y), k); +} + +void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + conv(reinterpret_cast*>(x), y, k); +} +void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) { + conv(x, reinterpret_cast*>(y), k); +} + +void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + conv(reinterpret_cast*>(x), y, k); +} +void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + conv(x, reinterpret_cast*>(y), k); +} + +void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + conv(reinterpret_cast*>(x), y, k); +} +void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + conv(x, reinterpret_cast*>(y), k); +} diff --git a/ggml/src/ggml-fp8.h b/ggml/src/ggml-fp8.h new file mode 100644 index 0000000000000..da7784d46010e --- /dev/null +++ b/ggml/src/ggml-fp8.h @@ -0,0 +1,45 @@ +// this is more a .inc. +#ifdef __cplusplus +template +constexpr int exp_i2() { + return 1 << N; +} + +template +constexpr float exp_f2() { + if constexpr (N>0) return exp_f2()*2; + if constexpr (N<0) return exp_f2()/2; + if constexpr (N==0) return 1.; +} + + +template //, int M=7-E> 1.7 bits! +struct FP8 { + uint8_t bits; + using type = FP8<_E>; + static constexpr int E = _E; + static constexpr int M = (7-_E); + static constexpr int E_BIAS = exp_i2()-1; + static constexpr float MAX = (2-exp_f2<-M+1>())*exp_f2()>(); + static constexpr float MIN = exp_f2<-M>()*exp_f2<2-exp_i2()>(); +}; + +extern "C" { +#endif + + // Note: types are define in ggml-common.h + GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k); + + GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k); + + GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k); + + GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7301a9c6caab8..31b8bdc801ac1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5229,7 +5229,26 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); } break; - + case GGML_TYPE_E4M3_Q: + case GGML_TYPE_E3M4_Q: + { + // Note realy clean, but it is the same test for E4M3. + const block_e3m4_q * q = (const block_e3m4_q *) data; + int nans = 0; + for (size_t i = 0; i < nb; ++i) { + if (!validate_float(q[i].d, i)) { + return false; + } + // NAN + for (size_t k = 0; k < QK_K; ++k) { + nans += (q[i].qs[k] & 0x7f) == 0x7f; + } + } + if (nans) { + fprintf(stderr, "%s: found %d NaNs in row of %zu FP8 values\n", __func__, nans, nb*QK_K); + return false; + } + } break; case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1a9a7efaf7f39..d2c9f99e0a888 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -9,6 +9,7 @@ // FIXME: required here for quantization functions #include "ggml-quants.h" #include "ggml-aarch64.h" +#include "ggml-fp8.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -840,6 +841,38 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = NULL, .from_float_ref = NULL, }, + [GGML_TYPE_E5M2] = { + .type_name = "fp8_e5m2", + .blck_size = 1, + .type_size = sizeof(ggml_e5m2_t), + .is_quantized = true, + .to_float = (ggml_to_float_t) ggml_e5m2_to_fp32_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_e5m2_row_ref, + }, + [GGML_TYPE_E4M3] = { + .type_name = "fp8_e4m3", + .blck_size = 1, + .type_size = sizeof(ggml_e4m3_t), + .is_quantized = true, + .to_float = (ggml_to_float_t) ggml_e4m3_to_fp32_row, + .from_float_ref = (ggml_from_float_t) ggml_fp32_to_e4m3_row_ref, + }, + [GGML_TYPE_E4M3_Q] = { + .type_name = "fp8_e4m3_q", + .blck_size = QK_K, + .type_size = sizeof(block_e4m3_q), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_e4m3_q, + .from_float_ref = (ggml_from_float_t) quantize_row_e4m3_q_ref, + }, + [GGML_TYPE_E3M4_Q] = { + .type_name = "fp8_e3m4_q", + .blck_size = QK_K, + .type_size = sizeof(block_e3m4_q), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_e3m4_q, + .from_float_ref = (ggml_from_float_t) quantize_row_e3m4_q_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -1271,6 +1304,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; + case GGML_FTYPE_MOSTLY_E5M2: wtype = GGML_TYPE_E5M2; break; + case GGML_FTYPE_MOSTLY_E4M3: wtype = GGML_TYPE_E4M3; break; + case GGML_FTYPE_MOSTLY_E4M3_Q: wtype = GGML_TYPE_E4M3_Q; break; + case GGML_FTYPE_MOSTLY_E3M4_Q: wtype = GGML_TYPE_E3M4_Q; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -6274,6 +6311,26 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_E5M2 : + { // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row); + ggml_fp32_to_e5m2_row_ref(src + start, (ggml_e5m2_t*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row); + result = nrows * row_size; + } break; + case GGML_TYPE_E4M3 : + { // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row); + ggml_fp32_to_e4m3_row_ref(src + start, (ggml_e4m3_t*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row); + result = nrows * row_size; + } break; + case GGML_TYPE_E4M3_Q: + { // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row); + quantize_row_e4m3_q_ref(src + start, (block_e4m3_q*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row); + result = nrows * row_size; + } break; + case GGML_TYPE_E3M4_Q: + { // move to ggml-cpu.c : type_traits[type].from_float(src + start, (char *) dst + start_row * row_size, (int64_t)nrows*n_per_row); + quantize_row_e3m4_q_ref(src + start, (block_e3m4_q*)((char *) dst + start_row * row_size), (int64_t)nrows*n_per_row); + result = nrows * row_size; + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index ab5e376e6c7f2..c16d1973f84fc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -176,6 +176,10 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_E5M2 = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_E4M3 = 39, // except 1d tensors + LLAMA_FTYPE_MOSTLY_E4M3_Q = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_E3M4_Q = 41, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/scripts/hf.sh b/scripts/hf.sh index 85c2c4d9a952e..b251925fa453f 100755 --- a/scripts/hf.sh +++ b/scripts/hf.sh @@ -26,7 +26,7 @@ function has_cmd { } if has_cmd wget; then - cmd="wget -q --show-progress -c -O %s/%s %s" + cmd="wget -q -c -O %s/%s %s" elif has_cmd curl; then cmd="curl -C - -f --output-dir %s -o %s -L %s" else diff --git a/src/llama.cpp b/src/llama.cpp index 22b951ba2a946..3f9e961eaeb22 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4517,6 +4517,10 @@ struct llama_model_loader { case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; + case GGML_TYPE_E5M2: ftype = LLAMA_FTYPE_MOSTLY_E5M2; break; + case GGML_TYPE_E4M3: ftype = LLAMA_FTYPE_MOSTLY_E4M3; break; + case GGML_TYPE_E4M3_Q: ftype = LLAMA_FTYPE_MOSTLY_E4M3_Q; break; + case GGML_TYPE_E3M4_Q: ftype = LLAMA_FTYPE_MOSTLY_E3M4_Q; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5283,6 +5287,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; + case LLAMA_FTYPE_MOSTLY_E5M2: return "E5M2"; + case LLAMA_FTYPE_MOSTLY_E4M3: return "E4M3"; + case LLAMA_FTYPE_MOSTLY_E4M3_Q: return "E4M3_Q"; + case LLAMA_FTYPE_MOSTLY_E3M4_Q: return "E3M4_Q"; default: return "unknown, may not work"; } @@ -18422,6 +18430,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_E4M3_Q) { + new_type = GGML_TYPE_E4M3_Q; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_E3M4_Q) { + new_type = GGML_TYPE_E3M4_Q; + } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } @@ -18447,6 +18461,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_E4M3_Q || ftype == LLAMA_FTYPE_MOSTLY_E3M4_Q) { + new_type = tensor->type; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -18634,7 +18651,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || - new_type == GGML_TYPE_IQ1_M) { + new_type == GGML_TYPE_IQ1_M || new_type == GGML_TYPE_E4M3_Q || new_type == GGML_TYPE_E3M4_Q) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -18661,6 +18678,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; + case GGML_TYPE_E4M3_Q: + case GGML_TYPE_E3M4_Q: new_type = tensor->type; break; default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); } if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { @@ -18770,6 +18789,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; + // FP8 + case LLAMA_FTYPE_MOSTLY_E5M2: default_type = GGML_TYPE_E5M2; break; + case LLAMA_FTYPE_MOSTLY_E4M3: default_type = GGML_TYPE_E4M3; break; + case LLAMA_FTYPE_MOSTLY_E4M3_Q: default_type = GGML_TYPE_E4M3_Q; break; + case LLAMA_FTYPE_MOSTLY_E3M4_Q: default_type = GGML_TYPE_E3M4_Q; break; + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index c77c8ed1388d7..e812e83680ea3 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -88,10 +88,16 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type); qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size); - vdot->from_float(test_data2, tmp_q2.data(), test_size); + if (qfns_cpu->vec_dot_type != GGML_TYPE_F32) { + vdot->from_float(test_data2, tmp_q2.data(), test_size); + } float result = INFINITY; - qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + if (qfns_cpu->vec_dot_type != GGML_TYPE_F32) { + qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); + } else { + qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, test_data2, 0, 1); + } const float dot_ref = dot_product(test_data1, test_data2, test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 2882884938388..92886df0434eb 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -325,7 +325,7 @@ int main(int argc, char * argv[]) { printf("\n"); } - if (params.op_quantize_row_q_dot) { + if (params.op_quantize_row_q_dot && ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type)->from_float) { printf(" quantize_row_q_dot\n"); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));