Skip to content

Commit b3036a8

Browse files
ikawrakowIwan Kawrakow
andauthored
Option to enable disable the IQK CPU FA kernels (#429)
Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent c35a383 commit b3036a8

File tree

4 files changed

+13
-4
lines changed

4 files changed

+13
-4
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
131131
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
132132
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
133133

134+
option(GGML_IQK_FLASH_ATTENTION "ggml: enable the IQK FlashAttention CPU kernels" ON)
134135
option(GGML_IQK_FA_ALL_QUANTS "ggml: compile all quants for IQK FlashAttention" OFF)
135136

136137
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)

ggml/src/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,15 @@ if (GGML_IQK_MUL_MAT)
260260
add_compile_definitions(GGML_USE_IQK_MULMAT)
261261
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp iqk/iqk_flash_attn.cpp)
262262
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h iqk/iqk_flash_impl.h)
263-
if (GGML_IQK_FA_ALL_QUANTS)
264-
message(STATUS "Including all IQK FA kernels")
265-
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
263+
if (GGML_IQK_FLASH_ATTENTION)
264+
message(STATUS "Enabling IQK Flash Attention kernels")
265+
add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
266+
if (GGML_IQK_FA_ALL_QUANTS)
267+
message(STATUS "Including all IQK FA kernels")
268+
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
269+
endif()
270+
else()
271+
message(STATUS "Disabling IQK Flash Attention kernels")
266272
endif()
267273
endif()
268274

ggml/src/iqk/iqk_flash_attn.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include "iqk_mul_mat.h"
99
#include "iqk_flash_impl.h"
1010

11-
#ifdef IQK_IMPLEMENT
11+
#if defined IQK_IMPLEMENT && defined GGML_IQK_FLASH_ATTENTION
1212

1313
#include <algorithm>
1414
#include <cstdio>

ggml/src/iqk/iqk_mul_mat.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15875,6 +15875,7 @@ void MulMat::relu(int n, const float * x, float * y) {
1587515875
#endif
1587615876
} // namespace
1587715877

15878+
#ifdef GGML_IQK_FLASH_ATTENTION
1587815879
namespace {
1587915880

1588015881
template <int k_step>
@@ -18663,6 +18664,7 @@ bool iqk_flash_attn_impl(int int_type_k, // type of k
1866318664

1866418665
return true;
1866518666
}
18667+
#endif
1866618668

1866718669
#else // IQK_IMPLEMENT
1866818670

0 commit comments

Comments
 (0)