Skip to content

Commit e5a0668

Browse files
author
Iwan Kawrakow
committed
CPU
1 parent eff6fd7 commit e5a0668

File tree

1 file changed

+10
-24
lines changed

1 file changed

+10
-24
lines changed

ggml/src/iqk/iqk_gemm_ktquants.cpp

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -100,33 +100,24 @@ struct Trellis2 {
100100

101101
template <bool is_8 = false>
102102
struct Trellis3 {
103-
constexpr static uint32_t ka = 89226354;
104-
constexpr static uint32_t kb = 64248484;
103+
constexpr static uint32_t ka = 0xCBAC1FED;
105104
constexpr static uint32_t ka1 = ka*ka;
106-
constexpr static uint32_t kb1 = kb*ka+kb;
107105
constexpr static uint32_t ka2 = ka1*ka;
108-
constexpr static uint32_t kb2 = kb1*ka+kb;
109106
constexpr static uint32_t ka3 = ka2*ka;
110-
constexpr static uint32_t kb3 = kb2*ka+kb;
111107
constexpr static uint32_t ka4 = ka3*ka;
112-
constexpr static uint32_t kb4 = kb3*ka+kb;
113108
constexpr static uint32_t ka5 = ka4*ka;
114-
constexpr static uint32_t kb5 = kb4*ka+kb;
115109
constexpr static uint32_t ka6 = ka5*ka;
116-
constexpr static uint32_t kb6 = kb5*ka+kb;
117110
constexpr static uint32_t ka7 = ka6*ka;
118-
constexpr static uint32_t kb7 = kb6*ka+kb;
119111
const __m256i mka = is_8 ? _mm256_setr_epi32(ka, ka1, ka2, ka3, ka4, ka5, ka6, ka7) : _mm256_setr_epi32(ka, ka1, ka2, ka3, ka, ka1, ka2, ka3);
120-
const __m256i mkb = is_8 ? _mm256_setr_epi32(kb, kb1, kb2, kb3, kb4, kb5, kb6, kb7) : _mm256_setr_epi32(kb, kb1, kb2, kb3, kb, kb1, kb2, kb3);
121112
const __m256i shuffle = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
122113

123114
inline __m256i next8(uint32_t val1, uint32_t val2) const {
124115
__m256i mval = MM256_SET_M128I(_mm_set1_epi32(val2), _mm_set1_epi32(val1));
125-
return _mm256_add_epi32(_mm256_mullo_epi32(mval, mka), mkb);
116+
return _mm256_mullo_epi32(mval, mka);
126117
}
127118
inline __m256i next8(uint32_t val) const {
128119
__m256i mval = _mm256_set1_epi32(val);
129-
return _mm256_add_epi32(_mm256_mullo_epi32(mval, mka), mkb);
120+
return _mm256_mullo_epi32(mval, mka);
130121
}
131122
inline __m256 gen8(uint32_t val1, uint32_t val2) const {
132123
auto v8 = _mm256_and_si256(next8(val1, val2), _mm256_set1_epi32(0x3f3f3f3f));
@@ -189,11 +180,11 @@ struct Trellis3 {
189180
template <bool is_unsigned = false>
190181
inline void next64(const uint32_t * val, __m256i * result) const {
191182
const __m256i offset = is_unsigned ? _mm256_setzero_si256() : _mm256_set1_epi32(-126);
192-
auto vka3 = _mm256_set1_epi32(ka3), vkb3 = _mm256_set1_epi32(kb3);
183+
auto vka3 = _mm256_set1_epi32(ka3);
193184
__m256i aux[8];
194185
for (int i = 0; i < 4; ++i) {
195186
auto i8_1 = next8(val[2*i+0], val[2*i+1]);
196-
auto i8_2 = _mm256_add_epi32(_mm256_mullo_epi32(i8_1, vka3), vkb3);
187+
auto i8_2 = _mm256_mullo_epi32(i8_1, vka3);
197188
i8_1 = _mm256_and_si256(i8_1, _mm256_set1_epi32(0x3f3f3f3f));
198189
i8_2 = _mm256_and_si256(i8_2, _mm256_set1_epi32(0x3f3f3f3f));
199190
#ifdef HAVE_FANCY_SIMD
@@ -1419,22 +1410,17 @@ void mul_mat_iq4_kt_F32_T(int n, const void * vx, size_t bx, const DataInfo& inf
14191410
}
14201411

14211412
struct Trellis3 {
1422-
constexpr static uint32_t ka = 89226354;
1423-
constexpr static uint32_t kb = 64248484;
1413+
constexpr static uint32_t ka = ;0xCBAC1FED;
14241414
constexpr static uint32_t ka1 = ka*ka;
1425-
constexpr static uint32_t kb1 = kb*ka+kb;
14261415
constexpr static uint32_t ka2 = ka1*ka;
1427-
constexpr static uint32_t kb2 = kb1*ka+kb;
14281416
constexpr static uint32_t ka3 = ka2*ka;
1429-
constexpr static uint32_t kb3 = kb2*ka+kb;
14301417
const uint32x4_t mka = uint32x4_t{ka, ka1, ka2, ka3};
1431-
const uint32x4_t mkb = uint32x4_t{kb, kb1, kb2, kb3};
14321418
const uint8x16_t shuffle = load_shuffle();
14331419

14341420
inline uint32x4x2_t next8(uint32_t val1, uint32_t val2) const {
14351421
uint32x4x2_t result{vdupq_n_u32(val1), vdupq_n_u32(val2)};
1436-
result.val[0] = vmlaq_u32(mkb, mka, result.val[0]);
1437-
result.val[1] = vmlaq_u32(mkb, mka, result.val[1]);
1422+
result.val[0] = vmulq_u32(mka, result.val[0]);
1423+
result.val[1] = vmulq_u32(mka, result.val[1]);
14381424
return result;
14391425
}
14401426
inline int8x16x2_t next32(const uint32_t * val) const {
@@ -1457,12 +1443,12 @@ struct Trellis3 {
14571443
int8x16x2_t result = {vdupq_n_s8(-126), vdupq_n_s8(-126)};
14581444
int8x16x2_t i8;
14591445
for (int i = 0; i < 2; ++i) {
1460-
i8.val[0] = vmlaq_u32(mkb, mka, vdupq_n_u32(val[2*i+0]+v0));
1446+
i8.val[0] = vmulq_u32(mka, vdupq_n_u32(val[2*i+0]+v0));
14611447
i8.val[1] = vmlaq_u32(vkb3, vka3, i8.val[0]);
14621448
i8.val[0] = vandq_u32(i8.val[0], vdupq_n_u32(0x3f3f3f3f));
14631449
i8.val[1] = vandq_u32(i8.val[1], vdupq_n_u32(0x3f3f3f3f));
14641450
auto s1 = vpaddq_s8(vreinterpretq_s8_u32(i8.val[0]), vreinterpretq_s8_u32(i8.val[1]));
1465-
i8.val[0] = vmlaq_u32(mkb, mka, vdupq_n_u32(val[2*i+1]+v0));
1451+
i8.val[0] = vmulq_u32(mka, vdupq_n_u32(val[2*i+1]+v0));
14661452
i8.val[1] = vmlaq_u32(vkb3, vka3, i8.val[0]);
14671453
i8.val[0] = vandq_u32(i8.val[0], vdupq_n_u32(0x3f3f3f3f));
14681454
i8.val[1] = vandq_u32(i8.val[1], vdupq_n_u32(0x3f3f3f3f));

0 commit comments

Comments
 (0)