@@ -100,33 +100,24 @@ struct Trellis2 {
100100
101101template <bool is_8 = false >
102102struct Trellis3 {
103- constexpr static uint32_t ka = 89226354 ;
104- constexpr static uint32_t kb = 64248484 ;
103+ constexpr static uint32_t ka = 0xCBAC1FED ;
105104 constexpr static uint32_t ka1 = ka*ka;
106- constexpr static uint32_t kb1 = kb*ka+kb;
107105 constexpr static uint32_t ka2 = ka1*ka;
108- constexpr static uint32_t kb2 = kb1*ka+kb;
109106 constexpr static uint32_t ka3 = ka2*ka;
110- constexpr static uint32_t kb3 = kb2*ka+kb;
111107 constexpr static uint32_t ka4 = ka3*ka;
112- constexpr static uint32_t kb4 = kb3*ka+kb;
113108 constexpr static uint32_t ka5 = ka4*ka;
114- constexpr static uint32_t kb5 = kb4*ka+kb;
115109 constexpr static uint32_t ka6 = ka5*ka;
116- constexpr static uint32_t kb6 = kb5*ka+kb;
117110 constexpr static uint32_t ka7 = ka6*ka;
118- constexpr static uint32_t kb7 = kb6*ka+kb;
119111 const __m256i mka = is_8 ? _mm256_setr_epi32(ka, ka1, ka2, ka3, ka4, ka5, ka6, ka7) : _mm256_setr_epi32(ka, ka1, ka2, ka3, ka, ka1, ka2, ka3);
120- const __m256i mkb = is_8 ? _mm256_setr_epi32(kb, kb1, kb2, kb3, kb4, kb5, kb6, kb7) : _mm256_setr_epi32(kb, kb1, kb2, kb3, kb, kb1, kb2, kb3);
121112 const __m256i shuffle = _mm256_set_epi32(7 , 3 , 6 , 2 , 5 , 1 , 4 , 0 );
122113
123114 inline __m256i next8 (uint32_t val1, uint32_t val2) const {
124115 __m256i mval = MM256_SET_M128I (_mm_set1_epi32 (val2), _mm_set1_epi32 (val1));
125- return _mm256_add_epi32 ( _mm256_mullo_epi32 (mval, mka), mkb );
116+ return _mm256_mullo_epi32 (mval, mka);
126117 }
127118 inline __m256i next8 (uint32_t val) const {
128119 __m256i mval = _mm256_set1_epi32 (val);
129- return _mm256_add_epi32 ( _mm256_mullo_epi32 (mval, mka), mkb );
120+ return _mm256_mullo_epi32 (mval, mka);
130121 }
131122 inline __m256 gen8 (uint32_t val1, uint32_t val2) const {
132123 auto v8 = _mm256_and_si256 (next8 (val1, val2), _mm256_set1_epi32 (0x3f3f3f3f ));
@@ -189,11 +180,11 @@ struct Trellis3 {
189180 template <bool is_unsigned = false >
190181 inline void next64 (const uint32_t * val, __m256i * result) const {
191182 const __m256i offset = is_unsigned ? _mm256_setzero_si256 () : _mm256_set1_epi32 (-126 );
192- auto vka3 = _mm256_set1_epi32 (ka3), vkb3 = _mm256_set1_epi32 (kb3) ;
183+ auto vka3 = _mm256_set1_epi32 (ka3);
193184 __m256i aux[8 ];
194185 for (int i = 0 ; i < 4 ; ++i) {
195186 auto i8_1 = next8 (val[2 *i+0 ], val[2 *i+1 ]);
196- auto i8_2 = _mm256_add_epi32 ( _mm256_mullo_epi32 (i8_1, vka3), vkb3 );
187+ auto i8_2 = _mm256_mullo_epi32 (i8_1, vka3);
197188 i8_1 = _mm256_and_si256 (i8_1, _mm256_set1_epi32 (0x3f3f3f3f ));
198189 i8_2 = _mm256_and_si256 (i8_2, _mm256_set1_epi32 (0x3f3f3f3f ));
199190#ifdef HAVE_FANCY_SIMD
@@ -1419,22 +1410,17 @@ void mul_mat_iq4_kt_F32_T(int n, const void * vx, size_t bx, const DataInfo& inf
14191410}
14201411
14211412struct Trellis3 {
1422- constexpr static uint32_t ka = 89226354 ;
1423- constexpr static uint32_t kb = 64248484 ;
1413+ constexpr static uint32_t ka = ;0xCBAC1FED ;
14241414 constexpr static uint32_t ka1 = ka*ka;
1425- constexpr static uint32_t kb1 = kb*ka+kb;
14261415 constexpr static uint32_t ka2 = ka1*ka;
1427- constexpr static uint32_t kb2 = kb1*ka+kb;
14281416 constexpr static uint32_t ka3 = ka2*ka;
1429- constexpr static uint32_t kb3 = kb2*ka+kb;
14301417 const uint32x4_t mka = uint32x4_t {ka, ka1, ka2, ka3};
1431- const uint32x4_t mkb = uint32x4_t {kb, kb1, kb2, kb3};
14321418 const uint8x16_t shuffle = load_shuffle();
14331419
14341420 inline uint32x4x2_t next8 (uint32_t val1, uint32_t val2) const {
14351421 uint32x4x2_t result{vdupq_n_u32 (val1), vdupq_n_u32 (val2)};
1436- result.val [0 ] = vmlaq_u32 (mkb, mka, result.val [0 ]);
1437- result.val [1 ] = vmlaq_u32 (mkb, mka, result.val [1 ]);
1422+ result.val [0 ] = vmulq_u32 ( mka, result.val [0 ]);
1423+ result.val [1 ] = vmulq_u32 ( mka, result.val [1 ]);
14381424 return result;
14391425 }
14401426 inline int8x16x2_t next32 (const uint32_t * val) const {
@@ -1457,12 +1443,12 @@ struct Trellis3 {
14571443 int8x16x2_t result = {vdupq_n_s8 (-126 ), vdupq_n_s8 (-126 )};
14581444 int8x16x2_t i8 ;
14591445 for (int i = 0 ; i < 2 ; ++i) {
1460- i8 .val [0 ] = vmlaq_u32 (mkb, mka, vdupq_n_u32 (val[2 *i+0 ]+v0));
1446+ i8 .val [0 ] = vmulq_u32 ( mka, vdupq_n_u32 (val[2 *i+0 ]+v0));
14611447 i8 .val [1 ] = vmlaq_u32 (vkb3, vka3, i8 .val [0 ]);
14621448 i8 .val [0 ] = vandq_u32 (i8 .val [0 ], vdupq_n_u32 (0x3f3f3f3f ));
14631449 i8 .val [1 ] = vandq_u32 (i8 .val [1 ], vdupq_n_u32 (0x3f3f3f3f ));
14641450 auto s1 = vpaddq_s8 (vreinterpretq_s8_u32 (i8 .val [0 ]), vreinterpretq_s8_u32 (i8 .val [1 ]));
1465- i8 .val [0 ] = vmlaq_u32 (mkb, mka, vdupq_n_u32 (val[2 *i+1 ]+v0));
1451+ i8 .val [0 ] = vmulq_u32 ( mka, vdupq_n_u32 (val[2 *i+1 ]+v0));
14661452 i8 .val [1 ] = vmlaq_u32 (vkb3, vka3, i8 .val [0 ]);
14671453 i8 .val [0 ] = vandq_u32 (i8 .val [0 ], vdupq_n_u32 (0x3f3f3f3f ));
14681454 i8 .val [1 ] = vandq_u32 (i8 .val [1 ], vdupq_n_u32 (0x3f3f3f3f ));
0 commit comments