|
| 1 | +--- a/k_quants.c |
| 2 | ++++ b/k_quants.c |
| 3 | +@@ -43,6 +43,89 @@ |
| 4 | + // 2-6 bit quantization in super-blocks |
| 5 | + // |
| 6 | + |
| 7 | ++#if defined(__ARM_NEON) |
| 8 | ++ |
| 9 | ++#if !defined(__aarch64__) |
| 10 | ++ |
| 11 | ++inline static uint16_t vaddvq_u8(uint8x16_t v) { |
| 12 | ++ return |
| 13 | ++ (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + |
| 14 | ++ (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + |
| 15 | ++ (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + |
| 16 | ++ (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + |
| 17 | ++ (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + |
| 18 | ++ (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + |
| 19 | ++ (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + |
| 20 | ++ (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); |
| 21 | ++} |
| 22 | ++ |
| 23 | ++inline static int16_t vaddvq_s8(int8x16_t v) { |
| 24 | ++ return |
| 25 | ++ (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + |
| 26 | ++ (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + |
| 27 | ++ (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + |
| 28 | ++ (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + |
| 29 | ++ (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + |
| 30 | ++ (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + |
| 31 | ++ (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + |
| 32 | ++ (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); |
| 33 | ++} |
| 34 | ++ |
| 35 | ++inline static int32_t vaddvq_s16(int16x8_t v) { |
| 36 | ++ return |
| 37 | ++ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + |
| 38 | ++ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + |
| 39 | ++ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + |
| 40 | ++ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); |
| 41 | ++} |
| 42 | ++ |
| 43 | ++inline static uint32_t vaddvq_u16(uint16x8_t v) { |
| 44 | ++ return |
| 45 | ++ (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + |
| 46 | ++ (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + |
| 47 | ++ (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + |
| 48 | ++ (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); |
| 49 | ++} |
| 50 | ++ |
| 51 | ++inline static int32_t vaddvq_s32(int32x4_t v) { |
| 52 | ++ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); |
| 53 | ++} |
| 54 | ++ |
| 55 | ++inline static float vaddvq_f32(float32x4_t v) { |
| 56 | ++ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); |
| 57 | ++} |
| 58 | ++ |
| 59 | ++inline static float vminvq_f32(float32x4_t v) { |
| 60 | ++ return |
| 61 | ++ MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), |
| 62 | ++ MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); |
| 63 | ++} |
| 64 | ++ |
| 65 | ++inline static float vmaxvq_f32(float32x4_t v) { |
| 66 | ++ return |
| 67 | ++ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), |
| 68 | ++ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); |
| 69 | ++} |
| 70 | ++ |
| 71 | ++inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { |
| 72 | ++ int32x4_t res; |
| 73 | ++ |
| 74 | ++ res[0] = roundf(vgetq_lane_f32(v, 0)); |
| 75 | ++ res[1] = roundf(vgetq_lane_f32(v, 1)); |
| 76 | ++ res[2] = roundf(vgetq_lane_f32(v, 2)); |
| 77 | ++ res[3] = roundf(vgetq_lane_f32(v, 3)); |
| 78 | ++ |
| 79 | ++ return res; |
| 80 | ++} |
| 81 | ++ |
| 82 | ++inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { |
| 83 | ++ const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); |
| 84 | ++ const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); |
| 85 | ++ return vcombine_s16(c, d); |
| 86 | ++} |
| 87 | ++ |
| 88 | ++#endif |
| 89 | ++#endif |
| 90 | + |
| 91 | + // |
| 92 | + // ===================== Helper functions |
0 commit comments