Skip to content

Commit 7abdf2b

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ5_KS_R4: row-interleaved IQ5_KS (#426)
* iq5_ks_r4: basics * iq5_ks_r4: Zen4 works * iq5_ks_r4: AVX2 works * iq5_ks_r4: NEON * Fix iq5_ks on NEON --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 134d548 commit 7abdf2b

File tree

10 files changed

+441
-51
lines changed

10 files changed

+441
-51
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
6767
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
6868
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
6969
{ "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
70+
{ "IQ5_KS_R4",LLAMA_FTYPE_MOSTLY_IQ5_KS_R4,"IQ5_KS repacked", },
7071
{ "IQ4_KSS", LLAMA_FTYPE_MOSTLY_IQ4_KSS, " 4.0 bpw non-linear quantization", },
7172
{ "IQ5_KS", LLAMA_FTYPE_MOSTLY_IQ5_KS, " 5.25 bpw non-linear quantization", },
7273
{ "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",},

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ extern "C" {
452452
GGML_TYPE_IQ4_K_R4 = 339,
453453
GGML_TYPE_IQ5_K_R4 = 340,
454454
GGML_TYPE_IQ4_KS_R4 = 344,
455+
GGML_TYPE_IQ5_KS_R4 = 352,
455456
GGML_TYPE_Q8_KV_R8 = 398,
456457
GGML_TYPE_Q8_K_R8 = 399,
457458
GGML_TYPE_COUNT,
@@ -540,6 +541,7 @@ extern "C" {
540541
GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors
541542
GGML_FTYPE_MOSTLY_IQ5_K_R4 = 333, // except 1d tensors
542543
GGML_FTYPE_MOSTLY_IQ4_KS_R4 = 337, // except 1d tensors
544+
GGML_FTYPE_MOSTLY_IQ5_KS_R4 = 341, // except 1d tensors
543545
GGML_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors
544546
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors
545547
};

ggml/src/ggml-common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,13 @@ typedef struct {
694694
} block_iq5_ks;
695695
static_assert(sizeof(block_iq5_ks) == QK_K/32 + QK_K/2 + QK_K/8, "wrong iq5_ks block size/padding");
696696

697+
typedef struct {
698+
uint8_t scales[QK_K/8];
699+
uint8_t qs[QK_K*2];
700+
uint8_t qh[QK_K/2];
701+
} block_iq5_ks_r4;
702+
static_assert(sizeof(block_iq5_ks_r4) == 4*sizeof(block_iq5_ks), "wrong iq5_ks_r4 block size/padding");
703+
697704

698705
#endif // GGML_COMMON_DECL
699706
#endif // GGML_COMMON_DECL

ggml/src/ggml-quants.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15451,6 +15451,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
1545115451
case GGML_TYPE_IQ4_K_R4: break;
1545215452
case GGML_TYPE_IQ5_K_R4: break;
1545315453
case GGML_TYPE_IQ4_KS_R4:break;
15454+
case GGML_TYPE_IQ5_KS_R4:break;
1545415455
case GGML_TYPE_Q8_KV_R8: break;
1545515456
case GGML_TYPE_Q8_K_R8: break;
1545615457
case GGML_TYPE_Q8_KV: break;

ggml/src/ggml.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,23 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
13391339
.vec_dot_type = GGML_TYPE_Q8_K32,
13401340
#else
13411341
.vec_dot_type = GGML_TYPE_Q8_K,
1342+
#endif
1343+
.nrows = 1,
1344+
.row_meta_size = 4,
1345+
},
1346+
[GGML_TYPE_IQ5_KS_R4] = {
1347+
.type_name = "iq5_ks_r4",
1348+
.blck_size = QK_K,
1349+
.type_size = sizeof(block_iq5_ks),
1350+
.is_quantized = true,
1351+
.to_float = (ggml_to_float_t) dequantize_row_iq5_ks_r4,
1352+
.from_float = quantize_row_iq5_ks_r4,
1353+
.from_float_ref = (ggml_from_float_t)quantize_row_iq5_ks_r4_ref,
1354+
.vec_dot = vec_dot_iq5_ks_r4_q8_k,
1355+
#if defined __AVX2__
1356+
.vec_dot_type = GGML_TYPE_Q8_K32,
1357+
#else
1358+
.vec_dot_type = GGML_TYPE_Q8_K,
13421359
#endif
13431360
.nrows = 1,
13441361
.row_meta_size = 4,
@@ -4478,6 +4495,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
44784495
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
44794496
case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break;
44804497
case GGML_FTYPE_MOSTLY_IQ4_KS_R4: wtype = GGML_TYPE_IQ4_KS_R4;break;
4498+
case GGML_FTYPE_MOSTLY_IQ5_KS_R4: wtype = GGML_TYPE_IQ5_KS_R4;break;
44814499
case GGML_FTYPE_MOSTLY_IQ4_KSS: wtype = GGML_TYPE_IQ4_KSS; break;
44824500
case GGML_FTYPE_MOSTLY_IQ5_KS: wtype = GGML_TYPE_IQ5_KS; break;
44834501
case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break;
@@ -11242,6 +11260,7 @@ static void ggml_compute_forward_add(
1124211260
case GGML_TYPE_IQ4_XS:
1124311261
case GGML_TYPE_IQ4_KS:
1124411262
case GGML_TYPE_IQ4_KS_R4:
11263+
case GGML_TYPE_IQ5_KS_R4:
1124511264
case GGML_TYPE_IQ4_KSS:
1124611265
case GGML_TYPE_IQ5_KS:
1124711266
case GGML_TYPE_IQ2_K:
@@ -11715,6 +11734,7 @@ static void ggml_compute_forward_add1(
1171511734
case GGML_TYPE_IQ4_XS:
1171611735
case GGML_TYPE_IQ4_KS:
1171711736
case GGML_TYPE_IQ4_KS_R4:
11737+
case GGML_TYPE_IQ5_KS_R4:
1171811738
case GGML_TYPE_IQ4_KSS:
1171911739
case GGML_TYPE_IQ5_KS:
1172011740
case GGML_TYPE_IQ2_K:
@@ -11885,6 +11905,7 @@ static void ggml_compute_forward_acc(
1188511905
case GGML_TYPE_IQ4_XS:
1188611906
case GGML_TYPE_IQ4_KS:
1188711907
case GGML_TYPE_IQ4_KS_R4:
11908+
case GGML_TYPE_IQ5_KS_R4:
1188811909
case GGML_TYPE_IQ4_KSS:
1188911910
case GGML_TYPE_IQ5_KS:
1189011911
case GGML_TYPE_IQ2_K:
@@ -15382,6 +15403,7 @@ static void ggml_compute_forward_out_prod(
1538215403
case GGML_TYPE_IQ4_XS:
1538315404
case GGML_TYPE_IQ4_KS:
1538415405
case GGML_TYPE_IQ4_KS_R4:
15406+
case GGML_TYPE_IQ5_KS_R4:
1538515407
case GGML_TYPE_IQ4_KSS:
1538615408
case GGML_TYPE_IQ5_KS:
1538715409
case GGML_TYPE_IQ2_K:
@@ -15792,6 +15814,7 @@ static void ggml_compute_forward_set(
1579215814
case GGML_TYPE_IQ4_XS:
1579315815
case GGML_TYPE_IQ4_KS:
1579415816
case GGML_TYPE_IQ4_KS_R4:
15817+
case GGML_TYPE_IQ5_KS_R4:
1579515818
case GGML_TYPE_IQ4_KSS:
1579615819
case GGML_TYPE_IQ5_KS:
1579715820
case GGML_TYPE_IQ2_K:
@@ -16108,6 +16131,7 @@ static void ggml_compute_forward_get_rows(
1610816131
case GGML_TYPE_IQ4_XS:
1610916132
case GGML_TYPE_IQ4_KS:
1611016133
case GGML_TYPE_IQ4_KS_R4:
16134+
case GGML_TYPE_IQ5_KS_R4:
1611116135
case GGML_TYPE_IQ4_KSS:
1611216136
case GGML_TYPE_IQ5_KS:
1611316137
case GGML_TYPE_IQ2_K:
@@ -16741,6 +16765,7 @@ static void ggml_compute_forward_clamp(
1674116765
case GGML_TYPE_IQ4_XS:
1674216766
case GGML_TYPE_IQ4_KS:
1674316767
case GGML_TYPE_IQ4_KS_R4:
16768+
case GGML_TYPE_IQ5_KS_R4:
1674416769
case GGML_TYPE_IQ4_KSS:
1674516770
case GGML_TYPE_IQ5_KS:
1674616771
case GGML_TYPE_IQ2_K:
@@ -23810,6 +23835,7 @@ size_t ggml_quantize_chunk(
2381023835
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2381123836
case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2381223837
case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
23838+
case GGML_TYPE_IQ5_KS_R4:result = quantize_iq5_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2381323839
case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2381423840
case GGML_TYPE_IQ5_KS: result = quantize_iq5_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2381523841
case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)