llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

作者：XD / 发表： 2024年1月25日 01:05 / 更新： 2024年1月25日 01:15 / 编程笔记 / 阅读量：4719

The source code from llama.cpp /ggml-quants.c includes detailed definitions of various quantization structures used in neural networks and computational models. These structures, named Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K, are designed for efficient representation and processing of weights in a quantized format, reducing memory footprint while maintaining acceptable levels of accuracy.

//
// Super-block quantization structures
//

// Define the super-block size based on a preprocessor directive. 
// This affects the size of quantization blocks and related arrays.
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif

// 2-bit quantization structure
// Each weight is represented as x = a * q + b, where a is the scale and b is the minimum value.
// The structure is divided into 16 blocks of 16 elements each, leading to 2.625 bits per weight.

// When QK_K = 256, then scales = 16 bytes, qs = 64 bytes, d = 2 bytes, dmin = 2 bytes. The total is 84 bytes = 84 * 8  bits = 672 bits, so have 672 bits / 256 = 2.625 (bpw) bits per weight.

typedef struct {
    uint8_t scales[QK_K/16];    // Scales and minimums, quantized using 4 bits.
    uint8_t qs[QK_K/4];         // Quantized values.
    ggml_fp16_t d;              // Super-block scale for quantized scales.
    ggml_fp16_t dmin;           // Super-block scale for quantized minimums.
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

// 3-bit quantization structure
// Weights are represented as x = a * q, using only the scale factor a.
// Divided into 16 blocks of 16 elements each, this achieves 3.4375 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[2];        // Scale values.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
#else

// When QK_K = 256, then hmask= 32 bytes, qs = 64 bytes, scales = 12 bytes, d = 2 bytes. The total is 110 bytes = 110 * 8  bits = 880 bits, so we have 880 bits / 256 = 3.4375 (bpw) bits per weight.

typedef struct {
    uint8_t hmask[QK_K/8];    // High bit of the quantized values.
    uint8_t qs[QK_K/4];       // Low 2 bits of the quantized values.
    uint8_t scales[12];       // Scales, quantized with 6 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q3_K;
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
#endif

// 4-bit quantization structure
// Weights are again represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 4.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d[2];         // Super-block scales/mins.
    uint8_t scales[2];        // 4-bit block scales/mins.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qs[QK_K/2];       // 4-bit quantized values.
} block_q4_K;
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
#endif

// 5-bit quantization structure
// Weights are represented as x = a * q + b.
// The structure is divided into 8 blocks of 32 elements each, achieving 5.5 bits per weight.
#ifdef GGML_QKK_64
typedef struct {
    ggml_fp16_t d;            // Super-block scale.
    int8_t  scales[QK_K/16];  // 8-bit block scales.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
#else
typedef struct {
    ggml_fp16_t d;            // Super-block scale for quantized scales.
    ggml_fp16_t dmin;         // Super-block scale for quantized mins.
    uint8_t scales[K_SCALE_SIZE]; // Scales and mins, quantized with 6 bits.
    uint8_t qh[QK_K/8];       // High bit of the quantized values.
    uint8_t qs[QK_K/2];       // Low 4 bits of the quantized values.
} block_q5_K;
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif

// 6-bit quantization structure
// Weights are represented as x = a * q.
// The structure is divided into 16 blocks of 16 elements each, achieving 6.5625 bits per weight.
typedef struct {
    uint8_t ql[QK_K/2];       // Lower 4 bits of the quantized values.
    uint8_t qh[QK_K/4];       // Upper 2 bits of the quantized values.
    int8_t  scales[QK_K/16];  // Scales, quantized with 8 bits.
    ggml_fp16_t d;            // Super-block scale.
} block_q6_K;
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// Intermediate quantization and dot product structure
typedef struct {
    float   d;               // Delta value for quantization.
    int8_t  qs[QK_K];        // Quantized values.
    int16_t bsums[QK_K/16];  // Sum of quants in groups of 16.
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

// "True" 2-bit quantization structure, adjusted for block usage in ggml design.
// Results in 2.0625 bits per weight due to 16-bit scale for each block of 256.
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
} block_iq2_xxs;
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

// 2.3125 bpw (bits per weight) quantization structure
typedef struct {
    ggml_fp16_t d;           // Super-block scale.
    uint16_t qs[QK_K/8];     // Quantized values.
    uint8_t  scales[QK_K/32];// Scales for quantization.
} block_iq2_xs;
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");

本文作者：XD 转载请标明出处：http://www.eadst.com/blog/232

本站采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可。

上一篇
llama.cpp: Efficient 6-bit Data Packing in an 8-bit Array

下一篇
Use md5sum to Verify File Integrity

原 llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures

作者：XD / 发表： 2024年1月25日 01:05 / 更新： 2024年1月25日 01:15 / 编程笔记 / 阅读量：4719

llama.cpp: Definations of Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, and Q8_K Structures