diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 680757c6bf3561..007d5a1872a3bd 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -12,8 +12,8 @@ int main(int argc, char ** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); + fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); return 1; } @@ -27,7 +27,15 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int itype = atoi(argv[3]); + const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: + case LLAMA_FTYPE_MOSTLY_Q4_1: + break; + default: + fprintf(stderr, "Invalid model file type %d\n", ftype); + return 1; + } const int64_t t_main_start_us = ggml_time_us(); @@ -37,7 +45,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index 59e84ab45d120a..71f40f83234602 100644 --- a/ggml.c +++ b/ggml.c @@ -2578,29 +2578,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - QK, - QK, - 1, - 1, - 1, - 1, - 1, + [GGML_TYPE_F32] = 1, + [GGML_TYPE_F16] = 1, + [GGML_TYPE_Q4_0] = QK, + [GGML_TYPE_Q4_1] = QK, + [GGML_TYPE_I8] = 1, + [GGML_TYPE_I16] = 1, + [GGML_TYPE_I32] = 1, }; - -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - sizeof(block_q4_0), - sizeof(block_q4_1), - sizeof(int8_t ), - sizeof(int16_t), - sizeof(int32_t), - sizeof(ggml_fp16_t), - sizeof(float ), + [GGML_TYPE_F32] = sizeof(float), + [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_I8] = sizeof(int8_t), + [GGML_TYPE_I16] = sizeof(int16_t), + [GGML_TYPE_I32] = sizeof(int32_t), }; - -// don't forget to update the array above when adding new types -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", diff --git a/ggml.h b/ggml.h index ad962b109ea89c..167c4857e8d99c 100644 --- a/ggml.h +++ b/ggml.h @@ -198,13 +198,14 @@ struct ggml_object; struct ggml_context; enum ggml_type { - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, + // explicitly numbered values are used in llama.cpp files + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, - GGML_TYPE_F16, - GGML_TYPE_F32, GGML_TYPE_COUNT, }; diff --git a/llama.cpp b/llama.cpp index e4517959a8c778..56948ea70efe41 100644 --- a/llama.cpp +++ b/llama.cpp @@ -36,6 +36,7 @@ } \ } while (0) +static const char * ttype_str[] = { "f32", "f16", "q4_0", "q4_1" }; // determine number of model parts based on the dimension static const std::unordered_map LLAMA_N_PARTS = { @@ -100,7 +101,7 @@ struct llama_hparams { int32_t n_head = 32; int32_t n_layer = 32; int32_t n_rot = 64; - int32_t f16 = 1; + int32_t ftype = LLAMA_FTYPE_MOSTLY_F16; }; struct llama_layer { @@ -424,7 +425,7 @@ static bool llama_model_load( fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fin.read((char *) &hparams.f16, sizeof(hparams.f16)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); hparams.n_ctx = n_ctx; @@ -435,7 +436,7 @@ static bool llama_model_load( } // temp warning to tell the user to use "--n_parts" - if (hparams.f16 == 4 && n_parts != 1) { + if (hparams.ftype == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) { fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts); fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); } @@ -463,7 +464,7 @@ static bool llama_model_load( fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype = %d\n", __func__, hparams.ftype); fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); fprintf(stderr, "%s: type = %d\n", __func__, model.type); @@ -507,16 +508,19 @@ static bool llama_model_load( // in order to save memory and also to speed up the computation // wtype is for per-layer weights, while vtype is for other weights ggml_type wtype, vtype; - switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; - case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; + switch (model.hparams.ftype) { + case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break; + case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break; + case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break; + case LLAMA_FTYPE_PER_LAYER_IS_Q4_1: + wtype = GGML_TYPE_Q4_1; + vtype = GGML_TYPE_F16; + break; default: { - fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", - __func__, fname.c_str(), model.hparams.f16); + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); return false; } } @@ -647,11 +651,11 @@ static bool llama_model_load( while (true) { int32_t n_dims; int32_t length; - int32_t ftype; + int32_t ttype; fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); if (fin.eof()) { break; @@ -684,20 +688,19 @@ static bool llama_model_load( return false; } if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); + fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ttype_str[ttype]); } - switch (ftype) { - case 0: // f32 - case 1: // f16 + switch (ttype) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: break; - case 2: // q4_0 - case 3: // q4_1 + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: assert(ne[0] % 64 == 0); break; default: - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); + fprintf(stderr, "%s: unknown tensor type %d in model file\n", __func__, ttype); return false; }; @@ -1289,20 +1292,15 @@ static llama_vocab::id llama_sample_top_p_top_k( // // TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; +static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { + ggml_type qtype; - switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; + default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, ftype); return false; }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } - llama_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -1357,7 +1355,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -1365,7 +1363,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s printf("%s: n_mult = %d\n", __func__, hparams.n_mult); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + printf("%s: ftype = %d\n", __func__, hparams.ftype); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -1374,7 +1372,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &itype, sizeof(hparams.f16)); + int32_t iftype = ftype; + fout.write((char *) &iftype, sizeof(hparams.ftype)); } // load vocab @@ -1426,11 +1425,11 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s while (true) { int32_t n_dims; int32_t length; - int32_t ftype; + int32_t ttype; finp.read(reinterpret_cast(&n_dims), sizeof(n_dims)); finp.read(reinterpret_cast(&length), sizeof(length)); - finp.read(reinterpret_cast(&ftype), sizeof(ftype)); + finp.read(reinterpret_cast(&ttype), sizeof(ttype)); if (finp.eof()) { break; @@ -1454,8 +1453,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s } { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); + printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ttype_str[ttype]); } // regexes of tensor names to be quantized @@ -1475,12 +1473,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= (n_dims == 2); if (quantize) { - if (ftype != 0 && ftype != 1) { - fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); + if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { + fprintf(stderr, "%s: unsupported tensor type %d for integer quantization\n", __func__, ttype); return false; } - if (ftype == 1) { + if (ttype == GGML_TYPE_F16) { data_f16.resize(nelements); finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); data_f32.resize(nelements); @@ -1492,9 +1490,9 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s finp.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); } - ftype = itype; + ttype = qtype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); + const int bpe = ggml_type_size((ggml_type)ttype); data_u8.resize(nelements*bpe); finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); @@ -1502,7 +1500,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s fout.write(reinterpret_cast(&n_dims), sizeof(n_dims)); fout.write(reinterpret_cast(&length), sizeof(length)); - fout.write(reinterpret_cast(&ftype), sizeof(ftype)); + fout.write(reinterpret_cast(&ttype), sizeof(ttype)); for (int i = 0; i < n_dims; ++i) { fout.write(reinterpret_cast(&ne[i]), sizeof(ne[i])); } @@ -1522,7 +1520,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s size_t cur_size = 0; std::vector hist_cur(1 << 4, 0); - switch (type) { + switch (qtype) { case GGML_TYPE_Q4_0: { cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); @@ -1533,7 +1531,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s } break; default: { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); + fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, qtype); return false; } } @@ -1675,8 +1673,8 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { + enum llama_ftype ftype) { + if (!llama_model_quantize_internal(fname_inp, fname_out, ftype)) { fprintf(stderr, "%s: failed to quantize\n", __func__); return 1; } diff --git a/llama.h b/llama.h index 04e2bf71cd9c01..7c155449246c96 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,15 @@ extern "C" { void * progress_callback_user_data; }; + // model file types + enum llama_ftype { + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4, // but tok_embeddings.weight and output.weight are F16 + }; + LLAMA_API struct llama_context_params llama_context_default_params(); // Various functions for loading a ggml llama model. @@ -81,7 +90,7 @@ extern "C" { LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype); + enum llama_ftype ftype); // Returns the KV cache that will contain the context for the // ongoing prediction with the model.