Add enum llama_ftype, sync ggml_type to model files

ggerganov · Apr 5, 2023 · 2dab8f5 · 2dab8f5
1 parent 3416298
commit 2dab8f5
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 78 deletions.
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -12,8 +12,8 @@ int main(int argc, char ** argv) {
 
     if (argc != 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
+        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
         return 1;
     }
 
@@ -27,7 +27,15 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const int itype = atoi(argv[3]);
+    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
+    switch (ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0:
+        case LLAMA_FTYPE_MOSTLY_Q4_1:
+            break;
+        default:
+            fprintf(stderr, "Invalid model file type %d\n", ftype);
+            return 1;
+    }
 
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -37,7 +45,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
             return 1;
         }

diff --git a/ggml.c b/ggml.c
@@ -2578,29 +2578,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
 //
 
 static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    QK,
-    QK,
-    1,
-    1,
-    1,
-    1,
-    1,
+    [GGML_TYPE_F32]  = 1,
+    [GGML_TYPE_F16]  = 1,
+    [GGML_TYPE_Q4_0] = QK,
+    [GGML_TYPE_Q4_1] = QK,
+    [GGML_TYPE_I8]   = 1,
+    [GGML_TYPE_I16]  = 1,
+    [GGML_TYPE_I32]  = 1,
 };
-
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-    sizeof(block_q4_0),
-    sizeof(block_q4_1),
-    sizeof(int8_t ),
-    sizeof(int16_t),
-    sizeof(int32_t),
-    sizeof(ggml_fp16_t),
-    sizeof(float  ),
+    [GGML_TYPE_F32]  = sizeof(float),
+    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
+    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
+    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
+    [GGML_TYPE_I8]   = sizeof(int8_t),
+    [GGML_TYPE_I16]  = sizeof(int16_t),
+    [GGML_TYPE_I32]  = sizeof(int32_t),
 };
-
-// don't forget to update the array above when adding new types
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",

diff --git a/ggml.h b/ggml.h
@@ -198,13 +198,14 @@ struct ggml_object;
 struct ggml_context;
 
 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
+    // explicitly numbered values are used in llama.cpp files
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
     GGML_TYPE_I8,
     GGML_TYPE_I16,
     GGML_TYPE_I32,
-    GGML_TYPE_F16,
-    GGML_TYPE_F32,
     GGML_TYPE_COUNT,
 };
 

diff --git a/llama.cpp b/llama.cpp
@@ -36,6 +36,7 @@
         } \
     } while (0)
 
+static const char * ttype_str[] = { "f32", "f16", "q4_0", "q4_1" };
 
 // determine number of model parts based on the dimension
 static const std::unordered_map<int, int> LLAMA_N_PARTS = {
@@ -100,7 +101,7 @@ struct llama_hparams {
     int32_t n_head  = 32;
     int32_t n_layer = 32;
     int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t ftype   = LLAMA_FTYPE_MOSTLY_F16;
 };
 
 struct llama_layer {
@@ -424,7 +425,7 @@ static bool llama_model_load(
         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
         hparams.n_ctx = n_ctx;
 
@@ -435,7 +436,7 @@ static bool llama_model_load(
         }
 
         // temp warning to tell the user to use "--n_parts"
-        if (hparams.f16 == 4 && n_parts != 1) {
+        if (hparams.ftype == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) {
             fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
             fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
         }
@@ -463,7 +464,7 @@ static bool llama_model_load(
         fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
         fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
         fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype   = %d\n", __func__, hparams.ftype);
         fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
         fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
         fprintf(stderr, "%s: type    = %d\n", __func__, model.type);
@@ -507,16 +508,19 @@ static bool llama_model_load(
     // in order to save memory and also to speed up the computation
     // wtype is for per-layer weights, while vtype is for other weights
     ggml_type wtype, vtype;
-    switch (model.hparams.f16) {
-        case 0: wtype = vtype = GGML_TYPE_F32;  break;
-        case 1: wtype = vtype = GGML_TYPE_F16;  break;
-        case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
-        case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
-        case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
+    switch (model.hparams.ftype) {
+        case LLAMA_FTYPE_ALL_F32:     wtype = vtype = GGML_TYPE_F32;  break;
+        case LLAMA_FTYPE_MOSTLY_F16:  wtype = vtype = GGML_TYPE_F16;  break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break;
+        case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
+            wtype = GGML_TYPE_Q4_1;
+            vtype = GGML_TYPE_F16;
+            break;
         default:
                 {
-                    fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
-                            __func__, fname.c_str(), model.hparams.f16);
+                    fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                            __func__, fname.c_str(), model.hparams.ftype);
                     return false;
                 }
     }
@@ -647,11 +651,11 @@ static bool llama_model_load(
         while (true) {
             int32_t n_dims;
             int32_t length;
-            int32_t ftype;
+            int32_t ttype;
 
             fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
             fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
 
             if (fin.eof()) {
                 break;
@@ -684,20 +688,19 @@ static bool llama_model_load(
                 return false;
             }
             if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
+                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ttype_str[ttype]);
             }
 
-            switch (ftype) {
-                case 0:  // f32
-                case 1:  // f16
+            switch (ttype) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
                     break;
-                case 2:  // q4_0
-                case 3:  // q4_1
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
                     assert(ne[0] % 64 == 0);
                     break;
                 default:
-                    fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                    fprintf(stderr, "%s: unknown tensor type %d in model file\n", __func__, ttype);
                     return false;
             };
 
@@ -1289,20 +1292,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
 //
 
 // TODO: reuse code from the llama_model_load() somehow
-static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
-    ggml_type type = GGML_TYPE_Q4_1;
+static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
+    ggml_type qtype;
 
-    switch (itype) {
-        case 2: type = GGML_TYPE_Q4_0; break;
-        case 3: type = GGML_TYPE_Q4_1; break;
-        default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
+    switch (ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, ftype); return false;
     };
 
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
-        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
-        return false;
-    }
-
     llama_vocab vocab;
 
     printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@@ -1357,15 +1355,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
         finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
         finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
 
         printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
         printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
         printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
         printf("%s: n_head  = %d\n", __func__, hparams.n_head);
         printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
 
         fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
         //fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -1374,7 +1372,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
         fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
         fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
         fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &itype,           sizeof(hparams.f16));
+        int32_t iftype = ftype;
+        fout.write((char *) &iftype,          sizeof(hparams.ftype));
     }
 
     // load vocab
@@ -1426,11 +1425,11 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
         while (true) {
             int32_t n_dims;
             int32_t length;
-            int32_t ftype;
+            int32_t ttype;
 
             finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
             finp.read(reinterpret_cast<char *>(&length), sizeof(length));
-            finp.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
 
             if (finp.eof()) {
                 break;
@@ -1454,8 +1453,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             }
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
+                printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ttype_str[ttype]);
             }
 
             // regexes of tensor names to be quantized
@@ -1475,12 +1473,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             quantize &= (n_dims == 2);
 
             if (quantize) {
-                if (ftype != 0 && ftype != 1) {
-                    fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
+                if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                    fprintf(stderr, "%s: unsupported tensor type %d for integer quantization\n", __func__, ttype);
                     return false;
                 }
 
-                if (ftype == 1) {
+                if (ttype == GGML_TYPE_F16) {
                     data_f16.resize(nelements);
                     finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                     data_f32.resize(nelements);
@@ -1492,17 +1490,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
                     finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
                 }
 
-                ftype = itype;
+                ttype = qtype;
             } else {
-                const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
+                const int bpe = ggml_type_size((ggml_type)ttype);
 
                 data_u8.resize(nelements*bpe);
                 finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
             }
 
             fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
             fout.write(reinterpret_cast<char *>(&length), sizeof(length));
-            fout.write(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
             for (int i = 0; i < n_dims; ++i) {
                 fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
             }
@@ -1522,7 +1520,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
                 size_t cur_size = 0;
                 std::vector<int64_t> hist_cur(1 << 4, 0);
 
-                switch (type) {
+                switch (qtype) {
                     case GGML_TYPE_Q4_0:
                         {
                             cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
@@ -1533,7 +1531,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
                         } break;
                     default:
                         {
-                            fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
+                            fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, qtype);
                             return false;
                         }
                 }
@@ -1675,8 +1673,8 @@ void llama_free(struct llama_context * ctx) {
 int llama_model_quantize(
         const char * fname_inp,
         const char * fname_out,
-               int   itype) {
-    if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
+  enum llama_ftype   ftype) {
+    if (!llama_model_quantize_internal(fname_inp, fname_out, ftype)) {
         fprintf(stderr, "%s: failed to quantize\n", __func__);
         return 1;
     }

diff --git a/llama.h b/llama.h
@@ -64,6 +64,15 @@ extern "C" {
         void * progress_callback_user_data;
     };
 
+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32           = 0,
+        LLAMA_FTYPE_MOSTLY_F16        = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0       = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1       = 3,  // except 1d tensors
+        LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4,  // but tok_embeddings.weight and output.weight are F16
+    };
+
     LLAMA_API struct llama_context_params llama_context_default_params();
 
     // Various functions for loading a ggml llama model.
@@ -81,7 +90,7 @@ extern "C" {
     LLAMA_API int llama_model_quantize(
             const char * fname_inp,
             const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype);
 
     // Returns the KV cache that will contain the context for the
     // ongoing prediction with the model.