Skip to content

Commit

Permalink
Add enum llama_ftype, sync ggml_type to model files
Browse files Browse the repository at this point in the history
  • Loading branch information
sw committed Apr 5, 2023
1 parent 3416298 commit 2dab8f5
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 78 deletions.
16 changes: 12 additions & 4 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ int main(int argc, char ** argv) {

if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
return 1;
}

Expand All @@ -27,7 +27,15 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];

const int itype = atoi(argv[3]);
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0:
case LLAMA_FTYPE_MOSTLY_Q4_1:
break;
default:
fprintf(stderr, "Invalid model file type %d\n", ftype);
return 1;
}

const int64_t t_main_start_us = ggml_time_us();

Expand All @@ -37,7 +45,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();

if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
Expand Down
35 changes: 16 additions & 19 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2578,29 +2578,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
//

static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
QK,
QK,
1,
1,
1,
1,
1,
[GGML_TYPE_F32] = 1,
[GGML_TYPE_F16] = 1,
[GGML_TYPE_Q4_0] = QK,
[GGML_TYPE_Q4_1] = QK,
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};

static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated");

static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
sizeof(block_q4_0),
sizeof(block_q4_1),
sizeof(int8_t ),
sizeof(int16_t),
sizeof(int32_t),
sizeof(ggml_fp16_t),
sizeof(float ),
[GGML_TYPE_F32] = sizeof(float),
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};

// don't forget to update the array above when adding new types
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");

static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
Expand Down
9 changes: 5 additions & 4 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,14 @@ struct ggml_object;
struct ggml_context;

enum ggml_type {
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
// explicitly numbered values are used in llama.cpp files
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_F16,
GGML_TYPE_F32,
GGML_TYPE_COUNT,
};

Expand Down
98 changes: 48 additions & 50 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
} \
} while (0)

static const char * ttype_str[] = { "f32", "f16", "q4_0", "q4_1" };

// determine number of model parts based on the dimension
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
Expand Down Expand Up @@ -100,7 +101,7 @@ struct llama_hparams {
int32_t n_head = 32;
int32_t n_layer = 32;
int32_t n_rot = 64;
int32_t f16 = 1;
int32_t ftype = LLAMA_FTYPE_MOSTLY_F16;
};

struct llama_layer {
Expand Down Expand Up @@ -424,7 +425,7 @@ static bool llama_model_load(
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));

hparams.n_ctx = n_ctx;

Expand All @@ -435,7 +436,7 @@ static bool llama_model_load(
}

// temp warning to tell the user to use "--n_parts"
if (hparams.f16 == 4 && n_parts != 1) {
if (hparams.ftype == LLAMA_FTYPE_PER_LAYER_IS_Q4_1 && n_parts != 1) {
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
}
Expand Down Expand Up @@ -463,7 +464,7 @@ static bool llama_model_load(
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
fprintf(stderr, "%s: ftype = %d\n", __func__, hparams.ftype);
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
Expand Down Expand Up @@ -507,16 +508,19 @@ static bool llama_model_load(
// in order to save memory and also to speed up the computation
// wtype is for per-layer weights, while vtype is for other weights
ggml_type wtype, vtype;
switch (model.hparams.f16) {
case 0: wtype = vtype = GGML_TYPE_F32; break;
case 1: wtype = vtype = GGML_TYPE_F16; break;
case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
switch (model.hparams.ftype) {
case LLAMA_FTYPE_ALL_F32: wtype = vtype = GGML_TYPE_F32; break;
case LLAMA_FTYPE_MOSTLY_F16: wtype = vtype = GGML_TYPE_F16; break;
case LLAMA_FTYPE_MOSTLY_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_PER_LAYER_IS_Q4_1:
wtype = GGML_TYPE_Q4_1;
vtype = GGML_TYPE_F16;
break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
}
Expand Down Expand Up @@ -647,11 +651,11 @@ static bool llama_model_load(
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
int32_t ttype;

fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));

if (fin.eof()) {
break;
Expand Down Expand Up @@ -684,20 +688,19 @@ static bool llama_model_load(
return false;
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ttype_str[ttype]);
}

switch (ftype) {
case 0: // f32
case 1: // f16
switch (ttype) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
break;
case 2: // q4_0
case 3: // q4_1
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
assert(ne[0] % 64 == 0);
break;
default:
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
fprintf(stderr, "%s: unknown tensor type %d in model file\n", __func__, ttype);
return false;
};

Expand Down Expand Up @@ -1289,20 +1292,15 @@ static llama_vocab::id llama_sample_top_p_top_k(
//

// TODO: reuse code from the llama_model_load() somehow
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type type = GGML_TYPE_Q4_1;
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
ggml_type qtype;

switch (itype) {
case 2: type = GGML_TYPE_Q4_0; break;
case 3: type = GGML_TYPE_Q4_1; break;
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, ftype); return false;
};

if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
return false;
}

llama_vocab vocab;

printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
Expand Down Expand Up @@ -1357,15 +1355,15 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));

printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: ftype = %d\n", __func__, hparams.ftype);

fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
Expand All @@ -1374,7 +1372,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &itype, sizeof(hparams.f16));
int32_t iftype = ftype;
fout.write((char *) &iftype, sizeof(hparams.ftype));
}

// load vocab
Expand Down Expand Up @@ -1426,11 +1425,11 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
int32_t ttype;

finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));

if (finp.eof()) {
break;
Expand All @@ -1454,8 +1453,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
}

{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ttype_str[ttype]);
}

// regexes of tensor names to be quantized
Expand All @@ -1475,12 +1473,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= (n_dims == 2);

if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
fprintf(stderr, "%s: unsupported tensor type %d for integer quantization\n", __func__, ttype);
return false;
}

if (ftype == 1) {
if (ttype == GGML_TYPE_F16) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
Expand All @@ -1492,17 +1490,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}

ftype = itype;
ttype = qtype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
const int bpe = ggml_type_size((ggml_type)ttype);

data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}

fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
for (int i = 0; i < n_dims; ++i) {
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
Expand All @@ -1522,7 +1520,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);

switch (type) {
switch (qtype) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
Expand All @@ -1533,7 +1531,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
} break;
default:
{
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, qtype);
return false;
}
}
Expand Down Expand Up @@ -1675,8 +1673,8 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype) {
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
enum llama_ftype ftype) {
if (!llama_model_quantize_internal(fname_inp, fname_out, ftype)) {
fprintf(stderr, "%s: failed to quantize\n", __func__);
return 1;
}
Expand Down
11 changes: 10 additions & 1 deletion llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ extern "C" {
void * progress_callback_user_data;
};

// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_PER_LAYER_IS_Q4_1 = 4, // but tok_embeddings.weight and output.weight are F16
};

LLAMA_API struct llama_context_params llama_context_default_params();

// Various functions for loading a ggml llama model.
Expand All @@ -81,7 +90,7 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype);
enum llama_ftype ftype);

// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
Expand Down

0 comments on commit 2dab8f5

Please sign in to comment.