Improve usability of --model-url & related flags (#6930)

* args: default --model to models/ + filename from --model-url or --hf-file (or else legacy models/7B/ggml-model-f16.gguf) * args: main & server now call gpt_params_handle_model_default * args: define DEFAULT_MODEL_PATH + update cli docs * curl: check url of previous download (.json metadata w/ url, etag & lastModified) * args: fix update to quantize-stats.cpp * curl: support legacy .etag / .lastModified companion files * curl: rm legacy .etag file support * curl: reuse regex across headers callback calls * curl: unique_ptr to manage lifecycle of curl & outfile * curl: nit: no need for multiline regex flag * curl: update failed test (model file collision) + gitignore *.gguf.json
ggerganov · Apr 29, 2024 · 8843a98 · 8843a98
1 parent b8c1476
commit 8843a98
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 133 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.a
 *.so
 *.gguf
+*.gguf.json
 *.bin
 *.exe
 *.dll

diff --git a/common/common.cpp b/common/common.cpp
diff --git a/common/common.h b/common/common.h
@@ -31,6 +31,8 @@
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
@@ -92,7 +94,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
     std::string model_draft          = "";  // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = "";  // model url to download
@@ -171,6 +173,8 @@ struct gpt_params {
     std::vector<std::string> image; // path to image file(s)
 };
 
+void gpt_params_handle_model_default(gpt_params & params);
+
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

diff --git a/examples/main/README.md b/examples/main/README.md
@@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -23,7 +23,7 @@
 #endif
 
 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.gguf";
+    std::string model = DEFAULT_MODEL_PATH;
     bool verbose = false;
     bool per_layer_stats = false;
     bool print_histogram = false;

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2353,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
         printf("                            disable KV offload\n");
     }
     printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
     printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
     printf("                            model download url (default: unused)\n");
     printf("  -hfr REPO, --hf-repo REPO\n");
@@ -2835,6 +2835,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
         }
     }
 
+    gpt_params_handle_model_default(params);
+
     if (!params.kv_overrides.empty()) {
         params.kv_overrides.emplace_back();
         params.kv_overrides.back().key[0] = 0;

diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
@@ -5,7 +5,7 @@ Feature: llama.cpp server
   Background: Server startup
     Given a server listening on localhost:8080
     And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file ggml-model-f16.gguf
+    And   a model file bert-bge-small.gguf
     And   a model alias bert-bge-small
     And   42 as server seed
     And   2 slots
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     *.a
     *.so
     *.gguf
+    *.gguf.json
     *.bin
     *.exe
     *.dll
@@ Expand Down @@