From 57ef494862a8be1c721682c174df25901f8551a4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 14 May 2024 19:42:05 +0200
Subject: [PATCH 1/8] feat(llama.cpp): support distributed llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .env                              |  5 +++++
 Makefile                          | 11 +++++++++--
 backend/cpp/llama/grpc-server.cpp |  6 ++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/.env b/.env
index ea2d4e358f52..95a515bc850f 100644
--- a/.env
+++ b/.env
@@ -71,6 +71,11 @@
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1
 
+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true
 
diff --git a/Makefile b/Makefile
index 8140377c6e33..bb7879924b5b 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
 
 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
+CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12
 
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -314,7 +314,7 @@ build: prepare backend-assets grpcs ## Build the project
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 
 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
 
 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
@@ -764,3 +764,10 @@ docker-image-intel-xpu:
 .PHONY: swagger
 swagger:
 	swag init -g core/http/app.go --output swagger
+
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-grpc
+	$(MAKE) -C backend/cpp/llama-grpc purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
\ No newline at end of file
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index f9673b33ccfa..fb1e1388f2d2 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
     } else {
         params.n_parallel = 1;
     }
+
+    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+    if (llama_grpc_servers != NULL) {
+        params.rpc_servers = std::string(llama_grpc_servers);
+    }
+    
     // TODO: Add yarn
 
     if (!request->tensorsplit().empty()) {

From 1f23930dc371ca77148d9e8ff94911261139e59d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 14 May 2024 19:42:25 +0200
Subject: [PATCH 2/8] feat: let tweak how chat messages are merged together

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/backend_config.go      | 33 ++++++++++++++++++++++++------
 core/http/endpoints/openai/chat.go |  7 ++++++-
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 41c792fb1da3..6b9aa54e1d80 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -93,6 +93,8 @@ type Diffusers struct {
 	ControlNet       string  `yaml:"control_net"`
 }
 
+// LLMConfig is a struct that holds the configuration that are
+// generic for most of the LLM backends.
 type LLMConfig struct {
 	SystemPrompt    string   `yaml:"system_prompt"`
 	TensorSplit     string   `yaml:"tensor_split"`
@@ -144,6 +146,7 @@ type LLMConfig struct {
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }
 
+// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
 	Device           string `yaml:"device"`
@@ -151,13 +154,31 @@ type AutoGPTQ struct {
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }
 
+// TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
-	Chat                 string `yaml:"chat"`
-	ChatMessage          string `yaml:"chat_message"`
-	Completion           string `yaml:"completion"`
-	Edit                 string `yaml:"edit"`
-	Functions            string `yaml:"function"`
-	UseTokenizerTemplate bool   `yaml:"use_tokenizer_template"`
+	// Chat is the template used in the chat completion endpoint
+	Chat string `yaml:"chat"`
+
+	// ChatMessage is the template used for chat messages
+	ChatMessage string `yaml:"chat_message"`
+
+	// Completion is the template used for completion requests
+	Completion string `yaml:"completion"`
+
+	// Edit is the template used for edit completion requests
+	Edit string `yaml:"edit"`
+
+	// Functions is the template used when tools are present in the client requests
+	Functions string `yaml:"function"`
+
+	// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
+	// Note: this is mostly consumed for backends such as vllm and transformers
+	// that can use the tokenizers specified in the JSON config files of the models
+	UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
+
+	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
+	// It defaults to \n
+	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 }
 
 func (c *BackendConfig) SetFunctionCallString(s string) {
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index ccbf094667d4..c49ef263c197 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				mess = append(mess, content)
 			}
 
-			predInput = strings.Join(mess, "\n")
+			joinCharacter := "\n"
+			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+			}
+
+			predInput = strings.Join(mess, joinCharacter)
 			log.Debug().Msgf("Prompt (before templating): %s", predInput)
 
 			templateFile := ""

From 077ba78b28417d39416cd54bf462e0e7e3268093 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 14 May 2024 20:02:58 +0200
Subject: [PATCH 3/8] refactor

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                  | 14 +++---
 pkg/model/initializers.go | 94 ++++++++++++++++++++++++++-------------
 2 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index bb7879924b5b..d4666936dc40 100644
--- a/Makefile
+++ b/Makefile
@@ -691,6 +691,13 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
 
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-grpc
+	$(MAKE) -C backend/cpp/llama-grpc purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
+
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
@@ -764,10 +771,3 @@ docker-image-intel-xpu:
 .PHONY: swagger
 swagger:
 	swag init -g core/http/app.go --output swagger
-
-backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
-	cp -rf backend/cpp/llama backend/cpp/llama-grpc
-	$(MAKE) -C backend/cpp/llama-grpc purge
-	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
-	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
\ No newline at end of file
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 115a12a0cf4c..bd0316d6d0b0 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -29,13 +29,14 @@ var Aliases map[string]string = map[string]string{
 const (
 	LlamaGGML = "llama-ggml"
 
-	LLamaCPP         = "llama-cpp"
+	LLamaCPP = "llama-cpp"
 
 	LLamaCPPCUDA12   = "llama-cpp-cuda12"
 	LLamaCPPAVX2     = "llama-cpp-avx2"
 	LLamaCPPAVX      = "llama-cpp-avx"
 	LLamaCPPFallback = "llama-cpp-fallback"
 	LLamaCPPCUDA     = "llama-cpp-cuda"
+	LLamaCPPGRPC     = "llama-cpp-grpc"
 
 	Gpt4AllLlamaBackend = "gpt4all-llama"
 	Gpt4AllMptBackend   = "gpt4all-mpt"
@@ -81,7 +82,8 @@ ENTRY:
 		}
 	}
 
-	foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false
+	// if we find the llama.cpp variants, show them of as a single backend (llama-cpp)
+	foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false
 	if _, ok := backends[LLamaCPP]; !ok {
 		for _, e := range entry {
 			if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
@@ -96,16 +98,23 @@ ENTRY:
 				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
 				foundLCPPFallback = true
 			}
+			if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
+				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
+				foundLCPPGRPC = true
+			}
 		}
 	}
 
 	// order backends from the asset directory.
 	// as we scan for backends, we want to keep some order which backends are tried of.
 	// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
-	// sets a priority list
-	// First has more priority
+
+	// sets a priority list - first has more priority
 	priorityList := []string{
-		// First llama.cpp and llama-ggml
+
+		// First llama.cpp(variants) and llama-ggml to follow.
+		// We keep the fallback to prevent that if the llama.cpp variants
+		// that depends on shared libs if breaks have still a safety net.
 		LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
 	}
 
@@ -142,6 +151,50 @@ ENTRY:
 	return orderedBackends, nil
 }
 
+// selectGRPCProcess selects the GRPC process to start based on system capabilities
+func selectGRPCProcess(backend, assetDir string) string {
+	foundCUDA := false
+	var grpcProcess string
+
+	// Select backend now just for llama.cpp
+	if backend != LLamaCPP {
+		return ""
+	}
+
+	// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
+	if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" {
+		return backendPath(assetDir, LLamaCPPGRPC)
+	}
+
+	gpus, err := xsysinfo.GPUs()
+	if err == nil {
+		for _, gpu := range gpus {
+			if strings.Contains(gpu.String(), "nvidia") {
+				log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
+				grpcProcess = backendPath(assetDir, LLamaCPPCUDA)
+				if _, err := os.Stat(grpcProcess); err == nil {
+					foundCUDA = true
+				}
+			}
+		}
+	}
+
+	if !foundCUDA {
+		if cpu.X86.HasAVX2 {
+			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
+			grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
+		} else if cpu.X86.HasAVX {
+			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
+			grpcProcess = backendPath(assetDir, LLamaCPPAVX)
+		} else {
+			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
+			grpcProcess = backendPath(assetDir, LLamaCPPFallback)
+		}
+	}
+
+	return grpcProcess
+}
+
 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
 func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
@@ -192,33 +245,10 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		} else {
 			grpcProcess := backendPath(o.assetDir, backend)
 
-			foundCUDA := false
-			// for llama-cpp, check CPU capabilities and load the appropriate variant
-			if backend == LLamaCPP {
-				gpus, err := xsysinfo.GPUs()
-				if err == nil {
-					for _, gpu := range gpus {
-						if strings.Contains(gpu.String(), "nvidia") {
-							log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
-							grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
-							if _, err := os.Stat(grpcProcess); err == nil {
-								foundCUDA = true
-							}
-						}
-					}
-				}
-
-				if !foundCUDA {
-					if cpu.X86.HasAVX2 {
-						log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-						grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
-					} else if cpu.X86.HasAVX {
-						log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-						grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
-					} else {
-						log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-						grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
-					}
+			if os.Getenv("DISABLE_AUTODETECT") != "true" {
+				// autoDetect GRPC process to start based on system capabilities
+				if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
+					grpcProcess = selectedProcess
 				}
 			}
 

From 2c5ae68fe59afb24f293d2679ad03a09ca20e6d6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 14 May 2024 20:04:39 +0200
Subject: [PATCH 4/8] Makefile: register to ALL_GRPC_BACKENDS

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index d4666936dc40..5b9d5191415e 100644
--- a/Makefile
+++ b/Makefile
@@ -158,6 +158,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper

From cacdf676a2e26e9679c2e425937a4d864472d872 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 14 May 2024 20:29:05 +0200
Subject: [PATCH 5/8] refactoring, allow disable auto-detection of backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 pkg/model/initializers.go | 110 +++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 50 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index bd0316d6d0b0..a349406c356b 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -60,7 +60,7 @@ func backendPath(assetDir, backend string) string {
 
 // backendsInAssetDir returns the list of backends in the asset directory
 // that should be loaded
-func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) {
+func backendsInAssetDir(assetDir string) ([]string, error) {
 	// Exclude backends from automatic loading
 	excludeBackends := []string{LocalStoreBackend}
 	entry, err := os.ReadDir(backendPath(assetDir, ""))
@@ -75,32 +75,41 @@ ENTRY:
 				continue ENTRY
 			}
 		}
-		if !e.IsDir() {
-			if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
-				backends[e.Name()] = []string{}
-			}
+		if e.IsDir() {
+			continue
+		}
+
+		// Skip the llama.cpp variants if we are autoDetecting
+		// But we always load the fallback variant if it exists
+		if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
+			continue
 		}
+
+		backends[e.Name()] = []string{}
 	}
 
-	// if we find the llama.cpp variants, show them of as a single backend (llama-cpp)
-	foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false
-	if _, ok := backends[LLamaCPP]; !ok {
-		for _, e := range entry {
-			if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
-				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
-				foundLCPPAVX2 = true
-			}
-			if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
-				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
-				foundLCPPAVX = true
-			}
-			if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
-				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
-				foundLCPPFallback = true
-			}
-			if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
-				backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
-				foundLCPPGRPC = true
+	// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
+	if autoDetect {
+		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp)
+		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false
+		if _, ok := backends[LLamaCPP]; !ok {
+			for _, e := range entry {
+				if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
+					foundLCPPAVX2 = true
+				}
+				if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
+					foundLCPPAVX = true
+				}
+				if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
+					foundLCPPFallback = true
+				}
+				if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
+					foundLCPPGRPC = true
+				}
 			}
 		}
 	}
@@ -148,7 +157,7 @@ ENTRY:
 		}
 	}
 
-	return orderedBackends, nil
+	return orderedBackends.Keys(), nil
 }
 
 // selectGRPCProcess selects the GRPC process to start based on system capabilities
@@ -171,30 +180,35 @@ func selectGRPCProcess(backend, assetDir string) string {
 		for _, gpu := range gpus {
 			if strings.Contains(gpu.String(), "nvidia") {
 				log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
-				grpcProcess = backendPath(assetDir, LLamaCPPCUDA)
-				if _, err := os.Stat(grpcProcess); err == nil {
+				p := backendPath(assetDir, LLamaCPPCUDA)
+				if _, err := os.Stat(p); err == nil {
+					grpcProcess = p
 					foundCUDA = true
 				}
 			}
 		}
 	}
 
-	if !foundCUDA {
-		if cpu.X86.HasAVX2 {
-			log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-			grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
-		} else if cpu.X86.HasAVX {
-			log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-			grpcProcess = backendPath(assetDir, LLamaCPPAVX)
-		} else {
-			log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-			grpcProcess = backendPath(assetDir, LLamaCPPFallback)
-		}
+	if foundCUDA {
+		return grpcProcess
+	}
+
+	if cpu.X86.HasAVX2 {
+		log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
+		grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
+	} else if cpu.X86.HasAVX {
+		log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
+		grpcProcess = backendPath(assetDir, LLamaCPPAVX)
+	} else {
+		log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
+		grpcProcess = backendPath(assetDir, LLamaCPPFallback)
 	}
 
 	return grpcProcess
 }
 
+var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
+
 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
 func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
@@ -245,7 +259,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		} else {
 			grpcProcess := backendPath(o.assetDir, backend)
 
-			if os.Getenv("DISABLE_AUTODETECT") != "true" {
+			if autoDetect {
 				// autoDetect GRPC process to start based on system capabilities
 				if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
 					grpcProcess = selectedProcess
@@ -393,28 +407,24 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
 
 	var err error
 
-	// autoload also external backends
-	allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]()
+	// get backends embedded in the binary
 	autoLoadBackends, err := backendsInAssetDir(o.assetDir)
 	if err != nil {
 		return nil, err
 	}
-	log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
-
-	for _, k := range autoLoadBackends.Keys() {
-		v, _ := autoLoadBackends.Get(k)
-		allBackendsToAutoLoad.Set(k, v)
-	}
 
+	// append externalBackends supplied by the user via the CLI
 	for _, b := range o.externalBackends {
-		allBackendsToAutoLoad.Set(b, []string{})
+		autoLoadBackends = append(autoLoadBackends, b)
 	}
 
+	log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
+
 	if o.model != "" {
-		log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys())
+		log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends)
 	}
 
-	for _, key := range allBackendsToAutoLoad.Keys() {
+	for _, key := range autoLoadBackends {
 		log.Info().Msgf("[%s] Attempting to load", key)
 		options := []Option{
 			WithBackendString(key),

From 793c45d7c43b67398f15c64c19736a4e0b8b9686 Mon Sep 17 00:00:00 2001
From: mudler <mudler@localai.io>
Date: Tue, 14 May 2024 23:34:54 +0200
Subject: [PATCH 6/8] minor fixups

Signed-off-by: mudler <mudler@localai.io>
---
 pkg/model/initializers.go | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index a349406c356b..d013740ce5d6 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -12,9 +12,9 @@ import (
 
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 	"github.com/go-skynet/LocalAI/pkg/xsysinfo"
+	"github.com/klauspost/cpuid/v2"
 	"github.com/phayes/freeport"
 	"github.com/rs/zerolog/log"
-	"golang.org/x/sys/cpu"
 
 	"github.com/elliotchance/orderedmap/v2"
 )
@@ -26,12 +26,13 @@ var Aliases map[string]string = map[string]string{
 	"langchain-huggingface": LCHuggingFaceBackend,
 }
 
+var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
+
 const (
 	LlamaGGML = "llama-ggml"
 
 	LLamaCPP = "llama-cpp"
 
-	LLamaCPPCUDA12   = "llama-cpp-cuda12"
 	LLamaCPPAVX2     = "llama-cpp-avx2"
 	LLamaCPPAVX      = "llama-cpp-avx"
 	LLamaCPPFallback = "llama-cpp-fallback"
@@ -90,8 +91,9 @@ ENTRY:
 
 	// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
 	if autoDetect {
-		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp)
-		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false
+		// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
+		// when starting the service
+		foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda := false, false, false, false, false
 		if _, ok := backends[LLamaCPP]; !ok {
 			for _, e := range entry {
 				if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
@@ -110,6 +112,10 @@ ENTRY:
 					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
 					foundLCPPGRPC = true
 				}
+				if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
+					backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
+					foundLCPPCuda = true
+				}
 			}
 		}
 	}
@@ -172,6 +178,7 @@ func selectGRPCProcess(backend, assetDir string) string {
 
 	// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
 	if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" {
+		log.Info().Msgf("[%s] attempting to load with GRPC variant", LLamaCPPGRPC)
 		return backendPath(assetDir, LLamaCPPGRPC)
 	}
 
@@ -179,11 +186,13 @@ func selectGRPCProcess(backend, assetDir string) string {
 	if err == nil {
 		for _, gpu := range gpus {
 			if strings.Contains(gpu.String(), "nvidia") {
-				log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
 				p := backendPath(assetDir, LLamaCPPCUDA)
 				if _, err := os.Stat(p); err == nil {
+					log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
 					grpcProcess = p
 					foundCUDA = true
+				} else {
+					log.Info().Msgf("GPU device found but no CUDA backend present")
 				}
 			}
 		}
@@ -193,10 +202,10 @@ func selectGRPCProcess(backend, assetDir string) string {
 		return grpcProcess
 	}
 
-	if cpu.X86.HasAVX2 {
+	if xsysinfo.HasCPUCaps(cpuid.AVX2) {
 		log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
 		grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
-	} else if cpu.X86.HasAVX {
+	} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
 		log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
 		grpcProcess = backendPath(assetDir, LLamaCPPAVX)
 	} else {
@@ -207,8 +216,6 @@ func selectGRPCProcess(backend, assetDir string) string {
 	return grpcProcess
 }
 
-var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
-
 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
 func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {

From 3e53f452a73b9e7fda4fcabc367b8a4cda216330 Mon Sep 17 00:00:00 2001
From: mudler <mudler@localai.io>
Date: Wed, 15 May 2024 00:14:08 +0200
Subject: [PATCH 7/8] feat: add cmd to start rpc-server from llama.cpp

Signed-off-by: mudler <mudler@localai.io>
---
 Makefile                   |  5 +++++
 core/cli/cli.go            |  9 +++++----
 core/cli/llamacppworker.go | 37 +++++++++++++++++++++++++++++++++++++
 pkg/assets/extract.go      |  6 +++++-
 4 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 core/cli/llamacppworker.go

diff --git a/Makefile b/Makefile
index 5b9d5191415e..0b6ff9593e5f 100644
--- a/Makefile
+++ b/Makefile
@@ -159,6 +159,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
+ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -699,6 +700,10 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
 
+backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
+	mkdir -p backend-assets/util/
+	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
+
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
diff --git a/core/cli/cli.go b/core/cli/cli.go
index 2f2dcd8ba1bb..71f877b8105d 100644
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -13,8 +13,9 @@ type Context struct {
 var CLI struct {
 	Context `embed:""`
 
-	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
-	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
-	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
-	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+	Run            RunCMD            `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Models         ModelsCMD         `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS            TTSCMD            `cmd:"" help:"Convert text to speech"`
+	Transcript     TranscriptCMD     `cmd:"" help:"Convert audio to text"`
+	LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
 }
diff --git a/core/cli/llamacppworker.go b/core/cli/llamacppworker.go
new file mode 100644
index 000000000000..832b5bde3fc3
--- /dev/null
+++ b/core/cli/llamacppworker.go
@@ -0,0 +1,37 @@
+package cli
+
+import (
+	"os"
+	"syscall"
+
+	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/rs/zerolog/log"
+)
+
+type LLAMACPPWorkerCMD struct {
+	Args              []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
+	BackendAssetsPath string   `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
+	// Extract files from the embedded FS
+	err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
+	log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
+	if err != nil {
+		log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+	}
+
+	return syscall.Exec(
+		assets.ResolvePath(
+			r.BackendAssetsPath,
+			"util",
+			"llama-cpp-rpc-server",
+		),
+		append([]string{
+			assets.ResolvePath(
+				r.BackendAssetsPath,
+				"util",
+				"llama-cpp-rpc-server",
+			)}, r.Args...),
+		os.Environ())
+}
diff --git a/pkg/assets/extract.go b/pkg/assets/extract.go
index b795cb30a94e..8f668a1aaeda 100644
--- a/pkg/assets/extract.go
+++ b/pkg/assets/extract.go
@@ -8,6 +8,10 @@ import (
 	"path/filepath"
 )
 
+func ResolvePath(dir string, paths ...string) string {
+	return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
+}
+
 func ExtractFiles(content embed.FS, extractDir string) error {
 	// Create the target directory if it doesn't exist
 	err := os.MkdirAll(extractDir, 0750)
@@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error {
 		}
 
 		// Create the file in the target directory
-		err = os.WriteFile(targetFile, fileData, 0600)
+		err = os.WriteFile(targetFile, fileData, 0700)
 		if err != nil {
 			return fmt.Errorf("failed to write file: %v", err)
 		}

From b33224b4d21be529d9aba660fe0cabf5e29ea0c6 Mon Sep 17 00:00:00 2001
From: mudler <mudler@localai.io>
Date: Wed, 15 May 2024 00:20:29 +0200
Subject: [PATCH 8/8] ci: add ccache

Signed-off-by: mudler <mudler@localai.io>
---
 .github/workflows/release.yaml | 4 ++--
 Dockerfile                     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 30b6d9507034..0245725d00a8 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -29,7 +29,7 @@ jobs:
       - name: Dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
       - name: Install CUDA Dependencies
         run: |
           curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -86,7 +86,7 @@ jobs:
           cache: false
       - name: Dependencies
         run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
           go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
           go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
       - name: Build stablediffusion
diff --git a/Dockerfile b/Dockerfile
index 9680ba5c6edc..9cde257cecdb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
+        ccache \
         ca-certificates \
         cmake \
         curl \