From 57ef494862a8be1c721682c174df25901f8551a4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 14 May 2024 19:42:05 +0200 Subject: [PATCH 1/8] feat(llama.cpp): support distributed llama.cpp Signed-off-by: Ettore Di Giacinto --- .env | 5 +++++ Makefile | 11 +++++++++-- backend/cpp/llama/grpc-server.cpp | 6 ++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.env b/.env index ea2d4e358f52..95a515bc850f 100644 --- a/.env +++ b/.env @@ -71,6 +71,11 @@ ### Define the number of parallel LLAMA.cpp workers (Defaults to 1) # LLAMACPP_PARALLEL=1 +### Define a list of GRPC Servers for llama-cpp workers to distribute the load +# https://github.com/ggerganov/llama.cpp/pull/6829 +# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md +# LLAMACPP_GRPC_SERVERS="" + ### Enable to run parallel requests # LOCALAI_PARALLEL_REQUESTS=true diff --git a/Makefile b/Makefile index 8140377c6e33..bb7879924b5b 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ BINARY_NAME=local-ai # llama.cpp versions GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8 +CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12 # gpt4all version GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all @@ -314,7 +314,7 @@ build: prepare backend-assets grpcs ## Build the project CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./ build-minimal: - BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build + BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build build-api: BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build @@ -764,3 +764,10 @@ docker-image-intel-xpu: .PHONY: swagger swagger: swag init -g core/http/app.go --output swagger + +backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-grpc + $(MAKE) -C backend/cpp/llama-grpc purge + $(info ${GREEN}I llama-cpp build info:grpc${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc \ No newline at end of file diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index f9673b33ccfa..fb1e1388f2d2 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request, } else { params.n_parallel = 1; } + + const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS"); + if (llama_grpc_servers != NULL) { + params.rpc_servers = std::string(llama_grpc_servers); + } + // TODO: Add yarn if (!request->tensorsplit().empty()) { From 1f23930dc371ca77148d9e8ff94911261139e59d Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 14 May 2024 19:42:25 +0200 Subject: [PATCH 2/8] feat: let tweak how chat messages are merged together Signed-off-by: Ettore Di Giacinto --- core/config/backend_config.go | 33 ++++++++++++++++++++++++------ core/http/endpoints/openai/chat.go | 7 ++++++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 41c792fb1da3..6b9aa54e1d80 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -93,6 +93,8 @@ type Diffusers struct { ControlNet string `yaml:"control_net"` } +// LLMConfig is a struct that holds the configuration that are +// generic for most of the LLM backends. type LLMConfig struct { SystemPrompt string `yaml:"system_prompt"` TensorSplit string `yaml:"tensor_split"` @@ -144,6 +146,7 @@ type LLMConfig struct { YarnBetaSlow float32 `yaml:"yarn_beta_slow"` } +// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend type AutoGPTQ struct { ModelBaseName string `yaml:"model_base_name"` Device string `yaml:"device"` @@ -151,13 +154,31 @@ type AutoGPTQ struct { UseFastTokenizer bool `yaml:"use_fast_tokenizer"` } +// TemplateConfig is a struct that holds the configuration of the templating system type TemplateConfig struct { - Chat string `yaml:"chat"` - ChatMessage string `yaml:"chat_message"` - Completion string `yaml:"completion"` - Edit string `yaml:"edit"` - Functions string `yaml:"function"` - UseTokenizerTemplate bool `yaml:"use_tokenizer_template"` + // Chat is the template used in the chat completion endpoint + Chat string `yaml:"chat"` + + // ChatMessage is the template used for chat messages + ChatMessage string `yaml:"chat_message"` + + // Completion is the template used for completion requests + Completion string `yaml:"completion"` + + // Edit is the template used for edit completion requests + Edit string `yaml:"edit"` + + // Functions is the template used when tools are present in the client requests + Functions string `yaml:"function"` + + // UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used. + // Note: this is mostly consumed for backends such as vllm and transformers + // that can use the tokenizers specified in the JSON config files of the models + UseTokenizerTemplate bool `yaml:"use_tokenizer_template"` + + // JoinChatMessagesByCharacter is a string that will be used to join chat messages together. + // It defaults to \n + JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"` } func (c *BackendConfig) SetFunctionCallString(s string) { diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index ccbf094667d4..c49ef263c197 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup mess = append(mess, content) } - predInput = strings.Join(mess, "\n") + joinCharacter := "\n" + if config.TemplateConfig.JoinChatMessagesByCharacter != nil { + joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter + } + + predInput = strings.Join(mess, joinCharacter) log.Debug().Msgf("Prompt (before templating): %s", predInput) templateFile := "" From 077ba78b28417d39416cd54bf462e0e7e3268093 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 14 May 2024 20:02:58 +0200 Subject: [PATCH 3/8] refactor Signed-off-by: Ettore Di Giacinto --- Makefile | 14 +++--- pkg/model/initializers.go | 94 ++++++++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index bb7879924b5b..d4666936dc40 100644 --- a/Makefile +++ b/Makefile @@ -691,6 +691,13 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda +backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-grpc + $(MAKE) -C backend/cpp/llama-grpc purge + $(info ${GREEN}I llama-cpp build info:grpc${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc + backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ @@ -764,10 +771,3 @@ docker-image-intel-xpu: .PHONY: swagger swagger: swag init -g core/http/app.go --output swagger - -backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc - cp -rf backend/cpp/llama backend/cpp/llama-grpc - $(MAKE) -C backend/cpp/llama-grpc purge - $(info ${GREEN}I llama-cpp build info:grpc${RESET}) - CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server - cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc \ No newline at end of file diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 115a12a0cf4c..bd0316d6d0b0 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -29,13 +29,14 @@ var Aliases map[string]string = map[string]string{ const ( LlamaGGML = "llama-ggml" - LLamaCPP = "llama-cpp" + LLamaCPP = "llama-cpp" LLamaCPPCUDA12 = "llama-cpp-cuda12" LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPCUDA = "llama-cpp-cuda" + LLamaCPPGRPC = "llama-cpp-grpc" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -81,7 +82,8 @@ ENTRY: } } - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false + // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false if _, ok := backends[LLamaCPP]; !ok { for _, e := range entry { if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { @@ -96,16 +98,23 @@ ENTRY: backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback) foundLCPPFallback = true } + if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC) + foundLCPPGRPC = true + } } } // order backends from the asset directory. // as we scan for backends, we want to keep some order which backends are tried of. // for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last. - // sets a priority list - // First has more priority + + // sets a priority list - first has more priority priorityList := []string{ - // First llama.cpp and llama-ggml + + // First llama.cpp(variants) and llama-ggml to follow. + // We keep the fallback to prevent that if the llama.cpp variants + // that depends on shared libs if breaks have still a safety net. LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback, } @@ -142,6 +151,50 @@ ENTRY: return orderedBackends, nil } +// selectGRPCProcess selects the GRPC process to start based on system capabilities +func selectGRPCProcess(backend, assetDir string) string { + foundCUDA := false + var grpcProcess string + + // Select backend now just for llama.cpp + if backend != LLamaCPP { + return "" + } + + // Note: This environment variable is read by the LocalAI's llama.cpp grpc-server + if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" { + return backendPath(assetDir, LLamaCPPGRPC) + } + + gpus, err := xsysinfo.GPUs() + if err == nil { + for _, gpu := range gpus { + if strings.Contains(gpu.String(), "nvidia") { + log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPCUDA) + if _, err := os.Stat(grpcProcess); err == nil { + foundCUDA = true + } + } + } + } + + if !foundCUDA { + if cpu.X86.HasAVX2 { + log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPAVX2) + } else if cpu.X86.HasAVX { + log.Info().Msgf("[%s] attempting to load with AVX variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPAVX) + } else { + log.Info().Msgf("[%s] attempting to load with fallback variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPFallback) + } + } + + return grpcProcess +} + // starts the grpcModelProcess for the backend, and returns a grpc client // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { @@ -192,33 +245,10 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } else { grpcProcess := backendPath(o.assetDir, backend) - foundCUDA := false - // for llama-cpp, check CPU capabilities and load the appropriate variant - if backend == LLamaCPP { - gpus, err := xsysinfo.GPUs() - if err == nil { - for _, gpu := range gpus { - if strings.Contains(gpu.String(), "nvidia") { - log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA) - if _, err := os.Stat(grpcProcess); err == nil { - foundCUDA = true - } - } - } - } - - if !foundCUDA { - if cpu.X86.HasAVX2 { - log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) - } else if cpu.X86.HasAVX { - log.Info().Msgf("[%s] attempting to load with AVX variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) - } else { - log.Info().Msgf("[%s] attempting to load with fallback variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) - } + if os.Getenv("DISABLE_AUTODETECT") != "true" { + // autoDetect GRPC process to start based on system capabilities + if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" { + grpcProcess = selectedProcess } } From 2c5ae68fe59afb24f293d2679ad03a09ca20e6d6 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 14 May 2024 20:04:39 +0200 Subject: [PATCH 4/8] Makefile: register to ALL_GRPC_BACKENDS Signed-off-by: Ettore Di Giacinto --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index d4666936dc40..5b9d5191415e 100644 --- a/Makefile +++ b/Makefile @@ -158,6 +158,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml +ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper From cacdf676a2e26e9679c2e425937a4d864472d872 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 14 May 2024 20:29:05 +0200 Subject: [PATCH 5/8] refactoring, allow disable auto-detection of backends Signed-off-by: Ettore Di Giacinto --- pkg/model/initializers.go | 110 +++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 50 deletions(-) diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index bd0316d6d0b0..a349406c356b 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -60,7 +60,7 @@ func backendPath(assetDir, backend string) string { // backendsInAssetDir returns the list of backends in the asset directory // that should be loaded -func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) { +func backendsInAssetDir(assetDir string) ([]string, error) { // Exclude backends from automatic loading excludeBackends := []string{LocalStoreBackend} entry, err := os.ReadDir(backendPath(assetDir, "")) @@ -75,32 +75,41 @@ ENTRY: continue ENTRY } } - if !e.IsDir() { - if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) { - backends[e.Name()] = []string{} - } + if e.IsDir() { + continue + } + + // Skip the llama.cpp variants if we are autoDetecting + // But we always load the fallback variant if it exists + if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect { + continue } + + backends[e.Name()] = []string{} } - // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false - if _, ok := backends[LLamaCPP]; !ok { - for _, e := range entry { - if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { - backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2) - foundLCPPAVX2 = true - } - if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX { - backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX) - foundLCPPAVX = true - } - if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback { - backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback) - foundLCPPFallback = true - } - if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC { - backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC) - foundLCPPGRPC = true + // if we are autoDetecting, we want to show the llama.cpp variants as a single backend + if autoDetect { + // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false + if _, ok := backends[LLamaCPP]; !ok { + for _, e := range entry { + if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2) + foundLCPPAVX2 = true + } + if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX) + foundLCPPAVX = true + } + if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback) + foundLCPPFallback = true + } + if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC) + foundLCPPGRPC = true + } } } } @@ -148,7 +157,7 @@ ENTRY: } } - return orderedBackends, nil + return orderedBackends.Keys(), nil } // selectGRPCProcess selects the GRPC process to start based on system capabilities @@ -171,30 +180,35 @@ func selectGRPCProcess(backend, assetDir string) string { for _, gpu := range gpus { if strings.Contains(gpu.String(), "nvidia") { log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) - grpcProcess = backendPath(assetDir, LLamaCPPCUDA) - if _, err := os.Stat(grpcProcess); err == nil { + p := backendPath(assetDir, LLamaCPPCUDA) + if _, err := os.Stat(p); err == nil { + grpcProcess = p foundCUDA = true } } } } - if !foundCUDA { - if cpu.X86.HasAVX2 { - log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) - grpcProcess = backendPath(assetDir, LLamaCPPAVX2) - } else if cpu.X86.HasAVX { - log.Info().Msgf("[%s] attempting to load with AVX variant", backend) - grpcProcess = backendPath(assetDir, LLamaCPPAVX) - } else { - log.Info().Msgf("[%s] attempting to load with fallback variant", backend) - grpcProcess = backendPath(assetDir, LLamaCPPFallback) - } + if foundCUDA { + return grpcProcess + } + + if cpu.X86.HasAVX2 { + log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPAVX2) + } else if cpu.X86.HasAVX { + log.Info().Msgf("[%s] attempting to load with AVX variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPAVX) + } else { + log.Info().Msgf("[%s] attempting to load with fallback variant", backend) + grpcProcess = backendPath(assetDir, LLamaCPPFallback) } return grpcProcess } +var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" + // starts the grpcModelProcess for the backend, and returns a grpc client // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { @@ -245,7 +259,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } else { grpcProcess := backendPath(o.assetDir, backend) - if os.Getenv("DISABLE_AUTODETECT") != "true" { + if autoDetect { // autoDetect GRPC process to start based on system capabilities if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" { grpcProcess = selectedProcess @@ -393,28 +407,24 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) { var err error - // autoload also external backends - allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]() + // get backends embedded in the binary autoLoadBackends, err := backendsInAssetDir(o.assetDir) if err != nil { return nil, err } - log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends) - - for _, k := range autoLoadBackends.Keys() { - v, _ := autoLoadBackends.Get(k) - allBackendsToAutoLoad.Set(k, v) - } + // append externalBackends supplied by the user via the CLI for _, b := range o.externalBackends { - allBackendsToAutoLoad.Set(b, []string{}) + autoLoadBackends = append(autoLoadBackends, b) } + log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends) + if o.model != "" { - log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys()) + log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends) } - for _, key := range allBackendsToAutoLoad.Keys() { + for _, key := range autoLoadBackends { log.Info().Msgf("[%s] Attempting to load", key) options := []Option{ WithBackendString(key), From 793c45d7c43b67398f15c64c19736a4e0b8b9686 Mon Sep 17 00:00:00 2001 From: mudler Date: Tue, 14 May 2024 23:34:54 +0200 Subject: [PATCH 6/8] minor fixups Signed-off-by: mudler --- pkg/model/initializers.go | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index a349406c356b..d013740ce5d6 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -12,9 +12,9 @@ import ( grpc "github.com/go-skynet/LocalAI/pkg/grpc" "github.com/go-skynet/LocalAI/pkg/xsysinfo" + "github.com/klauspost/cpuid/v2" "github.com/phayes/freeport" "github.com/rs/zerolog/log" - "golang.org/x/sys/cpu" "github.com/elliotchance/orderedmap/v2" ) @@ -26,12 +26,13 @@ var Aliases map[string]string = map[string]string{ "langchain-huggingface": LCHuggingFaceBackend, } +var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" + const ( LlamaGGML = "llama-ggml" LLamaCPP = "llama-cpp" - LLamaCPPCUDA12 = "llama-cpp-cuda12" LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" @@ -90,8 +91,9 @@ ENTRY: // if we are autoDetecting, we want to show the llama.cpp variants as a single backend if autoDetect { - // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC := false, false, false, false + // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up + // when starting the service + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda := false, false, false, false, false if _, ok := backends[LLamaCPP]; !ok { for _, e := range entry { if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { @@ -110,6 +112,10 @@ ENTRY: backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC) foundLCPPGRPC = true } + if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA) + foundLCPPCuda = true + } } } } @@ -172,6 +178,7 @@ func selectGRPCProcess(backend, assetDir string) string { // Note: This environment variable is read by the LocalAI's llama.cpp grpc-server if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" { + log.Info().Msgf("[%s] attempting to load with GRPC variant", LLamaCPPGRPC) return backendPath(assetDir, LLamaCPPGRPC) } @@ -179,11 +186,13 @@ func selectGRPCProcess(backend, assetDir string) string { if err == nil { for _, gpu := range gpus { if strings.Contains(gpu.String(), "nvidia") { - log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) p := backendPath(assetDir, LLamaCPPCUDA) if _, err := os.Stat(p); err == nil { + log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) grpcProcess = p foundCUDA = true + } else { + log.Info().Msgf("GPU device found but no CUDA backend present") } } } @@ -193,10 +202,10 @@ func selectGRPCProcess(backend, assetDir string) string { return grpcProcess } - if cpu.X86.HasAVX2 { + if xsysinfo.HasCPUCaps(cpuid.AVX2) { log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) grpcProcess = backendPath(assetDir, LLamaCPPAVX2) - } else if cpu.X86.HasAVX { + } else if xsysinfo.HasCPUCaps(cpuid.AVX) { log.Info().Msgf("[%s] attempting to load with AVX variant", backend) grpcProcess = backendPath(assetDir, LLamaCPPAVX) } else { @@ -207,8 +216,6 @@ func selectGRPCProcess(backend, assetDir string) string { return grpcProcess } -var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" - // starts the grpcModelProcess for the backend, and returns a grpc client // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { From 3e53f452a73b9e7fda4fcabc367b8a4cda216330 Mon Sep 17 00:00:00 2001 From: mudler Date: Wed, 15 May 2024 00:14:08 +0200 Subject: [PATCH 7/8] feat: add cmd to start rpc-server from llama.cpp Signed-off-by: mudler --- Makefile | 5 +++++ core/cli/cli.go | 9 +++++---- core/cli/llamacppworker.go | 37 +++++++++++++++++++++++++++++++++++++ pkg/assets/extract.go | 6 +++++- 4 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 core/cli/llamacppworker.go diff --git a/Makefile b/Makefile index 5b9d5191415e..0b6ff9593e5f 100644 --- a/Makefile +++ b/Makefile @@ -159,6 +159,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc +ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper @@ -699,6 +700,10 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc +backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc + mkdir -p backend-assets/util/ + cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server + backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ diff --git a/core/cli/cli.go b/core/cli/cli.go index 2f2dcd8ba1bb..71f877b8105d 100644 --- a/core/cli/cli.go +++ b/core/cli/cli.go @@ -13,8 +13,9 @@ type Context struct { var CLI struct { Context `embed:""` - Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"` - Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"` - TTS TTSCMD `cmd:"" help:"Convert text to speech"` - Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"` + Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"` + Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"` + TTS TTSCMD `cmd:"" help:"Convert text to speech"` + Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"` + LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"` } diff --git a/core/cli/llamacppworker.go b/core/cli/llamacppworker.go new file mode 100644 index 000000000000..832b5bde3fc3 --- /dev/null +++ b/core/cli/llamacppworker.go @@ -0,0 +1,37 @@ +package cli + +import ( + "os" + "syscall" + + "github.com/go-skynet/LocalAI/pkg/assets" + "github.com/rs/zerolog/log" +) + +type LLAMACPPWorkerCMD struct { + Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"` + BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"` +} + +func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error { + // Extract files from the embedded FS + err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath) + log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath) + if err != nil { + log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err) + } + + return syscall.Exec( + assets.ResolvePath( + r.BackendAssetsPath, + "util", + "llama-cpp-rpc-server", + ), + append([]string{ + assets.ResolvePath( + r.BackendAssetsPath, + "util", + "llama-cpp-rpc-server", + )}, r.Args...), + os.Environ()) +} diff --git a/pkg/assets/extract.go b/pkg/assets/extract.go index b795cb30a94e..8f668a1aaeda 100644 --- a/pkg/assets/extract.go +++ b/pkg/assets/extract.go @@ -8,6 +8,10 @@ import ( "path/filepath" ) +func ResolvePath(dir string, paths ...string) string { + return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...) +} + func ExtractFiles(content embed.FS, extractDir string) error { // Create the target directory if it doesn't exist err := os.MkdirAll(extractDir, 0750) @@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error { } // Create the file in the target directory - err = os.WriteFile(targetFile, fileData, 0600) + err = os.WriteFile(targetFile, fileData, 0700) if err != nil { return fmt.Errorf("failed to write file: %v", err) } From b33224b4d21be529d9aba660fe0cabf5e29ea0c6 Mon Sep 17 00:00:00 2001 From: mudler Date: Wed, 15 May 2024 00:20:29 +0200 Subject: [PATCH 8/8] ci: add ccache Signed-off-by: mudler --- .github/workflows/release.yaml | 4 ++-- Dockerfile | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 30b6d9507034..0245725d00a8 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -29,7 +29,7 @@ jobs: - name: Dependencies run: | sudo apt-get update - sudo apt-get install build-essential ffmpeg protobuf-compiler + sudo apt-get install build-essential ffmpeg protobuf-compiler ccache - name: Install CUDA Dependencies run: | curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb @@ -86,7 +86,7 @@ jobs: cache: false - name: Dependencies run: | - sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler + sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest go install google.golang.org/protobuf/cmd/protoc-gen-go@latest - name: Build stablediffusion diff --git a/Dockerfile b/Dockerfile index 9680ba5c6edc..9cde257cecdb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts" RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ + ccache \ ca-certificates \ cmake \ curl \