Skip to content

Commit

Permalink
feat(llama.cpp): add distributed llama.cpp inferencing (#2324)
Browse files Browse the repository at this point in the history
* feat(llama.cpp): support distributed llama.cpp

Signed-off-by: Ettore Di Giacinto <[email protected]>

* feat: let tweak how chat messages are merged together

Signed-off-by: Ettore Di Giacinto <[email protected]>

* refactor

Signed-off-by: Ettore Di Giacinto <[email protected]>

* Makefile: register to ALL_GRPC_BACKENDS

Signed-off-by: Ettore Di Giacinto <[email protected]>

* refactoring, allow disable auto-detection of backends

Signed-off-by: Ettore Di Giacinto <[email protected]>

* minor fixups

Signed-off-by: mudler <[email protected]>

* feat: add cmd to start rpc-server from llama.cpp

Signed-off-by: mudler <[email protected]>

* ci: add ccache

Signed-off-by: mudler <[email protected]>

---------

Signed-off-by: Ettore Di Giacinto <[email protected]>
Signed-off-by: mudler <[email protected]>
  • Loading branch information
mudler authored May 14, 2024
1 parent 2990966 commit c89271b
Show file tree
Hide file tree
Showing 11 changed files with 220 additions and 80 deletions.
5 changes: 5 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
# LLAMACPP_PARALLEL=1

### Define a list of GRPC Servers for llama-cpp workers to distribute the load
# https://github.com/ggerganov/llama.cpp/pull/6829
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
# LLAMACPP_GRPC_SERVERS=""

### Enable to run parallel requests
# LOCALAI_PARALLEL_REQUESTS=true

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
- name: Install CUDA Dependencies
run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:
cache: false
- name: Dependencies
run: |
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
- name: Build stablediffusion
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ccache \
ca-certificates \
cmake \
curl \
Expand Down
17 changes: 15 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BINARY_NAME=local-ai

# llama.cpp versions
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12

# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
Expand Down Expand Up @@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
Expand Down Expand Up @@ -314,7 +316,7 @@ build: prepare backend-assets grpcs ## Build the project
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build

build-api:
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
Expand Down Expand Up @@ -691,6 +693,17 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-grpc
$(MAKE) -C backend/cpp/llama-grpc purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc

backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server

backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
Expand Down
6 changes: 6 additions & 0 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
} else {
params.n_parallel = 1;
}

const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
if (llama_grpc_servers != NULL) {
params.rpc_servers = std::string(llama_grpc_servers);
}

// TODO: Add yarn

if (!request->tensorsplit().empty()) {
Expand Down
9 changes: 5 additions & 4 deletions core/cli/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ type Context struct {
var CLI struct {
Context `embed:""`

Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
}
37 changes: 37 additions & 0 deletions core/cli/llamacppworker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package cli

import (
"os"
"syscall"

"github.com/go-skynet/LocalAI/pkg/assets"
"github.com/rs/zerolog/log"
)

type LLAMACPPWorkerCMD struct {
Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
}

func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
// Extract files from the embedded FS
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
if err != nil {
log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
}

return syscall.Exec(
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
),
append([]string{
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
)}, r.Args...),
os.Environ())
}
33 changes: 27 additions & 6 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ type Diffusers struct {
ControlNet string `yaml:"control_net"`
}

// LLMConfig is a struct that holds the configuration that are
// generic for most of the LLM backends.
type LLMConfig struct {
SystemPrompt string `yaml:"system_prompt"`
TensorSplit string `yaml:"tensor_split"`
Expand Down Expand Up @@ -144,20 +146,39 @@ type LLMConfig struct {
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}

// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
Triton bool `yaml:"triton"`
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}

// TemplateConfig is a struct that holds the configuration of the templating system
type TemplateConfig struct {
Chat string `yaml:"chat"`
ChatMessage string `yaml:"chat_message"`
Completion string `yaml:"completion"`
Edit string `yaml:"edit"`
Functions string `yaml:"function"`
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
// Chat is the template used in the chat completion endpoint
Chat string `yaml:"chat"`

// ChatMessage is the template used for chat messages
ChatMessage string `yaml:"chat_message"`

// Completion is the template used for completion requests
Completion string `yaml:"completion"`

// Edit is the template used for edit completion requests
Edit string `yaml:"edit"`

// Functions is the template used when tools are present in the client requests
Functions string `yaml:"function"`

// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
// Note: this is mostly consumed for backends such as vllm and transformers
// that can use the tokenizers specified in the JSON config files of the models
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`

// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
// It defaults to \n
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
}

func (c *BackendConfig) SetFunctionCallString(s string) {
Expand Down
7 changes: 6 additions & 1 deletion core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
mess = append(mess, content)
}

predInput = strings.Join(mess, "\n")
joinCharacter := "\n"
if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
}

predInput = strings.Join(mess, joinCharacter)
log.Debug().Msgf("Prompt (before templating): %s", predInput)

templateFile := ""
Expand Down
6 changes: 5 additions & 1 deletion pkg/assets/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import (
"path/filepath"
)

func ResolvePath(dir string, paths ...string) string {
return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
}

func ExtractFiles(content embed.FS, extractDir string) error {
// Create the target directory if it doesn't exist
err := os.MkdirAll(extractDir, 0750)
Expand Down Expand Up @@ -39,7 +43,7 @@ func ExtractFiles(content embed.FS, extractDir string) error {
}

// Create the file in the target directory
err = os.WriteFile(targetFile, fileData, 0600)
err = os.WriteFile(targetFile, fileData, 0700)
if err != nil {
return fmt.Errorf("failed to write file: %v", err)
}
Expand Down
Loading

0 comments on commit c89271b

Please sign in to comment.