Skip to content

Commit

Permalink
Add LumiOpen/Viking-7B tokenizer support
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed May 17, 2024
1 parent 3afb494 commit ab842e3
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 3 deletions.
1 change: 1 addition & 0 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "viking-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B" },
]


Expand Down
3 changes: 3 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking-7b"

if res is None:
logger.warning("\n")
Expand Down
7 changes: 4 additions & 3 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4426,9 +4426,10 @@ static void llm_load_vocab(
tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe") {
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe" ||
tokenizer_pre == "viking-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
} else if (
tokenizer_pre == "deepseek-llm") {
Expand Down

0 comments on commit ab842e3

Please sign in to comment.