Skip to content

Commit

Permalink
Merge pull request #4 from bhavnicksm/development
Browse files Browse the repository at this point in the history
Bump version to 0.1.1 and enhance _AutoTikTokenizer functionality
  • Loading branch information
bhavnicksm authored Nov 5, 2024
2 parents 8cefed6 + d32dc77 commit d37b10b
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 36 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "autotiktokenizer"
version = "0.1.0"
version = "0.1.1"
description = "🧰 The AutoTokenizer that TikToken always needed -- Load any tokenizer with TikToken now! ✨"
readme = "README.md"
requires-python = ">=3.8"
Expand Down
2 changes: 1 addition & 1 deletion src/autotiktokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .autotiktokenizer import AutoTikTokenizer

__version__ = '0.1.0'
__version__ = '0.1.1'
__author__ = 'Bhavnick Singh Minhas'
__all__ = [ 'AutoTikTokenizer' ]
90 changes: 56 additions & 34 deletions src/autotiktokenizer/autotiktokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import json
import tiktoken


class _AutoTikTokenizer:
"""
_AutoTikTokenizer is a class designed to interface with HuggingFace tokenizers to provide a TikToken tokenizer
that can be used for the tokenization process. It mimics the functionality of AutoTokenizer in HuggingFace
_AutoTikTokenizer is a class designed to interface with HuggingFace tokenizers to provide a TikToken tokenizer
that can be used for the tokenization process. It mimics the functionality of AutoTokenizer in HuggingFace
but is tailored for TikToken.
Attributes:
tokenizer (Tokenizer): The HuggingFace tokenizer instance.
Expand All @@ -31,66 +32,87 @@ class _AutoTikTokenizer:
__call__():
Returns the TikToken encoding.
"""
def __init__(self):
self.tokenizer = None
self.name = None
self.vocab = None
self.tokenizer_config = None
self.mergeable_ranks = None
self.special_tokens = None
self.pattern = None
def __init__(self) -> None:
self.bytes_encoder = self._bytes_to_unicode()
self.bytes_decoder = {v:k for k,v in self.bytes_encoder.items()}

def _bytes_to_unicode(self):
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))

def _normalize_token_bytes(self, token):
"""Convert bytes to unicode."""
result = bytearray([self.bytes_decoder[b] for b in token])
result = bytes(result)
return result

def get_mergable_ranks(self):
# Convert vocab to binary mergeable_ranks
def get_mergable_ranks(self, vocab, special_tokens):
"""Convert vocab to binary mergeable_ranks."""
self.mergeable_ranks = {}

# Sort vocab by token id to ensure correct ordering
sorted_vocab = sorted(self.vocab.items(), key=lambda x: x[1])

# Create binary format ranks starting from 1
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
for rank, (token, _) in enumerate(sorted_vocab, start=0):
# Handle GPT-2 style tokens
if token.startswith('Ġ'):
token = ' ' + token[1:]
self.mergeable_ranks[token.encode('utf-8')] = rank

if token not in special_tokens:
key = self._normalize_token_bytes(token)
else:
key = token.encode()
self.mergeable_ranks[key] = rank
return self.mergeable_ranks

def get_special_tokens(self):
self.special_tokens = {}
sp = self.tokenizer.get_added_tokens_decoder()
for idx, token in sp.items():
self.special_tokens[token.content] = idx
return self.special_tokens

def get_pattern_str(self):
self.pattern = r'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s'
return self.pattern

def get_tiktoken_encoding(self):
def get_tiktoken_encoding(self, vocab):
special_tokens = self.get_special_tokens()
mergeable_ranks = self.get_mergable_ranks()
mergeable_ranks = self.get_mergable_ranks(vocab, special_tokens)
pattern = self.get_pattern_str()

encoding = tiktoken.Encoding(
self.name,
self.name,
pat_str=pattern,
mergeable_ranks=mergeable_ranks,
special_tokens=special_tokens,
)

return encoding

def from_pretrained(self, tokenizer_name_or_path: str):
self.tokenizer_name_or_path = tokenizer_name_or_path
self.tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path)
self.vocab = self.tokenizer.get_vocab()
vocab = self.tokenizer.get_vocab()

self.tokenizer_config = dict(json.loads(self.tokenizer.to_str()))
self.name = self.tokenizer_name_or_path.split('/')[-1]
return self.get_tiktoken_encoding()

def __call__(self):
return self.get_tiktoken_encoding()
return self.get_tiktoken_encoding(vocab)

def __call__(self, tokenizer_name_or_path: str):
return self.from_pretrained(tokenizer_name_or_path)

def __repr__(self) -> str:
return "AutoTikTokenizer"

AutoTikTokenizer = _AutoTikTokenizer()
32 changes: 32 additions & 0 deletions tests/test_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@ def tiktokenizer():
def tokenizer():
return Tokenizer.from_pretrained('gpt2')

@pytest.fixture
def sample_text():
text = """# Chunking Strategies in Retrieval-Augmented Generation: A Comprehensive Analysis\n\nIn the rapidly evolving landscape of natural language processing, Retrieval-Augmented Generation (RAG) has emerged as a groundbreaking approach that bridges the gap between large language models and external knowledge bases. At the heart of these systems lies a crucial yet often overlooked process: chunking. This fundamental operation, which involves the systematic decomposition of large text documents into smaller, semantically meaningful units, plays a pivotal role in determining the overall effectiveness of RAG implementations.
The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. This balancing act becomes particularly crucial when we consider the downstream implications for vector databases and embedding models that form the backbone of modern RAG systems.
The selection of appropriate chunk size emerges as a fundamental consideration that significantly impacts system performance. Through extensive experimentation and real-world implementations, researchers have identified that chunks typically perform optimally in the range of 256 to 1024 tokens. However, this range should not be treated as a rigid constraint but rather as a starting point for optimization based on specific use cases and requirements. The implications of chunk size selection ripple throughout the entire RAG pipeline, affecting everything from storage requirements to retrieval accuracy and computational overhead.
Fixed-size chunking represents the most straightforward approach to document segmentation, offering predictable memory usage and consistent processing time. However, this apparent simplicity comes with significant drawbacks. By arbitrarily dividing text based on token or character count, fixed-size chunking risks fragmenting semantic units and disrupting the natural flow of information. Consider, for instance, a technical document where a complex concept is explained across several paragraphs – fixed-size chunking might split this explanation at critical junctures, potentially compromising the system's ability to retrieve and present this information coherently.
In response to these limitations, semantic chunking has gained prominence as a more sophisticated alternative. This approach leverages natural language understanding to identify meaningful boundaries within the text, respecting the natural structure of the document. Semantic chunking can operate at various levels of granularity, from simple sentence-based segmentation to more complex paragraph-level or topic-based approaches. The key advantage lies in its ability to preserve the inherent semantic relationships within the text, leading to more meaningful and contextually relevant retrieval results.
Recent advances in the field have given rise to hybrid approaches that attempt to combine the best aspects of both fixed-size and semantic chunking. These methods typically begin with semantic segmentation but impose size constraints to prevent extreme variations in chunk length. Furthermore, the introduction of sliding window techniques with overlap has proved particularly effective in maintaining context across chunk boundaries. This overlap, typically ranging from 10% to 20% of the chunk size, helps ensure that no critical information is lost at segment boundaries, albeit at the cost of increased storage requirements.
The implementation of chunking strategies must also consider various technical factors that can significantly impact system performance. Vector database capabilities, embedding model constraints, and runtime performance requirements all play crucial roles in determining the optimal chunking approach. Moreover, content-specific factors such as document structure, language characteristics, and domain-specific requirements must be carefully considered. For instance, technical documentation might benefit from larger chunks that preserve detailed explanations, while news articles might perform better with smaller, more focused segments.
The future of chunking in RAG systems points toward increasingly sophisticated approaches. Current research explores the potential of neural chunking models that can learn optimal segmentation strategies from large-scale datasets. These models show promise in adapting to different content types and query patterns, potentially leading to more efficient and effective retrieval systems. Additionally, the emergence of cross-lingual chunking strategies addresses the growing need for multilingual RAG applications, while real-time adaptive chunking systems attempt to optimize segment boundaries based on user interaction patterns and retrieval performance metrics.
The effectiveness of RAG systems heavily depends on the thoughtful implementation of appropriate chunking strategies. While the field continues to evolve, practitioners must carefully consider their specific use cases and requirements when designing chunking solutions. Factors such as document characteristics, retrieval patterns, and performance requirements should guide the selection and optimization of chunking strategies. As we look to the future, the continued development of more sophisticated chunking approaches promises to further enhance the capabilities of RAG systems, enabling more accurate and efficient information retrieval and generation.
Through careful consideration of these various aspects and continued experimentation with different approaches, organizations can develop chunking strategies that effectively balance the competing demands of semantic coherence, computational efficiency, and retrieval accuracy. As the field continues to evolve, we can expect to see new innovations that further refine our ability to segment and process textual information in ways that enhance the capabilities of RAG systems while maintaining their practical utility in real-world applications."""
return text

def test_simple_sentence(tiktokenizer, tokenizer):
sentence = "Hey, I am Bhavnick Singh Minhas and I am building a tool to use TikToken tokenizers."
ttk_enc = tiktokenizer.encode(sentence)
Expand All @@ -20,4 +43,13 @@ def test_simple_sentence(tiktokenizer, tokenizer):
assert tokenizer.decode(hf_enc) == sentence, f"{tokenizer.decode(hf_enc)} != {sentence}"
assert tiktokenizer.decode(ttk_enc) == sentence, f"{tiktokenizer.decode(ttk_enc)} != {sentence}"
assert tokenizer.decode(hf_enc) == tiktokenizer.decode(ttk_enc), f"{tokenizer.decode(hf_enc)} != {tiktokenizer.decode(ttk_enc)}"
assert tiktokenizer.decode(hf_enc) == tokenizer.decode(ttk_enc), f"{tiktokenizer.decode(hf_enc)} != {tokenizer.decode(ttk_enc)}"

def test_long_text(tiktokenizer, tokenizer, sample_text):
ttk_enc = tiktokenizer.encode(sample_text)
hf_enc = tokenizer.encode(sample_text).ids

assert tokenizer.decode(hf_enc) == sample_text, f"{tokenizer.decode(hf_enc)} != {sample_text}"
assert tiktokenizer.decode(ttk_enc) == sample_text, f"{tiktokenizer.decode(ttk_enc)} != {sample_text}"
assert tokenizer.decode(hf_enc) == tiktokenizer.decode(ttk_enc), f"{tokenizer.decode(hf_enc)} != {tiktokenizer.decode(ttk_enc)}"
assert tiktokenizer.decode(hf_enc) == tokenizer.decode(ttk_enc), f"{tiktokenizer.decode(hf_enc)} != {tokenizer.decode(ttk_enc)}"

0 comments on commit d37b10b

Please sign in to comment.