Merge pull request #4 from bhavnicksm/development

Bump version to 0.1.1 and enhance _AutoTikTokenizer functionality
chonkie-ai · Nov 5, 2024 · d37b10b · d37b10b
2 parents 8cefed6 + d32dc77
commit d37b10b
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 36 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "autotiktokenizer"
-version = "0.1.0"
+version = "0.1.1"
 description = "🧰 The AutoTokenizer that TikToken always needed -- Load any tokenizer with TikToken now! ✨"
 readme = "README.md"
 requires-python = ">=3.8"

diff --git a/src/autotiktokenizer/__init__.py b/src/autotiktokenizer/__init__.py
@@ -1,5 +1,5 @@
 from .autotiktokenizer import AutoTikTokenizer
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
 __author__ = 'Bhavnick Singh Minhas'
 __all__ = [ 'AutoTikTokenizer' ]
diff --git a/src/autotiktokenizer/autotiktokenizer.py b/src/autotiktokenizer/autotiktokenizer.py
@@ -2,10 +2,11 @@
 import json
 import tiktoken
 
+
 class _AutoTikTokenizer:
     """
-    _AutoTikTokenizer is a class designed to interface with HuggingFace tokenizers to provide a TikToken tokenizer 
-    that can be used for the tokenization process. It mimics the functionality of AutoTokenizer in HuggingFace 
+    _AutoTikTokenizer is a class designed to interface with HuggingFace tokenizers to provide a TikToken tokenizer
+    that can be used for the tokenization process. It mimics the functionality of AutoTokenizer in HuggingFace
     but is tailored for TikToken.
     Attributes:
         tokenizer (Tokenizer): The HuggingFace tokenizer instance.
@@ -31,66 +32,87 @@ class _AutoTikTokenizer:
         __call__():
             Returns the TikToken encoding.
     """
-    def __init__(self):
-        self.tokenizer = None
-        self.name = None
-        self.vocab = None
-        self.tokenizer_config = None
-        self.mergeable_ranks = None
-        self.special_tokens = None
-        self.pattern = None
+    def __init__(self) -> None:
+        self.bytes_encoder = self._bytes_to_unicode()
+        self.bytes_decoder = {v:k for k,v in self.bytes_encoder.items()}
+
+    def _bytes_to_unicode(self):
+        """
+        Returns list of utf-8 byte and a corresponding list of unicode strings.
+        The reversible bpe codes work on unicode strings.
+        This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+        When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+        This is a signficant percentage of your normal, say, 32K bpe vocab.
+        To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+        And avoids mapping to whitespace/control characters the bpe code barfs on.
+        """
+        bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+        cs = bs[:]
+        n = 0
+        for b in range(2**8):
+            if b not in bs:
+                bs.append(b)
+                cs.append(2**8+n)
+                n += 1
+        cs = [chr(n) for n in cs]
+        return dict(zip(bs, cs))
+
+    def _normalize_token_bytes(self, token):
+        """Convert bytes to unicode."""
+        result = bytearray([self.bytes_decoder[b] for b in token])
+        result = bytes(result)
+        return result
 
-    def get_mergable_ranks(self):
-        # Convert vocab to binary mergeable_ranks
+    def get_mergable_ranks(self, vocab, special_tokens):
+        """Convert vocab to binary mergeable_ranks."""
         self.mergeable_ranks = {}
-
-        # Sort vocab by token id to ensure correct ordering
-        sorted_vocab = sorted(self.vocab.items(), key=lambda x: x[1])
-
-        # Create binary format ranks starting from 1
+        sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
         for rank, (token, _) in enumerate(sorted_vocab, start=0):
-            # Handle GPT-2 style tokens
-            if token.startswith('Ġ'):
-                token = ' ' + token[1:]
-            self.mergeable_ranks[token.encode('utf-8')] = rank
-
+            if token not in special_tokens:
+                key = self._normalize_token_bytes(token)
+            else:
+                key = token.encode()
+            self.mergeable_ranks[key] = rank
         return self.mergeable_ranks
-    
+
     def get_special_tokens(self):
         self.special_tokens = {}
         sp = self.tokenizer.get_added_tokens_decoder()
         for idx, token in sp.items():
             self.special_tokens[token.content] = idx
         return self.special_tokens
-    
+
     def get_pattern_str(self):
         self.pattern = r'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s++$|\s+(?!\S)|\s'
         return self.pattern
 
-    def get_tiktoken_encoding(self):
+    def get_tiktoken_encoding(self, vocab):
         special_tokens = self.get_special_tokens()
-        mergeable_ranks = self.get_mergable_ranks()
+        mergeable_ranks = self.get_mergable_ranks(vocab, special_tokens)
         pattern = self.get_pattern_str()
 
         encoding = tiktoken.Encoding(
-            self.name, 
+            self.name,
             pat_str=pattern,
             mergeable_ranks=mergeable_ranks,
             special_tokens=special_tokens,
         )
-        
+
         return encoding
-    
+
     def from_pretrained(self, tokenizer_name_or_path: str):
         self.tokenizer_name_or_path = tokenizer_name_or_path
         self.tokenizer = Tokenizer.from_pretrained(tokenizer_name_or_path)
-        self.vocab = self.tokenizer.get_vocab()
+        vocab = self.tokenizer.get_vocab()
+
         self.tokenizer_config = dict(json.loads(self.tokenizer.to_str()))
         self.name = self.tokenizer_name_or_path.split('/')[-1]
-        return self.get_tiktoken_encoding()
-
-    def __call__(self):
-        return self.get_tiktoken_encoding()
+        return self.get_tiktoken_encoding(vocab)
 
+    def __call__(self, tokenizer_name_or_path: str):
+      return self.from_pretrained(tokenizer_name_or_path)
+
+    def __repr__(self) -> str:
+        return "AutoTikTokenizer"
 
 AutoTikTokenizer = _AutoTikTokenizer()
diff --git a/tests/test_gpt2.py b/tests/test_gpt2.py
@@ -11,6 +11,29 @@ def tiktokenizer():
 def tokenizer():
     return Tokenizer.from_pretrained('gpt2')
 
+@pytest.fixture
+def sample_text():
+    text = """# Chunking Strategies in Retrieval-Augmented Generation: A Comprehensive Analysis\n\nIn the rapidly evolving landscape of natural language processing, Retrieval-Augmented Generation (RAG) has emerged as a groundbreaking approach that bridges the gap between large language models and external knowledge bases. At the heart of these systems lies a crucial yet often overlooked process: chunking. This fundamental operation, which involves the systematic decomposition of large text documents into smaller, semantically meaningful units, plays a pivotal role in determining the overall effectiveness of RAG implementations.
+
+    The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. This balancing act becomes particularly crucial when we consider the downstream implications for vector databases and embedding models that form the backbone of modern RAG systems.
+
+    The selection of appropriate chunk size emerges as a fundamental consideration that significantly impacts system performance. Through extensive experimentation and real-world implementations, researchers have identified that chunks typically perform optimally in the range of 256 to 1024 tokens. However, this range should not be treated as a rigid constraint but rather as a starting point for optimization based on specific use cases and requirements. The implications of chunk size selection ripple throughout the entire RAG pipeline, affecting everything from storage requirements to retrieval accuracy and computational overhead.
+
+    Fixed-size chunking represents the most straightforward approach to document segmentation, offering predictable memory usage and consistent processing time. However, this apparent simplicity comes with significant drawbacks. By arbitrarily dividing text based on token or character count, fixed-size chunking risks fragmenting semantic units and disrupting the natural flow of information. Consider, for instance, a technical document where a complex concept is explained across several paragraphs – fixed-size chunking might split this explanation at critical junctures, potentially compromising the system's ability to retrieve and present this information coherently.
+
+    In response to these limitations, semantic chunking has gained prominence as a more sophisticated alternative. This approach leverages natural language understanding to identify meaningful boundaries within the text, respecting the natural structure of the document. Semantic chunking can operate at various levels of granularity, from simple sentence-based segmentation to more complex paragraph-level or topic-based approaches. The key advantage lies in its ability to preserve the inherent semantic relationships within the text, leading to more meaningful and contextually relevant retrieval results.
+
+    Recent advances in the field have given rise to hybrid approaches that attempt to combine the best aspects of both fixed-size and semantic chunking. These methods typically begin with semantic segmentation but impose size constraints to prevent extreme variations in chunk length. Furthermore, the introduction of sliding window techniques with overlap has proved particularly effective in maintaining context across chunk boundaries. This overlap, typically ranging from 10% to 20% of the chunk size, helps ensure that no critical information is lost at segment boundaries, albeit at the cost of increased storage requirements.
+
+    The implementation of chunking strategies must also consider various technical factors that can significantly impact system performance. Vector database capabilities, embedding model constraints, and runtime performance requirements all play crucial roles in determining the optimal chunking approach. Moreover, content-specific factors such as document structure, language characteristics, and domain-specific requirements must be carefully considered. For instance, technical documentation might benefit from larger chunks that preserve detailed explanations, while news articles might perform better with smaller, more focused segments.
+
+    The future of chunking in RAG systems points toward increasingly sophisticated approaches. Current research explores the potential of neural chunking models that can learn optimal segmentation strategies from large-scale datasets. These models show promise in adapting to different content types and query patterns, potentially leading to more efficient and effective retrieval systems. Additionally, the emergence of cross-lingual chunking strategies addresses the growing need for multilingual RAG applications, while real-time adaptive chunking systems attempt to optimize segment boundaries based on user interaction patterns and retrieval performance metrics.
+
+    The effectiveness of RAG systems heavily depends on the thoughtful implementation of appropriate chunking strategies. While the field continues to evolve, practitioners must carefully consider their specific use cases and requirements when designing chunking solutions. Factors such as document characteristics, retrieval patterns, and performance requirements should guide the selection and optimization of chunking strategies. As we look to the future, the continued development of more sophisticated chunking approaches promises to further enhance the capabilities of RAG systems, enabling more accurate and efficient information retrieval and generation.
+
+    Through careful consideration of these various aspects and continued experimentation with different approaches, organizations can develop chunking strategies that effectively balance the competing demands of semantic coherence, computational efficiency, and retrieval accuracy. As the field continues to evolve, we can expect to see new innovations that further refine our ability to segment and process textual information in ways that enhance the capabilities of RAG systems while maintaining their practical utility in real-world applications."""
+    return text
+
 def test_simple_sentence(tiktokenizer, tokenizer):
     sentence = "Hey, I am Bhavnick Singh Minhas and I am building a tool to use TikToken tokenizers."
     ttk_enc = tiktokenizer.encode(sentence)
@@ -20,4 +43,13 @@ def test_simple_sentence(tiktokenizer, tokenizer):
     assert tokenizer.decode(hf_enc) == sentence, f"{tokenizer.decode(hf_enc)} != {sentence}"
     assert tiktokenizer.decode(ttk_enc) == sentence, f"{tiktokenizer.decode(ttk_enc)} != {sentence}"
     assert tokenizer.decode(hf_enc) == tiktokenizer.decode(ttk_enc), f"{tokenizer.decode(hf_enc)} != {tiktokenizer.decode(ttk_enc)}"
+    assert tiktokenizer.decode(hf_enc) == tokenizer.decode(ttk_enc), f"{tiktokenizer.decode(hf_enc)} != {tokenizer.decode(ttk_enc)}"
+
+def test_long_text(tiktokenizer, tokenizer, sample_text):
+    ttk_enc = tiktokenizer.encode(sample_text)
+    hf_enc = tokenizer.encode(sample_text).ids
+
+    assert tokenizer.decode(hf_enc) == sample_text, f"{tokenizer.decode(hf_enc)} != {sample_text}"
+    assert tiktokenizer.decode(ttk_enc) == sample_text, f"{tiktokenizer.decode(ttk_enc)} != {sample_text}"
+    assert tokenizer.decode(hf_enc) == tiktokenizer.decode(ttk_enc), f"{tokenizer.decode(hf_enc)} != {tiktokenizer.decode(ttk_enc)}"
     assert tiktokenizer.decode(hf_enc) == tokenizer.decode(ttk_enc), f"{tiktokenizer.decode(hf_enc)} != {tokenizer.decode(ttk_enc)}"