Return skip_special_tokens in _tokenizer.py

pytorch · Oct 11, 2024 · 33c6d54 · 33c6d54
1 parent 4315ee7
commit 33c6d54
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py
@@ -101,11 +101,13 @@ def encode(
             trim_leading_whitespace=trim_leading_whitespace,
         )
 
-    def decode(self, ids: List[int]) -> str:
+    def decode(self, ids: List[int], skip_special_tokens: bool = True) -> str:
         """Decode token IDs to strings.
 
         Args:
             ids (List[int]): The input token IDs to be decoded.
+            skip_special_tokens (bool): Whether to show or skip special tokens in the decoded string.
+                Default is True.
 
         Returns:
             str: The decoded text.
@@ -114,7 +116,7 @@ def decode(self, ids: List[int]) -> str:
         for token_id in ids:
             # Filter out special tokens and the placeholder tokens added
             # by the Phi3 team
-            if token_id >= 32_000 and token_id <= 32_064:
+            if skip_special_tokens and (token_id >= 32_000 and token_id <= 32_064):
                 continue
             else:
                 ids_for_decode.append(token_id)