Overlap for xgrammar

sgl-project · Dec 6, 2024 · 8e408f0 · 8e408f0
1 parent 8da9ef6
commit 8e408f0
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 5 deletions.
diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py
@@ -85,6 +85,10 @@ def allocate_vocab_mask(
     ) -> torch.Tensor:
         return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
 
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask
+
     def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
         tokens = torch.tensor(
             self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64

diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -86,12 +86,11 @@ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
         self.matcher.fill_next_token_bitmask(vocab_mask, idx)
 
     @staticmethod
-    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        if vocab_mask.device.type != logits.device.type:
-            # vocab_mask must then be on the same device as logits
-            # when applying the token bitmask, so we check and move if needed
-            vocab_mask = vocab_mask.to(logits.device)
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask.to(device, non_blocking=True)
 
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
         apply_token_bitmask_inplace(logits, vocab_mask)
 
     def copy(self):

diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -168,10 +168,14 @@ def update_regex_vocab_mask(self):
         )
         self.apply_mask = type(grammar).apply_vocab_mask  # force to use static method
 
+        # Apply the mask
         for i, grammar in enumerate(self.grammars):
             if grammar and not grammar.finished:
                 grammar.fill_vocab_mask(self.vocab_mask, i)
 
+        # Move the mask to the device if needed
+        self.vocab_mask = grammar.move_vocab_mask(self.vocab_mask, self.device)
+
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         self.penalizer_orchestrator.filter(unfinished_indices, new_indices)