sgl-project · Snowdar · Nov 26, 2024 · Nov 26, 2024 · Dec 1, 2024 · Dec 2, 2024
@@ -50,6 +50,12 @@ def forward(
         else:
             # Post process logits
             logits.div_(sampling_info.temperatures)
+
+            if sampling_info.need_top_n_sigma_sampling:
+                logits = apply_top_n_sigma_to_logits_torch(
+                    logits, sampling_info.top_n_sigmas
+                )
+
             probs = torch.softmax(logits, dim=-1)
             logits = None
             del logits
@@ -113,3 +119,16 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     sampled_index = torch.multinomial(probs_sort, num_samples=1)
     batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
     return batch_next_token_ids
+
+
+def apply_top_n_sigma_to_logits_torch(logits: torch.Tensor, top_n_sigmas: torch.Tensor):
+    max_logit = torch.max(logits, dim=-1, keepdim=True).values
+    sigma = torch.std(logits, dim=-1, keepdim=True)
+    # Create mask and enable only for the requests that have top_n_sigma > 0
+    mask = (top_n_sigmas.view(-1, 1) <= 0) | (
+        logits >= max_logit - top_n_sigmas.view(-1, 1) * sigma
+    )
+
+    # Apply mask
+    logits = torch.where(mask, logits, torch.tensor(float("-inf")).to(logits.device))
+    return logits
@@ -926,6 +926,9 @@ def v1_chat_generate_request(
             "stop": stop,
             "stop_token_ids": request.stop_token_ids,
             "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "top_n_sigma": request.top_n_sigma,
             "presence_penalty": request.presence_penalty,
             "frequency_penalty": request.frequency_penalty,
             "repetition_penalty": request.repetition_penalty,

@@ -283,6 +283,9 @@ class ChatCompletionRequest(BaseModel):
     no_stop_trim: bool = False
     ignore_eos: bool = False
     skip_special_tokens: bool = True
+    min_p: float = 0.0
+    top_k: int = -1
+    top_n_sigma: float = 0.0
 
 
 class ChatMessage(BaseModel):

@@ -23,12 +23,14 @@ class SamplingBatchInfo:
     top_ps: torch.Tensor
     top_ks: torch.Tensor
     min_ps: torch.Tensor
+    top_n_sigmas: torch.Tensor
 
     # All requests use greedy sampling
     is_all_greedy: bool
 
     # Dispatch in CUDA graph
     need_min_p_sampling: bool
+    need_top_n_sigma_sampling: bool
 
     # Bias Tensors
     vocab_size: int
@@ -69,13 +71,20 @@ def from_schedule_batch(
         min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        top_n_sigmas = torch.tensor(
+            [r.sampling_params.top_n_sigma for r in reqs], dtype=torch.float
+        ).to(device, non_blocking=True)
 
         ret = cls(
             temperatures=temperatures,
             top_ps=top_ps,
             top_ks=top_ks,
             min_ps=min_ps,
+            top_n_sigmas=top_n_sigmas,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
+            need_top_n_sigma_sampling=any(
+                r.sampling_params.top_n_sigma > 0 for r in reqs
+            ),
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             vocab_size=vocab_size,
             device=device,
@@ -183,6 +192,7 @@ def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor)
             "top_ps",
             "top_ks",
             "min_ps",
+            "top_n_sigmas",
             "logit_bias",
         ]:
             value = getattr(self, item, None)
@@ -222,6 +232,7 @@ def merge_batch(self, other: "SamplingBatchInfo"):
             "top_ps",
             "top_ks",
             "min_ps",
+            "top_n_sigmas",
         ]:
             self_val = getattr(self, item, None)
             other_val = getattr(other, item, None)

@@ -28,6 +28,7 @@ def __init__(
         top_p: float = 1.0,
         top_k: int = -1,
         min_p: float = 0.0,
+        top_n_sigma: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
@@ -44,6 +45,7 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
         self.min_p = min_p
+        self.top_n_sigma = top_n_sigma
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
@@ -78,6 +80,10 @@ def verify(self):
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
+        if self.top_n_sigma < 0.0:
+            raise ValueError(
+                f"top_n_sigma must be non-negative, got {self.top_n_sigma}."
+            )
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(
                 f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."