sgl-project · Snowdar · Nov 26, 2024 · Nov 26, 2024 · Dec 1, 2024 · Dec 2, 2024
@@ -50,6 +50,20 @@ def forward(
         else:
             # Post process logits
             logits.div_(sampling_info.temperatures)
+
+            if any(sampling_info.top_n_sigmas > 0):
+                max_logit = torch.max(logits, dim=-1, keepdim=True).values
+                sigma = torch.std(logits, dim=-1, keepdim=True)
+                # Create mask and enable only for the requests that have top_n_sigma > 0
+                mask = (sampling_info.top_n_sigmas.view(-1, 1) <= 0) | (
+                    logits >= max_logit - sampling_info.top_n_sigmas.view(-1, 1) * sigma
+                )
+
+                # Apply mask
+                logits = torch.where(
+                    mask, logits, torch.tensor(float("-inf")).to(logits.device)
+                )
+
             probs = torch.softmax(logits, dim=-1)
             logits = None
             del logits

@@ -926,6 +926,9 @@ def v1_chat_generate_request(
             "stop": stop,
             "stop_token_ids": request.stop_token_ids,
             "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "top_n_sigma": request.top_n_sigma,
             "presence_penalty": request.presence_penalty,
             "frequency_penalty": request.frequency_penalty,
             "repetition_penalty": request.repetition_penalty,

@@ -273,6 +273,8 @@ class ChatCompletionRequest(BaseModel):
     stream_options: Optional[StreamOptions] = None
     temperature: float = 0.7
     top_p: float = 1.0
+    top_k: int = -1
+    top_n_sigma: float = 0.0
     user: Optional[str] = None
 
     # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -283,6 +285,7 @@ class ChatCompletionRequest(BaseModel):
     no_stop_trim: bool = False
     ignore_eos: bool = False
     skip_special_tokens: bool = True
+    min_p: float = 0.0
 
 
 class ChatMessage(BaseModel):

@@ -23,6 +23,7 @@ class SamplingBatchInfo:
     top_ps: torch.Tensor
     top_ks: torch.Tensor
     min_ps: torch.Tensor
+    top_n_sigmas: torch.Tensor
 
     # All requests use greedy sampling
     is_all_greedy: bool
@@ -69,12 +70,16 @@ def from_schedule_batch(
         min_ps = torch.tensor(
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        top_n_sigmas = torch.tensor(
+            [r.sampling_params.top_n_sigma for r in reqs], dtype=torch.float
+        ).to(device, non_blocking=True)
 
         ret = cls(
             temperatures=temperatures,
             top_ps=top_ps,
             top_ks=top_ks,
             min_ps=min_ps,
+            top_n_sigmas=top_n_sigmas,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             vocab_size=vocab_size,
@@ -183,6 +188,7 @@ def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor)
             "top_ps",
             "top_ks",
             "min_ps",
+            "top_n_sigmas",
             "logit_bias",
         ]:
             value = getattr(self, item, None)
@@ -222,6 +228,7 @@ def merge_batch(self, other: "SamplingBatchInfo"):
             "top_ps",
             "top_ks",
             "min_ps",
+            "top_n_sigmas",
         ]:
             self_val = getattr(self, item, None)
             other_val = getattr(other, item, None)

@@ -28,6 +28,7 @@ def __init__(
         top_p: float = 1.0,
         top_k: int = -1,
         min_p: float = 0.0,
+        top_n_sigma: float = 0.0,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
@@ -44,6 +45,7 @@ def __init__(
         self.top_p = top_p
         self.top_k = top_k
         self.min_p = min_p
+        self.top_n_sigma = top_n_sigma
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
@@ -78,6 +80,10 @@ def verify(self):
             raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
         if not 0.0 <= self.min_p <= 1.0:
             raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
+        if self.top_n_sigma < 0.0:
+            raise ValueError(
+                f"top_n_sigma must be non-negative, got {self.top_n_sigma}."
+            )
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(
                 f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."