From dfd1bf127b4a511f6c2cf7b11e34f0b3be736a7e Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:24:40 -0500 Subject: [PATCH 01/12] create a read only function for new metric `last_token_latency` Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index cc3d96fc93a79..4db4ee971014c 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -667,6 +667,7 @@ def __init__( first_scheduled_time=None, first_token_time=None, time_in_queue=None) + self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() @@ -773,8 +774,15 @@ def get_last_latency(self, now: float) -> float: # Otherwise return token latency. latency = now - self.metrics.last_token_time self.metrics.last_token_time = now - return latency + def get_last_token_latency(self) -> float: + """Returns the latency of the last token.""" + if self.is_prefill(): + raise ValueError( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") + return self.last_token_latency + def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" # Note: in a case where a sequence_group is swapped and From 5d7bdb31fd3f6e0970d6bd30350eca5e6c07c513 Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:25:43 -0500 Subject: [PATCH 02/12] set function for setting last token latency and time Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 4db4ee971014c..69a6e20ef0512 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -763,16 +763,14 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill self.init_multi_step(num_steps=num_lookahead_slots + 1) - def get_last_latency(self, now: float) -> float: + def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" # If still in prefill phase, raise Error. if self.is_prefill(): raise ValueError( "seq_group.get_last_latency() should not be called " "if the seq_group is in prefill phase.") - - # Otherwise return token latency. - latency = now - self.metrics.last_token_time + self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now def get_last_token_latency(self) -> float: From ed14f7caa3c9558ca8cc1f8e8f8dce9b5d7e79cf Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:26:44 -0500 Subject: [PATCH 03/12] moved setting last token time to `_process_model_outputs` Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/engine/llm_engine.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dc2d77d6927cd..d269c48b803b1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1139,6 +1139,8 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, @@ -1181,6 +1183,8 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, From 27e0516fdc85f70a088c6fe9c397c3817352f7ff Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:27:28 -0500 Subject: [PATCH 04/12] Calling read only function for last token latency inside `_get_stats` Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/engine/llm_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d269c48b803b1..196e379fb43c6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1705,7 +1705,7 @@ def _get_stats(self, # If the seq_group just finished the prefill state # get TTFT. if not seq_group.is_prefill(): - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_to_first_tokens_iter.append(latency) # One generation token per finished prefill. @@ -1713,7 +1713,7 @@ def _get_stats(self, seq_group.num_seqs()) else: # TPOTs. - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_per_output_tokens_iter.append(latency) if seq_group.state.current_step == 0: # For async_output_proc, the do_log_stats() From dbc6cb14fb14597b3638ed7d60c399b83d639ddd Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:35:23 -0500 Subject: [PATCH 05/12] format fix Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 69a6e20ef0512..c5e6729b80aa3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -780,7 +780,7 @@ def get_last_token_latency(self) -> float: "seq_group.get_last_token_latency() should not be called " "if the seq_group is in prefill phase.") return self.last_token_latency - + def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" # Note: in a case where a sequence_group is swapped and From fe42b116ed855e9e3664d5d4d8f3d7c3780e81b2 Mon Sep 17 00:00:00 2001 From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:39:44 -0500 Subject: [PATCH 06/12] Typo fix Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index c5e6729b80aa3..82f1f74ef8045 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -768,7 +768,7 @@ def set_last_token_time(self, now: float) -> None: # If still in prefill phase, raise Error. if self.is_prefill(): raise ValueError( - "seq_group.get_last_latency() should not be called " + "seq_group.set_last_token_time() should not be called " "if the seq_group is in prefill phase.") self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now From a6e63169e68caa840770731467a3886459741daa Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Mon, 23 Dec 2024 11:19:13 -0500 Subject: [PATCH 07/12] switch to assertion Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 82f1f74ef8045..34f910d47b7d9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -765,20 +765,18 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" - # If still in prefill phase, raise Error. - if self.is_prefill(): - raise ValueError( - "seq_group.set_last_token_time() should not be called " - "if the seq_group is in prefill phase.") + # If still in prefill phase, assertion fails. + assert not self.is_prefill(), ( + "seq_group.set_last_token_time() should not be called " + "if the seq_group is in prefill phase.") self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now def get_last_token_latency(self) -> float: """Returns the latency of the last token.""" - if self.is_prefill(): - raise ValueError( - "seq_group.get_last_token_latency() should not be called " - "if the seq_group is in prefill phase.") + assert not self.is_prefill(), ( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") return self.last_token_latency def maybe_set_first_token_time(self, time: float) -> None: From f9a7519b4e6c709dfcba97de6b9c436f0039467a Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:24:40 -0500 Subject: [PATCH 08/12] create a read only function for new metric `last_token_latency` Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/sequence.py b/vllm/sequence.py index 34f910d47b7d9..e982be1a7c0d4 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -779,6 +779,14 @@ def get_last_token_latency(self) -> float: "if the seq_group is in prefill phase.") return self.last_token_latency + def get_last_token_latency(self) -> float: + """Returns the latency of the last token.""" + if self.is_prefill(): + raise ValueError( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") + return self.last_token_latency + def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" # Note: in a case where a sequence_group is swapped and From d040510b7a219230f1b23221c55cfdd304a2e53e Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:25:43 -0500 Subject: [PATCH 09/12] set function for setting last token latency and time Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/sequence.py b/vllm/sequence.py index e982be1a7c0d4..584e7bfaa650f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -765,10 +765,18 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" +<<<<<<< HEAD # If still in prefill phase, assertion fails. assert not self.is_prefill(), ( "seq_group.set_last_token_time() should not be called " "if the seq_group is in prefill phase.") +======= + # If still in prefill phase, raise Error. + if self.is_prefill(): + raise ValueError( + "seq_group.get_last_latency() should not be called " + "if the seq_group is in prefill phase.") +>>>>>>> set function for setting last token latency and time self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now From e075ac3df2ad18482277549580194cf175e9bf4b Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:35:23 -0500 Subject: [PATCH 10/12] format fix Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 584e7bfaa650f..834ae6c184b13 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -794,7 +794,7 @@ def get_last_token_latency(self) -> float: "seq_group.get_last_token_latency() should not be called " "if the seq_group is in prefill phase.") return self.last_token_latency - + def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" # Note: in a case where a sequence_group is swapped and From d7e39e8e7b1efad88344484af68fc0cdf78a63c5 Mon Sep 17 00:00:00 2001 From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com> Date: Sun, 22 Dec 2024 14:39:44 -0500 Subject: [PATCH 11/12] Typo fix Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 834ae6c184b13..720079de74c26 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -774,7 +774,7 @@ def set_last_token_time(self, now: float) -> None: # If still in prefill phase, raise Error. if self.is_prefill(): raise ValueError( - "seq_group.get_last_latency() should not be called " + "seq_group.set_last_token_time() should not be called " "if the seq_group is in prefill phase.") >>>>>>> set function for setting last token latency and time self.last_token_latency = now - self.metrics.last_token_time From 08e8e26ec9d3f69014c6e97d12ceba1cd4d35d4b Mon Sep 17 00:00:00 2001 From: rajveerb <46040700+rajveerb@users.noreply.github.com> Date: Mon, 23 Dec 2024 11:19:13 -0500 Subject: [PATCH 12/12] switch to assertion Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> --- vllm/sequence.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 720079de74c26..34f910d47b7d9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -765,18 +765,10 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" -<<<<<<< HEAD # If still in prefill phase, assertion fails. assert not self.is_prefill(), ( "seq_group.set_last_token_time() should not be called " "if the seq_group is in prefill phase.") -======= - # If still in prefill phase, raise Error. - if self.is_prefill(): - raise ValueError( - "seq_group.set_last_token_time() should not be called " - "if the seq_group is in prefill phase.") ->>>>>>> set function for setting last token latency and time self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now @@ -787,14 +779,6 @@ def get_last_token_latency(self) -> float: "if the seq_group is in prefill phase.") return self.last_token_latency - def get_last_token_latency(self) -> float: - """Returns the latency of the last token.""" - if self.is_prefill(): - raise ValueError( - "seq_group.get_last_token_latency() should not be called " - "if the seq_group is in prefill phase.") - return self.last_token_latency - def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" # Note: in a case where a sequence_group is swapped and