Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1][bugfix] fix cudagraph with inplace buffer assignment #11596

Merged
merged 4 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion vllm/compilation/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ def __init__(self,
compiled_callable: Optional[Callable] = None,
compilation_level: int = 0):

vllm_config = get_current_vllm_config()
self.vllm_config = vllm_config
if compiled_callable is None:
# default compilation settings
# compiling the forward method

vllm_config = get_current_vllm_config()
backend = vllm_config.compilation_config.init_backend(vllm_config)

compiled_callable = torch.compile(
Expand Down Expand Up @@ -82,6 +83,13 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):

self.compiled_codes.append(new_code)

if self.vllm_config.compilation_config.use_cudagraph and \
"update" in new_code.co_names:
import depyf
src = depyf.decompile(new_code)
msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa
raise RuntimeError(msg)
youkaichao marked this conversation as resolved.
Show resolved Hide resolved

@contextmanager
def dispatch_to_code(self, index: int):
"""Context manager to dispatch to the compiled code.
Expand Down
11 changes: 1 addition & 10 deletions vllm/model_executor/layers/rotary_embedding.py
youkaichao marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -541,19 +541,12 @@ def __init__(
short_cache = self._compute_cos_sin_cache(
original_max_position_embeddings, short_factor, short_mscale)
short_cache = short_cache.to(dtype)
self.register_buffer("short_cos_sin_cache",
short_cache,
persistent=False)

long_cache = self._compute_cos_sin_cache(max_position_embeddings,
long_factor, long_mscale)
long_cache = long_cache.to(dtype)
self.register_buffer("long_cos_sin_cache",
long_cache,
persistent=False)

long_short_cache = torch.cat(
[self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
Comment on lines -544 to -556
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are irrelevant to this pr's bugfix, but I just found they are redundant.

long_short_cache = torch.cat([short_cache, long_cache], dim=0)
self.register_buffer("long_short_cos_sin_cache",
long_short_cache,
persistent=False)
Expand Down Expand Up @@ -593,8 +586,6 @@ def forward(
torch.full_like(positions, k)).long()
idx = (torch.add(positions, long_prompt_offset)
if long_prompt_offset is not None else positions)
self.long_short_cos_sin_cache: torch.Tensor = (
self.long_short_cos_sin_cache.to(idx.device))
youkaichao marked this conversation as resolved.
Show resolved Hide resolved
idx = torch.add(idx, offsets) if offsets is not None else idx
cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)

Expand Down
Loading