Skip to content

Commit

Permalink
[Misc] Move print_*_once from utils to logger (#11298)
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
Signed-off-by: Maxime Fournioux <[email protected]>
Co-authored-by: Maxime Fournioux <[email protected]>
  • Loading branch information
DarkLight1337 and mfournioux authored Jan 9, 2025
1 parent 730e959 commit d848800
Show file tree
Hide file tree
Showing 21 changed files with 129 additions and 72 deletions.
1 change: 1 addition & 0 deletions .github/workflows/lint-and-deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
run: |
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test
Expand Down
9 changes: 6 additions & 3 deletions vllm/attention/backends/torch_sdpa.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.ipex_attn import PagedAttention
from vllm.attention.ops.paged_attn import PagedAttentionMetadata
from vllm.utils import make_tensor_with_pad, print_warning_once
from vllm.logger import init_logger
from vllm.utils import make_tensor_with_pad
from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder

logger = init_logger(__name__)


class TorchSDPABackend(AttentionBackend):

Expand Down Expand Up @@ -396,8 +399,8 @@ def __init__(
raise ValueError(
"Torch SPDA does not support block-sparse attention.")
if logits_soft_cap is not None:
print_warning_once("Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off.")
logger.warning_once("Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
Expand Down
8 changes: 5 additions & 3 deletions vllm/attention/backends/xformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata)
from vllm.utils import print_warning_once
from vllm.logger import init_logger

logger = init_logger(__name__)


class XFormersBackend(AttentionBackend):
Expand Down Expand Up @@ -385,8 +387,8 @@ def __init__(
raise ValueError(
"XFormers does not support block-sparse attention.")
if logits_soft_cap is not None:
print_warning_once("XFormers does not support logits soft cap. "
"Outputs may be slightly off.")
logger.warning_once("XFormers does not support logits soft cap. "
"Outputs may be slightly off.")
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
Expand Down
9 changes: 4 additions & 5 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
get_cpu_memory, print_warning_once, random_uuid,
resolve_obj_by_qualname)
get_cpu_memory, random_uuid, resolve_obj_by_qualname)

if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
Expand Down Expand Up @@ -314,7 +313,7 @@ def __init__(self,
sliding_window_len_min = get_min_sliding_window(
self.hf_text_config.sliding_window)

print_warning_once(
logger.warning_once(
f"{self.hf_text_config.model_type} has interleaved "
"attention, which is currently not supported by the "
"XFORMERS backend. Disabling sliding window and capping "
Expand Down Expand Up @@ -2758,7 +2757,7 @@ def uuid(self):

def model_post_init(self, __context: Any) -> None:
if not self.enable_reshape and self.enable_fusion:
print_warning_once(
logger.warning_once(
"Fusion enabled but reshape elimination disabled."
"RMSNorm + quant (fp8) fusion might not work")

Expand Down Expand Up @@ -3151,7 +3150,7 @@ def __post_init__(self):
self.scheduler_config.chunked_prefill_enabled and \
self.model_config.dtype == torch.float32 and \
current_platform.get_device_capability() == (7, 5):
print_warning_once(
logger.warning_once(
"Turing devices tensor cores do not support float32 matmul. "
"To workaround this limitation, vLLM will set 'ieee' input "
"precision for chunked prefill triton kernels.")
Expand Down
7 changes: 3 additions & 4 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.utils import MediaConnector
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.utils import print_warning_once

logger = init_logger(__name__)

Expand Down Expand Up @@ -985,14 +984,14 @@ def apply_mistral_chat_template(
**kwargs: Any,
) -> List[int]:
if chat_template is not None:
print_warning_once(
logger.warning_once(
"'chat_template' cannot be overridden for mistral tokenizer.")
if "add_generation_prompt" in kwargs:
print_warning_once(
logger.warning_once(
"'add_generation_prompt' is not supported for mistral tokenizer, "
"so it will be ignored.")
if "continue_final_message" in kwargs:
print_warning_once(
logger.warning_once(
"'continue_final_message' is not supported for mistral tokenizer, "
"so it will be ignored.")

Expand Down
20 changes: 11 additions & 9 deletions vllm/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.utils import print_info_once, print_warning_once

from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
PromptType, SingletonInputs, SingletonPrompt, token_inputs)
Expand Down Expand Up @@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
'''

if not self.model_config.is_encoder_decoder:
print_warning_once("Using None for decoder start token id because "
"this is not an encoder/decoder model.")
logger.warning_once(
"Using None for decoder start token id because "
"this is not an encoder/decoder model.")
return None

if (self.model_config is None or self.model_config.hf_config is None):
print_warning_once("Using None for decoder start token id because "
"model config is not available.")
logger.warning_once(
"Using None for decoder start token id because "
"model config is not available.")
return None

dec_start_token_id = getattr(self.model_config.hf_config,
'decoder_start_token_id', None)
if dec_start_token_id is None:
print_warning_once("Falling back on <BOS> for decoder start token "
"id because decoder start token id is not "
"available.")
logger.warning_once(
"Falling back on <BOS> for decoder start token "
"id because decoder start token id is not "
"available.")
dec_start_token_id = self.get_bos_token_id()

return dec_start_token_id
Expand Down Expand Up @@ -231,7 +233,7 @@ def _can_process_multimodal(self) -> bool:
# updated to use the new multi-modal processor
can_process_multimodal = self.mm_registry.has_processor(model_config)
if not can_process_multimodal:
print_info_once(
logger.info_once(
"Your model uses the legacy input pipeline instead of the new "
"multi-modal processor. Please note that the legacy pipeline "
"will be removed in a future release. For more details, see: "
Expand Down
4 changes: 2 additions & 2 deletions vllm/inputs/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
print_warning_once, resolve_mm_processor_kwargs)
resolve_mm_processor_kwargs)

from .data import ProcessorInputs, SingletonInputs
from .parse import is_encoder_decoder_inputs
Expand Down Expand Up @@ -352,7 +352,7 @@ def dummy_data_for_profiling(
num_tokens = dummy_data.seq_data.prompt_token_ids
if len(num_tokens) < seq_len:
if is_encoder_data:
print_warning_once(
logger.warning_once(
f"Expected at least {seq_len} dummy encoder tokens for "
f"profiling, but found {len(num_tokens)} tokens instead.")
else:
Expand Down
57 changes: 52 additions & 5 deletions vllm/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import logging
import os
import sys
from functools import partial
from functools import lru_cache, partial
from logging import Logger
from logging.config import dictConfig
from os import path
from typing import Dict, Optional
from types import MethodType
from typing import Any, Optional, cast

import vllm.envs as envs

Expand Down Expand Up @@ -49,8 +50,44 @@
}


@lru_cache
def _print_info_once(logger: Logger, msg: str) -> None:
# Set the stacklevel to 2 to print the original caller's line info
logger.info(msg, stacklevel=2)


@lru_cache
def _print_warning_once(logger: Logger, msg: str) -> None:
# Set the stacklevel to 2 to print the original caller's line info
logger.warning(msg, stacklevel=2)


class _VllmLogger(Logger):
"""
Note:
This class is just to provide type information.
We actually patch the methods directly on the :class:`logging.Logger`
instance to avoid conflicting with other libraries such as
`intel_extension_for_pytorch.utils._logger`.
"""

def info_once(self, msg: str) -> None:
"""
As :meth:`info`, but subsequent calls with the same message
are silently dropped.
"""
_print_info_once(self, msg)

def warning_once(self, msg: str) -> None:
"""
As :meth:`warning`, but subsequent calls with the same message
are silently dropped.
"""
_print_warning_once(self, msg)


def _configure_vllm_root_logger() -> None:
logging_config: Dict = {}
logging_config = dict[str, Any]()

if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
raise RuntimeError(
Expand Down Expand Up @@ -84,12 +121,22 @@ def _configure_vllm_root_logger() -> None:
dictConfig(logging_config)


def init_logger(name: str) -> Logger:
def init_logger(name: str) -> _VllmLogger:
"""The main purpose of this function is to ensure that loggers are
retrieved in such a way that we can be sure the root vllm logger has
already been configured."""

return logging.getLogger(name)
logger = logging.getLogger(name)

methods_to_patch = {
"info_once": _print_info_once,
"warning_once": _print_warning_once,
}

for method_name, method in methods_to_patch.items():
setattr(logger, method_name, MethodType(method, logger))

return cast(_VllmLogger, logger)


# The root logger is initialized when the module is imported.
Expand Down
6 changes: 4 additions & 2 deletions vllm/lora/peft_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from dataclasses import MISSING, dataclass, field, fields
from typing import Literal, Optional, Union

from vllm.utils import print_info_once
from vllm.logger import init_logger

logger = init_logger(__name__)


@dataclass
Expand Down Expand Up @@ -42,7 +44,7 @@ def _validate_features(self):
def __post_init__(self):
self._validate_features()
if self.use_rslora:
print_info_once("Loading LoRA weights trained with rsLoRA.")
logger.info_once("Loading LoRA weights trained with rsLoRA.")
self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
else:
self.vllm_lora_scaling_factor = self.lora_alpha / self.r
Expand Down
8 changes: 5 additions & 3 deletions vllm/lora/punica_wrapper/punica_selector.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import print_info_once

from .punica_base import PunicaWrapperBase

logger = init_logger(__name__)


def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
if current_platform.is_cuda_alike():
# Lazy import to avoid ImportError
from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
print_info_once("Using PunicaWrapperGPU.")
logger.info_once("Using PunicaWrapperGPU.")
return PunicaWrapperGPU(*args, **kwargs)
elif current_platform.is_hpu():
# Lazy import to avoid ImportError
from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
print_info_once("Using PunicaWrapperHPU.")
logger.info_once("Using PunicaWrapperHPU.")
return PunicaWrapperHPU(*args, **kwargs)
else:
raise NotImplementedError
3 changes: 1 addition & 2 deletions vllm/model_executor/custom_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from vllm.config import get_current_vllm_config
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import print_warning_once

logger = init_logger(__name__)

Expand Down Expand Up @@ -91,7 +90,7 @@ def enabled(cls) -> bool:
compilation_config = get_current_vllm_config().compilation_config
custom_ops = compilation_config.custom_ops
if not hasattr(cls, "name"):
print_warning_once(
logger.warning_once(
f"Custom op {cls.__name__} was not registered, "
f"which means it won't appear in the op registry. "
f"It will be enabled/disabled based on the global settings.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import vllm.model_executor.layers.fused_moe # noqa
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
Expand All @@ -16,7 +17,8 @@
all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import print_warning_once

logger = init_logger(__name__)


class GPTQMarlinState(Enum):
Expand Down Expand Up @@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
"activation scales are None.")
if (not all_close_1d(layer.w13_input_scale)
or not all_close_1d(layer.w2_input_scale)):
print_warning_once(
logger.warning_once(
"Found input_scales that are not equal for "
"fp8 MoE layer. Using the maximum across experts "
"for each layer. ")
"for each layer.")
layer.w13_input_scale = torch.nn.Parameter(
layer.w13_input_scale.max(), requires_grad=False)
layer.w2_input_scale = torch.nn.Parameter(
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
PerTensorScaleParameter)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import print_warning_once

ACTIVATION_SCHEMES = ["static", "dynamic"]

Expand Down Expand Up @@ -539,10 +538,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
"activation scales are None.")
if (not all_close_1d(layer.w13_input_scale)
or not all_close_1d(layer.w2_input_scale)):
print_warning_once(
logger.warning_once(
"Found input_scales that are not equal for "
"fp8 MoE layer. Using the maximum across experts "
"for each layer. ")
"for each layer.")
layer.w13_input_scale = torch.nn.Parameter(
layer.w13_input_scale.max(), requires_grad=False)
layer.w2_input_scale = torch.nn.Parameter(
Expand Down
Loading

0 comments on commit d848800

Please sign in to comment.