Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix more logging lint errors #21

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@ def setup(app):
for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
f"Potentially problematic mock target ({mock_target}) found; "
"Potentially problematic mock target (%s) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.")
"been loaded into sys.modules when the sphinx build starts.",
mock_target)


class MockedClassDocumenter(autodoc.ClassDocumenter):
Expand Down
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def compute_num_jobs(self):
num_jobs = os.environ.get("MAX_JOBS", None)
if num_jobs is not None:
num_jobs = int(num_jobs)
logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
else:
try:
# os.sched_getaffinity() isn't universally available, so fall
Expand All @@ -81,8 +81,9 @@ def compute_num_jobs(self):
nvcc_threads = os.getenv("NVCC_THREADS", None)
if nvcc_threads is not None:
nvcc_threads = int(nvcc_threads)
logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
" of nvcc threads.")
logger.info(
"Using NVCC_THREADS=%d as the number of nvcc threads.",
nvcc_threads)
else:
nvcc_threads = 1
num_jobs = max(1, num_jobs // nvcc_threads)
Expand Down
4 changes: 2 additions & 2 deletions vllm/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
and (not os.path.exists(path)):
# only the local master process (with local_rank == 0) can
# enter this block to calculate the cache
logger.info(f"generating GPU P2P access cache for in {path}")
logger.info("generating GPU P2P access cache for in %s", path)
cache = {}
for _i in range(num_dev):
for _j in range(num_dev):
Expand All @@ -126,7 +126,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
if is_distributed:
cpu_world_group = get_cpu_world_group()
dist.barrier(cpu_world_group)
logger.info(f"reading GPU P2P access cache from {path}")
logger.info("reading GPU P2P access cache from %s", path)
with open(path, "r") as f:
cache = json.load(f)
_gpu_p2p_access_cache = cache
Expand Down
18 changes: 9 additions & 9 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def process_request_output(self,
self._request_streams[request_id].put(request_output)
if request_output.finished:
if verbose:
logger.info(f"Finished request {request_id}.")
logger.info("Finished request %s.", request_id)
self.abort_request(request_id)

def process_exception(self,
Expand All @@ -128,7 +128,7 @@ def process_exception(self,
"""Propagate an exception from the engine."""
self._request_streams[request_id].put(exception)
if verbose:
logger.info(f"Finished request {request_id}.")
logger.info("Finished request %s.", request_id)
self.abort_request(request_id)

def add_request(self, request_id: str,
Expand All @@ -151,7 +151,7 @@ def add_request(self, request_id: str,
def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
"""Abort a request during next background loop iteration."""
if verbose:
logger.info(f"Aborted request {request_id}.")
logger.info("Aborted request %s.", request_id)

self._finished_requests.put_nowait(request_id)

Expand Down Expand Up @@ -521,11 +521,11 @@ async def add_request(
if shortened_token_ids is not None:
shortened_token_ids = shortened_token_ids[:self.
max_log_len]
logger.info(f"Received request {request_id}: "
f"prompt: {shortened_prompt!r}, "
f"sampling_params: {sampling_params}, "
f"prompt_token_ids: {shortened_token_ids}, "
f"lora_request: {lora_request}.")
logger.info(
"Received request %s: prompt: %r, "
"sampling_params: %s, prompt_token_ids: %s, "
"lora_request: %s.", request_id, shortened_prompt,
sampling_params, shortened_token_ids, lora_request)

if not self.is_running:
if self.start_engine_loop:
Expand Down Expand Up @@ -717,4 +717,4 @@ async def check_health(self) -> None:
raise RuntimeError("Engine is dead.") from e
else:
await self.engine.check_health_async()
logger.debug(f"Health check took {time.perf_counter()-t}s")
logger.debug("Health check took %fs", time.perf_counter() - t)
6 changes: 3 additions & 3 deletions vllm/engine/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def execute_model_compiled_dag_remote(self, ignored):
return output

except ImportError as e:
logger.warning(f"Failed to import Ray with {e!r}. "
"For distributed inference, please install Ray with "
"`pip install ray`.")
logger.warning(
"Failed to import Ray with %r. For distributed inference, "
"please install Ray with `pip install ray`.", e)
ray = None # type: ignore
RayWorkerWrapper = None # type: ignore

Expand Down
2 changes: 1 addition & 1 deletion vllm/executor/cpu_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def initialize_cache(self, num_gpu_blocks: int,
# NOTE: `cpu block` for CPU backend is located on CPU memory but is
# referred as `gpu block`. Because we want to reuse the existing block
# management procedure.
logger.info(f"# CPU blocks: {num_gpu_blocks}")
logger.info("# CPU blocks: %d", num_gpu_blocks)
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)

def execute_model(self,
Expand Down
6 changes: 3 additions & 3 deletions vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ def activate_lora(
index, _ = first_free_slot
self._active_loras[lora_id] = None
lora_model = self._registered_loras[lora_id]
logger.debug(
f"Activating LoRA. int id: {lora_model.id}, slot index: {index}")
logger.debug("Activating LoRA. int id: %d, slot index: %d",
lora_model.id, index)
self.lora_index_to_id[index] = lora_model.id
for module_name, module in self.modules.items():
module_lora = lora_model.get_lora(module_name)
Expand Down Expand Up @@ -563,7 +563,7 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable],
self.deactivate_lora_fn = deactivate_lora_fn

def _on_remove(self, key: Hashable, value: LoRAModel):
logger.debug(f"Removing LoRA. int id: {key}")
logger.debug("Removing LoRA. int id: %d", key)
self.deactivate_lora_fn(key)
return super()._on_remove(key, value)

Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/model_loader/tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,10 @@ def deserialize(self):
per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
after_mem = get_mem_usage()
deserializer.close()
logger.info(f"Deserialized {total_bytes_str} in "
f"{end - start:0.2f}s, {per_second}/s")
logger.info(f"Memory usage before: {before_mem}")
logger.info(f"Memory usage after: {after_mem}")
logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
end - start, per_second)
logger.info("Memory usage before: %s", before_mem)
logger.info("Memory usage after: %s", after_mem)

self._check_tensors_on_meta_device()
self._resize_lora_embeddings()
Expand Down
14 changes: 7 additions & 7 deletions vllm/model_executor/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str,
allow_patterns = [pattern]
break

logger.info(f"Using model weights format {allow_patterns}")
logger.info("Using model weights format %s", allow_patterns)
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(model_name_or_path, cache_dir):
Expand Down Expand Up @@ -310,17 +310,17 @@ def kv_cache_scales_loader(
return layer_scales_map.items()

except FileNotFoundError:
logger.error(f"File or directory '{filename}' not found.")
logger.error("File or directory '%s' not found.", filename)
except json.JSONDecodeError:
logger.error(f"Error decoding JSON in file '{filename}'.")
logger.error("Error decoding JSON in file '%s'.", filename)
except Exception as e:
logger.error(f"An error occurred while reading '{filename}': {e}")
logger.error("An error occurred while reading '%s': %s", filename, e)
# This section is reached if and only if any of the excepts are hit
# Return an empty iterable (list) => no KV cache scales are loaded
# which ultimately defaults to 1.0 scales
logger.warning("Defaulting to KV cache scaling factors = 1.0 "
f"for all layers in TP rank {tp_rank} "
"as an error occurred during loading.")
logger.warning(
"Defaulting to KV cache scaling factors = 1.0 for all "
"layers in TP rank %d as an error occurred during loading.", tp_rank)
return []


Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
"ROCm for now.")
if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
logger.warning(
f"Model architecture {model_arch} is partially supported "
"by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
"Model architecture %s is partially supported by ROCm: %s",
model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])

module_name, model_cls_name = _MODELS[model_arch]
module = importlib.import_module(
Expand All @@ -106,9 +106,9 @@ def get_supported_archs() -> List[str]:
def register_model(model_arch: str, model_cls: Type[nn.Module]):
if model_arch in _MODELS:
logger.warning(
f"Model architecture {model_arch} is already registered, "
"and will be overwritten by the new model "
f"class {model_cls.__name__}.")
"Model architecture %s is already registered, and will be "
"overwritten by the new model class %s.", model_arch,
model_cls.__name__)
global _OOT_MODELS
_OOT_MODELS[model_arch] = model_cls

Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def _get_gemma_act_fn(
"in the config JSON file when it was initially released. "
"Changing the activation function to approximate GeLU "
"(`gelu_pytorch_tanh`). If you want to use the legacy "
f"`{hidden_act}`, edit the config JSON to set "
f"`hidden_activation={hidden_act}` instead of `hidden_act`. "
"`%s`, edit the config JSON to set "
"`hidden_activation=%s` instead of `hidden_act`. "
"See https://github.com/huggingface/transformers/pull/29402 "
"for more details.")
"for more details.", hidden_act, hidden_act)
return GeluAndMul(approximate="tanh")
elif hidden_activation == "gelu_pytorch_tanh":
return GeluAndMul(approximate="tanh")
Expand Down
13 changes: 7 additions & 6 deletions vllm/transformers_utils/configs/dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ def from_pretrained(
and config_dict["model_type"] != cls.model_type
):
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
"You are using a model of type %s to instantiate a model of "
"type %s. This is not supported for all configurations of "
"models and can yield errors.",
config_dict["model_type"], cls.model_type)

return cls.from_dict(config_dict, **kwargs)

Expand Down Expand Up @@ -151,9 +152,9 @@ def from_pretrained(
and config_dict["model_type"] != cls.model_type
):
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
"You are using a model of type %s to instantiate a model of "
"type {cls.model_type}. This is not supported for all "
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be %s. I will fix it.

"configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)

return cls.from_dict(config_dict, **kwargs)

Expand Down
5 changes: 2 additions & 3 deletions vllm/transformers_utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
logger.warning(
f"No tokenizer found in {lora_request.lora_local_path}, "
"using base model tokenizer instead. "
f"(Exception: {str(e)})")
"No tokenizer found in %s, using base model tokenizer instead. "
"(Exception: %s)", lora_request.lora_local_path, e)
tokenizer = None
return tokenizer

Expand Down
27 changes: 14 additions & 13 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def load_model(self) -> None:
)

self.model_memory_usage = m.consumed_memory
logger.info(f"Loading model weights took "
f"{self.model_memory_usage / float(2**30):.4f} GB")
logger.info("Loading model weights took %.4f GB",
self.model_memory_usage / float(2**30))

if self.lora_config:
assert hasattr(self.model, "supported_lora_modules"
Expand All @@ -196,18 +196,19 @@ def load_model(self) -> None:
self.model.load_kv_cache_scales(
self.model_config.quantization_param_path)
else:
raise RuntimeError("Using FP8 KV cache and scaling "
"factors provided but model "
f"{self.model.__class__} does not "
"support loading scaling factors.")
raise RuntimeError(
"Using FP8 KV cache and scaling factors provided but "
"model %s does not support loading scaling factors.",
self.model.__class__)
else:
logger.warn("Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!")
logger.warning(
"Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!")
elif self.model_config.quantization_param_path is not None:
logger.warn("KV cache scaling factors provided, "
"but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used.")
logger.warning("KV cache scaling factors provided, "
"but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used.")

def set_block_size(self, block_size: int) -> None:
self.block_size = block_size
Expand Down Expand Up @@ -1054,7 +1055,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
end_time = time.perf_counter()
elapsed_time = end_time - start_time
# This usually takes < 10 seconds.
logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)

def __del__(self) -> None:
# Delete the CUDA graphs before deleting the pynccl communicator.
Expand Down