[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used

nvpohanh · nvpohanh · commit 446ee6455ed0 · 2025-12-07T22:28:35.000-08:00
VLLM config is set only during initialization stage, not during runtime
stage. Therefore, we should not call get_current_vllm_config() during
dunrime stage. Instead, cache the config we want during initialization
stage and reuse it during runtime stage.

Signed-off-by: Po-Han Huang &lt;pohanh@nvidia.com&gt;
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
@@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
 
 def force_use_trtllm_attention() -> bool | None:
     """
+    This function should only be called during initialization stage when vllm config
+    is set.
     Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
@@ -296,11 +298,12 @@ def use_trtllm_attention(
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
     has_sinks: bool = False,
     has_spec: bool = False,
 ) -> bool:
     """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()
 
     # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -43,6 +43,7 @@
 from vllm.triton_utils import tl, triton
 from vllm.utils.flashinfer import (
     can_use_trtllm_attention,
+    force_use_trtllm_attention,
     use_trtllm_attention,
 )
 from vllm.utils.math_utils import cdiv
@@ -357,7 +358,6 @@ def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
     def supports_sink(cls) -> bool:
         """FlashInfer supports sinks when TRTLLM attention is available (SM100)."""
         from vllm.utils.flashinfer import (
-            force_use_trtllm_attention,
             supports_trtllm_attention,
         )
 
@@ -499,6 +499,10 @@ def __init__(
             assert self.kv_cache_spec.dtype == self.model_config.dtype
             self.kv_cache_dtype = self.kv_cache_spec.dtype
 
+        # Store whether to force use TRTLLM attention since vllm config is only
+        # available during initialization stage.
+        self.force_use_trtllm = force_use_trtllm_attention()
+
         # Use model dtype as q dtype when TRTLLM attn is not supported, or
         # --attention-config.disable_flashinfer_q_quantization is set to 1. Otherwise,
         # try to use fp8 q if kv cache is fp8, and will fall back to model dtype
@@ -779,6 +783,7 @@ def build(
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
+            force_use_trtllm=self.force_use_trtllm,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )