vllm-project · vllm-bot · Dec 10, 2025 · Dec 8, 2025
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
@@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
 
 def force_use_trtllm_attention() -> bool | None:
     """
+    This function should only be called during initialization stage when vllm config
+    is set.
     Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
@@ -296,11 +298,12 @@ def use_trtllm_attention(
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
     has_sinks: bool = False,
     has_spec: bool = False,
 ) -> bool:
     """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()
 
     # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:

@@ -429,6 +429,7 @@ def __init__(
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
+        self.attention_config = vllm_config.attention_config
         self._workspace_buffer = None
         self._prefill_wrapper: (
             BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@@ -779,6 +780,7 @@ def build(
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
+            force_use_trtllm=self.attention_config.use_trtllm_attention,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )