2020from lmms_eval .api .model import lmms
2121from lmms_eval .api .registry import register_model
2222
23- NUM_SECONDS_TO_SLEEP = 5
23+ NUM_SECONDS_TO_SLEEP = int (os .getenv ("NUM_SECONDS_TO_SLEEP" , "5" ))
24+ WORKERS = int (os .getenv ("WORKERS" , "32" ))
2425
2526try :
2627 from vllm import LLM , SamplingParams
@@ -50,7 +51,7 @@ class VLLM(lmms):
5051 - VLLM chat method: https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat
5152
5253 Args:
53- model_version (str): HuggingFace model identifier or path to the model.
54+ model (str): HuggingFace model identifier or path to the model.
5455 Default: "Qwen/Qwen2.5-VL-3B-Instruct"
5556 tensor_parallel_size (int): Number of GPUs to use for tensor parallelism.
5657 Default: 1
@@ -81,7 +82,7 @@ class VLLM(lmms):
8182 "--model",
8283 "vllm",
8384 "--model_args",
84- "model_version =meta-llama/Llama-4-Scout-17B-16E-Instruct,"
85+ "model =meta-llama/Llama-4-Scout-17B-16E-Instruct,"
8586 "tensor_parallel_size=4,"
8687 "dtype=bfloat16,"
8788 "max_model_len=10240,"
@@ -118,7 +119,7 @@ class VLLM(lmms):
118119 "--model",
119120 "vllm",
120121 "--model_args",
121- "model_version =deepseek-ai/deepseek-vl2,"
122+ "model =deepseek-ai/deepseek-vl2,"
122123 'hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},' # example of passing model specific arguments, JSON string will be parsed automatically
123124 f"chat_template={chat_template_file}," # chat template file path
124125 "tensor_parallel_size=2,"
@@ -145,12 +146,12 @@ class VLLM(lmms):
145146
146147 def __init__ (
147148 self ,
148- model_version : str = "Qwen/Qwen2.5-VL-3B-Instruct" ,
149+ model : str = "Qwen/Qwen2.5-VL-3B-Instruct" ,
149150 tensor_parallel_size : int = 1 ,
151+ data_parallel_size : int = 1 ,
150152 gpu_memory_utilization : float = 0.8 ,
151153 batch_size : int = 1 ,
152154 max_frame_num : int = 32 ,
153- threads : int = 16 , # Threads to use for decoding visuals
154155 trust_remote_code : Optional [bool ] = True ,
155156 chat_template : Optional [str ] = None ,
156157 min_image_pixels : int = 28 , # minimum image dimension, required for Qwen 2/2.5-VL models
@@ -160,12 +161,13 @@ def __init__(
160161 # Manually set a image token for GPT4V so that we can search for it
161162 # and split the text and image
162163 # Here we just use the same token as llava for convenient
163- self .model_version = model_version
164+ self .model = model
164165 self .max_frame_num = max_frame_num
165- self .threads = threads
166+ self .chat_template = chat_template
166167 self .min_image_pixels = min_image_pixels
168+ self .data_parallel_size = data_parallel_size
167169 # Qwen 2/2.5-VL models enforce minimum image dimensions
168- self ._enforce_image_resize = self ._is_qwen_vl_model (model_version )
170+ self ._enforce_image_resize = self ._is_qwen_vl_model (model )
169171
170172 # Load chat template during initialization
171173 self .chat_template = None
@@ -191,13 +193,6 @@ def __init__(
191193
192194 # Set up vllm client
193195 os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
194- self .client = LLM (
195- model = self .model_version ,
196- tensor_parallel_size = tensor_parallel_size ,
197- gpu_memory_utilization = gpu_memory_utilization ,
198- trust_remote_code = trust_remote_code ,
199- ** kwargs ,
200- )
201196
202197 accelerator = Accelerator ()
203198 if accelerator .num_processes > 1 :
@@ -211,13 +206,27 @@ def __init__(
211206 self .accelerator = accelerator
212207 self ._rank = self .accelerator .local_process_index
213208 self ._world_size = self .accelerator .num_processes
209+ # TODO: Support tensor parallelism in the future for flexible vllm parallel
210+ if data_parallel_size > 1 :
211+ assert tensor_parallel_size == 1 , "Data parallelism is not supported with tensor parallelism. For current vllm version"
212+ if accelerator .num_processes > 1 :
213+ kwargs ["distributed_executor_backend" ] = "external_launcher"
214+ self .client = LLM (
215+ model = self .model ,
216+ tensor_parallel_size = tensor_parallel_size ,
217+ gpu_memory_utilization = gpu_memory_utilization ,
218+ trust_remote_code = trust_remote_code ,
219+ disable_log_stats = False ,
220+ seed = 1 ,
221+ ** kwargs ,
222+ )
214223
215224 self .device = self .accelerator .device
216225 self .batch_size_per_gpu = int (batch_size )
217226
218- def _is_qwen_vl_model (self , model_version : str ) -> bool :
227+ def _is_qwen_vl_model (self , model : str ) -> bool :
219228 qwen_vl_patterns = ["qwen2-vl" , "qwen2.5-vl" ]
220- return any (pattern in model_version .lower () for pattern in qwen_vl_patterns )
229+ return any (pattern in model .lower () for pattern in qwen_vl_patterns )
221230
222231 def _maybe_resize_image (self , img : Image .Image ) -> Image .Image :
223232 # edge‐case validation
@@ -294,16 +303,14 @@ def generate_until(self, requests) -> List[str]:
294303 contexts , gen_kwargs , doc_to_visual , doc_id , task , split = batch_requests [idx ].arguments
295304 if "max_new_tokens" not in gen_kwargs :
296305 gen_kwargs ["max_new_tokens" ] = 1024
297- if gen_kwargs ["max_new_tokens" ] > 4096 :
298- gen_kwargs ["max_new_tokens" ] = 4096
299306 if "temperature" not in gen_kwargs :
300307 gen_kwargs ["temperature" ] = 0
301308 if "top_p" not in gen_kwargs :
302309 gen_kwargs ["top_p" ] = 0.95
303310
304311 params = {
305- "temperature" : gen_kwargs ["temperature" ],
306312 "max_tokens" : gen_kwargs ["max_new_tokens" ],
313+ "temperature" : gen_kwargs ["temperature" ],
307314 "top_p" : gen_kwargs ["top_p" ],
308315 }
309316 sampling_params = SamplingParams (** params )
@@ -316,7 +323,7 @@ def generate_until(self, requests) -> List[str]:
316323 visuals = self .flatten (visuals )
317324 imgs = [] # multiple images or frames for video
318325 all_tasks = []
319- with ThreadPoolExecutor (max_workers = self . threads ) as executor :
326+ with ThreadPoolExecutor (max_workers = WORKERS ) as executor :
320327 for visual in visuals :
321328 if isinstance (visual , str ) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual ):
322329 all_tasks .append (executor .submit (self .encode_video , visual ))
@@ -329,10 +336,10 @@ def generate_until(self, requests) -> List[str]:
329336 imgs .append (task .result ())
330337
331338 messages = [{"role" : "user" , "content" : []}]
332- # When there is no image token in the context, append the image to the text
333- messages [0 ]["content" ].append ({"type" : "text" , "text" : contexts })
339+ # Add images first, then text
334340 for img in self .flatten (imgs ):
335341 messages [0 ]["content" ].append ({"type" : "image_url" , "image_url" : {"url" : f"data:image/png;base64,{ img } " }})
342+ messages [0 ]["content" ].append ({"type" : "text" , "text" : contexts })
336343
337344 batched_messages .append (messages )
338345
0 commit comments