Skip to content

Commit 4994927

Browse files
Luodianbrian.li
andauthored
[Feat] fix tasks and vllm to reproduce better results. (#774)
* modify gpt evaluation model name retrieval * [Enhancement] Improve score processing and validation in mia_bench * [Refactor] Update MathVerseEvaluator initialization and API key handling * remove sampling-related parameters to avoid warnings * fix llm judge with azure * Refactor mathverse result aggregation by removing redundant file saving steps during evaluation * Remove unused submission metric from mathverse_testmini.yaml configuration * Remove unused generation parameters from mmmu_val_thinking.yaml to streamline configuration * Refactor mathvista result aggregation by removing file saving steps from evaluation process * [Refactor] Update VLLM model initialization and configuration parameters * [Refactor] Update model argument naming in VLLM configuration * [Refactor] Clean up whitespace and improve regex patterns in score processing * feat(vllm): Add default temperature and top_p to generation params --------- Co-authored-by: brian.li <[email protected]>
1 parent 8895505 commit 4994927

18 files changed

+258
-174
lines changed

examples/models/vllm_qwen2vl.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ export NCCL_DEBUG=DEBUG
1010

1111
python3 -m lmms_eval \
1212
--model vllm \
13-
--model_args model_version=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \
13+
--model_args model=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \
1414
--tasks mme,gsm8k_cot_self_consistency,mmmu_val \
1515
--batch_size 64 \
1616
--log_samples \

lmms_eval/models/llava_onevision.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -518,18 +518,6 @@ def _collate(x):
518518
prompt_question = conv.get_prompt()
519519
question_input.append(prompt_question)
520520

521-
# preconfigure gen_kwargs with defaults
522-
if "max_new_tokens" not in gen_kwargs:
523-
gen_kwargs["max_new_tokens"] = 1024
524-
if "temperature" not in gen_kwargs:
525-
gen_kwargs["temperature"] = 0
526-
if "do_sample" not in gen_kwargs:
527-
gen_kwargs["do_sample"] = False
528-
if "top_p" not in gen_kwargs:
529-
gen_kwargs["top_p"] = None
530-
if "num_beams" not in gen_kwargs:
531-
gen_kwargs["num_beams"] = 1
532-
533521
input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input]
534522
pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
535523
input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
@@ -548,8 +536,18 @@ def _collate(x):
548536

549537
# These steps are not in LLaVA's original code, but are necessary for generation to work
550538
# TODO: attention to this major generation step...
539+
# preconfigure gen_kwargs with defaults
540+
if "max_new_tokens" not in gen_kwargs:
541+
gen_kwargs["max_new_tokens"] = 1024
542+
551543
if "image_aspect_ratio" in gen_kwargs.keys():
552544
gen_kwargs.pop("image_aspect_ratio")
545+
# When do_sample=False, remove sampling-related parameters to avoid warnings
546+
# These might be in gen_kwargs or in the model's generation_config
547+
if not gen_kwargs.get("do_sample", False):
548+
gen_kwargs.pop("temperature", None)
549+
gen_kwargs.pop("top_p", None)
550+
gen_kwargs.pop("top_k", None)
553551
try:
554552
with torch.inference_mode():
555553
cont = self.model.generate(input_ids, attention_mask=attention_masks, pad_token_id=pad_token_ids, images=image_tensor, use_cache=self.use_cache, **gen_kwargs)
@@ -732,12 +730,14 @@ def _collate(x):
732730
# preconfigure gen_kwargs with defaults
733731
if "max_new_tokens" not in gen_kwargs:
734732
gen_kwargs["max_new_tokens"] = 1024
735-
if "temperature" not in gen_kwargs:
736-
gen_kwargs["temperature"] = 0
737733
if "do_sample" not in gen_kwargs:
738734
gen_kwargs["do_sample"] = False
739-
if "top_p" not in gen_kwargs:
740-
gen_kwargs["top_p"] = None
735+
# Only set temperature and top_p if do_sample is True
736+
if gen_kwargs.get("do_sample", False):
737+
if "temperature" not in gen_kwargs:
738+
gen_kwargs["temperature"] = 1.0 # Default temperature for sampling
739+
if "top_p" not in gen_kwargs:
740+
gen_kwargs["top_p"] = 1.0 # Default top_p for sampling
741741
if "num_beams" not in gen_kwargs:
742742
gen_kwargs["num_beams"] = 1
743743

@@ -761,6 +761,10 @@ def _collate(x):
761761
# TODO: attention to this major generation step...
762762
if "image_aspect_ratio" in gen_kwargs.keys():
763763
gen_kwargs.pop("image_aspect_ratio")
764+
# Remove temperature and top_p when do_sample=False to avoid warnings
765+
if not gen_kwargs.get("do_sample", False):
766+
gen_kwargs.pop("temperature", None)
767+
gen_kwargs.pop("top_p", None)
764768
try:
765769
with torch.inference_mode():
766770
cont = self.model.generate(input_ids, attention_mask=attention_masks, pad_token_id=pad_token_ids, images=image_tensor, use_cache=self.use_cache, **gen_kwargs)

lmms_eval/models/vllm.py

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
from lmms_eval.api.model import lmms
2121
from lmms_eval.api.registry import register_model
2222

23-
NUM_SECONDS_TO_SLEEP = 5
23+
NUM_SECONDS_TO_SLEEP = int(os.getenv("NUM_SECONDS_TO_SLEEP", "5"))
24+
WORKERS = int(os.getenv("WORKERS", "32"))
2425

2526
try:
2627
from vllm import LLM, SamplingParams
@@ -50,7 +51,7 @@ class VLLM(lmms):
5051
- VLLM chat method: https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat
5152
5253
Args:
53-
model_version (str): HuggingFace model identifier or path to the model.
54+
model (str): HuggingFace model identifier or path to the model.
5455
Default: "Qwen/Qwen2.5-VL-3B-Instruct"
5556
tensor_parallel_size (int): Number of GPUs to use for tensor parallelism.
5657
Default: 1
@@ -81,7 +82,7 @@ class VLLM(lmms):
8182
"--model",
8283
"vllm",
8384
"--model_args",
84-
"model_version=meta-llama/Llama-4-Scout-17B-16E-Instruct,"
85+
"model=meta-llama/Llama-4-Scout-17B-16E-Instruct,"
8586
"tensor_parallel_size=4,"
8687
"dtype=bfloat16,"
8788
"max_model_len=10240,"
@@ -118,7 +119,7 @@ class VLLM(lmms):
118119
"--model",
119120
"vllm",
120121
"--model_args",
121-
"model_version=deepseek-ai/deepseek-vl2,"
122+
"model=deepseek-ai/deepseek-vl2,"
122123
'hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},' # example of passing model specific arguments, JSON string will be parsed automatically
123124
f"chat_template={chat_template_file}," # chat template file path
124125
"tensor_parallel_size=2,"
@@ -145,12 +146,12 @@ class VLLM(lmms):
145146

146147
def __init__(
147148
self,
148-
model_version: str = "Qwen/Qwen2.5-VL-3B-Instruct",
149+
model: str = "Qwen/Qwen2.5-VL-3B-Instruct",
149150
tensor_parallel_size: int = 1,
151+
data_parallel_size: int = 1,
150152
gpu_memory_utilization: float = 0.8,
151153
batch_size: int = 1,
152154
max_frame_num: int = 32,
153-
threads: int = 16, # Threads to use for decoding visuals
154155
trust_remote_code: Optional[bool] = True,
155156
chat_template: Optional[str] = None,
156157
min_image_pixels: int = 28, # minimum image dimension, required for Qwen 2/2.5-VL models
@@ -160,12 +161,13 @@ def __init__(
160161
# Manually set a image token for GPT4V so that we can search for it
161162
# and split the text and image
162163
# Here we just use the same token as llava for convenient
163-
self.model_version = model_version
164+
self.model = model
164165
self.max_frame_num = max_frame_num
165-
self.threads = threads
166+
self.chat_template = chat_template
166167
self.min_image_pixels = min_image_pixels
168+
self.data_parallel_size = data_parallel_size
167169
# Qwen 2/2.5-VL models enforce minimum image dimensions
168-
self._enforce_image_resize = self._is_qwen_vl_model(model_version)
170+
self._enforce_image_resize = self._is_qwen_vl_model(model)
169171

170172
# Load chat template during initialization
171173
self.chat_template = None
@@ -191,13 +193,6 @@ def __init__(
191193

192194
# Set up vllm client
193195
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
194-
self.client = LLM(
195-
model=self.model_version,
196-
tensor_parallel_size=tensor_parallel_size,
197-
gpu_memory_utilization=gpu_memory_utilization,
198-
trust_remote_code=trust_remote_code,
199-
**kwargs,
200-
)
201196

202197
accelerator = Accelerator()
203198
if accelerator.num_processes > 1:
@@ -211,13 +206,27 @@ def __init__(
211206
self.accelerator = accelerator
212207
self._rank = self.accelerator.local_process_index
213208
self._world_size = self.accelerator.num_processes
209+
# TODO: Support tensor parallelism in the future for flexible vllm parallel
210+
if data_parallel_size > 1:
211+
assert tensor_parallel_size == 1, "Data parallelism is not supported with tensor parallelism. For current vllm version"
212+
if accelerator.num_processes > 1:
213+
kwargs["distributed_executor_backend"] = "external_launcher"
214+
self.client = LLM(
215+
model=self.model,
216+
tensor_parallel_size=tensor_parallel_size,
217+
gpu_memory_utilization=gpu_memory_utilization,
218+
trust_remote_code=trust_remote_code,
219+
disable_log_stats=False,
220+
seed=1,
221+
**kwargs,
222+
)
214223

215224
self.device = self.accelerator.device
216225
self.batch_size_per_gpu = int(batch_size)
217226

218-
def _is_qwen_vl_model(self, model_version: str) -> bool:
227+
def _is_qwen_vl_model(self, model: str) -> bool:
219228
qwen_vl_patterns = ["qwen2-vl", "qwen2.5-vl"]
220-
return any(pattern in model_version.lower() for pattern in qwen_vl_patterns)
229+
return any(pattern in model.lower() for pattern in qwen_vl_patterns)
221230

222231
def _maybe_resize_image(self, img: Image.Image) -> Image.Image:
223232
# edge‐case validation
@@ -294,16 +303,14 @@ def generate_until(self, requests) -> List[str]:
294303
contexts, gen_kwargs, doc_to_visual, doc_id, task, split = batch_requests[idx].arguments
295304
if "max_new_tokens" not in gen_kwargs:
296305
gen_kwargs["max_new_tokens"] = 1024
297-
if gen_kwargs["max_new_tokens"] > 4096:
298-
gen_kwargs["max_new_tokens"] = 4096
299306
if "temperature" not in gen_kwargs:
300307
gen_kwargs["temperature"] = 0
301308
if "top_p" not in gen_kwargs:
302309
gen_kwargs["top_p"] = 0.95
303310

304311
params = {
305-
"temperature": gen_kwargs["temperature"],
306312
"max_tokens": gen_kwargs["max_new_tokens"],
313+
"temperature": gen_kwargs["temperature"],
307314
"top_p": gen_kwargs["top_p"],
308315
}
309316
sampling_params = SamplingParams(**params)
@@ -316,7 +323,7 @@ def generate_until(self, requests) -> List[str]:
316323
visuals = self.flatten(visuals)
317324
imgs = [] # multiple images or frames for video
318325
all_tasks = []
319-
with ThreadPoolExecutor(max_workers=self.threads) as executor:
326+
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
320327
for visual in visuals:
321328
if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
322329
all_tasks.append(executor.submit(self.encode_video, visual))
@@ -329,10 +336,10 @@ def generate_until(self, requests) -> List[str]:
329336
imgs.append(task.result())
330337

331338
messages = [{"role": "user", "content": []}]
332-
# When there is no image token in the context, append the image to the text
333-
messages[0]["content"].append({"type": "text", "text": contexts})
339+
# Add images first, then text
334340
for img in self.flatten(imgs):
335341
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
342+
messages[0]["content"].append({"type": "text", "text": contexts})
336343

337344
batched_messages.append(messages)
338345

lmms_eval/tasks/k12/k12.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,6 @@ generation_kwargs:
1212
until:
1313
- "ASSISTANT:"
1414
max_new_tokens: 16384
15-
temperature: 0
16-
top_p: 0.95
17-
num_beams: 1
18-
do_sample: false
19-
repetition_penalty: 1.2
2015
process_results: !function utils.k12_process_results
2116
metric_list:
2217
- metric: llm_as_judge_eval

lmms_eval/tasks/mathverse/mathverse_evals.py

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
import requests
66
from loguru import logger as eval_logger
7+
from openai import AzureOpenAI, OpenAI
78
from tqdm import tqdm
89

910
DEMO_PROMPT_EXTRACT = """
@@ -74,67 +75,45 @@
7475

7576
class MathVerseEvaluator:
7677
API_TYPE = os.getenv("API_TYPE", "openai")
77-
7878
if API_TYPE == "openai":
7979
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
8080
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
8181
headers = {
8282
"Authorization": f"Bearer {API_KEY}",
8383
"Content-Type": "application/json",
8484
}
85+
client = OpenAI(api_key=API_KEY, base_url=API_URL.rstrip("chat/completions"))
86+
8587
elif API_TYPE == "azure":
8688
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
8789
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
88-
headers = {
89-
"api-key": API_KEY,
90-
"Content-Type": "application/json",
91-
}
90+
API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
91+
client = AzureOpenAI(azure_endpoint=API_URL, api_version=API_VERSION, api_key=API_KEY)
92+
gpt_model = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
9293

93-
def __init__(self, api_key, gpt_model="gpt-3.5-turbo", quick_extract=False):
94-
self.api_key = api_key
95-
self.gpt_model = gpt_model
94+
def __init__(self, quick_extract=False):
9695
self.quick_extract = quick_extract
9796

98-
def _post_request(self, payload):
99-
headers = {
100-
"Authorization": f"Bearer {self.api_key}",
101-
"Content-Type": "application/json",
102-
}
103-
response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30)
104-
response.raise_for_status()
105-
return response.json()
106-
107-
def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0):
97+
def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=5, sleep_time=0):
10898
messages = [
10999
{"role": "user", "content": prompt},
110100
]
111-
payload = {"model": self.gpt_model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "n": n}
101+
payload = {"model": self.gpt_model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens}
112102

113103
while patience > 0:
114104
patience -= 1
115105
try:
116-
response = self._post_request(payload)
106+
response = self.client.chat.completions.create(**payload)
117107
if n == 1:
118-
prediction = response["choices"][0]["message"]["content"].strip()
108+
prediction = response.choices[0].message.content.strip()
119109
if prediction and prediction != "":
120110
return prediction
121111
else:
122-
prediction = [choice["message"]["content"].strip() for choice in response["choices"]]
112+
prediction = [choice.message.content.strip() for choice in response.choices]
123113
if prediction and prediction[0] != "":
124114
return prediction
125115

126116
except Exception as e:
127-
# some model may output repetitive answer, which ChatGPT will throw an error.
128-
if "repetitive patterns" in str(e):
129-
print(str(e))
130-
print("Continue with empty answer")
131-
return ""
132-
# some answer may contain some sensitive words, like 'test'
133-
if "sensitive" in str(e) or "400" in str(e):
134-
print(str(e))
135-
print("Continue with empty answer")
136-
return "0"
137-
138117
if "Rate limit" not in str(e):
139118
eval_logger.error(e)
140119

lmms_eval/tasks/mathverse/mathverse_testmini.yaml

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@ doc_to_text: !function utils.mathverse_doc_to_text
1111
doc_to_target: "answer"
1212
generation_kwargs:
1313
max_new_tokens: 16384
14-
temperature: 0.0
15-
do_sample: true
16-
top_p: 0.95
17-
top_k: 50
18-
repetition_penalty: 1.2
1914
until:
2015
- "</s>"
2116
- "Q:"
@@ -25,14 +20,11 @@ metric_list:
2520
- metric: gpt_eval_score
2621
aggregation: !function utils.mathverse_aggregate_results_eval
2722
higher_is_better: true
28-
- metric: submission
29-
aggregation: !function utils.mathverse_aggregate_results_submission
30-
higher_is_better: true
31-
23+
3224
lmms_eval_specific_kwargs:
3325
default:
3426
shot_type: "format-prompt" # can also be "custom-prompt"
3527
query_type: "query_wo" # now only support query_wo
3628
model_specific_generation_kwargs:
3729
llava:
38-
image_aspect_ratio: original
30+
image_aspect_ratio: original

lmms_eval/tasks/mathverse/utils.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
config = yaml.safe_load("".join(safe_data))
2121

22-
mathverse_evaluator = MathVerseEvaluator(api_key=os.getenv("OPENAI_API_KEY", "YOUR_API_KEY"), gpt_model=config["metadata"]["gpt_eval_model_name"])
22+
mathverse_evaluator = MathVerseEvaluator()
2323

2424

2525
def mathverse_doc_to_visual(doc):
@@ -82,21 +82,7 @@ def mathverse_aggregate_results_submission(results, args, *, calculate_gain=Fals
8282
def mathverse_aggregate_results_eval(results, args, *, calculate_gain=False, random_scores=None):
8383
split_flag = results[0]["metadata"]["split"]
8484
problem_version = results[0]["problem_version"].lower().replace(" ", "_")
85-
# save the result first, in case the gpt evaluation fails
86-
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
87-
with open(path, "w") as f:
88-
json.dump(results, f, indent=4)
89-
# gpt evaluation
9085
results_dict, scores = mathverse_evaluator.eval_results(results, config)
91-
# save results
92-
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
93-
with open(path, "w") as f:
94-
json.dump(results_dict, f, indent=4)
95-
# save scores
96-
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_scores.json", args)
97-
with open(path, "w") as f:
98-
json.dump(scores, f, indent=4)
99-
eval_logger.info(f"Saved scores to {path}")
10086
if scores["average"]["accuracy"] == 0:
10187
return None
10288
return scores["average"]["accuracy"]

0 commit comments

Comments
 (0)