Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/models/vllm_qwen2vl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export NCCL_DEBUG=DEBUG

python3 -m lmms_eval \
--model vllm \
--model_args model_version=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \
--model_args model=Qwen/Qwen2-VL-7B-Instruct,tensor_parallel_size=4 \
--tasks mme,gsm8k_cot_self_consistency,mmmu_val \
--batch_size 64 \
--log_samples \
Expand Down
36 changes: 20 additions & 16 deletions lmms_eval/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,18 +518,6 @@ def _collate(x):
prompt_question = conv.get_prompt()
question_input.append(prompt_question)

# preconfigure gen_kwargs with defaults
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 0
if "do_sample" not in gen_kwargs:
gen_kwargs["do_sample"] = False
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = None
if "num_beams" not in gen_kwargs:
gen_kwargs["num_beams"] = 1

input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input]
pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
Expand All @@ -548,8 +536,18 @@ def _collate(x):

# These steps are not in LLaVA's original code, but are necessary for generation to work
# TODO: attention to this major generation step...
# preconfigure gen_kwargs with defaults
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024

if "image_aspect_ratio" in gen_kwargs.keys():
gen_kwargs.pop("image_aspect_ratio")
# When do_sample=False, remove sampling-related parameters to avoid warnings
# These might be in gen_kwargs or in the model's generation_config
if not gen_kwargs.get("do_sample", False):
gen_kwargs.pop("temperature", None)
gen_kwargs.pop("top_p", None)
gen_kwargs.pop("top_k", None)
try:
with torch.inference_mode():
cont = self.model.generate(input_ids, attention_mask=attention_masks, pad_token_id=pad_token_ids, images=image_tensor, use_cache=self.use_cache, **gen_kwargs)
Expand Down Expand Up @@ -732,12 +730,14 @@ def _collate(x):
# preconfigure gen_kwargs with defaults
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 0
if "do_sample" not in gen_kwargs:
gen_kwargs["do_sample"] = False
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = None
# Only set temperature and top_p if do_sample is True
if gen_kwargs.get("do_sample", False):
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 1.0 # Default temperature for sampling
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = 1.0 # Default top_p for sampling
if "num_beams" not in gen_kwargs:
gen_kwargs["num_beams"] = 1

Expand All @@ -761,6 +761,10 @@ def _collate(x):
# TODO: attention to this major generation step...
if "image_aspect_ratio" in gen_kwargs.keys():
gen_kwargs.pop("image_aspect_ratio")
# Remove temperature and top_p when do_sample=False to avoid warnings
if not gen_kwargs.get("do_sample", False):
gen_kwargs.pop("temperature", None)
gen_kwargs.pop("top_p", None)
try:
with torch.inference_mode():
cont = self.model.generate(input_ids, attention_mask=attention_masks, pad_token_id=pad_token_ids, images=image_tensor, use_cache=self.use_cache, **gen_kwargs)
Expand Down
59 changes: 30 additions & 29 deletions lmms_eval/models/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model

NUM_SECONDS_TO_SLEEP = 5
NUM_SECONDS_TO_SLEEP = int(os.getenv("NUM_SECONDS_TO_SLEEP", "5"))
WORKERS = int(os.getenv("WORKERS", "32"))

try:
from vllm import LLM, SamplingParams
Expand Down Expand Up @@ -50,7 +51,7 @@ class VLLM(lmms):
- VLLM chat method: https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat

Args:
model_version (str): HuggingFace model identifier or path to the model.
model (str): HuggingFace model identifier or path to the model.
Default: "Qwen/Qwen2.5-VL-3B-Instruct"
tensor_parallel_size (int): Number of GPUs to use for tensor parallelism.
Default: 1
Expand Down Expand Up @@ -81,7 +82,7 @@ class VLLM(lmms):
"--model",
"vllm",
"--model_args",
"model_version=meta-llama/Llama-4-Scout-17B-16E-Instruct,"
"model=meta-llama/Llama-4-Scout-17B-16E-Instruct,"
"tensor_parallel_size=4,"
"dtype=bfloat16,"
"max_model_len=10240,"
Expand Down Expand Up @@ -118,7 +119,7 @@ class VLLM(lmms):
"--model",
"vllm",
"--model_args",
"model_version=deepseek-ai/deepseek-vl2,"
"model=deepseek-ai/deepseek-vl2,"
'hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},' # example of passing model specific arguments, JSON string will be parsed automatically
f"chat_template={chat_template_file}," # chat template file path
"tensor_parallel_size=2,"
Expand All @@ -145,12 +146,12 @@ class VLLM(lmms):

def __init__(
self,
model_version: str = "Qwen/Qwen2.5-VL-3B-Instruct",
model: str = "Qwen/Qwen2.5-VL-3B-Instruct",
tensor_parallel_size: int = 1,
data_parallel_size: int = 1,
gpu_memory_utilization: float = 0.8,
batch_size: int = 1,
max_frame_num: int = 32,
threads: int = 16, # Threads to use for decoding visuals
trust_remote_code: Optional[bool] = True,
chat_template: Optional[str] = None,
min_image_pixels: int = 28, # minimum image dimension, required for Qwen 2/2.5-VL models
Expand All @@ -160,12 +161,13 @@ def __init__(
# Manually set a image token for GPT4V so that we can search for it
# and split the text and image
# Here we just use the same token as llava for convenient
self.model_version = model_version
self.model = model
self.max_frame_num = max_frame_num
self.threads = threads
self.chat_template = chat_template
self.min_image_pixels = min_image_pixels
self.data_parallel_size = data_parallel_size
# Qwen 2/2.5-VL models enforce minimum image dimensions
self._enforce_image_resize = self._is_qwen_vl_model(model_version)
self._enforce_image_resize = self._is_qwen_vl_model(model)

# Load chat template during initialization
self.chat_template = None
Comment on lines +166 to 173
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Bug: Duplicate and conflicting chat_template initialization.

The self.chat_template is assigned the parameter value on line 166 but then immediately set to None on line 173, discarding the initial assignment. This appears to be an error.

Remove the redundant assignment:

         self.model = model
         self.max_frame_num = max_frame_num
-        self.chat_template = chat_template
         self.min_image_pixels = min_image_pixels
         self.data_parallel_size = data_parallel_size
         # Qwen 2/2.5-VL models enforce minimum image dimensions
         self._enforce_image_resize = self._is_qwen_vl_model(model)
 
         # Load chat template during initialization
         self.chat_template = None
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
self.chat_template = chat_template
self.min_image_pixels = min_image_pixels
self.data_parallel_size = data_parallel_size
# Qwen 2/2.5-VL models enforce minimum image dimensions
self._enforce_image_resize = self._is_qwen_vl_model(model_version)
self._enforce_image_resize = self._is_qwen_vl_model(model)
# Load chat template during initialization
self.chat_template = None
self.model = model
self.max_frame_num = max_frame_num
self.min_image_pixels = min_image_pixels
self.data_parallel_size = data_parallel_size
# Qwen 2/2.5-VL models enforce minimum image dimensions
self._enforce_image_resize = self._is_qwen_vl_model(model)
# Load chat template during initialization
self.chat_template = None
🤖 Prompt for AI Agents
In lmms_eval/models/vllm.py between lines 166 and 173, the attribute
self.chat_template is first assigned the parameter value and then immediately
overwritten to None, which discards the initial assignment. Remove the redundant
assignment setting self.chat_template to None on line 173 to preserve the
intended initialization with the parameter value.

Expand All @@ -191,13 +193,6 @@ def __init__(

# Set up vllm client
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
self.client = LLM(
model=self.model_version,
tensor_parallel_size=tensor_parallel_size,
gpu_memory_utilization=gpu_memory_utilization,
trust_remote_code=trust_remote_code,
**kwargs,
)

accelerator = Accelerator()
if accelerator.num_processes > 1:
Expand All @@ -211,13 +206,27 @@ def __init__(
self.accelerator = accelerator
self._rank = self.accelerator.local_process_index
self._world_size = self.accelerator.num_processes
# TODO: Support tensor parallelism in the future for flexible vllm parallel
if data_parallel_size > 1:
assert tensor_parallel_size == 1, "Data parallelism is not supported with tensor parallelism. For current vllm version"
if accelerator.num_processes > 1:
kwargs["distributed_executor_backend"] = "external_launcher"
self.client = LLM(
model=self.model,
tensor_parallel_size=tensor_parallel_size,
gpu_memory_utilization=gpu_memory_utilization,
trust_remote_code=trust_remote_code,
disable_log_stats=False,
seed=1,
**kwargs,
)

self.device = self.accelerator.device
self.batch_size_per_gpu = int(batch_size)

def _is_qwen_vl_model(self, model_version: str) -> bool:
def _is_qwen_vl_model(self, model: str) -> bool:
qwen_vl_patterns = ["qwen2-vl", "qwen2.5-vl"]
return any(pattern in model_version.lower() for pattern in qwen_vl_patterns)
return any(pattern in model.lower() for pattern in qwen_vl_patterns)

def _maybe_resize_image(self, img: Image.Image) -> Image.Image:
# edge‐case validation
Expand Down Expand Up @@ -294,17 +303,9 @@ def generate_until(self, requests) -> List[str]:
contexts, gen_kwargs, doc_to_visual, doc_id, task, split = batch_requests[idx].arguments
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if gen_kwargs["max_new_tokens"] > 4096:
gen_kwargs["max_new_tokens"] = 4096
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 0
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = 0.95

params = {
"temperature": gen_kwargs["temperature"],
"max_tokens": gen_kwargs["max_new_tokens"],
"top_p": gen_kwargs["top_p"],
}
sampling_params = SamplingParams(**params)

Expand All @@ -316,7 +317,7 @@ def generate_until(self, requests) -> List[str]:
visuals = self.flatten(visuals)
imgs = [] # multiple images or frames for video
all_tasks = []
with ThreadPoolExecutor(max_workers=self.threads) as executor:
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
for visual in visuals:
if isinstance(visual, str) and (".mp4" in visual or ".avi" in visual or ".mov" in visual or ".flv" in visual or ".wmv" in visual):
all_tasks.append(executor.submit(self.encode_video, visual))
Expand All @@ -329,10 +330,10 @@ def generate_until(self, requests) -> List[str]:
imgs.append(task.result())

messages = [{"role": "user", "content": []}]
# When there is no image token in the context, append the image to the text
messages[0]["content"].append({"type": "text", "text": contexts})
# Add images first, then text
for img in self.flatten(imgs):
messages[0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}})
messages[0]["content"].append({"type": "text", "text": contexts})

batched_messages.append(messages)

Expand Down
5 changes: 0 additions & 5 deletions lmms_eval/tasks/k12/k12.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,6 @@ generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 16384
temperature: 0
top_p: 0.95
num_beams: 1
do_sample: false
repetition_penalty: 1.2
process_results: !function utils.k12_process_results
metric_list:
- metric: llm_as_judge_eval
Expand Down
45 changes: 12 additions & 33 deletions lmms_eval/tasks/mathverse/mathverse_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import requests
from loguru import logger as eval_logger
from openai import AzureOpenAI, OpenAI
from tqdm import tqdm

DEMO_PROMPT_EXTRACT = """
Expand Down Expand Up @@ -74,67 +75,45 @@

class MathVerseEvaluator:
API_TYPE = os.getenv("API_TYPE", "openai")

if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
client = OpenAI(api_key=API_KEY, base_url=API_URL.rstrip("chat/completions"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Incorrect use of .rstrip() with multi-character string.

The .rstrip("chat/completions") call removes any combination of the characters in "chat/completions" from the end, not the exact string. This could lead to unexpected behavior.

Use .removesuffix() for exact string removal (Python 3.9+):

-        client = OpenAI(api_key=API_KEY, base_url=API_URL.rstrip("chat/completions"))
+        client = OpenAI(api_key=API_KEY, base_url=API_URL.removesuffix("/chat/completions"))

Or for older Python versions:

-        client = OpenAI(api_key=API_KEY, base_url=API_URL.rstrip("chat/completions"))
+        base_url = API_URL[:-len("/chat/completions")] if API_URL.endswith("/chat/completions") else API_URL
+        client = OpenAI(api_key=API_KEY, base_url=base_url)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
client = OpenAI(api_key=API_KEY, base_url=API_URL.rstrip("chat/completions"))
client = OpenAI(api_key=API_KEY, base_url=API_URL.removesuffix("/chat/completions"))
🧰 Tools
🪛 Ruff (0.12.2)

85-85: Using .strip() with multi-character strings is misleading

(B005)

🤖 Prompt for AI Agents
In lmms_eval/tasks/mathverse/mathverse_evals.py at line 85, the use of
.rstrip("chat/completions") is incorrect because it removes any combination of
those characters from the end rather than the exact substring. Replace
.rstrip("chat/completions") with .removesuffix("chat/completions") if using
Python 3.9 or newer, or implement a conditional check to remove the exact suffix
for older Python versions to ensure only the intended substring is removed from
API_URL.


elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}
API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
client = AzureOpenAI(azure_endpoint=API_URL, api_version=API_VERSION, api_key=API_KEY)
gpt_model = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")

def __init__(self, api_key, gpt_model="gpt-3.5-turbo", quick_extract=False):
self.api_key = api_key
self.gpt_model = gpt_model
def __init__(self, quick_extract=False):
self.quick_extract = quick_extract

def _post_request(self, payload):
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
response = requests.post(self.API_URL, headers=headers, json=payload, timeout=30)
response.raise_for_status()
return response.json()

def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=10000000, sleep_time=0):
def get_chat_response(self, prompt, temperature=0, max_tokens=256, n=1, patience=5, sleep_time=0):
messages = [
{"role": "user", "content": prompt},
]
payload = {"model": self.gpt_model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "n": n}
payload = {"model": self.gpt_model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: API Type Mismatch Causes Undefined Attribute

The gpt_model attribute is no longer set in the __init__ method and is only defined at the class level for the Azure API type. Consequently, when using the OpenAI API, self.gpt_model is undefined, leading to an AttributeError when accessed in the get_chat_response method.

Locations (1)

Fix in Cursor Fix in Web


while patience > 0:
patience -= 1
try:
response = self._post_request(payload)
response = self.client.chat.completions.create(**payload)
if n == 1:
prediction = response["choices"][0]["message"]["content"].strip()
prediction = response.choices[0].message.content.strip()
if prediction and prediction != "":
return prediction
else:
prediction = [choice["message"]["content"].strip() for choice in response["choices"]]
prediction = [choice.message.content.strip() for choice in response.choices]
if prediction and prediction[0] != "":
return prediction

except Exception as e:
# some model may output repetitive answer, which ChatGPT will throw an error.
if "repetitive patterns" in str(e):
print(str(e))
print("Continue with empty answer")
return ""
# some answer may contain some sensitive words, like 'test'
if "sensitive" in str(e) or "400" in str(e):
print(str(e))
print("Continue with empty answer")
return "0"

if "Rate limit" not in str(e):
eval_logger.error(e)

Expand Down
12 changes: 2 additions & 10 deletions lmms_eval/tasks/mathverse/mathverse_testmini.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ doc_to_text: !function utils.mathverse_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16384
temperature: 0.0
do_sample: true
top_p: 0.95
top_k: 50
repetition_penalty: 1.2
until:
- "</s>"
- "Q:"
Expand All @@ -25,14 +20,11 @@ metric_list:
- metric: gpt_eval_score
aggregation: !function utils.mathverse_aggregate_results_eval
higher_is_better: true
- metric: submission
aggregation: !function utils.mathverse_aggregate_results_submission
higher_is_better: true


lmms_eval_specific_kwargs:
default:
shot_type: "format-prompt" # can also be "custom-prompt"
query_type: "query_wo" # now only support query_wo
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
image_aspect_ratio: original
16 changes: 1 addition & 15 deletions lmms_eval/tasks/mathverse/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

config = yaml.safe_load("".join(safe_data))

mathverse_evaluator = MathVerseEvaluator(api_key=os.getenv("OPENAI_API_KEY", "YOUR_API_KEY"), gpt_model=config["metadata"]["gpt_eval_model_name"])
mathverse_evaluator = MathVerseEvaluator()


def mathverse_doc_to_visual(doc):
Expand Down Expand Up @@ -82,21 +82,7 @@ def mathverse_aggregate_results_submission(results, args, *, calculate_gain=Fals
def mathverse_aggregate_results_eval(results, args, *, calculate_gain=False, random_scores=None):
split_flag = results[0]["metadata"]["split"]
problem_version = results[0]["problem_version"].lower().replace(" ", "_")
# save the result first, in case the gpt evaluation fails
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
with open(path, "w") as f:
json.dump(results, f, indent=4)
# gpt evaluation
results_dict, scores = mathverse_evaluator.eval_results(results, config)
# save results
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_results.json", args)
with open(path, "w") as f:
json.dump(results_dict, f, indent=4)
# save scores
path = generate_submission_file(f"mathverse_{split_flag}_{problem_version}_scores.json", args)
with open(path, "w") as f:
json.dump(scores, f, indent=4)
eval_logger.info(f"Saved scores to {path}")
if scores["average"]["accuracy"] == 0:
return None
return scores["average"]["accuracy"]
Loading