Skip to content

Commit e591de0

Browse files
Luodiankcz358
authored andcommitted
Add llm_as_judge_eval metric to multiple tasks and integrate llm_judge API for evaluation
1 parent af1efae commit e591de0

File tree

9 files changed

+217
-309
lines changed

9 files changed

+217
-309
lines changed

lmms_eval/tasks/mathverse/mathverse_testmini.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ generation_kwargs:
2222
- "<|im_end|>"
2323
process_results: !function utils.mathverse_process_results
2424
metric_list:
25+
- metric: llm_as_judge_eval
26+
aggregation: mean
27+
higher_is_better: true
2528
- metric: gpt_eval_score
2629
aggregation: !function utils.mathverse_aggregate_results_eval
2730
higher_is_better: true

lmms_eval/tasks/mathverse/utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import yaml
77
from loguru import logger as eval_logger
88

9+
from lmms_eval.llm_judge import ServerConfig, get_server
910
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
1011
from lmms_eval.tasks.mathverse.mathverse_evals import MathVerseEvaluator
1112

@@ -19,6 +20,15 @@
1920

2021
config = yaml.safe_load("".join(safe_data))
2122

23+
# Initialize the judge server
24+
API_TYPE = os.getenv("API_TYPE", "openai")
25+
GPT_MODEL = os.getenv("MODEL_VERSION", config["metadata"]["gpt_eval_model_name"])
26+
27+
server_config = ServerConfig(
28+
model_name=GPT_MODEL,
29+
)
30+
server = get_server(server_name=API_TYPE, config=server_config)
31+
2232
mathverse_evaluator = MathVerseEvaluator(api_key=os.getenv("OPENAI_API_KEY", "YOUR_API_KEY"), gpt_model=config["metadata"]["gpt_eval_model_name"])
2333

2434

@@ -45,6 +55,39 @@ def mathverse_doc_to_text(doc, lmms_eval_specific_kwargs=None):
4555

4656
def mathverse_process_results(doc, results):
4757
prediction = results[0].strip()
58+
question = doc["question_for_eval"]
59+
answer = doc["answer"] if "answer" in doc else None
60+
61+
# Define custom prompt for MathVerse evaluation
62+
custom_prompt = """Below are two answers to a math question. Determine whether these two answers are consistent.
63+
Please note that only when the Model Answer completely matches the Standard Answer means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
64+
65+
Return only "Yes" if they are consistent or "No" if they are different.
66+
Only return "Yes" or "No" with no additional text or formatting."""
67+
68+
judge_result = 0
69+
if answer is not None:
70+
try:
71+
# Use the llm_judge API for binary evaluation
72+
result = server.evaluate_binary(
73+
question=question,
74+
answer=str(answer),
75+
prediction=prediction,
76+
output_format="yes/no",
77+
custom_prompt=custom_prompt
78+
)
79+
80+
# Parse the result
81+
if result["success"]:
82+
judge_response = result["result"]
83+
judge_result = 1 if judge_response and judge_response.lower() == "yes" else 0
84+
else:
85+
eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
86+
judge_result = 0
87+
88+
except Exception as e:
89+
eval_logger.error(f"Error getting judge response: {e}")
90+
judge_result = 0
4891

4992
result = {
5093
"sample_index": doc["sample_index"],
@@ -58,9 +101,11 @@ def mathverse_process_results(doc, results):
58101
"query_wo": doc["query_wo"],
59102
"query_cot": doc["query_cot"],
60103
"question_for_eval": doc["question_for_eval"],
104+
"true_false": judge_result == 1,
61105
}
62106

63107
return {
108+
"llm_as_judge_eval": judge_result,
64109
"gpt_eval_score": result,
65110
"submission": result,
66111
}

lmms_eval/tasks/mathvision/mathvision_reason_test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ generation_kwargs:
2121
- "<|im_end|>"
2222
process_results: !function utils.mathvision_gpt_eval_process_results
2323
metric_list:
24-
- metric: mathvision_gpt_eval_score
25-
aggregation: !function utils.mathvision_aggregate_results_eval
24+
- metric: llm_as_judge_eval
25+
aggregation: mean
2626
higher_is_better: true

lmms_eval/tasks/mathvision/mathvision_reason_testmini.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ generation_kwargs:
2121
- "<|im_end|>"
2222
process_results: !function utils.mathvision_gpt_eval_process_results
2323
metric_list:
24-
- metric: mathvision_gpt_eval_score
25-
aggregation: !function utils.mathvision_aggregate_results_eval
24+
- metric: llm_as_judge_eval
25+
aggregation: mean
2626
higher_is_better: true

lmms_eval/tasks/mathvision/utils.py

Lines changed: 44 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -7,85 +7,22 @@
77
import requests
88
import yaml
99
from loguru import logger as eval_logger
10-
from openai import AzureOpenAI, OpenAI
1110

11+
from lmms_eval.llm_judge import ServerConfig, get_server
1212
from lmms_eval.tasks.mathvision.eval_utils import find_math_answer, is_equal, is_number
1313

1414
NUM_SECONDS_TO_SLEEP = 5
15-
API_TYPE = os.getenv("API_TYPE", "openai")
16-
MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
17-
18-
JUDGE_RULES = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case.
19-
# Input
20-
Question:
21-
```
22-
{question}
23-
```
24-
Ground Truth Answer:
25-
```
26-
{answer}
27-
```
28-
Model Prediction:
29-
```
30-
{pred}
31-
```
32-
33-
# Evaluation Rules
34-
- The model prediction may contain the reasoning process, you should spot the final answer from it.
35-
- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options.
36-
- For open-ended questions:
37-
* Score 1 if the prediction matches the answer semantically, it can be in different format.
38-
* Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct.
39-
- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way.
40-
- Treat numerical answers as correct if they match within reasonable precision
41-
- For questions requiring units, both value and unit must be correct
42-
43-
# Strict Output format
44-
0/1"""
45-
46-
if API_TYPE == "openai":
47-
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
48-
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
49-
client = OpenAI(api_key=API_KEY)
50-
elif API_TYPE == "azure":
51-
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
52-
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
53-
client = AzureOpenAI(azure_endpoint=API_URL, api_version="2023-07-01-preview", api_key=API_KEY)
5415

16+
# Initialize the judge server
17+
API_TYPE = os.getenv("API_TYPE", "openai")
18+
GPT_MODEL = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
5519

56-
def get_chat_response(content: str, max_tokens: int, retries: int = 5):
57-
global MODEL_VERSION
58-
global client
20+
server_config = ServerConfig(
21+
model_name=GPT_MODEL,
22+
)
23+
server = get_server(server_name=API_TYPE, config=server_config)
5924

60-
messages = [
61-
{
62-
"role": "system",
63-
"content": "You are a helpful and precise assistant for checking the correctness of the answer.",
64-
},
65-
{"role": "user", "content": content},
66-
]
67-
68-
payload = {
69-
"model": MODEL_VERSION,
70-
"messages": messages,
71-
"temperature": 0.2,
72-
"max_tokens": max_tokens,
73-
}
7425

75-
for attempt in range(retries):
76-
try:
77-
response = client.chat.completions.create(**payload)
78-
content = response.choices[0].message.content.strip()
79-
return content
80-
except requests.exceptions.RequestException as e:
81-
eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
82-
time.sleep(NUM_SECONDS_TO_SLEEP)
83-
if attempt == retries - 1:
84-
eval_logger.error(f"Failed to get response after {retries} attempts")
85-
return 0
86-
except Exception as e:
87-
eval_logger.error(f"Error on attempt {attempt+1}: {e}")
88-
return 0
8926

9027

9128
def mathvision_doc_to_visual(doc):
@@ -115,22 +52,48 @@ def mathvision_gpt_eval_process_results(doc, results):
11552
for pred in results:
11653
model_answer = pred.strip()
11754
gt_answer = str(doc["answer"])
118-
gpt_response = get_chat_response(JUDGE_RULES.format(question=doc["question"], answer=gt_answer, pred=model_answer), 1024)
55+
question = doc["question"]
56+
57+
# Define custom prompt for MathVision evaluation
58+
custom_prompt = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case.
59+
60+
# Evaluation Rules
61+
- The model prediction may contain the reasoning process, you should spot the final answer from it.
62+
- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options.
63+
- For open-ended questions:
64+
* Score 1 if the prediction matches the answer semantically, it can be in different format.
65+
* Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct.
66+
- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way.
67+
- Treat numerical answers as correct if they match within reasonable precision
68+
- For questions requiring units, both value and unit must be correct
69+
70+
Return only "1" or "0" with no additional text or formatting."""
71+
11972
try:
120-
if int(gpt_response) == 1:
121-
correct_list.append(True)
73+
# Use the llm_judge API for binary evaluation
74+
result = server.evaluate_binary(
75+
question=question,
76+
answer=gt_answer,
77+
prediction=model_answer,
78+
output_format="1/0",
79+
custom_prompt=custom_prompt
80+
)
81+
82+
# Parse the result
83+
if result["success"]:
84+
judge_response = result["result"]
85+
correct_list.append(judge_response == "1")
12286
else:
87+
eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
12388
correct_list.append(False)
89+
12490
except Exception as e:
125-
eval_logger.error(f"Error on attempt {attempt+1}: {e}")
91+
eval_logger.error(f"Error getting judge response: {e}")
12692
correct_list.append(False)
12793

128-
return {
129-
"mathvision_gpt_eval_score": {
130-
"response": results,
131-
"scores": correct_list,
132-
},
133-
}
94+
# Calculate the average score for this document
95+
avg_score = sum(1 if score else 0 for score in correct_list) / len(correct_list) if correct_list else 0
96+
return {"llm_as_judge_eval": avg_score}
13497

13598

13699
def mathvision_process_results(doc, results):

lmms_eval/tasks/mme_cot/utils.py

Lines changed: 36 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -8,69 +8,23 @@
88

99
import pandas as pd
1010
from loguru import logger as eval_logger
11-
from openai import AzureOpenAI, OpenAI
1211
from PIL import Image
1312

13+
from lmms_eval.llm_judge import ServerConfig, get_server
1414
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
1515

1616
dir_name = os.path.dirname(os.path.abspath(__file__))
1717

18+
# Initialize the judge server
1819
API_TYPE = os.getenv("API_TYPE", "openai")
19-
if API_TYPE == "openai":
20-
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
21-
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
22-
headers = {
23-
"Authorization": f"Bearer {API_KEY}",
24-
"Content-Type": "application/json",
25-
}
26-
client = OpenAI(api_key=API_KEY)
27-
gpt_model = config["metadata"]["gpt_eval_model_name"]
28-
29-
elif API_TYPE == "azure":
30-
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
31-
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
32-
API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
33-
client = AzureOpenAI(azure_endpoint=API_URL, api_version=API_VERSION, api_key=API_KEY)
34-
gpt_model = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
35-
36-
37-
def get_chat_response(prompt, max_token=256, retry=5):
38-
messages = [
39-
{"role": "user", "content": prompt},
40-
]
41-
for i in range(retry):
42-
try:
43-
completion = client.chat.completions.create(model=gpt_model, messages=messages, temperature=0.5 * i, max_tokens=max_token)
44-
prediction = completion.choices[0].message.content.strip()
45-
if prediction.lower() == "yes" or prediction.lower() == "no":
46-
return prediction
47-
except Exception as e:
48-
eval_logger.error(e)
49-
return "no"
50-
51-
52-
def build_mmecot_gpt4_prompt(question_data):
53-
prompt = """You are given a question, the solution and the correct answer. Please determine if the solution matches the correct answer.
54-
Focus only on the mathematical or semantic correctness of the content. Ignore any differences in formatting, such as LaTeX syntax, symbols, styles, or additional wrappers (e.g., \boxed, $...$, or similar). Compare only the core mathematical or textual meaning of the solution and the correct answer.
55-
The process or reasoning leading to the Solution is irrelevant, ONLY the correctness of the result matters.
56-
Return only "Yes" if the solution is correct or "No" if it is incorrect.
57-
Only return "Yes" or "No" with no additional text or formatting.
58-
59-
Question:
60-
{question}
61-
--------------------------------
62-
Correct Answer:
63-
{answer}
64-
--------------------------------
65-
Solution:
66-
{solution}
67-
--------------------------------
68-
"""
69-
question = question_data["question"]
70-
answer = question_data["answer"]
71-
response = str(question_data["response"])
72-
prompt = prompt.format(question=question, answer=answer, solution=response)
73-
return prompt
20+
GPT_MODEL = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
21+
22+
server_config = ServerConfig(
23+
model_name=GPT_MODEL,
24+
)
25+
server = get_server(server_name=API_TYPE, config=server_config)
26+
27+
7428

7529

7630
def mmecot_doc_to_visual(doc):
@@ -119,20 +73,36 @@ def mmecot_process_results(doc, results):
11973
parsed_preds.append(pred)
12074

12175
prediction = results[0].strip()
122-
# Build the prompt for GPT-4o evaluation
123-
question_data = {"index": doc.get("index", "unknown"), "question": doc["question"], "answer": doc["answer"], "response": prediction}
124-
125-
# Build the prompt and get GPT-4o's judgment
126-
prompt = build_mmecot_gpt4_prompt(question_data)
76+
question = doc["question"]
77+
answer = doc["answer"]
78+
79+
# Define custom prompt for MME-CoT evaluation
80+
custom_prompt = """You are given a question, the solution and the correct answer. Please determine if the solution matches the correct answer.
81+
Focus only on the mathematical or semantic correctness of the content. Ignore any differences in formatting, such as LaTeX syntax, symbols, styles, or additional wrappers (e.g., \boxed, $...$, or similar). Compare only the core mathematical or textual meaning of the solution and the correct answer.
82+
The process or reasoning leading to the Solution is irrelevant, ONLY the correctness of the result matters.
83+
Return only "Yes" if the solution is correct or "No" if it is incorrect.
84+
Only return "Yes" or "No" with no additional text or formatting."""
85+
12786
try:
128-
completion = get_chat_response(prompt)
129-
if completion.lower() == "yes" or completion.lower() == "no":
130-
judge_result = 1 if completion.lower() == "yes" else 0
87+
# Use the llm_judge API for binary evaluation
88+
result = server.evaluate_binary(
89+
question=question,
90+
answer=answer,
91+
prediction=prediction,
92+
output_format="yes/no",
93+
custom_prompt=custom_prompt
94+
)
95+
96+
# Parse the result
97+
if result["success"]:
98+
judge_response = result["result"]
99+
judge_result = 1 if judge_response and judge_response.lower() == "yes" else 0
131100
else:
132-
eval_logger.error(f"Invalid response: {completion}")
101+
eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
133102
judge_result = 0
103+
134104
except Exception as e:
135-
eval_logger.error(f"Error getting chat response: {e}")
105+
eval_logger.error(f"Error getting judge response: {e}")
136106
judge_result = 0
137107

138108
return {"submission": {"index": doc["index"], "prediction": parsed_preds}, "llm_as_judge_eval": judge_result}

lmms_eval/tasks/mmmu/mmmu_val_thinking.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ process_results: !function utils.mmmu_reasoning_process_results
2020
# k: 8
2121

2222
metric_list:
23-
- metric: mmmu_judge_acc
24-
aggregation: !function utils.mmmu_aggregate_judge_results
23+
- metric: llm_as_judge_eval
24+
aggregation: mean
2525
higher_is_better: true
2626

2727
generation_kwargs:

0 commit comments

Comments
 (0)