Skip to content

Commit 0e5a478

Browse files
authored
Add VL-RewardBench dataset (EvolvingLMMs-Lab#484)
* update vlreward bench * remove * pretty * fix * isort
1 parent b9a44f2 commit 0e5a478

File tree

3 files changed

+173
-0
lines changed

3 files changed

+173
-0
lines changed

docs/current_tasks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@
148148
- [VizWizVQA](https://vizwiz.org/tasks-and-datasets/vqa/) (vizwiz_vqa)
149149
- VizWizVQA Validation (vizwiz_vqa_val)
150150
- VizWizVQA Test (vizwiz_vqa_test)
151+
- [VL-RewardBench](https://vl-rewardbench.github.io) (vl_rewardbench)
151152
- [VQAv2](https://visualqa.org/) (vqav2)
152153
- VQAv2 Validation (vqav2_val)
153154
- VQAv2 Test (vqav2_test)
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import json
2+
import os
3+
import random
4+
import re
5+
from collections import defaultdict
6+
7+
import requests
8+
from loguru import logger as eval_logger
9+
10+
LLM_PARSE_ANSWER_PROMPT = """
11+
You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
12+
Return the Answer X ONLY. e.g., Answer 1 or Answer 2.
13+
14+
Judgement: {judgement}
15+
"""
16+
17+
API_TYPE = os.getenv("API_TYPE", "openai")
18+
19+
if API_TYPE == "openai":
20+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
21+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
22+
headers = {
23+
"Authorization": f"Bearer {API_KEY}",
24+
"Content-Type": "application/json",
25+
}
26+
27+
28+
def get_prompt(data_obj, random_number):
29+
answers = [data_obj["response"][0], data_obj["response"][1]] if random_number == 0 else [data_obj["response"][1], data_obj["response"][0]]
30+
prompt_str = f""" You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. Please analyze the following image and question, then determine which of the two provided answers is better.
31+
32+
Question: {data_obj["query"]}
33+
34+
Answer 1: {answers[0]}
35+
36+
Answer 2: {answers[1]}
37+
38+
Please evaluate both answers based on the following criteria:
39+
1. Accuracy: How well does the answer align with the visual information in the image?
40+
2. Completeness: Does the answer fully address all aspects of the question?
41+
3. Clarity: Is the answer easy to understand and well-articulated?
42+
4. Relevance: Does the answer directly relate to the question and the image?
43+
44+
After your evaluation, please:
45+
1. Explain your reasoning for each criterion.
46+
2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2). For example: Overall Judgment: Answer X is better.
47+
48+
Your response should be structured and detailed, demonstrating your understanding of both the visual and textual elements of the task."""
49+
return prompt_str
50+
51+
52+
def vlrewardbench_doc_to_visual(doc):
53+
return [doc["image"].convert("RGB")]
54+
55+
56+
def vlrewardbench_doc_to_text(doc):
57+
# we randomly choose the order of the answers to avoid positional bias
58+
random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
59+
query_prompt = get_prompt(doc, random_number)
60+
return query_prompt
61+
62+
63+
def parse_pred_ans(pred_ans):
64+
pred_ans = pred_ans.strip()
65+
pattern = r"(?:Overall Judgment|Therefore)\s*.*\s*-*\s*Answer\s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
66+
match = re.search(pattern, pred_ans.replace("\n", "").replace("*", ""), re.IGNORECASE)
67+
flag_choice = -1
68+
if match:
69+
answer_number = int(match.group(1))
70+
flag_choice = answer_number
71+
else:
72+
# parse by llm
73+
parsed_response = parse_by_llm(pred_ans)
74+
if "Answer 1".lower() in parsed_response.lower():
75+
flag_choice = 1
76+
elif "Answer 2".lower() in parsed_response.lower():
77+
flag_choice = 2
78+
else:
79+
eval_logger.warning(f"Cannot parse the answer: {pred_ans}, we randomly choose a choice")
80+
flag_choice = random.choice([1, 2])
81+
82+
return flag_choice # which one is better
83+
84+
85+
def parse_by_llm(response, model="gpt-4o-mini", max_tokens=32):
86+
# get the judgement from response using gpt-4o
87+
data = {"max_tokens": max_tokens, "model": model, "temperature": 0.0, "top_p": 1.0, "presence_penalty": 1, "messages": [{"role": "user", "content": LLM_PARSE_ANSWER_PROMPT.format(judgement=response)}]}
88+
response = requests.post(API_URL, headers=headers, data=json.dumps(data).encode("utf-8"))
89+
result = response.content.decode("utf-8")
90+
dict_result = json.loads(result)
91+
llm_output = dict_result["choices"][0]["message"]["content"].strip()
92+
return llm_output
93+
94+
95+
def vlrewardbench_process_results(doc, results):
96+
"""
97+
Args:
98+
doc: a instance of the eval dataset
99+
results: [pred]
100+
Returns:
101+
a dictionary with key: metric name (in this case mme score), value: metric value
102+
"""
103+
pred = results[0]
104+
pred_ans = parse_pred_ans(pred) # 1 or 2 indicte which one is better
105+
random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
106+
# Note: human_ranking [0, 1] -> answer 1 is better, [1, 0] -> answer 2 is better
107+
gt_ans = doc["human_ranking"].index(0 if random_number == 0 else 1) + 1
108+
109+
if pred_ans == gt_ans:
110+
score = 1.0
111+
else:
112+
score = 0.0
113+
category = doc["id"].split("-")[0].split("_")[0].lower()
114+
115+
group_mapping = {
116+
"mathverse": "reasoning",
117+
"hallucination": "hallucination",
118+
"mmmu": "reasoning",
119+
"rlhf": "hallucination",
120+
"rlaif": "hallucination",
121+
"wildvision": "general",
122+
"vlfeedback": "general",
123+
}
124+
key_name = "vlreward_score"
125+
# Note: the key name here is very important. It decides which aggregation function will receive the results
126+
# We note down the question id/category to help us aggregate the results later
127+
return {key_name: {"question_id": doc["id"], "category": group_mapping.get(category, "general"), "score": score}}
128+
129+
130+
def vlrewardbench_aggregate_results(results):
131+
"""
132+
Args:
133+
results: a list of values returned by process_results
134+
Returns:
135+
A score
136+
"""
137+
category2score = defaultdict(list)
138+
for result in results:
139+
score = result["score"]
140+
category = result["category"]
141+
category2score[category].append(score)
142+
143+
category2avg_score = {}
144+
for category, question2scores in category2score.items():
145+
category2avg_score[category] = sum(question2scores) / len(question2scores)
146+
for category, avg_score in category2avg_score.items():
147+
eval_logger.info(f"{category}: {avg_score:.2f}")
148+
total_score = sum(category2avg_score.values()) / len(category2avg_score) # Macro-Avg across tasks
149+
return total_score
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
dataset_path: MMInstruction/VL-RewardBench
2+
dataset_kwargs:
3+
token: True
4+
task: "vl_rewardbench"
5+
test_split: test
6+
output_type: generate_until
7+
doc_to_visual: !function utils.vlrewardbench_doc_to_visual
8+
doc_to_text: !function utils.vlrewardbench_doc_to_text
9+
doc_to_target: "human_ranking"
10+
generation_kwargs:
11+
max_new_tokens: 1024
12+
temperature: 1.0
13+
top_p: 1.0
14+
num_beams: 1
15+
do_sample: true
16+
# The return value of process_results will be used by metrics
17+
process_results: !function utils.vlrewardbench_process_results
18+
metric_list:
19+
- metric: vlreward_score
20+
aggregation: !function utils.vlrewardbench_aggregate_results
21+
higher_is_better: true
22+
metadata:
23+
- version: 0.0

0 commit comments

Comments
 (0)