|
| 1 | +import json |
| 2 | +import os |
| 3 | +import random |
| 4 | +import re |
| 5 | +from collections import defaultdict |
| 6 | + |
| 7 | +import requests |
| 8 | +from loguru import logger as eval_logger |
| 9 | + |
| 10 | +LLM_PARSE_ANSWER_PROMPT = """ |
| 11 | +You are given a pairwise judgement for two responses. Please return the better response according to the judgement. |
| 12 | +Return the Answer X ONLY. e.g., Answer 1 or Answer 2. |
| 13 | +
|
| 14 | +Judgement: {judgement} |
| 15 | +""" |
| 16 | + |
| 17 | +API_TYPE = os.getenv("API_TYPE", "openai") |
| 18 | + |
| 19 | +if API_TYPE == "openai": |
| 20 | + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") |
| 21 | + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") |
| 22 | + headers = { |
| 23 | + "Authorization": f"Bearer {API_KEY}", |
| 24 | + "Content-Type": "application/json", |
| 25 | + } |
| 26 | + |
| 27 | + |
| 28 | +def get_prompt(data_obj, random_number): |
| 29 | + answers = [data_obj["response"][0], data_obj["response"][1]] if random_number == 0 else [data_obj["response"][1], data_obj["response"][0]] |
| 30 | + prompt_str = f""" You are a highly capable multimodal AI assistant tasked with evaluating answers to visual questions. Please analyze the following image and question, then determine which of the two provided answers is better. |
| 31 | +
|
| 32 | +Question: {data_obj["query"]} |
| 33 | +
|
| 34 | +Answer 1: {answers[0]} |
| 35 | +
|
| 36 | +Answer 2: {answers[1]} |
| 37 | +
|
| 38 | +Please evaluate both answers based on the following criteria: |
| 39 | +1. Accuracy: How well does the answer align with the visual information in the image? |
| 40 | +2. Completeness: Does the answer fully address all aspects of the question? |
| 41 | +3. Clarity: Is the answer easy to understand and well-articulated? |
| 42 | +4. Relevance: Does the answer directly relate to the question and the image? |
| 43 | +
|
| 44 | +After your evaluation, please: |
| 45 | +1. Explain your reasoning for each criterion. |
| 46 | +2. Provide an overall judgment on which answer is better (Answer 1 or Answer 2). For example: Overall Judgment: Answer X is better. |
| 47 | +
|
| 48 | +Your response should be structured and detailed, demonstrating your understanding of both the visual and textual elements of the task.""" |
| 49 | + return prompt_str |
| 50 | + |
| 51 | + |
| 52 | +def vlrewardbench_doc_to_visual(doc): |
| 53 | + return [doc["image"].convert("RGB")] |
| 54 | + |
| 55 | + |
| 56 | +def vlrewardbench_doc_to_text(doc): |
| 57 | + # we randomly choose the order of the answers to avoid positional bias |
| 58 | + random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers |
| 59 | + query_prompt = get_prompt(doc, random_number) |
| 60 | + return query_prompt |
| 61 | + |
| 62 | + |
| 63 | +def parse_pred_ans(pred_ans): |
| 64 | + pred_ans = pred_ans.strip() |
| 65 | + pattern = r"(?:Overall Judgment|Therefore)\s*.*\s*-*\s*Answer\s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better" |
| 66 | + match = re.search(pattern, pred_ans.replace("\n", "").replace("*", ""), re.IGNORECASE) |
| 67 | + flag_choice = -1 |
| 68 | + if match: |
| 69 | + answer_number = int(match.group(1)) |
| 70 | + flag_choice = answer_number |
| 71 | + else: |
| 72 | + # parse by llm |
| 73 | + parsed_response = parse_by_llm(pred_ans) |
| 74 | + if "Answer 1".lower() in parsed_response.lower(): |
| 75 | + flag_choice = 1 |
| 76 | + elif "Answer 2".lower() in parsed_response.lower(): |
| 77 | + flag_choice = 2 |
| 78 | + else: |
| 79 | + eval_logger.warning(f"Cannot parse the answer: {pred_ans}, we randomly choose a choice") |
| 80 | + flag_choice = random.choice([1, 2]) |
| 81 | + |
| 82 | + return flag_choice # which one is better |
| 83 | + |
| 84 | + |
| 85 | +def parse_by_llm(response, model="gpt-4o-mini", max_tokens=32): |
| 86 | + # get the judgement from response using gpt-4o |
| 87 | + data = {"max_tokens": max_tokens, "model": model, "temperature": 0.0, "top_p": 1.0, "presence_penalty": 1, "messages": [{"role": "user", "content": LLM_PARSE_ANSWER_PROMPT.format(judgement=response)}]} |
| 88 | + response = requests.post(API_URL, headers=headers, data=json.dumps(data).encode("utf-8")) |
| 89 | + result = response.content.decode("utf-8") |
| 90 | + dict_result = json.loads(result) |
| 91 | + llm_output = dict_result["choices"][0]["message"]["content"].strip() |
| 92 | + return llm_output |
| 93 | + |
| 94 | + |
| 95 | +def vlrewardbench_process_results(doc, results): |
| 96 | + """ |
| 97 | + Args: |
| 98 | + doc: a instance of the eval dataset |
| 99 | + results: [pred] |
| 100 | + Returns: |
| 101 | + a dictionary with key: metric name (in this case mme score), value: metric value |
| 102 | + """ |
| 103 | + pred = results[0] |
| 104 | + pred_ans = parse_pred_ans(pred) # 1 or 2 indicte which one is better |
| 105 | + random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers |
| 106 | + # Note: human_ranking [0, 1] -> answer 1 is better, [1, 0] -> answer 2 is better |
| 107 | + gt_ans = doc["human_ranking"].index(0 if random_number == 0 else 1) + 1 |
| 108 | + |
| 109 | + if pred_ans == gt_ans: |
| 110 | + score = 1.0 |
| 111 | + else: |
| 112 | + score = 0.0 |
| 113 | + category = doc["id"].split("-")[0].split("_")[0].lower() |
| 114 | + |
| 115 | + group_mapping = { |
| 116 | + "mathverse": "reasoning", |
| 117 | + "hallucination": "hallucination", |
| 118 | + "mmmu": "reasoning", |
| 119 | + "rlhf": "hallucination", |
| 120 | + "rlaif": "hallucination", |
| 121 | + "wildvision": "general", |
| 122 | + "vlfeedback": "general", |
| 123 | + } |
| 124 | + key_name = "vlreward_score" |
| 125 | + # Note: the key name here is very important. It decides which aggregation function will receive the results |
| 126 | + # We note down the question id/category to help us aggregate the results later |
| 127 | + return {key_name: {"question_id": doc["id"], "category": group_mapping.get(category, "general"), "score": score}} |
| 128 | + |
| 129 | + |
| 130 | +def vlrewardbench_aggregate_results(results): |
| 131 | + """ |
| 132 | + Args: |
| 133 | + results: a list of values returned by process_results |
| 134 | + Returns: |
| 135 | + A score |
| 136 | + """ |
| 137 | + category2score = defaultdict(list) |
| 138 | + for result in results: |
| 139 | + score = result["score"] |
| 140 | + category = result["category"] |
| 141 | + category2score[category].append(score) |
| 142 | + |
| 143 | + category2avg_score = {} |
| 144 | + for category, question2scores in category2score.items(): |
| 145 | + category2avg_score[category] = sum(question2scores) / len(question2scores) |
| 146 | + for category, avg_score in category2avg_score.items(): |
| 147 | + eval_logger.info(f"{category}: {avg_score:.2f}") |
| 148 | + total_score = sum(category2avg_score.values()) / len(category2avg_score) # Macro-Avg across tasks |
| 149 | + return total_score |
0 commit comments