|
7 | 7 | import requests |
8 | 8 | import yaml |
9 | 9 | from loguru import logger as eval_logger |
10 | | -from openai import AzureOpenAI, OpenAI |
11 | 10 |
|
| 11 | +from lmms_eval.llm_judge import ServerConfig, get_server |
12 | 12 | from lmms_eval.tasks.mathvision.eval_utils import find_math_answer, is_equal, is_number |
13 | 13 |
|
14 | 14 | NUM_SECONDS_TO_SLEEP = 5 |
15 | | -API_TYPE = os.getenv("API_TYPE", "openai") |
16 | | -MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") |
17 | | - |
18 | | -JUDGE_RULES = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case. |
19 | | -# Input |
20 | | -Question: |
21 | | -``` |
22 | | -{question} |
23 | | -``` |
24 | | -Ground Truth Answer: |
25 | | -``` |
26 | | -{answer} |
27 | | -``` |
28 | | -Model Prediction: |
29 | | -``` |
30 | | -{pred} |
31 | | -``` |
32 | | -
|
33 | | -# Evaluation Rules |
34 | | -- The model prediction may contain the reasoning process, you should spot the final answer from it. |
35 | | -- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options. |
36 | | -- For open-ended questions: |
37 | | - * Score 1 if the prediction matches the answer semantically, it can be in different format. |
38 | | - * Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct. |
39 | | -- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way. |
40 | | -- Treat numerical answers as correct if they match within reasonable precision |
41 | | -- For questions requiring units, both value and unit must be correct |
42 | | -
|
43 | | -# Strict Output format |
44 | | -0/1""" |
45 | | - |
46 | | -if API_TYPE == "openai": |
47 | | - API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") |
48 | | - API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") |
49 | | - client = OpenAI(api_key=API_KEY) |
50 | | -elif API_TYPE == "azure": |
51 | | - API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") |
52 | | - API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") |
53 | | - client = AzureOpenAI(azure_endpoint=API_URL, api_version="2023-07-01-preview", api_key=API_KEY) |
54 | 15 |
|
| 16 | +# Initialize the judge server |
| 17 | +API_TYPE = os.getenv("API_TYPE", "openai") |
| 18 | +GPT_MODEL = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") |
55 | 19 |
|
56 | | -def get_chat_response(content: str, max_tokens: int, retries: int = 5): |
57 | | - global MODEL_VERSION |
58 | | - global client |
| 20 | +server_config = ServerConfig( |
| 21 | + model_name=GPT_MODEL, |
| 22 | +) |
| 23 | +server = get_server(server_name=API_TYPE, config=server_config) |
59 | 24 |
|
60 | | - messages = [ |
61 | | - { |
62 | | - "role": "system", |
63 | | - "content": "You are a helpful and precise assistant for checking the correctness of the answer.", |
64 | | - }, |
65 | | - {"role": "user", "content": content}, |
66 | | - ] |
67 | | - |
68 | | - payload = { |
69 | | - "model": MODEL_VERSION, |
70 | | - "messages": messages, |
71 | | - "temperature": 0.2, |
72 | | - "max_tokens": max_tokens, |
73 | | - } |
74 | 25 |
|
75 | | - for attempt in range(retries): |
76 | | - try: |
77 | | - response = client.chat.completions.create(**payload) |
78 | | - content = response.choices[0].message.content.strip() |
79 | | - return content |
80 | | - except requests.exceptions.RequestException as e: |
81 | | - eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}") |
82 | | - time.sleep(NUM_SECONDS_TO_SLEEP) |
83 | | - if attempt == retries - 1: |
84 | | - eval_logger.error(f"Failed to get response after {retries} attempts") |
85 | | - return 0 |
86 | | - except Exception as e: |
87 | | - eval_logger.error(f"Error on attempt {attempt+1}: {e}") |
88 | | - return 0 |
89 | 26 |
|
90 | 27 |
|
91 | 28 | def mathvision_doc_to_visual(doc): |
@@ -115,22 +52,48 @@ def mathvision_gpt_eval_process_results(doc, results): |
115 | 52 | for pred in results: |
116 | 53 | model_answer = pred.strip() |
117 | 54 | gt_answer = str(doc["answer"]) |
118 | | - gpt_response = get_chat_response(JUDGE_RULES.format(question=doc["question"], answer=gt_answer, pred=model_answer), 1024) |
| 55 | + question = doc["question"] |
| 56 | + |
| 57 | + # Define custom prompt for MathVision evaluation |
| 58 | + custom_prompt = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case. |
| 59 | +
|
| 60 | +# Evaluation Rules |
| 61 | +- The model prediction may contain the reasoning process, you should spot the final answer from it. |
| 62 | +- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options. |
| 63 | +- For open-ended questions: |
| 64 | + * Score 1 if the prediction matches the answer semantically, it can be in different format. |
| 65 | + * Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct. |
| 66 | +- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way. |
| 67 | +- Treat numerical answers as correct if they match within reasonable precision |
| 68 | +- For questions requiring units, both value and unit must be correct |
| 69 | +
|
| 70 | +Return only "1" or "0" with no additional text or formatting.""" |
| 71 | + |
119 | 72 | try: |
120 | | - if int(gpt_response) == 1: |
121 | | - correct_list.append(True) |
| 73 | + # Use the llm_judge API for binary evaluation |
| 74 | + result = server.evaluate_binary( |
| 75 | + question=question, |
| 76 | + answer=gt_answer, |
| 77 | + prediction=model_answer, |
| 78 | + output_format="1/0", |
| 79 | + custom_prompt=custom_prompt |
| 80 | + ) |
| 81 | + |
| 82 | + # Parse the result |
| 83 | + if result["success"]: |
| 84 | + judge_response = result["result"] |
| 85 | + correct_list.append(judge_response == "1") |
122 | 86 | else: |
| 87 | + eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}") |
123 | 88 | correct_list.append(False) |
| 89 | + |
124 | 90 | except Exception as e: |
125 | | - eval_logger.error(f"Error on attempt {attempt+1}: {e}") |
| 91 | + eval_logger.error(f"Error getting judge response: {e}") |
126 | 92 | correct_list.append(False) |
127 | 93 |
|
128 | | - return { |
129 | | - "mathvision_gpt_eval_score": { |
130 | | - "response": results, |
131 | | - "scores": correct_list, |
132 | | - }, |
133 | | - } |
| 94 | + # Calculate the average score for this document |
| 95 | + avg_score = sum(1 if score else 0 for score in correct_list) / len(correct_list) if correct_list else 0 |
| 96 | + return {"llm_as_judge_eval": avg_score} |
134 | 97 |
|
135 | 98 |
|
136 | 99 | def mathvision_process_results(doc, results): |
|
0 commit comments