Add llm_as_judge_eval metric to multiple tasks and integrate llm_judge API for evaluation

Luodian · kcz358 · commit e591de019ad3 · 2025-06-19T07:01:17.000Z
diff --git a/lmms_eval/tasks/mathverse/mathverse_testmini.yaml b/lmms_eval/tasks/mathverse/mathverse_testmini.yaml
@@ -22,6 +22,9 @@ generation_kwargs:
     - "<|im_end|>"
 process_results: !function utils.mathverse_process_results
 metric_list:
+  - metric: llm_as_judge_eval
+    aggregation: mean
+    higher_is_better: true
   - metric: gpt_eval_score
     aggregation: !function utils.mathverse_aggregate_results_eval
     higher_is_better: true
diff --git a/lmms_eval/tasks/mathverse/utils.py b/lmms_eval/tasks/mathverse/utils.py
@@ -6,6 +6,7 @@
 import yaml
 from loguru import logger as eval_logger
 
+from lmms_eval.llm_judge import ServerConfig, get_server
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 from lmms_eval.tasks.mathverse.mathverse_evals import MathVerseEvaluator
 
@@ -19,6 +20,15 @@
 
     config = yaml.safe_load("".join(safe_data))
 
+# Initialize the judge server
+API_TYPE = os.getenv("API_TYPE", "openai")
+GPT_MODEL = os.getenv("MODEL_VERSION", config["metadata"]["gpt_eval_model_name"])
+
+server_config = ServerConfig(
+    model_name=GPT_MODEL,
+)
+server = get_server(server_name=API_TYPE, config=server_config)
+
 mathverse_evaluator = MathVerseEvaluator(api_key=os.getenv("OPENAI_API_KEY", "YOUR_API_KEY"), gpt_model=config["metadata"]["gpt_eval_model_name"])
 
 
@@ -45,6 +55,39 @@ def mathverse_doc_to_text(doc, lmms_eval_specific_kwargs=None):
 
 def mathverse_process_results(doc, results):
     prediction = results[0].strip()
+    question = doc["question_for_eval"] 
+    answer = doc["answer"] if "answer" in doc else None
+    
+    # Define custom prompt for MathVerse evaluation
+    custom_prompt = """Below are two answers to a math question. Determine whether these two answers are consistent.
+Please note that only when the Model Answer completely matches the Standard Answer means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+
+Return only "Yes" if they are consistent or "No" if they are different.
+Only return "Yes" or "No" with no additional text or formatting."""
+    
+    judge_result = 0
+    if answer is not None:
+        try:
+            # Use the llm_judge API for binary evaluation
+            result = server.evaluate_binary(
+                question=question,
+                answer=str(answer),
+                prediction=prediction,
+                output_format="yes/no",
+                custom_prompt=custom_prompt
+            )
+            
+            # Parse the result
+            if result["success"]:
+                judge_response = result["result"]
+                judge_result = 1 if judge_response and judge_response.lower() == "yes" else 0
+            else:
+                eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
+                judge_result = 0
+                
+        except Exception as e:
+            eval_logger.error(f"Error getting judge response: {e}")
+            judge_result = 0
 
     result = {
         "sample_index": doc["sample_index"],
@@ -58,9 +101,11 @@ def mathverse_process_results(doc, results):
         "query_wo": doc["query_wo"],
         "query_cot": doc["query_cot"],
         "question_for_eval": doc["question_for_eval"],
+        "true_false": judge_result == 1,
     }
 
     return {
+        "llm_as_judge_eval": judge_result,
         "gpt_eval_score": result,
         "submission": result,
     }
diff --git a/lmms_eval/tasks/mathvision/mathvision_reason_test.yaml b/lmms_eval/tasks/mathvision/mathvision_reason_test.yaml
@@ -21,6 +21,6 @@ generation_kwargs:
     - "<|im_end|>"
 process_results: !function utils.mathvision_gpt_eval_process_results
 metric_list:
-  - metric: mathvision_gpt_eval_score
-    aggregation: !function utils.mathvision_aggregate_results_eval
+  - metric: llm_as_judge_eval
+    aggregation: mean
     higher_is_better: true
diff --git a/lmms_eval/tasks/mathvision/mathvision_reason_testmini.yaml b/lmms_eval/tasks/mathvision/mathvision_reason_testmini.yaml
@@ -21,6 +21,6 @@ generation_kwargs:
     - "<|im_end|>"
 process_results: !function utils.mathvision_gpt_eval_process_results
 metric_list:
-  - metric: mathvision_gpt_eval_score
-    aggregation: !function utils.mathvision_aggregate_results_eval
+  - metric: llm_as_judge_eval
+    aggregation: mean
     higher_is_better: true
diff --git a/lmms_eval/tasks/mathvision/utils.py b/lmms_eval/tasks/mathvision/utils.py
@@ -7,85 +7,22 @@
 import requests
 import yaml
 from loguru import logger as eval_logger
-from openai import AzureOpenAI, OpenAI
 
+from lmms_eval.llm_judge import ServerConfig, get_server
 from lmms_eval.tasks.mathvision.eval_utils import find_math_answer, is_equal, is_number
 
 NUM_SECONDS_TO_SLEEP = 5
-API_TYPE = os.getenv("API_TYPE", "openai")
-MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
-
-JUDGE_RULES = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case.
-# Input
-Question:
-```
-{question}
-```
-Ground Truth Answer:
-```
-{answer}
-```
-Model Prediction:
-```
-{pred}
-```
-
-# Evaluation Rules
-- The model prediction may contain the reasoning process, you should spot the final answer from it.
-- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options.
-- For open-ended questions:
-  * Score 1 if the prediction matches the answer semantically, it can be in different format.
-  * Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct.
-- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way.
-- Treat numerical answers as correct if they match within reasonable precision
-- For questions requiring units, both value and unit must be correct
-
-# Strict Output format
-0/1"""
-
-if API_TYPE == "openai":
-    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
-    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
-    client = OpenAI(api_key=API_KEY)
-elif API_TYPE == "azure":
-    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
-    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
-    client = AzureOpenAI(azure_endpoint=API_URL, api_version="2023-07-01-preview", api_key=API_KEY)
 
+# Initialize the judge server
+API_TYPE = os.getenv("API_TYPE", "openai")
+GPT_MODEL = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
 
-def get_chat_response(content: str, max_tokens: int, retries: int = 5):
-    global MODEL_VERSION
-    global client
+server_config = ServerConfig(
+    model_name=GPT_MODEL,
+)
+server = get_server(server_name=API_TYPE, config=server_config)
 
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful and precise assistant for checking the correctness of the answer.",
-        },
-        {"role": "user", "content": content},
-    ]
-
-    payload = {
-        "model": MODEL_VERSION,
-        "messages": messages,
-        "temperature": 0.2,
-        "max_tokens": max_tokens,
-    }
 
-    for attempt in range(retries):
-        try:
-            response = client.chat.completions.create(**payload)
-            content = response.choices[0].message.content.strip()
-            return content
-        except requests.exceptions.RequestException as e:
-            eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
-            time.sleep(NUM_SECONDS_TO_SLEEP)
-            if attempt == retries - 1:
-                eval_logger.error(f"Failed to get response after {retries} attempts")
-                return 0
-        except Exception as e:
-            eval_logger.error(f"Error on attempt {attempt+1}: {e}")
-            return 0
 
 
 def mathvision_doc_to_visual(doc):
@@ -115,22 +52,48 @@ def mathvision_gpt_eval_process_results(doc, results):
     for pred in results:
         model_answer = pred.strip()
         gt_answer = str(doc["answer"])
-        gpt_response = get_chat_response(JUDGE_RULES.format(question=doc["question"], answer=gt_answer, pred=model_answer), 1024)
+        question = doc["question"]
+        
+        # Define custom prompt for MathVision evaluation
+        custom_prompt = """You are a strict evaluator assessing answer correctness. You must output 1 for fully correct answers and 0 for any other case.
+
+# Evaluation Rules
+- The model prediction may contain the reasoning process, you should spot the final answer from it.
+- For multiple-choice questions: Score 1 if the predicted answer matches the ground truth answer, it can be directly in option letters or the content of the options.
+- For open-ended questions:
+  * Score 1 if the prediction matches the answer semantically, it can be in different format.
+  * Score 0 for partially correct answers or answers with extra incorrect information, even if the reasoning process is correct.
+- Ignore minor differences in formatting, capitalization, or spacing since the model may explain in a different way.
+- Treat numerical answers as correct if they match within reasonable precision
+- For questions requiring units, both value and unit must be correct
+
+Return only "1" or "0" with no additional text or formatting."""
+        
         try:
-            if int(gpt_response) == 1:
-                correct_list.append(True)
+            # Use the llm_judge API for binary evaluation
+            result = server.evaluate_binary(
+                question=question,
+                answer=gt_answer,
+                prediction=model_answer,
+                output_format="1/0",
+                custom_prompt=custom_prompt
+            )
+            
+            # Parse the result
+            if result["success"]:
+                judge_response = result["result"]
+                correct_list.append(judge_response == "1")
             else:
+                eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
                 correct_list.append(False)
+                
         except Exception as e:
-            eval_logger.error(f"Error on attempt {attempt+1}: {e}")
+            eval_logger.error(f"Error getting judge response: {e}")
             correct_list.append(False)
 
-    return {
-        "mathvision_gpt_eval_score": {
-            "response": results,
-            "scores": correct_list,
-        },
-    }
+    # Calculate the average score for this document
+    avg_score = sum(1 if score else 0 for score in correct_list) / len(correct_list) if correct_list else 0
+    return {"llm_as_judge_eval": avg_score}
 
 
 def mathvision_process_results(doc, results):
diff --git a/lmms_eval/tasks/mme_cot/utils.py b/lmms_eval/tasks/mme_cot/utils.py
@@ -8,69 +8,23 @@
 
 import pandas as pd
 from loguru import logger as eval_logger
-from openai import AzureOpenAI, OpenAI
 from PIL import Image
 
+from lmms_eval.llm_judge import ServerConfig, get_server
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
 dir_name = os.path.dirname(os.path.abspath(__file__))
 
+# Initialize the judge server
 API_TYPE = os.getenv("API_TYPE", "openai")
-if API_TYPE == "openai":
-    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
-    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
-    headers = {
-        "Authorization": f"Bearer {API_KEY}",
-        "Content-Type": "application/json",
-    }
-    client = OpenAI(api_key=API_KEY)
-    gpt_model = config["metadata"]["gpt_eval_model_name"]
-
-elif API_TYPE == "azure":
-    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
-    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
-    API_VERSION = os.getenv("AZURE_API_VERSION", "2023-07-01-preview")
-    client = AzureOpenAI(azure_endpoint=API_URL, api_version=API_VERSION, api_key=API_KEY)
-    gpt_model = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
-
-
-def get_chat_response(prompt, max_token=256, retry=5):
-    messages = [
-        {"role": "user", "content": prompt},
-    ]
-    for i in range(retry):
-        try:
-            completion = client.chat.completions.create(model=gpt_model, messages=messages, temperature=0.5 * i, max_tokens=max_token)
-            prediction = completion.choices[0].message.content.strip()
-            if prediction.lower() == "yes" or prediction.lower() == "no":
-                return prediction
-        except Exception as e:
-            eval_logger.error(e)
-    return "no"
-
-
-def build_mmecot_gpt4_prompt(question_data):
-    prompt = """You are given a question, the solution and the correct answer. Please determine if the solution matches the correct answer.
-Focus only on the mathematical or semantic correctness of the content. Ignore any differences in formatting, such as LaTeX syntax, symbols, styles, or additional wrappers (e.g., \boxed, $...$, or similar). Compare only the core mathematical or textual meaning of the solution and the correct answer.
-The process or reasoning leading to the Solution is irrelevant, ONLY the correctness of the result matters.
-Return only "Yes" if the solution is correct or "No" if it is incorrect.
-Only return "Yes" or "No" with no additional text or formatting.
-
-Question: 
-{question}
---------------------------------
-Correct Answer:
-{answer}
---------------------------------
-Solution: 
-{solution}
---------------------------------
-"""
-    question = question_data["question"]
-    answer = question_data["answer"]
-    response = str(question_data["response"])
-    prompt = prompt.format(question=question, answer=answer, solution=response)
-    return prompt
+GPT_MODEL = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
+
+server_config = ServerConfig(
+    model_name=GPT_MODEL,
+)
+server = get_server(server_name=API_TYPE, config=server_config)
+
+
 
 
 def mmecot_doc_to_visual(doc):
@@ -119,20 +73,36 @@ def mmecot_process_results(doc, results):
         parsed_preds.append(pred)
 
     prediction = results[0].strip()
-    # Build the prompt for GPT-4o evaluation
-    question_data = {"index": doc.get("index", "unknown"), "question": doc["question"], "answer": doc["answer"], "response": prediction}
-
-    # Build the prompt and get GPT-4o's judgment
-    prompt = build_mmecot_gpt4_prompt(question_data)
+    question = doc["question"]
+    answer = doc["answer"]
+    
+    # Define custom prompt for MME-CoT evaluation
+    custom_prompt = """You are given a question, the solution and the correct answer. Please determine if the solution matches the correct answer.
+Focus only on the mathematical or semantic correctness of the content. Ignore any differences in formatting, such as LaTeX syntax, symbols, styles, or additional wrappers (e.g., \boxed, $...$, or similar). Compare only the core mathematical or textual meaning of the solution and the correct answer.
+The process or reasoning leading to the Solution is irrelevant, ONLY the correctness of the result matters.
+Return only "Yes" if the solution is correct or "No" if it is incorrect.
+Only return "Yes" or "No" with no additional text or formatting."""
+    
     try:
-        completion = get_chat_response(prompt)
-        if completion.lower() == "yes" or completion.lower() == "no":
-            judge_result = 1 if completion.lower() == "yes" else 0
+        # Use the llm_judge API for binary evaluation
+        result = server.evaluate_binary(
+            question=question,
+            answer=answer,
+            prediction=prediction,
+            output_format="yes/no",
+            custom_prompt=custom_prompt
+        )
+        
+        # Parse the result
+        if result["success"]:
+            judge_response = result["result"]
+            judge_result = 1 if judge_response and judge_response.lower() == "yes" else 0
         else:
-            eval_logger.error(f"Invalid response: {completion}")
+            eval_logger.error(f"Judge evaluation failed: {result.get('raw_response', 'Unknown error')}")
             judge_result = 0
+            
     except Exception as e:
-        eval_logger.error(f"Error getting chat response: {e}")
+        eval_logger.error(f"Error getting judge response: {e}")
         judge_result = 0
 
     return {"submission": {"index": doc["index"], "prediction": parsed_preds}, "llm_as_judge_eval": judge_result}
diff --git a/lmms_eval/tasks/mmmu/mmmu_val_thinking.yaml b/lmms_eval/tasks/mmmu/mmmu_val_thinking.yaml
@@ -20,8 +20,8 @@ process_results: !function utils.mmmu_reasoning_process_results
 #         k: 8
 
 metric_list:
-  - metric: mmmu_judge_acc
-    aggregation: !function utils.mmmu_aggregate_judge_results
+  - metric: llm_as_judge_eval
+    aggregation: mean
     higher_is_better: true
 
 generation_kwargs:
diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
diff --git a/lmms_eval/tasks/olympiadbench/testmini_utils.py b/lmms_eval/tasks/olympiadbench/testmini_utils.py