55from collections import defaultdict
66
77import requests
8- from lmms_eval .tasks ._task_utils .file_utils import generate_submission_file
98from loguru import logger as eval_logger
109
11- dir_name = os .path .dirname (os .path .abspath (__file__ ))
12-
1310
1411LLM_PARSE_ANSWER_PROMPT = """
1512You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
1815Judgement: {judgement}
1916"""
2017
18+ API_TYPE = os .getenv ("API_TYPE" , "openai" )
19+
20+ if API_TYPE == "openai" :
21+ API_URL = os .getenv ("OPENAI_API_URL" , "https://api.openai.com/v1/chat/completions" )
22+ API_KEY = os .getenv ("OPENAI_API_KEY" , "YOUR_API_KEY" )
23+ headers = {
24+ "Authorization" : f"Bearer { API_KEY } " ,
25+ "Content-Type" : "application/json" ,
26+ }
27+
2128
2229def get_prompt (data_obj , random_number ):
2330 answers = [data_obj ["response" ][0 ], data_obj ["response" ][1 ]] if random_number == 0 else [data_obj ["response" ][1 ], data_obj ["response" ][0 ]]
@@ -47,32 +54,18 @@ def vlrewardbench_doc_to_visual(doc):
4754 return [doc ["image" ].convert ("RGB" )]
4855
4956
50- def vlrewardbench_doc_to_text (doc , lmms_eval_specific_kwargs = None ):
57+ def vlrewardbench_doc_to_text (doc ):
5158 # we randomly choose the order of the answers to avoid positional bias
5259 random_number = sum (len (res ) for res in doc ["response" ]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
53- # doc["random_number"] = random_number # save it for later use Notes: This cannot be done as the doc iterator would be reset
5460 query_prompt = get_prompt (doc , random_number )
55-
5661 return query_prompt
5762
5863
59- API_TYPE = os .getenv ("API_TYPE" , "openai" )
60-
61- if API_TYPE == "openai" :
62- API_URL = os .getenv ("OPENAI_API_URL" , "https://api.openai.com/v1/chat/completions" )
63- API_KEY = os .getenv ("OPENAI_API_KEY" , "YOUR_API_KEY" )
64- headers = {
65- "Authorization" : f"Bearer { API_KEY } " ,
66- "Content-Type" : "application/json" ,
67- }
68-
69-
7064def parse_pred_ans (pred_ans ):
71- pred_ans = pred_ans .lower (). strip ()
72- pattern = r"(?:overall judgment|therefore )\s*.*\s*-*\s*answer \s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
65+ pred_ans = pred_ans .strip ()
66+ pattern = r"(?:Overall Judgment|Therefore )\s*.*\s*-*\s*Answer \s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
7367 match = re .search (pattern , pred_ans .replace ("\n " , "" ).replace ("*" , "" ), re .IGNORECASE )
7468 flag_choice = - 1
75-
7669 if match :
7770 answer_number = int (match .group (1 ))
7871 flag_choice = answer_number
@@ -92,7 +85,7 @@ def parse_pred_ans(pred_ans):
9285
9386def parse_by_llm (response , model = "gpt-4o-mini" , max_tokens = 32 ):
9487 # get the judgement from response using gpt-4o
95- data = {"max_tokens" : max_tokens , "model" : model , "temperature" : 0.0 , "top_p" : 1 , "presence_penalty" : 1 , "messages" : [{"role" : "user" , "content" : LLM_PARSE_ANSWER_PROMPT .format (judgement = response )}]}
88+ data = {"max_tokens" : max_tokens , "model" : model , "temperature" : 0.0 , "top_p" : 1.0 , "presence_penalty" : 1 , "messages" : [{"role" : "user" , "content" : LLM_PARSE_ANSWER_PROMPT .format (judgement = response )}]}
9689 response = requests .post (API_URL , headers = headers , data = json .dumps (data ).encode ("utf-8" ))
9790 result = response .content .decode ("utf-8" )
9891 dict_result = json .loads (result )
@@ -109,10 +102,11 @@ def vlrewardbench_process_results(doc, results):
109102 a dictionary with key: metric name (in this case mme score), value: metric value
110103 """
111104 pred = results [0 ]
112- pred_ans = parse_pred_ans (pred )
105+ pred_ans = parse_pred_ans (pred ) # 1 or 2 indicte which one is better
113106 random_number = sum (len (res ) for res in doc ["response" ]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
107+ # Note: human_ranking [0, 1] -> answer 1 is better, [1, 0] -> answer 2 is better
108+ gt_ans = doc ["human_ranking" ].index (0 if random_number == 0 else 1 ) + 1
114109
115- gt_ans = doc ["human_ranking" ].index (1 if random_number == 0 else 0 ) + 1
116110 if pred_ans == gt_ans :
117111 score = 1.0
118112 else :
0 commit comments