EvolvingLMMs-Lab · Luodian · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/lmms_eval/tasks/mathvista/mathvista_test.yaml b/lmms_eval/tasks/mathvista/mathvista_test.yaml
@@ -21,4 +21,7 @@ metric_list:
 
 lmms_eval_specific_kwargs:
   default:
-    shot_type: "reason-first" # can be "reason-first", "solution", "step-by-step"
+    shot_type: "reason-first" # can be "reason-first", "solution", "step-by-step"
+    shot: 0
+    use_caption: False
+    use_ocr: False
diff --git a/lmms_eval/tasks/videomathqa/cot_postprocess.py b/lmms_eval/tasks/videomathqa/cot_postprocess.py
@@ -1,17 +1,19 @@
+import argparse
+import json
 import os
+import random
 import re
 import sys
-import json
-import random
-import argparse
+
 from tqdm import tqdm
-from vllm import LLM, SamplingParams
 from transformers import AutoTokenizer
-from videomathqa.utils import (extract_characters_regex,
-                            videomathqa_process_results,
-                            videomathqa_mcq_aggregate_results,
-                            videomathqa_multi_binary_aggregate_results)
-
+from videomathqa.utils import (
+    extract_characters_regex,
+    videomathqa_mcq_aggregate_results,
+    videomathqa_multi_binary_aggregate_results,
+    videomathqa_process_results,
+)
+from vllm import LLM, SamplingParams
 
 mcq_prompt = (
     "Given the original multiple-choice options and a model-generated answer containing reasoning and a final answer, identify the option that best matches the final answer and return only the corresponding letter (A, B, C, D, or E)."

diff --git a/lmms_eval/tasks/videomathqa/cot_step_evaluation.py b/lmms_eval/tasks/videomathqa/cot_step_evaluation.py
@@ -1,11 +1,12 @@
-import os
+import argparse
 import ast
 import json
-import argparse
+import os
+
 import pandas as pd
 from tqdm import tqdm
-from vllm import LLM, SamplingParams
 from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
 
 system_prompt = """
 You are a intelligent assistant for grading math question solutions. You will be given:
@@ -110,7 +111,6 @@ def compute_score(gt_data, res_data, res_file, tokenizer, llm, sampling_params,
     batch = []
     scored_samples = []
     for sample in tqdm(gt_data, desc="Assigning scores with Qwen3"):
-
         qid = sample["question_id"]
         matched = [res for res in res_data if res["doc"]["question_id"] == qid]
         if not matched:

diff --git a/lmms_eval/tasks/videomathqa/utils.py b/lmms_eval/tasks/videomathqa/utils.py
@@ -1,13 +1,13 @@
 import os
 import re
-import cv2
 import sys
-import yaml
-import numpy as np
-
+from collections import defaultdict
 from pathlib import Path
 from typing import List
-from collections import defaultdict
+
+import cv2
+import numpy as np
+import yaml
 from loguru import logger as eval_logger
 
 VIDEO_LENGTH = ["short", "medium", "long"]
@@ -28,7 +28,6 @@ def decode_video(video_path: str) -> List[np.ndarray]:
 
 
 def load_video(video_path, max_frames, annot_sample_rate=1):
-
     def uniform_sample(m, n):
         assert n <= m
         stride = (m - 1) / (n - 1) if n > 1 else 0  # Calculate the stride
@@ -298,7 +297,6 @@ def videomathqa_mcq_aggregate_results(results):
 
 
 def videomathqa_multi_binary_aggregate_results(results):
-
     grouped = defaultdict(list)
     for result in results:
         grouped[result["question_id"]].append(result)

diff --git a/pyproject.toml b/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
     "av",
     "hf_transfer",
     "nltk",
-    "sentencepiece==0.1.99",
+    "sentencepiece",
     "yt-dlp",
     "pycocoevalcap",
     "tqdm-multiprocess",