[Fix] of "evaluation of llava_vid on mvbench" (EvolvingLMMs-Lab#541)

zhshj0110 · web-flow · commit 8feca2331bf8 · 2025-02-20T06:14:05.000+08:00
* Update README.md

* Update README.md

* [Fix] of "mvbench missing videos"

Modify DATA_LIST so that it can find the corresponding video

* [Fix] of "evaluation of llava_vid on mvbench"

"mvbench_video/tvqa/frames_fps3_hq/castle_s07e04_seg02_clip_14" is the sampled video frame, not the original video file. And The current code logic cannot handle this subtask.

* Fixing lmms_eval/models/llava_vid.py
diff --git a/lmms_eval/models/llava_vid.py b/lmms_eval/models/llava_vid.py
@@ -1,3 +1,4 @@
+import glob
 import math
 import os
 from datetime import timedelta
@@ -416,6 +417,8 @@ def generate_until(self, requests) -> List[str]:
             visuals = doc_to_visual(self.task_dict[task][split][doc_id])
             # visuals = [visuals]
             # visuals = self.flatten(visuals)
+            if os.path.isdir(visuals[0]):
+                visuals = glob.glob(visuals[0] + "/*")
             videos = []
             try:
                 # for visual in visuals:
@@ -440,7 +443,8 @@ def generate_until(self, requests) -> List[str]:
                         frame_idx = sampled_indices.tolist()
                         frame_time = [i / fps for i in frame_idx]
                         frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
-                        video = [visuals[i] for i in frame_idx]
+                        # video = [visuals[i] for i in frame_idx]
+                        video = np.stack([np.array(Image.open(visuals[i])) for i in frame_idx], axis=0)
 
                 video = self._image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda()
                 if self.torch_dtype == "bfloat16":
diff --git a/lmms_eval/tasks/mvbench/utils.py b/lmms_eval/tasks/mvbench/utils.py
@@ -16,22 +16,22 @@
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
 DATA_LIST = {
-    "object_interaction": "star/Charades_segment",
-    "action_sequence": "star/Charades_segment",
-    "action_prediction": "star/Charades_segment",
-    "action_localization": "sta/sta_video_segment",
+    "object_interaction": "star/Charades_v1_480",
+    "action_sequence": "star/Charades_v1_480",
+    "action_prediction": "star/Charades_v1_480",
+    "action_localization": "sta_video",
     "moving_count": "clevrer/video_validation",
-    "fine_grained_pose": "nturgbd_convert",
+    "fine_grained_pose": "nturgbd",
     "character_order": "perception/videos",
     "object_shuffle": "perception/videos",
     "egocentric_navigation": "vlnqa",
     "moving_direction": "clevrer/video_validation",
-    "episodic_reasoning": "tvqa/video_fps3_hq_segment",
+    "episodic_reasoning": "tvqa/frames_fps3_hq",
     "fine_grained_action": "Moments_in_Time_Raw/videos",
     "scene_transition": "scene_qa/video",
     "state_change": "perception/videos",
     "moving_attribute": "clevrer/video_validation",
-    "action_antonym": "ssv2_video_mp4",
+    "action_antonym": "ssv2_video",
     "unexpected_action": "FunQA_test/test",
     "counterfactual_inference": "clevrer/video_validation",
     "object_existence": "clevrer/video_validation",