Yhzhang/add charades sta (EvolvingLMMs-Lab#536)

ZhangYuanhan-AI · john.zhang · web-flow · commit 532ca07ad413 · 2025-02-18T10:33:41.000+08:00
* add charades_sta for temporal grounding

* update

* update formatting

---------

Co-authored-by: john.zhang &lt;john.zhang@bytedance.com&gt;
diff --git a/lmms_eval/tasks/charades_sta/charades.yaml b/lmms_eval/tasks/charades_sta/charades.yaml
@@ -0,0 +1,30 @@
+dataset_path: lmms-lab/charades_sta
+dataset_kwargs:
+  token: True
+  cache_dir: charades_sta
+  video: True
+task: temporal_grounding_charades
+test_split: test
+
+generation_kwargs:
+  max_new_tokens: 50
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+output_type: generate_until
+doc_to_visual: !function utils.temporal_grounding_doc_to_visual
+doc_to_text: !function utils.temporal_grounding_doc_to_text
+doc_to_target: !function utils.temporal_grounding_doc_to_answer
+process_results: !function utils.temporal_grounding_process_results_generation
+
+
+metric_list:
+  - metric: submission
+    aggregation: !function utils.temporal_grounding_aggregate_charades
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Please find the visual event described by a sentence in the video, determining its starting and ending times. The format should be: 'The event happens in the start time - end time'. For example, The event 'person turn a light on' happens in the 24.3 - 30.4 seonds. Now I will give you the textual sentence: "
+    post_prompt: "Please return its start time and end time."
diff --git a/lmms_eval/tasks/charades_sta/eval_tvg.py b/lmms_eval/tasks/charades_sta/eval_tvg.py
@@ -0,0 +1,135 @@
+import argparse
+import json
+import os
+import pdb
+import re
+from copy import deepcopy
+from pathlib import Path
+
+import numpy as np
+
+
+# read json files
+def read_json(path):
+    with open(path, "r") as fin:
+        datas = json.load(fin)
+    return datas
+
+
+def write_json(path, data):
+    with open(path, "w") as fout:
+        json.dump(data, fout)
+    print("The format file has been saved at:{}".format(path))
+    return
+
+
+def extract_time(paragraph):
+    prompt = "A specific example is : 20.8 - 30.0 seconds".lower()
+    paragraph = paragraph.lower().replace(prompt, "").replace("to", "-")
+    # Split text into sentences based on common delimiters
+    sentences = re.split(r"[!?\n]", paragraph)
+
+    # Keywords that might indicate the presence of time information
+    keywords = ["starts", "ends", "happens in", "start time", "end time", "start", "end", "happen"]
+    # filter sentences by keywords
+    candidates = []
+    for sentence in sentences:
+        # If sentence contains one of the keywords
+        if any(keyword in sentence for keyword in keywords):
+            candidates.append(sentence)
+
+    timestamps = []
+    # Check for The given query happens in m - n (seconds)
+    patterns = [r"(\d+\.*\d*)\s*-\s*(\d+\.*\d*)"]
+
+    for time_pattern in patterns:
+        time_matches = re.findall(time_pattern, paragraph)
+        if time_matches:
+            timestamps = [[float(start), float(end)] for start, end in time_matches]
+
+    if len(sentences) == 0:
+        return []
+    # check for other formats e.g.:
+    # 1 .Starting time: 0.8 seconds
+    # Ending time: 1.1 seconds
+    # 2. The start time for this event is 0 seconds, and the end time is 12 seconds.
+    if len(timestamps) == 0:
+        times = []
+        time_regex = re.compile(r"\b(\d+\.\d+\b|\b\d+)\b")  # time formats (e.g., 18, 18.5)
+        for sentence in candidates:
+            time = re.findall(time_regex, sentence)
+            if time:
+                time_in_sec = float(time[0])
+                times.append(time_in_sec)
+        times = times[: len(times) // 2 * 2]
+        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
+    # Check for  examples like:
+    # 3. The event 'person flipped the light switch near the door' starts at 00:00:18 and ends at 00:00:23.
+    if len(timestamps) == 0:
+        times = []
+        time_regex = re.compile(r"\b((\d{1,2}:\d{2}:\d{2}))\b")  # time formats (e.g., 18:00, 00:18:05)
+        for sentence in candidates:
+            time = re.findall(time_regex, sentence)
+            if time:
+                t = time[0]
+            else:
+                continue
+            # If time is in HH:MM:SS format, convert to seconds
+            if t.count(":") == 2:
+                h, m, s = map(int, t.split(":"))
+                time_in_sec = h * 3600 + m * 60 + s
+            elif t.count(":") == 1:
+                m, s = map(int, t.split(":"))
+                time_in_sec = m * 60 + s
+            times.append(time_in_sec)
+        times = times[: len(times) // 2 * 2]
+        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
+    results = []
+    for start, end in timestamps:
+        if end > start:
+            results.append([start, end])
+        else:
+            results.append([end, start])
+    if len(results) > 1:
+        results = results[:1]
+    return results
+
+
+def iou(A, B):
+    max0 = max((A[0]), (B[0]))
+    min0 = min((A[0]), (B[0]))
+    max1 = max((A[1]), (B[1]))
+    min1 = min((A[1]), (B[1]))
+    return max(min1 - max0, 0) / (max1 - min0)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", default="your_result.json")
+    args = parser.parse_args()
+
+    datas = read_json(args.f)
+
+    num = len(datas)
+
+    # miou
+    ious = []
+    for k in datas.keys():
+        vid, caption, gt = k.split(">>>")
+        pred = datas[k]
+        gt = eval(gt)
+        timestamps = extract_time(pred)
+        if len(timestamps) != 1:
+            print(f"pred={pred},timestamps={timestamps}")
+            timestamps = [[gt[1] + 10, gt[1] + 20]]
+        # print(f"GT: {gt}, Pred: {timestamps[0]}")
+
+        ious.append(iou(gt, timestamps[0]))
+
+    Result = {0.3: 0, 0.5: 0, 0.7: 0}
+    for c_iou in [0.3, 0.5, 0.7]:
+        for cur_iou in ious:
+            if cur_iou >= c_iou:
+                Result[c_iou] = Result[c_iou] + 1
+
+    print("IOU 0.3: {0}\nIOU 0.5: {1}\nIOU 0.7: {2}\nmIOU".format(Result[0.3] * 100 / num, Result[0.5] * 100 / num, Result[0.7] * 100 / num), sum(ious) * 100 / num)
diff --git a/lmms_eval/tasks/charades_sta/utils.py b/lmms_eval/tasks/charades_sta/utils.py
@@ -0,0 +1,102 @@
+import datetime
+import json
+import os
+import random
+import sys
+from pathlib import Path
+
+import numpy as np
+import yaml
+from decord import VideoReader, cpu
+from loguru import logger as eval_logger
+
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+
+# with open(Path(__file__).parent / "_default_template.yaml", "r") as f:
+#     raw_data = f.readlines()
+#     safe_data = []
+#     for i, line in enumerate(raw_data):
+#         # remove function definition since yaml load cannot handle it
+#         if "!function" not in line:
+#             safe_data.append(line)
+
+#     config = yaml.safe_load("".join(safe_data))
+
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+# cache_dir = os.path.join(hf_home, cache_dir)
+# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "charades.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+# DATA_LIST = {
+#     "charades": 'your_data_dir/Charades/',
+# }
+# Pass in video path here
+# Can only work correctly with video llm
+def temporal_grounding_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
+    video_path = doc["video"]
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_path = os.path.join(cache_dir, "Charades_v1_480", video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif "s3://" not in video_path:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+
+    return [video_path]
+
+
+# This is the place where you format your question
+def temporal_grounding_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    if "pre_prompt" in lmms_eval_specific_kwargs:
+        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
+    if "post_prompt" in lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
+
+    question = doc["caption"]
+
+    return f"{pre_prompt}{question}. {post_prompt}"
+
+
+def temporal_grounding_doc_to_answer(doc):
+    return doc["timestamp"]
+
+
+# Process result for mcq answer generation
+def temporal_grounding_process_results_generation(doc, result):
+    pred = result[0]
+    return {"submission": {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}}
+
+
+def temporal_grounding_aggregate_charades(results, args):
+    temporal_grounding_aggregate_submissions(results, args, "charades")
+
+
+def temporal_grounding_aggregate_submissions(results, args, task):
+    now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    submission_file_name = f"inference_results_temporal_grounding_{task}_{now_date_time}.json"
+    path = file_utils.generate_submission_file(submission_file_name, args)
+
+    # results is a list of 5031 dict,
+    # need to convert results into a single dict with 5031 key-value pairs
+    combined_submission = {}
+
+    for submission_dict in results:
+        combined_submission.update(submission_dict)
+
+    with open(path, "w") as f:
+        json.dump(combined_submission, f, indent=4)
+
+    eval_logger.info(f"Submission file saved to {path}")
diff --git a/tools/get_split_zip.py b/tools/get_split_zip.py
@@ -0,0 +1,32 @@
+import os
+import zipfile
+
+
+def split_zip(input_zip, output_dir, max_size=5 * 1024**3):  # 5GB
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    part = 1
+    current_size = 0
+    prefix_name = input_zip.split(".")[0]
+    output_zip = zipfile.ZipFile(os.path.join(output_dir, f"{prefix_name}_part_{part}.zip"), "w", zipfile.ZIP_DEFLATED)
+
+    with zipfile.ZipFile(input_zip, "r") as zip_ref:
+        for file in zip_ref.namelist():
+            file_data = zip_ref.read(file)
+            file_size = len(file_data)
+
+            if current_size + file_size > max_size:
+                output_zip.close()
+                part += 1
+                current_size = 0
+                output_zip = zipfile.ZipFile(os.path.join(output_dir, f"{prefix_name}_part_{part}.zip"), "w", zipfile.ZIP_DEFLATED)
+
+            output_zip.writestr(file, file_data)
+            current_size += file_size
+
+    output_zip.close()
+
+
+# Usage
+split_zip("Charades_v1_480.zip", "split_zips")
diff --git a/tools/make_video_hf_dataset_from_json.py b/tools/make_video_hf_dataset_from_json.py
@@ -0,0 +1,81 @@
+import json
+import os
+
+import pandas as pd
+from datasets import Dataset, Features, Image, Sequence, Value
+from tqdm import tqdm
+
+# Define the features for the dataset
+features = Features(
+    {
+        "video": Value(dtype="string"),
+        "caption": Value(dtype="string"),
+        "timestamp": Sequence(Value(dtype="float16")),  # Use Sequence for lists
+    }
+)
+
+df_items = {
+    "video": [],
+    "caption": [],
+    "timestamp": [],
+}
+
+# Load json file
+json_path = "/opt/tiger/lmms-eval/lmms_eval/tasks/charades_sta/temporal_grounding_charades.json"
+with open(json_path, "r") as f:
+    data = json.load(f)
+
+
+# Iterate over the rows of the data
+for cur_meta in data:
+    video = cur_meta["video"]
+    caption = cur_meta["caption"]
+    timestamp = cur_meta["timestamp"]
+    # import pdb;pdb.set_trace()
+    df_items["video"].append(video)
+    df_items["caption"].append(caption)
+    df_items["timestamp"].append(timestamp)
+
+import pdb
+
+pdb.set_trace()
+
+df_items = pd.DataFrame(df_items)
+
+dataset = Dataset.from_pandas(df_items, features=features)
+
+hub_dataset_path = "lmms-lab/charades_sta"
+dataset.push_to_hub(repo_id=hub_dataset_path, split="test")
+
+# # upload the *zip to huggingface
+# from huggingface_hub import HfApi
+
+# def upload_zip_to_huggingface(repo_id, zip_path, commit_message="Upload ZIP file"):
+#     """
+#     Uploads a ZIP file to a Hugging Face dataset repository.
+
+#     Args:
+#         repo_id (str): The dataset repository ID (e.g., "your-username/your-dataset").
+#         zip_path (str): Path to the ZIP file to upload.
+#         commit_message (str): Commit message for the upload.
+#     """
+#     api = HfApi()
+
+#     # Upload file to the dataset repo
+#     api.upload_file(
+#         path_or_fileobj=zip_path,
+#         path_in_repo=zip_path.split("/")[-1],  # Store with the same filename
+#         repo_id=repo_id,
+#         repo_type="dataset",
+#         commit_message=commit_message
+#     )
+#     print(f"Successfully uploaded {zip_path} to {repo_id}")
+
+# # Example Usage for upload all zip in directory
+# import os
+# directory_path = "/home/tiger/split_zips"
+# # Iterate over all files in the directory
+# for filename in os.listdir(directory_path):
+#     if filename.endswith(".zip"):
+#         file_path = os.path.join(directory_path, filename)
+#         upload_zip_to_huggingface("lmms-lab/charades_sta", file_path)