Skip to content

Commit 532ca07

Browse files
ZhangYuanhan-AIjohn.zhang
andauthored
Yhzhang/add charades sta (EvolvingLMMs-Lab#536)
* add charades_sta for temporal grounding * update * update formatting --------- Co-authored-by: john.zhang <[email protected]>
1 parent 25882dc commit 532ca07

File tree

5 files changed

+380
-0
lines changed

5 files changed

+380
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
dataset_path: lmms-lab/charades_sta
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: charades_sta
5+
video: True
6+
task: temporal_grounding_charades
7+
test_split: test
8+
9+
generation_kwargs:
10+
max_new_tokens: 50
11+
temperature: 0
12+
top_p: 1.0
13+
num_beams: 1
14+
do_sample: false
15+
16+
output_type: generate_until
17+
doc_to_visual: !function utils.temporal_grounding_doc_to_visual
18+
doc_to_text: !function utils.temporal_grounding_doc_to_text
19+
doc_to_target: !function utils.temporal_grounding_doc_to_answer
20+
process_results: !function utils.temporal_grounding_process_results_generation
21+
22+
23+
metric_list:
24+
- metric: submission
25+
aggregation: !function utils.temporal_grounding_aggregate_charades
26+
higher_is_better: true
27+
lmms_eval_specific_kwargs:
28+
default:
29+
pre_prompt: "Please find the visual event described by a sentence in the video, determining its starting and ending times. The format should be: 'The event happens in the start time - end time'. For example, The event 'person turn a light on' happens in the 24.3 - 30.4 seonds. Now I will give you the textual sentence: "
30+
post_prompt: "Please return its start time and end time."
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import argparse
2+
import json
3+
import os
4+
import pdb
5+
import re
6+
from copy import deepcopy
7+
from pathlib import Path
8+
9+
import numpy as np
10+
11+
12+
# read json files
13+
def read_json(path):
14+
with open(path, "r") as fin:
15+
datas = json.load(fin)
16+
return datas
17+
18+
19+
def write_json(path, data):
20+
with open(path, "w") as fout:
21+
json.dump(data, fout)
22+
print("The format file has been saved at:{}".format(path))
23+
return
24+
25+
26+
def extract_time(paragraph):
27+
prompt = "A specific example is : 20.8 - 30.0 seconds".lower()
28+
paragraph = paragraph.lower().replace(prompt, "").replace("to", "-")
29+
# Split text into sentences based on common delimiters
30+
sentences = re.split(r"[!?\n]", paragraph)
31+
32+
# Keywords that might indicate the presence of time information
33+
keywords = ["starts", "ends", "happens in", "start time", "end time", "start", "end", "happen"]
34+
# filter sentences by keywords
35+
candidates = []
36+
for sentence in sentences:
37+
# If sentence contains one of the keywords
38+
if any(keyword in sentence for keyword in keywords):
39+
candidates.append(sentence)
40+
41+
timestamps = []
42+
# Check for The given query happens in m - n (seconds)
43+
patterns = [r"(\d+\.*\d*)\s*-\s*(\d+\.*\d*)"]
44+
45+
for time_pattern in patterns:
46+
time_matches = re.findall(time_pattern, paragraph)
47+
if time_matches:
48+
timestamps = [[float(start), float(end)] for start, end in time_matches]
49+
50+
if len(sentences) == 0:
51+
return []
52+
# check for other formats e.g.:
53+
# 1 .Starting time: 0.8 seconds
54+
# Ending time: 1.1 seconds
55+
# 2. The start time for this event is 0 seconds, and the end time is 12 seconds.
56+
if len(timestamps) == 0:
57+
times = []
58+
time_regex = re.compile(r"\b(\d+\.\d+\b|\b\d+)\b") # time formats (e.g., 18, 18.5)
59+
for sentence in candidates:
60+
time = re.findall(time_regex, sentence)
61+
if time:
62+
time_in_sec = float(time[0])
63+
times.append(time_in_sec)
64+
times = times[: len(times) // 2 * 2]
65+
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
66+
# Check for examples like:
67+
# 3. The event 'person flipped the light switch near the door' starts at 00:00:18 and ends at 00:00:23.
68+
if len(timestamps) == 0:
69+
times = []
70+
time_regex = re.compile(r"\b((\d{1,2}:\d{2}:\d{2}))\b") # time formats (e.g., 18:00, 00:18:05)
71+
for sentence in candidates:
72+
time = re.findall(time_regex, sentence)
73+
if time:
74+
t = time[0]
75+
else:
76+
continue
77+
# If time is in HH:MM:SS format, convert to seconds
78+
if t.count(":") == 2:
79+
h, m, s = map(int, t.split(":"))
80+
time_in_sec = h * 3600 + m * 60 + s
81+
elif t.count(":") == 1:
82+
m, s = map(int, t.split(":"))
83+
time_in_sec = m * 60 + s
84+
times.append(time_in_sec)
85+
times = times[: len(times) // 2 * 2]
86+
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
87+
results = []
88+
for start, end in timestamps:
89+
if end > start:
90+
results.append([start, end])
91+
else:
92+
results.append([end, start])
93+
if len(results) > 1:
94+
results = results[:1]
95+
return results
96+
97+
98+
def iou(A, B):
99+
max0 = max((A[0]), (B[0]))
100+
min0 = min((A[0]), (B[0]))
101+
max1 = max((A[1]), (B[1]))
102+
min1 = min((A[1]), (B[1]))
103+
return max(min1 - max0, 0) / (max1 - min0)
104+
105+
106+
if __name__ == "__main__":
107+
parser = argparse.ArgumentParser()
108+
parser.add_argument("-f", default="your_result.json")
109+
args = parser.parse_args()
110+
111+
datas = read_json(args.f)
112+
113+
num = len(datas)
114+
115+
# miou
116+
ious = []
117+
for k in datas.keys():
118+
vid, caption, gt = k.split(">>>")
119+
pred = datas[k]
120+
gt = eval(gt)
121+
timestamps = extract_time(pred)
122+
if len(timestamps) != 1:
123+
print(f"pred={pred},timestamps={timestamps}")
124+
timestamps = [[gt[1] + 10, gt[1] + 20]]
125+
# print(f"GT: {gt}, Pred: {timestamps[0]}")
126+
127+
ious.append(iou(gt, timestamps[0]))
128+
129+
Result = {0.3: 0, 0.5: 0, 0.7: 0}
130+
for c_iou in [0.3, 0.5, 0.7]:
131+
for cur_iou in ious:
132+
if cur_iou >= c_iou:
133+
Result[c_iou] = Result[c_iou] + 1
134+
135+
print("IOU 0.3: {0}\nIOU 0.5: {1}\nIOU 0.7: {2}\nmIOU".format(Result[0.3] * 100 / num, Result[0.5] * 100 / num, Result[0.7] * 100 / num), sum(ious) * 100 / num)
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import datetime
2+
import json
3+
import os
4+
import random
5+
import sys
6+
from pathlib import Path
7+
8+
import numpy as np
9+
import yaml
10+
from decord import VideoReader, cpu
11+
from loguru import logger as eval_logger
12+
13+
import lmms_eval.tasks._task_utils.file_utils as file_utils
14+
15+
# with open(Path(__file__).parent / "_default_template.yaml", "r") as f:
16+
# raw_data = f.readlines()
17+
# safe_data = []
18+
# for i, line in enumerate(raw_data):
19+
# # remove function definition since yaml load cannot handle it
20+
# if "!function" not in line:
21+
# safe_data.append(line)
22+
23+
# config = yaml.safe_load("".join(safe_data))
24+
25+
26+
hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
27+
# cache_dir = os.path.join(hf_home, cache_dir)
28+
# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
29+
base_cache_dir = os.path.expanduser(hf_home)
30+
with open(Path(__file__).parent / "charades.yaml", "r") as f:
31+
raw_data = f.readlines()
32+
safe_data = []
33+
for i, line in enumerate(raw_data):
34+
# remove function definition since yaml load cannot handle it
35+
if "!function" not in line:
36+
safe_data.append(line)
37+
38+
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
39+
40+
41+
# DATA_LIST = {
42+
# "charades": 'your_data_dir/Charades/',
43+
# }
44+
# Pass in video path here
45+
# Can only work correctly with video llm
46+
def temporal_grounding_doc_to_visual(doc, lmms_eval_specific_kwargs=None):
47+
video_path = doc["video"]
48+
cache_dir = os.path.join(base_cache_dir, cache_name)
49+
video_path = os.path.join(cache_dir, "Charades_v1_480", video_path)
50+
if os.path.exists(video_path):
51+
video_path = video_path
52+
elif "s3://" not in video_path:
53+
sys.exit(f"video path:{video_path} does not exist, please check")
54+
55+
return [video_path]
56+
57+
58+
# This is the place where you format your question
59+
def temporal_grounding_doc_to_text(doc, lmms_eval_specific_kwargs=None):
60+
if lmms_eval_specific_kwargs is None:
61+
lmms_eval_specific_kwargs = {}
62+
63+
if "pre_prompt" in lmms_eval_specific_kwargs:
64+
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
65+
if "post_prompt" in lmms_eval_specific_kwargs:
66+
post_prompt = lmms_eval_specific_kwargs["post_prompt"]
67+
68+
question = doc["caption"]
69+
70+
return f"{pre_prompt}{question}. {post_prompt}"
71+
72+
73+
def temporal_grounding_doc_to_answer(doc):
74+
return doc["timestamp"]
75+
76+
77+
# Process result for mcq answer generation
78+
def temporal_grounding_process_results_generation(doc, result):
79+
pred = result[0]
80+
return {"submission": {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}}
81+
82+
83+
def temporal_grounding_aggregate_charades(results, args):
84+
temporal_grounding_aggregate_submissions(results, args, "charades")
85+
86+
87+
def temporal_grounding_aggregate_submissions(results, args, task):
88+
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
89+
submission_file_name = f"inference_results_temporal_grounding_{task}_{now_date_time}.json"
90+
path = file_utils.generate_submission_file(submission_file_name, args)
91+
92+
# results is a list of 5031 dict,
93+
# need to convert results into a single dict with 5031 key-value pairs
94+
combined_submission = {}
95+
96+
for submission_dict in results:
97+
combined_submission.update(submission_dict)
98+
99+
with open(path, "w") as f:
100+
json.dump(combined_submission, f, indent=4)
101+
102+
eval_logger.info(f"Submission file saved to {path}")

tools/get_split_zip.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import zipfile
3+
4+
5+
def split_zip(input_zip, output_dir, max_size=5 * 1024**3): # 5GB
6+
if not os.path.exists(output_dir):
7+
os.makedirs(output_dir)
8+
9+
part = 1
10+
current_size = 0
11+
prefix_name = input_zip.split(".")[0]
12+
output_zip = zipfile.ZipFile(os.path.join(output_dir, f"{prefix_name}_part_{part}.zip"), "w", zipfile.ZIP_DEFLATED)
13+
14+
with zipfile.ZipFile(input_zip, "r") as zip_ref:
15+
for file in zip_ref.namelist():
16+
file_data = zip_ref.read(file)
17+
file_size = len(file_data)
18+
19+
if current_size + file_size > max_size:
20+
output_zip.close()
21+
part += 1
22+
current_size = 0
23+
output_zip = zipfile.ZipFile(os.path.join(output_dir, f"{prefix_name}_part_{part}.zip"), "w", zipfile.ZIP_DEFLATED)
24+
25+
output_zip.writestr(file, file_data)
26+
current_size += file_size
27+
28+
output_zip.close()
29+
30+
31+
# Usage
32+
split_zip("Charades_v1_480.zip", "split_zips")
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
import os
3+
4+
import pandas as pd
5+
from datasets import Dataset, Features, Image, Sequence, Value
6+
from tqdm import tqdm
7+
8+
# Define the features for the dataset
9+
features = Features(
10+
{
11+
"video": Value(dtype="string"),
12+
"caption": Value(dtype="string"),
13+
"timestamp": Sequence(Value(dtype="float16")), # Use Sequence for lists
14+
}
15+
)
16+
17+
df_items = {
18+
"video": [],
19+
"caption": [],
20+
"timestamp": [],
21+
}
22+
23+
# Load json file
24+
json_path = "/opt/tiger/lmms-eval/lmms_eval/tasks/charades_sta/temporal_grounding_charades.json"
25+
with open(json_path, "r") as f:
26+
data = json.load(f)
27+
28+
29+
# Iterate over the rows of the data
30+
for cur_meta in data:
31+
video = cur_meta["video"]
32+
caption = cur_meta["caption"]
33+
timestamp = cur_meta["timestamp"]
34+
# import pdb;pdb.set_trace()
35+
df_items["video"].append(video)
36+
df_items["caption"].append(caption)
37+
df_items["timestamp"].append(timestamp)
38+
39+
import pdb
40+
41+
pdb.set_trace()
42+
43+
df_items = pd.DataFrame(df_items)
44+
45+
dataset = Dataset.from_pandas(df_items, features=features)
46+
47+
hub_dataset_path = "lmms-lab/charades_sta"
48+
dataset.push_to_hub(repo_id=hub_dataset_path, split="test")
49+
50+
# # upload the *zip to huggingface
51+
# from huggingface_hub import HfApi
52+
53+
# def upload_zip_to_huggingface(repo_id, zip_path, commit_message="Upload ZIP file"):
54+
# """
55+
# Uploads a ZIP file to a Hugging Face dataset repository.
56+
57+
# Args:
58+
# repo_id (str): The dataset repository ID (e.g., "your-username/your-dataset").
59+
# zip_path (str): Path to the ZIP file to upload.
60+
# commit_message (str): Commit message for the upload.
61+
# """
62+
# api = HfApi()
63+
64+
# # Upload file to the dataset repo
65+
# api.upload_file(
66+
# path_or_fileobj=zip_path,
67+
# path_in_repo=zip_path.split("/")[-1], # Store with the same filename
68+
# repo_id=repo_id,
69+
# repo_type="dataset",
70+
# commit_message=commit_message
71+
# )
72+
# print(f"Successfully uploaded {zip_path} to {repo_id}")
73+
74+
# # Example Usage for upload all zip in directory
75+
# import os
76+
# directory_path = "/home/tiger/split_zips"
77+
# # Iterate over all files in the directory
78+
# for filename in os.listdir(directory_path):
79+
# if filename.endswith(".zip"):
80+
# file_path = os.path.join(directory_path, filename)
81+
# upload_zip_to_huggingface("lmms-lab/charades_sta", file_path)

0 commit comments

Comments
 (0)