Skip to content

Commit ca905c5

Browse files
authored
[Benchmark] Support Video-MME-v2 (#1508)
* support videomme-v2 * fix lint
1 parent 9e3cc84 commit ca905c5

4 files changed

Lines changed: 592 additions & 3 deletions

File tree

vlmeval/dataset/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@
139139
from .video_holmes import Video_Holmes
140140
from .video_mmlu import Video_MMLU_CAP, Video_MMLU_QA
141141
from .videomme import VideoMME
142+
from .videommev2 import VideoMMEv2
142143
from .videommmu import VideoMMMU
143144
from .videott import VideoTT
144145
from .viewspatialbench import ViewSpatialBench
@@ -308,7 +309,8 @@ def evaluate(self, eval_file, **judge_kwargs):
308309
QBench_Video, QBench_Video_MCQ, QBench_Video_VQA,
309310
Video_MMLU_CAP, Video_MMLU_QA,
310311
Video_Holmes, VCRBench, CGAVCounting,
311-
EgoExoBench_MCQ, DREAM, VideoTT, VideoMMMU, MVUEval, OMTGBench, V2PBench, AVSpeakerBench
312+
EgoExoBench_MCQ, DREAM, VideoTT, VideoMMMU, MVUEval, OMTGBench, V2PBench, AVSpeakerBench,
313+
VideoMMEv2
312314
]
313315

314316
# add by EASI team

vlmeval/dataset/utils/videomme.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,30 @@ def extract_characters_regex(s):
150150
if matches is None:
151151
return ''
152152
return matches[0]
153+
154+
155+
def extract_characters_regex_v2(s):
156+
"""Extract answer letter from A-H for Video-MME-v2 (supports up to 8 options)."""
157+
s = s.strip()
158+
answer_prefixes = [
159+
'Final Answer:',
160+
'The best answer is',
161+
'The correct answer is',
162+
'The answer is',
163+
'The answer',
164+
'The best option is',
165+
'The correct option is',
166+
'Best answer:',
167+
'Best option:',
168+
'Answer:',
169+
'Option:',
170+
]
171+
for answer_prefix in answer_prefixes:
172+
s = s.replace(answer_prefix, '')
173+
174+
if len(s.split()) > 10 and not re.search('[A-H]', s):
175+
return ''
176+
matches = re.search(r'[A-H]', s)
177+
if matches is None:
178+
return ''
179+
return matches[0]

vlmeval/dataset/video_dataset_config.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,54 @@
5252
'Video-MME_0.5fps_subs': partial(VideoMME, dataset='Video-MME', fps=0.5, use_subtitle=True),
5353
}
5454

55+
videommev2_dataset = {
56+
# ── No subtitle ──
57+
'Video-MME-v2_64frame': partial(VideoMMEv2, dataset='Video-MME-v2', nframe=64),
58+
'Video-MME-v2_1fps': partial(VideoMMEv2, dataset='Video-MME-v2', fps=1.0),
59+
# ── Subtitle (non-interleave, concatenated as text block) ──
60+
'Video-MME-v2_64frame_subs': partial(
61+
VideoMMEv2, dataset='Video-MME-v2', nframe=64, with_subtitle=True),
62+
'Video-MME-v2_1fps_subs': partial(
63+
VideoMMEv2, dataset='Video-MME-v2', fps=1.0, with_subtitle=True),
64+
# ── Subtitle (interleave, timestamp-aligned between frames) ──
65+
'Video-MME-v2_64frame_subs_interleave': partial(
66+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
67+
with_subtitle=True, subtitle_interleave=True),
68+
'Video-MME-v2_1fps_subs_interleave': partial(
69+
VideoMMEv2, dataset='Video-MME-v2', fps=1.0,
70+
with_subtitle=True, subtitle_interleave=True),
71+
# ── Reasoning (no subtitle) ──
72+
'Video-MME-v2_64frame_reasoning': partial(
73+
VideoMMEv2, dataset='Video-MME-v2', nframe=64, reasoning=True),
74+
# ── Reasoning + subtitle (non-interleave) ──
75+
'Video-MME-v2_64frame_reasoning_subs': partial(
76+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
77+
reasoning=True, with_subtitle=True),
78+
# ── Reasoning + subtitle (interleave) ──
79+
'Video-MME-v2_64frame_reasoning_subs_interleave': partial(
80+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
81+
reasoning=True, with_subtitle=True, subtitle_interleave=True),
82+
# ── Resize (no subtitle) ──
83+
'Video-MME-v2_64frame_resize': partial(
84+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
85+
resize_target_area=448 * 448),
86+
'Video-MME-v2_1fps_resize': partial(
87+
VideoMMEv2, dataset='Video-MME-v2', fps=1.0,
88+
resize_target_area=448 * 448),
89+
# ── Resize + subtitle ──
90+
'Video-MME-v2_64frame_resize_subs': partial(
91+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
92+
resize_target_area=448 * 448, with_subtitle=True),
93+
# ── Resize + subtitle interleave ──
94+
'Video-MME-v2_64frame_resize_subs_interleave': partial(
95+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
96+
resize_target_area=448 * 448, with_subtitle=True, subtitle_interleave=True),
97+
# ── Resize + reasoning ──
98+
'Video-MME-v2_64frame_resize_reasoning': partial(
99+
VideoMMEv2, dataset='Video-MME-v2', nframe=64,
100+
resize_target_area=448 * 448, reasoning=True),
101+
}
102+
55103
videommmu_dataset = {
56104
'VideoMMMU_8frame': partial(VideoMMMU, dataset='VideoMMMU', nframe=8),
57105
'VideoMMMU_64frame': partial(VideoMMMU, dataset='VideoMMMU', nframe=64),
@@ -329,8 +377,8 @@ def _build_video_variants(subsets, cls, variants=VSI_FRAME_VARIANTS):
329377
supported_video_datasets = {}
330378

331379
dataset_groups = [
332-
mmbench_video_dataset, mvbench_dataset, videomme_dataset, videommmu_dataset, longvideobench_dataset,
333-
mlvu_dataset, tempcompass_dataset, cgbench_dataset, worldsense_dataset, tamperbench_dataset,
380+
mmbench_video_dataset, mvbench_dataset, videomme_dataset, videommev2_dataset, videommmu_dataset,
381+
longvideobench_dataset, mlvu_dataset, tempcompass_dataset, cgbench_dataset, worldsense_dataset, tamperbench_dataset,
334382
megabench_dataset, qbench_video_dataset, moviechat1k_dataset, vdc_dataset, video_holmes_dataset, vcrbench_dataset,
335383
cg_av_counting_dataset, video_mmlu_dataset, egoexobench_dataset, dream_1k_dataset, video_tt_dataset,
336384
video_vsi_dataset, mvu_eval_dataset, omtg_dataset, v2pbench_dataset, av_speakerbench_dataset

0 commit comments

Comments
 (0)