|
52 | 52 | 'Video-MME_0.5fps_subs': partial(VideoMME, dataset='Video-MME', fps=0.5, use_subtitle=True), |
53 | 53 | } |
54 | 54 |
|
| 55 | +videommev2_dataset = { |
| 56 | + # ── No subtitle ── |
| 57 | + 'Video-MME-v2_64frame': partial(VideoMMEv2, dataset='Video-MME-v2', nframe=64), |
| 58 | + 'Video-MME-v2_1fps': partial(VideoMMEv2, dataset='Video-MME-v2', fps=1.0), |
| 59 | + # ── Subtitle (non-interleave, concatenated as text block) ── |
| 60 | + 'Video-MME-v2_64frame_subs': partial( |
| 61 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, with_subtitle=True), |
| 62 | + 'Video-MME-v2_1fps_subs': partial( |
| 63 | + VideoMMEv2, dataset='Video-MME-v2', fps=1.0, with_subtitle=True), |
| 64 | + # ── Subtitle (interleave, timestamp-aligned between frames) ── |
| 65 | + 'Video-MME-v2_64frame_subs_interleave': partial( |
| 66 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 67 | + with_subtitle=True, subtitle_interleave=True), |
| 68 | + 'Video-MME-v2_1fps_subs_interleave': partial( |
| 69 | + VideoMMEv2, dataset='Video-MME-v2', fps=1.0, |
| 70 | + with_subtitle=True, subtitle_interleave=True), |
| 71 | + # ── Reasoning (no subtitle) ── |
| 72 | + 'Video-MME-v2_64frame_reasoning': partial( |
| 73 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, reasoning=True), |
| 74 | + # ── Reasoning + subtitle (non-interleave) ── |
| 75 | + 'Video-MME-v2_64frame_reasoning_subs': partial( |
| 76 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 77 | + reasoning=True, with_subtitle=True), |
| 78 | + # ── Reasoning + subtitle (interleave) ── |
| 79 | + 'Video-MME-v2_64frame_reasoning_subs_interleave': partial( |
| 80 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 81 | + reasoning=True, with_subtitle=True, subtitle_interleave=True), |
| 82 | + # ── Resize (no subtitle) ── |
| 83 | + 'Video-MME-v2_64frame_resize': partial( |
| 84 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 85 | + resize_target_area=448 * 448), |
| 86 | + 'Video-MME-v2_1fps_resize': partial( |
| 87 | + VideoMMEv2, dataset='Video-MME-v2', fps=1.0, |
| 88 | + resize_target_area=448 * 448), |
| 89 | + # ── Resize + subtitle ── |
| 90 | + 'Video-MME-v2_64frame_resize_subs': partial( |
| 91 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 92 | + resize_target_area=448 * 448, with_subtitle=True), |
| 93 | + # ── Resize + subtitle interleave ── |
| 94 | + 'Video-MME-v2_64frame_resize_subs_interleave': partial( |
| 95 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 96 | + resize_target_area=448 * 448, with_subtitle=True, subtitle_interleave=True), |
| 97 | + # ── Resize + reasoning ── |
| 98 | + 'Video-MME-v2_64frame_resize_reasoning': partial( |
| 99 | + VideoMMEv2, dataset='Video-MME-v2', nframe=64, |
| 100 | + resize_target_area=448 * 448, reasoning=True), |
| 101 | +} |
| 102 | + |
55 | 103 | videommmu_dataset = { |
56 | 104 | 'VideoMMMU_8frame': partial(VideoMMMU, dataset='VideoMMMU', nframe=8), |
57 | 105 | 'VideoMMMU_64frame': partial(VideoMMMU, dataset='VideoMMMU', nframe=64), |
@@ -329,8 +377,8 @@ def _build_video_variants(subsets, cls, variants=VSI_FRAME_VARIANTS): |
329 | 377 | supported_video_datasets = {} |
330 | 378 |
|
331 | 379 | dataset_groups = [ |
332 | | - mmbench_video_dataset, mvbench_dataset, videomme_dataset, videommmu_dataset, longvideobench_dataset, |
333 | | - mlvu_dataset, tempcompass_dataset, cgbench_dataset, worldsense_dataset, tamperbench_dataset, |
| 380 | + mmbench_video_dataset, mvbench_dataset, videomme_dataset, videommev2_dataset, videommmu_dataset, |
| 381 | + longvideobench_dataset, mlvu_dataset, tempcompass_dataset, cgbench_dataset, worldsense_dataset, tamperbench_dataset, |
334 | 382 | megabench_dataset, qbench_video_dataset, moviechat1k_dataset, vdc_dataset, video_holmes_dataset, vcrbench_dataset, |
335 | 383 | cg_av_counting_dataset, video_mmlu_dataset, egoexobench_dataset, dream_1k_dataset, video_tt_dataset, |
336 | 384 | video_vsi_dataset, mvu_eval_dataset, omtg_dataset, v2pbench_dataset, av_speakerbench_dataset |
|
0 commit comments