{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:10:17Z","timestamp":1765343417038,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"name":"National Research Foundation, Singapore","award":["AISG3-RP-2024-033"],"award-info":[{"award-number":["AISG3-RP-2024-033"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758295","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13346-13353","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AEGIS: Authenticity Evaluation Benchmark for AI-Generated Video Sequences"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-5931-6443","authenticated-orcid":false,"given":"Jieyu","family":"Li","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6455-047X","authenticated-orcid":false,"given":"Xin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Centre for Frontier AI Research, Institute of High Performance Computing Agency for Science, Technology and Research, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4675-7055","authenticated-orcid":false,"given":"Joey Tianyi","family":"Zhou","sequence":"additional","affiliation":[{"name":"Centre for Frontier AI Research, Institute of High Performance Computing Agency for Science, Technology and Research, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2. 5-vl technical report. arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_2_1","volume-title":"Videophy: Evaluating physical commonsense for video generation. arXiv:2406.03520","author":"Bansal Hritik","year":"2024","unstructured":"Hritik Bansal, Zongyu Lin, Tianyi Xie, Zeshun Zong, Michal Yarom, Yonatan Bitton, Chenfanfu Jiang, Yizhou Sun, Kai-Wei Chang, and Aditya Grover. 2024. Videophy: Evaluating physical commonsense for video generation. arXiv:2406.03520 (2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_1_4_1","unstructured":"Nuria Alina Chandra Ryan Murtfeldt Lin Qiu Arnab Karmakar Hannah Lee Emmanuel Tanumihardja Kevin Farhat Ben Caffee Sejin Paik Changyeon Lee et al. 2025. Deepfake-Eval-2024: A Multi-Modal In-the-Wild Benchmark of Deepfakes Circulated in 2024. arXiv:2503.02857 (2025)."},{"key":"e_1_3_2_1_5_1","volume-title":"What Matters in Detecting AI-Generated Videos like Sora? arXiv:2406.19568","author":"Chang Chirui","year":"2024","unstructured":"Chirui Chang, Zhengzhe Liu, Xiaoyang Lyu, and Xiaojuan Qi. 2024. What Matters in Detecting AI-Generated Videos like Sora? arXiv:2406.19568 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Demamba: Ai-generated video detection on million-scale genvideo benchmark. arXiv:2405.19707","author":"Chen Haoxing","year":"2024","unstructured":"Haoxing Chen, Yan Hong, Zizheng Huang, Zhuoer Xu, Zhangxuan Gu, Yaohui Li, Jun Lan, Huijia Zhu, Jianfu Zhang, Weiqiang Wang, et al., 2024. Demamba: Ai-generated video detection on million-scale genvideo benchmark. arXiv:2405.19707 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Radu Tudor Ionescu, and Mubarak Shah","author":"Croitoru Florinel-Alin","year":"2023","unstructured":"Florinel-Alin Croitoru, Vlad Hondru, Radu Tudor Ionescu, and Mubarak Shah. 2023. Diffusion models in vision: A survey. IEEE Trans. Pattern Anal. Mach. Intell. (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Aigcbench: Comprehensive evaluation of image-to-video content generated by ai","author":"Fan Fanda","year":"2023","unstructured":"Fanda Fan, Chunjie Luo, Wanling Gao, and Jianfeng Zhan. 2023. Aigcbench: Comprehensive evaluation of image-to-video content generated by ai. BenchCouncil Trans. Benchmarks Stand. Eval. (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proc. NeurIPS.","author":"Gat Itai","year":"2024","unstructured":"Itai Gat, Tal Remez, Neta Shaul, Felix Kreuk, Ricky TQ Chen, Gabriel Synnaeve, Yossi Adi, and Yaron Lipman. 2024. Discrete flow matching. In Proc. NeurIPS."},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. NeurIPS.","author":"Goodfellow Ian J","year":"2014","unstructured":"Ian J Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. In Proc. NeurIPS."},{"key":"e_1_3_2_1_11_1","volume-title":"VLForgery Face Triad: Detection, Localization and Attribution via Multimodal Large Language Models. arXiv:2503.06142","author":"He Xinan","year":"2025","unstructured":"Xinan He, Yue Zhou, Bing Fan, Bin Li, Guopu Zhu, and Feng Ding. 2025. VLForgery Face Triad: Detection, Localization and Attribution via Multimodal Large Language Models. arXiv:2503.06142 (2025)."},{"key":"e_1_3_2_1_12_1","volume-title":"ExDDV: A New Dataset for Explainable Deepfake Detection in Video. arXiv:2503.14421","author":"Hondru Vlad","year":"2025","unstructured":"Vlad Hondru, Eduard Hogea, Darian Onchis, and Radu Tudor Ionescu. 2025. ExDDV: A New Dataset for Explainable Deepfake Detection in Video. arXiv:2503.14421 (2025)."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. ICLR.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al., 2022. Lora: Low-rank adaptation of large language models.. In Proc. ICLR."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_1_15_1","volume-title":"Ji Youn Ryu, and Se-Hoon Jeong","author":"Hwang Yoori","year":"2021","unstructured":"Yoori Hwang, Ji Youn Ryu, and Se-Hoon Jeong. 2021. Effects of disinformation using deepfake: The protective effect of media literacy education. Cyberpsychology, Behavior, and Social Networking (2021)."},{"key":"e_1_3_2_1_16_1","volume-title":"Assessing the perceived credibility of deepfakes: The impact of system-generated cues and video characteristics","author":"Jin Xinyi","year":"2025","unstructured":"Xinyi Jin, Zhuoyue Zhang, Bowen Gao, Shuqing Gao, Wenbo Zhou, Nenghai Yu, and Guoyan Wang. 2025. Assessing the perceived credibility of deepfakes: The impact of system-generated cues and video characteristics. New Media & Society (2025)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_1_18_1","volume-title":"Proc. NeurIPS.","author":"Klein Leon","year":"2023","unstructured":"Leon Klein, Andreas Kr\u00e4mer, and Frank No\u00e9. 2023. Equivariant flow matching. In Proc. NeurIPS."},{"volume-title":"Kling: AI Video Generation Model. https:\/\/https:\/\/klingai.kuaishou.com\/\/.","year":"2024","key":"e_1_3_2_1_19_1","unstructured":"Kuaishou. 2024. Kling: AI Video Generation Model. https:\/\/https:\/\/klingai.kuaishou.com\/\/."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00636"},{"key":"e_1_3_2_1_21_1","volume-title":"Agiqa-3k: An open database for ai-generated image quality assessment","author":"Li Chunyi","year":"2023","unstructured":"Chunyi Li, Zicheng Zhang, Haoning Wu, Wei Sun, Xiongkuo Min, Xiaohong Liu, Guangtao Zhai, and Weisi Lin. 2023. Agiqa-3k: An open database for ai-generated image quality assessment. IEEE Trans. Circuits Syst. Video Technol. (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00213"},{"key":"e_1_3_2_1_23_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Detecting multimedia generated by large ai models: A survey. arXiv:2402.00045","author":"Lin Li","year":"2024","unstructured":"Li Lin, Neeraj Gupta, Yue Zhang, Hainan Ren, Chun-Hao Liu, Feng Ding, Xin Wang, Xin Li, Luisa Verdoliva, and Shu Hu. 2024. Detecting multimedia generated by large ai models: A survey. arXiv:2402.00045 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"A survey of ai-generated video evaluation. arXiv:2410.19884","author":"Liu Xiao","year":"2024","unstructured":"Xiao Liu, Xinhao Xiang, Zizhong Li, Yongheng Wang, Zhuoheng Li, Zhuosheng Liu, Weidi Zhang, Weiqi Ye, and Jiawei Zhang. 2024b. A survey of ai-generated video evaluation. arXiv:2410.19884 (2024)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"e_1_3_2_1_27_1","volume-title":"Sora: A review on background, technology, limitations, and opportunities of large vision models. arXiv:2402.17177","author":"Liu Yixin","year":"2024","unstructured":"Yixin Liu, Kai Zhang, Yuan Li, Zhiling Yan, Chujie Gao, Ruoxi Chen, Zhengqing Yuan, Yue Huang, Hanchi Sun, Jianfeng Gao, et al., 2024c. Sora: A review on background, technology, limitations, and opportunities of large vision models. arXiv:2402.17177 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proc. NeurIPS.","author":"Miao Yibo","year":"2024","unstructured":"Yibo Miao, Yifan Zhu, Lijia Yu, Jun Zhu, Xiao-Shan Gao, and Yinpeng Dong. 2024. T2vsafetybench: Evaluating the safety of text-to-video generative models. In Proc. NeurIPS."},{"key":"e_1_3_2_1_29_1","volume-title":"GenVidBench: A Challenging Benchmark for Detecting AI-Generated Video. arXiv:2501.11340","author":"Ni Zhenliang","year":"2025","unstructured":"Zhenliang Ni, Qiangyu Yan, Mouxiao Huang, Tianning Yuan, Yehui Tang, Hailin Hu, Xinghao Chen, and Yunhe Wang. 2025. GenVidBench: A Challenging Benchmark for Detecting AI-Generated Video. arXiv:2501.11340 (2025)."},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2023. GPT-4V. https:\/\/openai.com\/index\/gpt-4v-system-card\/."},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2024. GPT-4o. https:\/\/platform.openai.com\/docs\/models\/gpt-4o."},{"key":"e_1_3_2_1_32_1","volume-title":"Sora: AI Video Generation Model. https:\/\/openai.com\/sora.","author":"AI.","year":"2024","unstructured":"OpenAI. 2024. Sora: AI Video Generation Model. https:\/\/openai.com\/sora."},{"key":"e_1_3_2_1_33_1","volume-title":"Pika: AI Video Generation Platform. https:\/\/www.pika.art\/.","author":"Labs Pika","year":"2024","unstructured":"Pika Labs. 2024. Pika: AI Video Generation Platform. https:\/\/www.pika.art\/."},{"key":"e_1_3_2_1_34_1","volume-title":"Proc. ICML.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In Proc. ICML."},{"key":"e_1_3_2_1_35_1","volume-title":"On learning multi-modal forgery representation for diffusion generated video detection. arXiv:2410.23623","author":"Song Xiufeng","year":"2024","unstructured":"Xiufeng Song, Xiao Guo, Jiache Zhang, Qirui Li, Lei Bai, Xiaoming Liu, Guangtao Zhai, and Xiaohong Liu. 2024. On learning multi-modal forgery representation for diffusion generated video detection. arXiv:2410.23623 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"FragFake: A Dataset for Fine-Grained Detection of Edited Images with Vision Language Models. arXiv:2505.15644","author":"Sun Zhen","year":"2025","unstructured":"Zhen Sun, Ziyi Zhang, Zeren Luo, Zeyang Sha, Tianshuo Cong, Zheng Li, Shiwen Cui, Weiqiang Wang, Jiaheng Wei, Xinlei He, Qi Li, and Qian Wang. 2025. FragFake: A Dataset for Fine-Grained Detection of Edited Images with Vision Language Models. arXiv:2505.15644 (2025)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"e_1_3_2_1_38_1","volume-title":"Deepfakes and disinformation: Exploring the impact of synthetic political video on deception, uncertainty, and trust in news","author":"Vaccari Cristian","year":"2020","unstructured":"Cristian Vaccari and Andrew Chadwick. 2020. Deepfakes and disinformation: Exploring the impact of synthetic political video on deception, uncertainty, and trust in news. Social Media Society (2020)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Jiarui Wang Huiyu Duan Jing Liu Shi Chen Xiongkuo Min and Guangtao Zhai. 2023a. Aigciqa2023: A large-scale image quality assessment database for ai generated images: from the perspectives of quality authenticity and correspondence. In CAAI ICAI.","DOI":"10.1007\/978-981-99-9119-8_5"},{"key":"e_1_3_2_1_40_1","volume-title":"VideoFactory: Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation. arXiv:2305.10874","author":"Wang Wenjing","year":"2023","unstructured":"Wenjing Wang, Huan Yang, Zixi Tuo, Huiguo He, Junchen Zhu, Jianlong Fu, and Jiaying Liu. 2023b. VideoFactory: Swap Attention in Spatiotemporal Diffusions for Text-to-Video Generation. arXiv:2305.10874 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"TIP-I2V: A Million-Scale Real Text and Image Prompt Dataset for Image-to-Video Generation. arXiv:2411.04709","author":"Wang Wenhao","year":"2024","unstructured":"Wenhao Wang and Yi Yang. 2024. TIP-I2V: A Million-Scale Real Text and Image Prompt Dataset for Image-to-Video Generation. arXiv:2411.04709 (2024)."},{"key":"e_1_3_2_1_42_1","unstructured":"Zhiyuan Yan Taiping Yao Shen Chen Yandan Zhao Xinghe Fu Junwei Zhu Donghao Luo Chengjie Wang Shouhong Ding Yunsheng Wu et al. 2024. Df40: Toward next-generation deepfake detection. arXiv:2406.13495 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proc. NeurIPS.","author":"Yang Dongjie","year":"2024","unstructured":"Dongjie Yang, Suyuan Huang, Chengqiang Lu, Xiaodong Han, Haoxin Zhang, Yan Gao, Yao Hu, and Hai Zhao. 2024a. Vript: A video is worth thousands of words. In Proc. NeurIPS."},{"key":"e_1_3_2_1_44_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et al. 2024b. CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer. arXiv:2408.06072 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"PKU-AIGIQA-4K: A Perceptual Quality Assessment Database for Both Text-to-Image and Image-to-Image AI-Generated Images. arXiv:2404.18409","author":"Yuan Jiquan","year":"2024","unstructured":"Jiquan Yuan, Fanyi Yang, Jihe Li, Xinyan Cao, Jinming Che, Jinlong Lin, and Xixin Cao. 2024. PKU-AIGIQA-4K: A Perceptual Quality Assessment Database for Both Text-to-Image and Image-to-Image AI-Generated Images. arXiv:2404.18409 (2024)."},{"key":"e_1_3_2_1_46_1","volume-title":"Mingzhen Huang, Xianghao Kong, Nix Liu Xin, Shanshan Jiang, et al.","author":"Zhang Ruihan","year":"2025","unstructured":"Ruihan Zhang, Borou Yu, Jiajian Min, Yetong Xin, Zheng Wei, Juncheng Nemo Shi, Mingzhen Huang, Xianghao Kong, Nix Liu Xin, Shanshan Jiang, et al., 2025. Generative AI for Film Creation: A Survey of Recent Advances. arXiv:2504.08296 (2025)."},{"key":"e_1_3_2_1_47_1","volume-title":"I2vgen-xl: High-quality image-to-video synthesis via cascaded diffusion models. arXiv:2311.04145","author":"Zhang Shiwei","year":"2023","unstructured":"Shiwei Zhang, Jiayu Wang, Yingya Zhang, Kang Zhao, Hangjie Yuan, Zhiwu Qin, Xiang Wang, Deli Zhao, and Jingren Zhou. 2023. I2vgen-xl: High-quality image-to-video synthesis via cascaded diffusion models. arXiv:2311.04145 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Llamafactory: Unified efficient fine-tuning of 100 language models. arXiv:2403.13372","author":"Zheng Yaowei","year":"2024","unstructured":"Yaowei Zheng, Richong Zhang, Junhao Zhang, Yanhan Ye, Zheyan Luo, Zhangchi Feng, and Yongqiang Ma. 2024b. Llamafactory: Unified efficient fine-tuning of 100 language models. arXiv:2403.13372 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Open-sora: Democratizing efficient video production for all. arXiv:2412.20404","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiangyu Peng, Tianji Yang, Chenhui Shen, Shenggui Li, Hongxin Liu, Yukun Zhou, Tianyi Li, and Yang You. 2024a. Open-sora: Democratizing efficient video production for all. arXiv:2412.20404 (2024)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00979"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758295","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:07:17Z","timestamp":1765343237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758295"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3758295","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758295","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}