{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:37Z","timestamp":1765309837567,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758305","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"13421-13427","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["BrokenVideos: A Benchmark Dataset for Fine-Grained Artifact Localization in AI-Generated Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6004-1325","authenticated-orcid":false,"given":"Jiahao","family":"Lin","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9123-5108","authenticated-orcid":false,"given":"Weixuan","family":"Peng","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6283-895X","authenticated-orcid":false,"given":"Bojia","family":"Zi","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5034-1928","authenticated-orcid":false,"given":"Yifeng","family":"Gao","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8493-1966","authenticated-orcid":false,"given":"Xianbiao","family":"Qi","sequence":"additional","affiliation":[{"name":"IntelliFusion Inc., Shenzhen City, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2099-4973","authenticated-orcid":false,"given":"Xingjun","family":"Ma","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1907-8567","authenticated-orcid":false,"given":"Yu-Gang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jianfa Bai Man Lin and Gang Cao. 2024b. AI-Generated Video Detection via Spatio-Temporal Anomaly Learning. (2024). arXiv:2403.16638"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Zechen Bai Tong He Haiyang Mei Pichao Wang Ziteng Gao Joya Chen Lei Liu Zheng Zhang and Mike Zheng Shou. 2024a. One token to seg them all: Language instructed reasoning segmentation in videos. In NeurIPS.","DOI":"10.52202\/079017-0219"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Sergi Caelles Kevis-Kokitsi Maninis Jordi Pont-Tuset Laura Leal-Taix\u00e9 Daniel Cremers and Luc Van Gool. 2017. One-Shot Video Object Segmentation. arXiv:1611.05198 [cs.CV] https:\/\/arxiv.org\/abs\/1611.05198","DOI":"10.1109\/CVPR.2017.565"},{"key":"e_1_3_2_1_4_1","volume-title":"Videocrafter2: Overcoming data limitations for high-quality video diffusion models. arXiv preprint arXiv:2401.09047","author":"Chen Haoxin","year":"2024","unstructured":"Haoxin Chen, Yong Zhang, Xiaodong Cun, Menghan Xia, Xintao Wang, Chao Weng, and Ying Shan. 2024. Videocrafter2: Overcoming data limitations for high-quality video diffusion models. arXiv preprint arXiv:2401.09047 (2024)."},{"key":"e_1_3_2_1_5_1","volume-title":"Carlo Masone, and Giuseppe Averta","author":"Cuttano Claudia","year":"2025","unstructured":"Claudia Cuttano, Gabriele Trivigno, Gabriele Rosi, Carlo Masone, and Giuseppe Averta. 2025a. SAMWISE: Infusing Wisdom in SAM2 for Text-Driven Video Segmentation. arXiv:2411.17646 [cs.CV] https:\/\/arxiv.org\/abs\/2411.17646"},{"key":"e_1_3_2_1_6_1","volume-title":"Carlo Masone, and Giuseppe Averta","author":"Cuttano Claudia","year":"2025","unstructured":"Claudia Cuttano, Gabriele Trivigno, Gabriele Rosi, Carlo Masone, and Giuseppe Averta. 2025b. SAMWISE: Infusing wisdom in SAM2 for Text-Driven Video Segmentation. In CVPR."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Chen Feng Duolikun Danier Fan Zhang Alex Mackin Andy Collins and David Bull. 2024. BVI-Artefact: An Artefact Detection Benchmark Dataset for Streamed Videos. arXiv:2312.08859 [cs.CV] https:\/\/arxiv.org\/abs\/2312.08859","DOI":"10.1109\/PCS60826.2024.10566356"},{"key":"e_1_3_2_1_9_1","unstructured":"Gen-3. 2024. Introducing Gen-3 Alpha: A New Frontier for Video Generation. https:\/\/runwayml.com\/research\/introducing-gen-3-alpha\/."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2889276"},{"key":"e_1_3_2_1_11_1","volume-title":"AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. arXiv preprint arXiv:2307.04725","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Yaohui Wang, Yu Qiao, Dahua Lin, and Bo Dai. 2023. AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning. arXiv preprint arXiv:2307.04725 (2023)."},{"key":"e_1_3_2_1_12_1","unstructured":"Keling. 2024. KLING VIDEO MODEL. https:\/\/kling.kuaishou.com\/en."},{"key":"e_1_3_2_1_13_1","volume-title":"Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125","author":"Kondratyuk Dan","year":"2023","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Rachel Hornung, Hartwig Adam, Hassan Akbari, Yair Alon, Vighnesh Birodkar, et al., 2023. Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125 (2023)."},{"key":"e_1_3_2_1_14_1","unstructured":"Weijie Kong Qi Tian Zijian Zhang Rox Min Zuozhuo Dai Jin Zhou Jiangfeng Xiong Xin Li Bo Wu Jianwei Zhang Kathrina Wu Qin Lin Junkun Yuan Yanxin Long Aladdin Wang Andong Wang Changlin Li Duojun Huang Fang Yang Hao Tan Hongmei Wang Jacob Song Jiawang Bai Jianbing Wu Jinbao Xue Joey Wang Kai Wang Mengyang Liu Pengyu Li Shuai Li Weiyan Wang Wenqing Yu Xinchi Deng Yang Li Yi Chen Yutao Cui Yuanbo Peng Zhentao Yu Zhiyu He Zhiyong Xu Zixiang Zhou Zunnan Xu Yangyu Tao Qinglin Lu Songtao Liu Daquan Zhou Hongfa Wang Yong Yang Di Wang Yuhong Liu Jie Jiang and Caesar Zhong. 2025. HunyuanVideo: A Systematic Framework For Large Video Generative Models. arXiv:2412.03603 [cs.CV] https:\/\/arxiv.org\/abs\/2412.03603"},{"key":"e_1_3_2_1_15_1","volume-title":"GLUS: Global-Local Reasoning Unified into A Single Large Language Model for Video Segmentation. arXiv preprint arXiv:2504.07962","author":"Lin Lang","year":"2025","unstructured":"Lang Lin, Xueyang Yu, Ziqi Pang, and Yu-Xiong Wang. 2025. GLUS: Global-Local Reasoning Unified into A Single Large Language Model for Video Segmentation. arXiv preprint arXiv:2504.07962 (2025)."},{"key":"e_1_3_2_1_16_1","unstructured":"Qingyuan Liu Pengyuan Shi Yun-Yun Tsai Chengzhi Mao and Junfeng Yang. 2024. Turns Out I'm Not Real: Towards Robust Detection of AI-Generated Videos. (2024). arXiv:2406.09601"},{"key":"e_1_3_2_1_17_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_18_1","unstructured":"Guoqing Ma Haoyang Huang Kun Yan Liangyu Chen Nan Duan Shengming Yin Changyi Wan Ranchen Ming Xiaoniu Song Xing Chen et al. 2025. Step-video-t2v technical report: The practice challenges and future of video foundation model. arXiv preprint arXiv:2502.10248 (2025)."},{"key":"e_1_3_2_1_19_1","unstructured":"Mochi-1. 2024. Mochi-1. https:\/\/www.genmo.ai\/blog."},{"key":"e_1_3_2_1_20_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv:2303.08774 [cs.CL]"},{"key":"e_1_3_2_1_21_1","volume-title":"Sora: Creating video from text. https:\/\/openai.com\/index\/sora\/.","author":"AI.","year":"2024","unstructured":"OpenAI. 2024. Sora: Creating video from text. https:\/\/openai.com\/index\/sora\/."},{"key":"e_1_3_2_1_22_1","volume-title":"Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Doll\u00e1r, and Christoph Feichtenhofer.","author":"Ravi Nikhila","year":"2024","unstructured":"Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman R\u00e4dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Doll\u00e1r, and Christoph Feichtenhofer. 2024. SAM 2: Segment Anything in Images and Videos. arXiv preprint arXiv:2408.00714 (2024). https:\/\/arxiv.org\/abs\/2408.00714"},{"key":"e_1_3_2_1_23_1","volume-title":"Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792","author":"Singer Uriel","year":"2022","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al., 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"CAMBI: Contrast-aware Multiscale Banding Index. arXiv:2102.00079 [eess.IV] https:\/\/arxiv.org\/abs\/2102.00079","author":"Tandon Pulkit","year":"2021","unstructured":"Pulkit Tandon, Mariana Afonso, Joel Sole, and Luk\u00e1\u0161 Krasula. 2021. CAMBI: Contrast-aware Multiscale Banding Index. arXiv:2102.00079 [eess.IV] https:\/\/arxiv.org\/abs\/2102.00079"},{"key":"e_1_3_2_1_25_1","volume-title":"Bovik","author":"Tu Zhengzhong","year":"2020","unstructured":"Zhengzhong Tu, Jessie Lin, Yilin Wang, Balu Adsumilli, and Alan C. Bovik. 2020. BBAND Index: A No-Reference Banding Artifact Predictor. arXiv:2002.11891 [eess.IV] https:\/\/arxiv.org\/abs\/2002.11891"},{"key":"e_1_3_2_1_26_1","volume-title":"Stamm","author":"Vahdati Danial Samadi","year":"2024","unstructured":"Danial Samadi Vahdati, Tai D. Nguyen, Aref Azizpour, and Matthew C. Stamm. 2024. Beyond Deepfake Images: Detecting AI-Generated Videos. (2024). arXiv:2404.15955"},{"key":"e_1_3_2_1_27_1","volume-title":"Wan: Open and Advanced Large-Scale Video Generative Models. arXiv:2503.20314 [cs.CV] https:\/\/arxiv.org\/abs\/2503.20314","author":"Wan Team","year":"2025","unstructured":"Team Wan, Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, Chen-Wei Xie, Di Chen, Feiwu Yu, Haiming Zhao, Jianxiao Yang, Jianyuan Zeng, Jiayu Wang, Jingfeng Zhang, Jingren Zhou, Jinkai Wang, Jixuan Chen, Kai Zhu, Kang Zhao, Keyu Yan, Lianghua Huang, Mengyang Feng, Ningyi Zhang, Pandeng Li, Pingyu Wu, Ruihang Chu, Ruili Feng, Shiwei Zhang, Siyang Sun, Tao Fang, Tianxing Wang, Tianyi Gui, Tingyu Weng, Tong Shen, Wei Lin, Wei Wang, Wei Wang, Wenmeng Zhou, Wente Wang, Wenting Shen, Wenyuan Yu, Xianzhong Shi, Xiaoming Huang, Xin Xu, Yan Kou, Yangyu Lv, Yifei Li, Yijing Liu, Yiming Wang, Yingya Zhang, Yitong Huang, Yong Li, You Wu, Yu Liu, Yulin Pan, Yun Zheng, Yuntao Hong, Yupeng Shi, Yutong Feng, Zeyinzi Jiang, Zhen Han, Zhi-Fan Wu, and Ziyu Liu. 2025. Wan: Open and Advanced Large-Scale Video Generative Models. arXiv:2503.20314 [cs.CV] https:\/\/arxiv.org\/abs\/2503.20314"},{"key":"e_1_3_2_1_28_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023. ModelScope Text-to-Video Technical Report. arXiv:2308.06571 [cs.CV]"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Stephen Wolf. 2008. A No Reference (NR) and Reduced Reference (RR) Metric for Detecting Dropped Video Frames. https:\/\/api.semanticscholar.org\/CorpusID:42072990","DOI":"10.70220\/mkjlbtqp"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611737"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Peiqing Yang Shangchen Zhou Jixin Zhao Qingyi Tao and Chen Change Loy. 2025. MatAnyone: Stable Video Matting with Consistent Memory Propagation. arXiv:2501.14677 [cs.CV] https:\/\/arxiv.org\/abs\/2501.14677","DOI":"10.1109\/CVPR52734.2025.00684"},{"key":"e_1_3_2_1_32_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang JiaZheng Xu Yuanming Yang Xiaohan Zhang Xiaotao Gu Guanyu Feng Da Yin Wenyi Hong Weihan Wang Yean Cheng Yuxuan Zhang Ting Liu Bin Xu Yuxiao Dong and Jie Tang. 2024. CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer. (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Ailing Zeng Yuhang Yang Weidong Chen and Wei Liu. 2024. The Dawn of Video Generation: Preliminary Explorations with SORA-like Models. arXiv:2410.05227 [cs.CV] https:\/\/arxiv.org\/abs\/2410.05227"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3084101"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3248162"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758305","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:26Z","timestamp":1765309586000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758305"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":35,"alternative-id":["10.1145\/3746027.3758305","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758305","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}