{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:31:06Z","timestamp":1780057866323,"version":"3.54.0"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01773","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"19075-19084","source":"Crossref","is-referenced-by-count":1,"title":["BadVideo: Stealthy Backdoor Attack Against Text-to-Video Generation"],"prefix":"10.1109","author":[{"given":"Ruotong","family":"Wang","sequence":"first","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mingli","family":"Zhu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiarong","family":"Ou","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rui","family":"Chen","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xin","family":"Tao","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pengfei","family":"Wan","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou Technology"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Baoyuan","family":"Wu","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Qwen2.5-vl technical report","author":"Bai","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref2","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02190"},{"key":"ref5","article-title":"Panda-70m: Captioning 70 m videos with multiple cross-modality teachers","author":"Chen","year":"2024","journal-title":"CVPR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00393"},{"key":"ref7","article-title":"Llama guard 3 vision: Safeguarding human-ai image understanding conversations","author":"Chi","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00391"},{"key":"ref9","article-title":"Villandiffusion: A unified backdoor attack framework for diffusion models","author":"Chou","year":"2023","journal-title":"NeurIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00695"},{"key":"ref11","article-title":"Animatediff: Animate your personalized text-toimage diffusion models without specific tuning","author":"Guo","year":"2024","journal-title":"ICLR"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i19.30110"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.02073"},{"key":"ref14","article-title":"Videopoet: A large language model for zero-shot video generation","author":"Kondratyuk","year":"2024","journal-title":"ICML"},{"key":"ref15","article-title":"Hunyuanvideo: A systematic framework for large video generative models","author":"Kong","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref16","volume-title":"Flux","year":"2024"},{"key":"ref17","article-title":"Invisible backdoor attacks on diffusion models","author":"Li","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.01615"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02327"},{"key":"ref21","article-title":"Step-video-t2v technical report: The practice, challenges, and future of video foundation model","author":"Ma","year":"2025","journal-title":"arXiv e-prints"},{"key":"ref22","article-title":"T2vsafetybench: Evaluating the safety of text-to-video generative models","author":"Miao","year":"2024","journal-title":"NeurIPS"},{"key":"ref23","volume-title":"Gpt-4o system card","year":"2024"},{"key":"ref24","volume-title":"Upgrading the moderation api with our new multimodal moderation model","year":"2024"},{"key":"ref25","volume-title":"Sora: Creating Video from Text","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3127905"},{"key":"ref27","article-title":"Movie gen: A cast of media foundation models","author":"Polyak","year":"2025","journal-title":"arXiv preprint arXiv"},{"key":"ref28","article-title":"Smoothllm: Defending large language models against jailbreaking attacks","author":"Robey","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/SP54263.2024.00207"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SP54263.2024.00207"},{"key":"ref32","article-title":"Denoising diffusion implicit models","author":"Song","year":"2021","journal-title":"ICLR"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00423"},{"key":"ref34","article-title":"A good image generator is what you need for high-resolution video synthesis","author":"Tian","year":"2021","journal-title":"ICLR"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680689"},{"key":"ref36","article-title":"Modelscope text-to-video technical report","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref37","volume-title":"Versatile backdoor attack with visible, semantic, sample-specific, and compatible triggers","author":"Wang","year":"2024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref39","article-title":"Internvid: A large-scale video-text dataset for multimodal understanding and generation","author":"Wang","year":"2024","journal-title":"ICLR"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02447-x"},{"key":"ref41","article-title":"Godiva: Generating open-domain videos from natural descriptions","author":"Wu","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref42","article-title":"A survey on video diffusion models","author":"Xing","year":"2024","journal-title":"ACM Computing Surveys"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref44","article-title":"Videogpt: Video generation using vq-vae and transformers","author":"Yan","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref45","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2025","journal-title":"ICLR"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00845"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612108"},{"key":"ref48","article-title":"Open-sora: Democratizing efficient video production for all","author":"Zheng","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref49","volume-title":"To think or not to think: Exploring the unthinking vulnerability in large reasoning models","author":"Zhu","year":"2025"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445459.pdf?arnumber=11445459","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:18:14Z","timestamp":1777612694000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445459\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01773","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}