{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:20:32Z","timestamp":1765340432299,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372482"],"award-info":[{"award-number":["62372482"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Science and Technology Program","award":["GJHZ20220913142600001"],"award-info":[{"award-number":["GJHZ20220913142600001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754943","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"9529-9538","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LaVieID: Local Autoregressive Diffusion Transformers for Identity-Preserving Video Creation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6410-5447","authenticated-orcid":false,"given":"Wenhui","family":"Song","sequence":"first","affiliation":[{"name":"Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9838-6532","authenticated-orcid":false,"given":"Hanhui","family":"Li","sequence":"additional","affiliation":[{"name":"Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3099-2886","authenticated-orcid":false,"given":"Jiehui","family":"Huang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology (HKUST), Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6183-6598","authenticated-orcid":false,"given":"Panwen","family":"Hu","sequence":"additional","affiliation":[{"name":"Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3772-9789","authenticated-orcid":false,"given":"Yuhao","family":"Cheng","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8083-0340","authenticated-orcid":false,"given":"Long","family":"Chen","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7709-0602","authenticated-orcid":false,"given":"Yiqiang","family":"Yan","sequence":"additional","affiliation":[{"name":"Lenovo Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3213-3062","authenticated-orcid":false,"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[{"name":"Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3687945"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0759"},{"key":"e_1_3_2_1_4_1","volume-title":"Delta-DiT: A Training-Free Acceleration Method Tailored for Diffusion Transformers. arXiv preprint arXiv:2406.01125","author":"Chen Pengtao","year":"2024","unstructured":"Pengtao Chen, Mingzhu Shen, Peng Ye, Jianjian Cao, Chongjun Tu, Christos-Savvas Bouganis, Yiren Zhao, and Tao Chen. 2024b. Delta-DiT: A Training-Free Acceleration Method Tailored for Diffusion Transformers. arXiv preprint arXiv:2406.01125 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"e_1_3_2_1_6_1","volume-title":"Hallo2: Long-duration and high-resolution audio-driven portrait image animation. arXiv preprint arXiv:2410.07718","author":"Cui Jiahao","year":"2024","unstructured":"Jiahao Cui, Hui Li, Yao Yao, Hao Zhu, Hanlin Shang, Kaihui Cheng, Hang Zhou, Siyu Zhu, and Jingdong Wang. 2024. Hallo2: Long-duration and high-resolution audio-driven portrait image animation. arXiv preprint arXiv:2410.07718 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Autoregressive Video Generation without Vector Quantization. arXiv preprint arXiv:2412.14169","author":"Deng Haoge","year":"2024","unstructured":"Haoge Deng, Ting Pan, Haiwen Diao, Zhengxiong Luo, Yufeng Cui, Huchuan Lu, Shiguang Shan, Yonggang Qi, and Xinlong Wang. 2024. Autoregressive Video Generation without Vector Quantization. arXiv preprint arXiv:2412.14169 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_1_9_1","volume-title":"Vincent Tao Hu, and Bj\u00f6rn Ommer","author":"Fuest Michael","year":"2025","unstructured":"Michael Fuest, Vincent Tao Hu, and Bj\u00f6rn Ommer. 2025. MaskFlow: Discrete Flows For Flexible and Efficient Long Video Generation. arXiv preprint arXiv:2502.11234 (2025)."},{"volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Gao Junyao","key":"e_1_3_2_1_10_1","unstructured":"Junyao Gao, SUN Yanan, Fei Shen, Xin Jiang, Zhening Xing, Kai Chen, and Cairong Zhao. [n.d.]. FaceShot: Bring Any Character into Life. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations.","author":"Gong Shansan","year":"2022","unstructured":"Shansan Gong, Mukai Li, Jiangtao Feng, Zhiyong Wu, and Lingpeng Kong. 2022. DiffuSeq: Sequence to Sequence Text Generation with Diffusion Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","volume-title":"Long-Context Autoregressive Video Modeling with Next-Frame Prediction. arXiv preprint arXiv:2503.19325","author":"Gu Yuchao","year":"2025","unstructured":"Yuchao Gu, Weijia Mao, and Mike Zheng Shou. 2025. Long-Context Autoregressive Video Modeling with Next-Frame Prediction. arXiv preprint arXiv:2503.19325 (2025)."},{"key":"e_1_3_2_1_14_1","volume-title":"Liveportrait: Efficient portrait animation with stitching and retargeting control. arXiv preprint arXiv:2407.03168","author":"Guo Jianzhu","year":"2024","unstructured":"Jianzhu Guo, Dingyun Zhang, Xiaoqiang Liu, Zhizhou Zhong, Yuan Zhang, Pengfei Wan, and Di Zhang. 2024b. Liveportrait: Efficient portrait animation with stitching and retargeting control. arXiv preprint arXiv:2407.03168 (2024)."},{"key":"e_1_3_2_1_15_1","first-page":"36777","article-title":"Pulid: Pure and lightning id customization via contrastive alignment","volume":"37","author":"Guo Zinan","year":"2024","unstructured":"Zinan Guo, Yanze Wu, Chen Zhuowei, Peng Zhang, Qian He, et al., 2024a. Pulid: Pure and lightning id customization via contrastive alignment. Advances in Neural Information Processing Systems, Vol. 37 (2024), 36777-36804.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00360"},{"key":"e_1_3_2_1_17_1","volume-title":"Id-animator: Zero-shot identity-preserving human video generation. arXiv preprint arXiv:2404.15275","author":"He Xuanhua","year":"2024","unstructured":"Xuanhua He, Quande Liu, Shengju Qian, Xin Wang, Tao Hu, Ke Cao, Keyu Yan, and Jie Zhang. 2024. Id-animator: Zero-shot identity-preserving human video generation. arXiv preprint arXiv:2404.15275 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Neighboring autoregressive modeling for efficient visual generation. arXiv preprint arXiv:2503.10696","author":"He Yefei","year":"2025","unstructured":"Yefei He, Yuanyu He, Shaoxuan He, Feng Chen, Hong Zhou, Kaipeng Zhang, and Bohan Zhuang. 2025. Neighboring autoregressive modeling for efficient visual generation. arXiv preprint arXiv:2503.10696 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:2211.13221","author":"He Yingqing","year":"2022","unstructured":"Yingqing He, Tianyu Yang, Yong Zhang, Ying Shan, and Qifeng Chen. 2022. Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:2211.13221 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_21_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, Vol. 33 (2020), 6840-6851.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","key":"e_1_3_2_1_22_1","unstructured":"Jonathan Ho and Tim Salimans. [n.d.]. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_1_23_1","first-page":"8633","article-title":"Video diffusion models","volume":"35","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Tim Salimans, Alexey Gritsenko, William Chan, Mohammad Norouzi, and David J Fleet. 2022. Video diffusion models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 8633-8646.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868","author":"Hong Wenyi","year":"2022","unstructured":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang. 2022. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868 (2022)."},{"key":"e_1_3_2_1_25_1","volume-title":"ACDiT: Interpolating Autoregressive Conditional Modeling and Diffusion Transformer. arXiv preprint arXiv:2412.07720","author":"Hu Jinyi","year":"2024","unstructured":"Jinyi Hu, Shengding Hu, Yuxuan Song, Yufei Huang, Mingxuan Wang, Hao Zhou, Zhiyuan Liu, Wei-Ying Ma, and Maosong Sun. 2024. ACDiT: Interpolating Autoregressive Conditional Modeling and Diffusion Transformer. arXiv preprint arXiv:2412.07720 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Consistentid: Portrait generation with multimodal fine-grained identity preserving. arXiv preprint arXiv:2404.16771","author":"Huang Jiehui","year":"2024","unstructured":"Jiehui Huang, Xiao Dong, Wenhui Song, Zheng Chong, Zhenchao Tang, Jun Zhou, Yuhao Cheng, Long Chen, Hanhui Li, Yiqiang Yan, et al., 2024a. Consistentid: Portrait generation with multimodal fine-grained identity preserving. arXiv preprint arXiv:2404.16771 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680849"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_1_29_1","volume-title":"InfiniteYou: Flexible Photo Recrafting While Preserving Your Identity. arXiv preprint","author":"Jiang Liming","year":"2025","unstructured":"Liming Jiang, Qing Yan, Yumin Jia, Zichuan Liu, Hao Kang, and Xin Lu. 2025. InfiniteYou: Flexible Photo Recrafting While Preserving Your Identity. arXiv preprint, Vol. arXiv:2503.16418 (2025)."},{"key":"e_1_3_2_1_30_1","volume-title":"Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125","author":"Kondratyuk Dan","year":"2023","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, et al., 2023. Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125 (2023)."},{"key":"e_1_3_2_1_31_1","first-page":"25105","volume-title":"VideoPoet: A Large Language Model for Zero-Shot Video Generation. International Conference on Machine Learning","volume":"235","author":"Kondratyuk Dan","year":"2024","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming Chang Chiu, et al., 2024. VideoPoet: A Large Language Model for Zero-Shot Video Generation. International Conference on Machine Learning, Vol. 235 (2024), 25105-25124."},{"key":"e_1_3_2_1_32_1","unstructured":"PKU-Yuan Lab Tuzhan AI et al. 2024. Open-sora-plan."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00255"},{"key":"e_1_3_2_1_34_1","unstructured":"Hengjia Li Lifan Jiang Xi Xiao Tianyang Wang Hongwei Yi Boxi Wu and Deng Cai. 2025. MagicID: Hybrid Preference Optimization for ID-Consistent and Dynamic-Preserved Video Customization. arXiv preprint arXiv:2503.12689 (2025)."},{"key":"e_1_3_2_1_35_1","volume-title":"PersonalVideo: High ID-Fidelity Video Customization without Dynamic and Semantic Degradation. arXiv preprint arXiv:2411.17048","author":"Li Hengjia","year":"2024","unstructured":"Hengjia Li, Haonan Qiu, Shiwei Zhang, Xiang Wang, Yujie Wei, Zekun Li, Yingya Zhang, Boxi Wu, and Deng Cai. 2024b. PersonalVideo: High ID-Fidelity Video Customization without Dynamic and Semantic Degradation. arXiv preprint arXiv:2411.17048 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"ControlVAR: Exploring Controllable Visual Autoregressive Modeling. arXiv preprint arXiv:2406.09750","author":"Li Xiang","year":"2024","unstructured":"Xiang Li, Kai Qiu, Hao Chen, Jason Kuen, Zhe Lin, Rita Singh, and Bhiksha Raj. 2024a. ControlVAR: Exploring Controllable Visual Autoregressive Modeling. arXiv preprint arXiv:2406.09750 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048","author":"Ma Xin","year":"2024","unstructured":"Xin Ma, Yaohui Wang, Gengyun Jia, Xinyuan Chen, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao. 2024a. Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00725"},{"key":"e_1_3_2_1_39_1","volume-title":"Magic-me: Identity-specific video customized diffusion. arXiv preprint arXiv:2402.09368","author":"Ma Ze","year":"2024","unstructured":"Ze Ma, Daquan Zhou, Chun-Hsiao Yeh, Xue-She Wang, Xiuyu Li, Huanrui Yang, Zhen Dong, Kurt Keutzer, and Jiashi Feng. 2024b. Magic-me: Identity-specific video customized diffusion. arXiv preprint arXiv:2402.09368 (2024)."},{"key":"e_1_3_2_1_40_1","unstructured":"OpenAI. 2023. Video generation models as world simulators."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Machine Learning. 8821-8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. 8821-8831."},{"key":"e_1_3_2_1_43_1","volume-title":"Next Block Prediction: Video Generation via Semi-Auto-Regressive Modeling. arXiv preprint arXiv:2502.07737","author":"Ren Shuhuai","year":"2025","unstructured":"Shuhuai Ren, Shuming Ma, Xu Sun, and Furu Wei. 2025a. Next Block Prediction: Video Generation via Semi-Auto-Regressive Modeling. arXiv preprint arXiv:2502.07737 (2025)."},{"key":"e_1_3_2_1_44_1","volume-title":"VideoWorld: Exploring Knowledge Learning from Unlabeled Videos. arXiv preprint arXiv:2501.09781","author":"Ren Zhongwei","year":"2025","unstructured":"Zhongwei Ren, Yunchao Wei, Xun Guo, Yao Zhao, Bingyi Kang, Jiashi Feng, and Xiaojie Jin. 2025b. VideoWorld: Exploring Knowledge Learning from Unlabeled Videos. arXiv preprint arXiv:2501.09781 (2025)."},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning. 42818-42835","author":"Ruhe David","year":"2024","unstructured":"David Ruhe, Jonathan Heek, Tim Salimans, and Emiel Hoogeboom. 2024. Rolling diffusion models. In International Conference on Machine Learning. 42818-42835."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"e_1_3_2_1_47_1","volume-title":"International Conference on Learning Representations.","author":"Singer Uriel","year":"2023","unstructured":"Uriel Singer, Adam Polyak, Thomas Hayes, Xi Yin, Jie An, Songyang Zhang, Qiyuan Hu, Harry Yang, Oron Ashual, Oran Gafni, et al., 2023. Make-A-Video: Text-to-Video Generation without Text-Video Data. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_48_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL-IJCNLP). 1-6.","author":"Su J","year":"2021","unstructured":"J Su, H Zhang, X Li, J Zhang, and Y RoFormer Li. 2021. Enhanced transformer with rotary position embedding. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL-IJCNLP). 1-6."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00690"},{"key":"e_1_3_2_1_51_1","first-page":"24804","article-title":"Csdi: Conditional score-based diffusion models for probabilistic time series imputation","volume":"34","author":"Tashiro Yusuke","year":"2021","unstructured":"Yusuke Tashiro, Jiaming Song, Yang Song, and Stefano Ermon. 2021. Csdi: Conditional score-based diffusion models for probabilistic time series imputation. Advances in Neural Information Processing Systems, Vol. 34 (2021), 24804-24816.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_52_1","first-page":"84839","article-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction","volume":"37","author":"Tian Keyu","year":"2024","unstructured":"Keyu Tian, Yi Jiang, Zehuan Yuan, Bingyue Peng, and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. Advances in Neural Information Processing Systems, Vol. 37 (2024), 84839-84865.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Den Oord Aaron Van","year":"2017","unstructured":"Aaron Van Den Oord, Oriol Vinyals, et al., 2017. Neural discrete representation learning. Advances in Neural Information Processing Systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_55_1","volume-title":"Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694","author":"Wei Huawei","year":"2024","unstructured":"Huawei Wei, Zejun Yang, and Zhisheng Wang. 2024a. Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 6537-6549","author":"Wei Yujie","year":"2024","unstructured":"Yujie Wei, Shiwei Zhang, Zhiwu Qing, Hangjie Yuan, Zhiheng Liu, Yu Liu, Yingya Zhang, Jingren Zhou, and Hongming Shan. 2024b. Dreamvideo: Composing your dream videos with customized subject and motion. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 6537-6549."},{"key":"e_1_3_2_1_57_1","volume-title":"A learning algorithm for continually running fully recurrent neural networks. Neural computation","author":"Williams Ronald J","year":"1989","unstructured":"Ronald J Williams and David Zipser. 1989. A learning algorithm for continually running fully recurrent neural networks. Neural computation, Vol. 1, 2 (1989), 270-280."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681391"},{"key":"e_1_3_2_1_59_1","volume-title":"Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157","author":"Yan Wilson","year":"2021","unstructured":"Wilson Yan, Yunzhi Zhang, Pieter Abbeel, and Aravind Srinivas. 2021. Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157 (2021)."},{"key":"e_1_3_2_1_60_1","volume-title":"Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072","author":"Yang Zhuoyi","year":"2024","unstructured":"Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, et al., 2024. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"From slow bidirectional to fast autoregressive video diffusion models. arXiv preprint arXiv:2412.07772","author":"Yin Tianwei","year":"2024","unstructured":"Tianwei Yin, Qiang Zhang, Richard Zhang, William T Freeman, Fredo Durand, Eli Shechtman, and Xun Huang. 2024. From slow bidirectional to fast autoregressive video diffusion models. arXiv preprint arXiv:2412.07772, Vol. 2 (2024)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"e_1_3_2_1_65_1","volume-title":"International Conference on Learning Representations.","author":"Yu Lijun","year":"2024","unstructured":"Lijun Yu, Jose Lezama, Nitesh Bharadwaj Gundavarapu, Luca Versari, Kihyuk Sohn, David Minnen, Yong Cheng, Agrim Gupta, Xiuye Gu, Alexander G Hauptmann, et al., 2024. Language Model Beats Diffusion-Tokenizer is key to visual generation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01211"},{"key":"e_1_3_2_1_67_1","volume-title":"Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers. arXiv preprint arXiv:2501.03931","author":"Zhang Yuechen","year":"2025","unstructured":"Yuechen Zhang, Yaoyang Liu, Bin Xia, Bohao Peng, Zexin Yan, Eric Lo, and Jiaya Jia. 2025a. Magic Mirror: ID-Preserved Video Generation in Video Diffusion Transformers. arXiv preprint arXiv:2501.03931 (2025)."},{"key":"e_1_3_2_1_68_1","volume-title":"Fantasyid: Face knowledge enhanced id-preserving video generation. arXiv preprint arXiv:2502.13995","author":"Zhang Yunpeng","year":"2025","unstructured":"Yunpeng Zhang, Qiang Wang, Fan Jiang, Yaqi Fan, Mu Xu, and Yonggang Qi. 2025b. Fantasyid: Face knowledge enhanced id-preserving video generation. arXiv preprint arXiv:2502.13995 (2025)."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01963"},{"key":"e_1_3_2_1_70_1","unstructured":"Yong Zhong Zhuoyi Yang Jiayan Teng Xiaotao Gu and Chongxuan Li. 2025. Concat-ID: Towards Universal Identity-Preserving Video Synthesis. arXiv preprint arXiv:2503.14151 (2025)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754943","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:17:26Z","timestamp":1765340246000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754943"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":70,"alternative-id":["10.1145\/3746027.3754943","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754943","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}