{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:58:15Z","timestamp":1774022295517,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730744","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:42:43Z","timestamp":1753260163000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mobius: Text to Seamless Looping Video Generation via Latent Shift"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3134-217X","authenticated-orcid":false,"given":"Xiuli","family":"Bi","sequence":"first","affiliation":[{"name":"Chongqing University of Post and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8862-1521","authenticated-orcid":false,"given":"Jianfei","family":"Yuan","sequence":"additional","affiliation":[{"name":"Chongqing University of Post and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3164-6299","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"additional","affiliation":[{"name":"Chongqing University of Post and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0066-3448","authenticated-orcid":false,"given":"Yong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Meituan, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3607-2236","authenticated-orcid":false,"given":"Xiaodong","family":"Cun","sequence":"additional","affiliation":[{"name":"GVC Lab, Great Bay University, Dongguan, China and Dongguan Key Laboratory for Intelligence and Information Technology, Dongguan, Guanggong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1788-3746","authenticated-orcid":false,"given":"Chi-Man","family":"Pun","sequence":"additional","affiliation":[{"name":"University of Macau, Macau, Macao"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8469-5302","authenticated-orcid":false,"given":"Bin","family":"Xiao","sequence":"additional","affiliation":[{"name":"Chongqing University of Post and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_3_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1186822.1073268"},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.12147"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00052"},{"key":"e_1_3_3_3_5_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et\u00a0al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_3_6_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman Clarence Ng Ricky Wang and Aditya Ramesh. 2024. Video generation models as world simulators. (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"e_1_3_3_3_7_1","unstructured":"Minghong Cai Xiaodong Cun Xiaoyu Li Wenze Liu Zhaoyang Zhang Yong Zhang Ying Shan and Xiangyu Yue. 2024. DiTCtrl: Exploring Attention Control in Multi-Modal Diffusion Transformer for Tuning-Free Multi-Prompt Longer Video Generation. arXiv:https:\/\/arXiv.org\/abs\/2412.18597 (2024)."},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_3_3_9_1","unstructured":"Haoxin Chen Menghan Xia Yingqing He Yong Zhang Xiaodong Cun Shaoshu Yang Jinbo Xing Yaofang Liu Qifeng Chen Xintao Wang et\u00a0al. 2023. Videocrafter1: Open diffusion models for high-quality video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.19512 (2023)."},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"e_1_3_3_3_11_1","unstructured":"Yuwei Guo Ceyuan Yang Anyi Rao Zhengyang Liang Yaohui Wang Yu Qiao Maneesh Agrawala Dahua Lin and Bo Dai. 2023. Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.04725 (2023)."},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"crossref","unstructured":"Tavi Halperin Hanit Hakim Orestis Vantzos Gershon Hochman Netai Benaim Lior Sassy Michael Kupchik Ofir Bibi and Ohad Fried. 2021. Endless loops: detecting and animating periodic patterns in still images. ACM Transactions on graphics (TOG) 40 4 (2021) 1\u201312.","DOI":"10.1145\/3450626.3459935"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","unstructured":"Mingming He Jing Liao Pedro\u00a0V. Sander and Hugues Hoppe. 2017. Gigapixel Panorama Video Loops. ACM Trans. Graph. 37 1 Article 3 (Nov. 2017) 15\u00a0pages. 10.1145\/3144455","DOI":"10.1145\/3144455"},{"key":"e_1_3_3_3_14_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_3_15_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik\u00a0P Kingma Ben Poole Mohammad Norouzi David\u00a0J Fleet et\u00a0al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02303 (2022)."},{"key":"e_1_3_3_3_16_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00575"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_3_3_19_1","unstructured":"Jihwan Kim Junoh Kang Jinyoung Choi and Bohyung Han. 2024. FIFO-Diffusion: Generating Infinite Videos from Text without Training. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.11473 (2024)."},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02279"},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"crossref","unstructured":"Zicheng Liao Neel Joshi and Hugues Hoppe. 2013. Automated video looping with progressive dynamism. ACM Transactions on Graphics (TOG) 32 4 (2013) 1\u201310.","DOI":"10.1145\/2461912.2461950"},{"key":"e_1_3_3_3_22_1","unstructured":"Shanchuan Lin and Xiao Yang. 2024. Animatediff-lightning: Cross-model diffusion distillation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12706 (2024)."},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"crossref","unstructured":"Xiao Liu Yanan Zheng Zhengxiao Du Ming Ding Yujie Qian Zhilin Yang and Jie Tang. 2023. GPT understands too. AI Open (2023).","DOI":"10.1016\/j.aiopen.2023.08.012"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"e_1_3_3_3_25_1","unstructured":"Aniruddha Mahapatra Aliaksandr Siarohin Hsin-Ying Lee Sergey Tulyakov and Jun-Yan Zhu. 2023. Synthesizing Artistic Cinemagraphs from Text. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.03190 (2023)."},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"crossref","unstructured":"Muyao Niu Xiaodong Cun Xintao Wang Yong Zhang Ying Shan and Yinqiang Zheng. 2024. MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.20222 (2024).","DOI":"10.1007\/978-3-031-72655-2_7"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_3_28_1","unstructured":"Bowen Peng Jeffrey Quesnelle Honglu Fan and Enrico Shippole. 2023. Yarn: Efficient context window extension of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.00071 (2023)."},{"key":"e_1_3_3_3_29_1","unstructured":"Chenyang Qi Xiaodong Cun Yong Zhang Chenyang Lei Xintao Wang Ying Shan and Qifeng Chen. 2023. FateZero: Fusing Attentions for Zero-shot Text-based Video Editing. arXiv:https:\/\/arXiv.org\/abs\/2303.09535 (2023)."},{"key":"e_1_3_3_3_30_1","unstructured":"Haonan Qiu Menghan Xia Yong Zhang Yingqing He Xintao Wang Ying Shan and Ziwei Liu. 2023. Freenoise: Tuning-free longer video diffusion via noise rescheduling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.15169 (2023)."},{"key":"e_1_3_3_3_31_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657497"},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00453"},{"key":"e_1_3_3_3_35_1","unstructured":"Uriel Singer Adam Polyak Thomas Hayes Xi Yin Jie An Songyang Zhang Qiyuan Hu Harry Yang Oron Ashual Oran Gafni et\u00a0al. 2022. Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.14792 (2022)."},{"key":"e_1_3_3_3_36_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.02502 (2020)."},{"key":"e_1_3_3_3_37_1","doi-asserted-by":"crossref","unstructured":"Jianlin Su Murtadha Ahmed Yu Lu Shengfeng Pan Wen Bo and Yunfeng Liu. 2024. Roformer: Enhanced transformer with rotary position embedding. Neurocomputing 568 (2024) 127063.","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_3_3_38_1","unstructured":"Zhenxiong Tan Xingyi Yang Songhua Liu and Xinchao Wang. 2024. Video-Infinity: Distributed Long Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.16260 (2024)."},{"key":"e_1_3_3_3_39_1","unstructured":"Maham Tanveer Yang Zhou Simon Niklaus Ali\u00a0Mahdavi Amiri Hao Zhang Krishna\u00a0Kumar Singh and Nanxuan Zhao. 2024. MotionBridge: Dynamic Video Inbetweening with Flexible Controls. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.13190 (2024)."},{"key":"e_1_3_3_3_40_1","unstructured":"Genmo Team. 2024a. Mochi 1. https:\/\/github.com\/genmoai\/models."},{"key":"e_1_3_3_3_41_1","unstructured":"Hunyuan\u00a0Video Team. 2024b. HunyuanVideo: A Systematic Framework For Large Video Generative Models. https:\/\/arxiv.org\/abs\/2412.03603"},{"key":"e_1_3_3_3_42_1","unstructured":"Thomas Unterthiner Sjoerd Van\u00a0Steenkiste Karol Kurach Raphael Marinier Marcin Michalski and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1812.01717 (2018)."},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696409.3700181"},{"key":"e_1_3_3_3_44_1","unstructured":"Fu-Yun Wang Wenshuo Chen Guanglu Song Han-Jia Ye Yu Liu and Hongsheng Li. 2023a. Gen-l-video: Multi-text to long video generation via temporal co-denoising. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.18264 (2023)."},{"key":"e_1_3_3_3_45_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023b. Modelscope text-to-video technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06571 (2023)."},{"key":"e_1_3_3_3_46_1","unstructured":"Weimin Wang Jiawei Liu Zhijie Lin Jiangqiao Yan Shuo Chen Chetwin Low Tuyen Hoang Jie Wu Jun\u00a0Hao Liew Hanshu Yan et\u00a0al. 2024b. Magicvideo-v2: Multi-stage high-aesthetic video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.04468 (2024)."},{"key":"e_1_3_3_3_47_1","unstructured":"Xiaojuan Wang Boyang Zhou Brian Curless Ira Kemelmacher-Shlizerman Aleksander Holynski and Steven\u00a0M Seitz. 2024c. Generative inbetweening: Adapting image-to-video models for keyframe interpolation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.15239 (2024)."},{"key":"e_1_3_3_3_48_1","unstructured":"Tianxing Wu Chenyang Si Yuming Jiang Ziqi Huang and Ziwei Liu. 2023. Freeinit: Bridging initialization gap in video diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.07537 (2023)."},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"crossref","unstructured":"Jinbo Xing Hanyuan Liu Menghan Xia Yong Zhang Xintao Wang Ying Shan and Tien-Tsin Wong. 2024. Tooncrafter: Generative cartoon interpolation. ACM Transactions on Graphics (TOG) 43 6 (2024) 1\u201311.","DOI":"10.1145\/3687761"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_23"},{"key":"e_1_3_3_3_51_1","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_3_52_1","unstructured":"Jiwen Yu Xiaodong Cun Chenyang Qi Yong Zhang Xintao Wang Ying Shan and Jian Zhang. 2023a. AnimateZero: Video Diffusion Models are Zero-Shot Image Animators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.03793 (2023)."},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02118"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"crossref","unstructured":"Wenxuan Zhang Xiaodong Cun Xuan Wang Yong Zhang Xi Shen Yu Guo Ying Shan and Fei Wang. 2022. SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation. arXiv:https:\/\/arXiv.org\/abs\/2211.12194","DOI":"10.1109\/CVPR52729.2023.00836"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730744","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:59:49Z","timestamp":1774018789000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730744"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":53,"alternative-id":["10.1145\/3721238.3730744","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730744","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}