{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:01:19Z","timestamp":1765310479323,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":78,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFA1014003"],"award-info":[{"award-number":["2024YFA1014003"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758205","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:39:06Z","timestamp":1761377946000},"page":"12674-12681","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["<scp>EditWorld:<\/scp>\n                    Simulating World Dynamics for Instruction-Following Image Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0999-6231","authenticated-orcid":false,"given":"Bohan","family":"Zeng","sequence":"first","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1905-8053","authenticated-orcid":false,"given":"Ling","family":"Yang","sequence":"additional","affiliation":[{"name":"Princeton University, Princeton, USA and Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4584-9388","authenticated-orcid":false,"given":"Jiaming","family":"Liu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China and Tiamat AI, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7468-8790","authenticated-orcid":false,"given":"Minghao","family":"Xu","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1460-8124","authenticated-orcid":false,"given":"Yuanxing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-565X","authenticated-orcid":false,"given":"Pengfei","family":"Wan","sequence":"additional","affiliation":[{"name":"Kling Team, Kuaishou, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7532-5550","authenticated-orcid":false,"given":"Wentao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8906-3777","authenticated-orcid":false,"given":"Shuicheng","family":"Yan","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mc-llava: Multi-concept personalized vision-language model. arXiv preprint arXiv:2411.11706","author":"An Ruichuan","year":"2024","unstructured":"Ruichuan An, Sihan Yang, Ming Lu, Renrui Zhang, Kai Zeng, Yulin Luo, Jiajun Cao, Hao Liang, Ying Chen, Qi She, et al., 2024. Mc-llava: Multi-concept personalized vision-language model. arXiv preprint arXiv:2411.11706 (2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Ruichuan An Sihan Yang Renrui Zhang Zijun Shen Ming Lu Gaole Dai Hao Liang Ziyu Guo Shilin Yan Yulin Luo et al. 2025. UniCTokens: Boosting Personalized Understanding and Generation via Unified Concept Tokens. arXiv preprint arXiv:2505.14671 (2025)."},{"key":"e_1_3_2_1_3_1","volume-title":"Blended latent diffusion. TOG","author":"Avrahami Omri","year":"2023","unstructured":"Omri Avrahami, Ohad Fried, and Dani Lischinski. 2023. Blended latent diffusion. TOG (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Omri Avrahami Dani Lischinski and Ohad Fried. 2022. Blended diffusion for text-driven editing of natural images. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_1_5_1","volume-title":"Multi-Step Visual Reasoning with Visual Tokens Scaling and Verification. arXiv preprint arXiv:2506.07235","author":"Bai Tianyi","year":"2025","unstructured":"Tianyi Bai, Zengjie Hu, Fupeng Sun, Jiantao Qiu, Yizhen Jiang, Guangxin He, Bohan Zeng, Conghui He, Binhang Yuan, and Wentao Zhang. 2025. Multi-Step Visual Reasoning with Visual Tokens Scaling and Verification. arXiv preprint arXiv:2506.07235 (2025)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Omer Bar-Tal Dolev Ofri-Amar Rafail Fridman Yoni Kasten and Tali Dekel. 2022. Text2live: Text-driven layered image and video editing. In ECCV.","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"e_1_3_2_1_7_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et al. 2023. Improving image generation with better captions. Computer Science. https:\/\/cdn. openai. com\/papers\/dall-e-3. pdf Vol. 2 3 (2023) 8."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Tim Brooks Aleksander Holynski and Alexei A Efros. 2023. Instructpix2pix: Learning to follow image editing instructions. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_9_1","volume-title":"VersaVid-R1: A Versatile Video Understanding and Reasoning Model from Question Answering to Captioning Tasks. arXiv preprint arXiv:2506.09079","author":"Chen Xinlong","year":"2025","unstructured":"Xinlong Chen, Yuanxing Zhang, Yushuo Guan, Bohan Zeng, Yang Shi, Sihan Yang, Pengfei Wan, Qiang Liu, Liang Wang, and Tieniu Tan. 2025. VersaVid-R1: A Versatile Video Understanding and Reasoning Model from Question Answering to Captioning Tasks. arXiv preprint arXiv:2506.09079 (2025)."},{"key":"e_1_3_2_1_10_1","volume-title":"Diffedit: Diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427","author":"Couairon Guillaume","year":"2022","unstructured":"Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord. 2022. Diffedit: Diffusion-based semantic image editing with mask guidance. arXiv preprint arXiv:2210.11427 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Vqgan-clip: Open domain image generation and editing with natural language guidance. In ECCV.","author":"Crowson Katherine","year":"2022","unstructured":"Katherine Crowson, Stella Biderman, Daniel Kornis, Dashiell Stander, Eric Hallahan, Louis Castricato, and Edward Raff. 2022. Vqgan-clip: Open domain image generation and editing with natural language guidance. In ECCV."},{"key":"e_1_3_2_1_12_1","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. In NeurIPS."},{"key":"e_1_3_2_1_13_1","volume-title":"Yinfei Yang, and Zhe Gan.","author":"Fu Tsu-Jui","year":"2023","unstructured":"Tsu-Jui Fu, Wenze Hu, Xianzhi Du, William Yang Wang, Yinfei Yang, and Zhe Gan. 2023. Guiding instruction-based image editing via multimodal large language models. arXiv preprint arXiv:2309.17102 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530164"},{"key":"e_1_3_2_1_15_1","volume-title":"Tokenflow: Consistent diffusion features for consistent video editing. In ICLR.","author":"Geyer Michal","year":"2023","unstructured":"Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel. 2023. Tokenflow: Consistent diffusion features for consistent video editing. In ICLR."},{"key":"e_1_3_2_1_16_1","volume-title":"Pair-diffusion: Object-level image editing with structure-and-appearance paired diffusion models. arXiv preprint arXiv:2303.17546","author":"Goel Vidit","year":"2023","unstructured":"Vidit Goel, Elia Peruzzo, Yifan Jiang, Dejia Xu, Nicu Sebe, Trevor Darrell, Zhangyang Wang, and Humphrey Shi. 2023. Pair-diffusion: Object-level image editing with structure-and-appearance paired diffusion models. arXiv preprint arXiv:2303.17546 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Any2anytryon: Leveraging adaptive position embeddings for versatile virtual clothing tasks. arXiv preprint arXiv:2501.15891","author":"Guo Hailong","year":"2025","unstructured":"Hailong Guo, Bohan Zeng, Yiren Song, Wentao Zhang, Chuang Zhang, and Jiaming Liu. 2025. Any2anytryon: Leveraging adaptive position embeddings for versatile virtual clothing tasks. arXiv preprint arXiv:2501.15891 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"World models. arXiv preprint arXiv:1803.10122","author":"Ha David","year":"2018","unstructured":"David Ha and J\u00fcrgen Schmidhuber. 2018. World models. arXiv preprint arXiv:1803.10122 (2018)."},{"key":"e_1_3_2_1_19_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_20_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao et al. 2022a. Imagen video: High definition video generation with diffusion models. arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_21_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NeurIPS."},{"key":"e_1_3_2_1_22_1","volume-title":"Cascaded diffusion models for high fidelity image generation. JMLR","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho, Chitwan Saharia, William Chan, David J Fleet, Mohammad Norouzi, and Tim Salimans. 2022b. Cascaded diffusion models for high fidelity image generation. JMLR (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_24_1","volume-title":"Diffusion Model-Based Image Editing: A Survey. arXiv preprint arXiv:2402.17525","author":"Huang Yi","year":"2024","unstructured":"Yi Huang, Jiancheng Huang, Yifan Liu, Mingfu Yan, Jiaxi Lv, Jianzhuang Liu, Wei Xiong, He Zhang, Shifeng Chen, and Liangliang Cao. 2024. Diffusion Model-Based Image Editing: A Survey. arXiv preprint arXiv:2402.17525 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Yuzhou Huang Liangbin Xie Xintao Wang Ziyang Yuan Xiaodong Cun Yixiao Ge Jiantao Zhou Chao Dong Rui Huang Ruimao Zhang et al. 2023. SmartEdit: Exploring Complex Instruction-based Image Editing with Multimodal Large Language Models. arXiv preprint arXiv:2312.06739 (2023).","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"e_1_3_2_1_26_1","volume-title":"ReasonPix2Pix: Instruction Reasoning Dataset for Advanced Image Editing. arXiv preprint arXiv:2405.11190","author":"Jin Ying","year":"2024","unstructured":"Ying Jin, Pengyang Ling, Xiaoyi Dong, Pan Zhang, Jiaqi Wang, and Dahua Lin. 2024. ReasonPix2Pix: Instruction Reasoning Dataset for Advanced Image Editing. arXiv preprint arXiv:2405.11190 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Imagic: Text-based real image editing with diffusion models. In CVPR.","author":"Kawar Bahjat","year":"2023","unstructured":"Bahjat Kawar, Shiran Zada, Oran Lang, Omer Tov, Huiwen Chang, Tali Dekel, Inbar Mosseri, and Michal Irani. 2023. Imagic: Text-based real image editing with diffusion models. In CVPR."},{"key":"e_1_3_2_1_28_1","volume-title":"Diffusionclip: Text-guided diffusion models for robust image manipulation. In CVPR.","author":"Kim Gwanghyun","year":"2022","unstructured":"Gwanghyun Kim, Taesung Kwon, and Jong Chul Ye. 2022. Diffusionclip: Text-guided diffusion models for robust image manipulation. In CVPR."},{"key":"e_1_3_2_1_29_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_31_1","volume-title":"Clipstyler: Image style transfer with a single text condition. In CVPR.","author":"Kwon Gihyun","year":"2022","unstructured":"Gihyun Kwon and Jong Chul Ye. 2022. Clipstyler: Image style transfer with a single text condition. In CVPR."},{"key":"e_1_3_2_1_32_1","volume-title":"Karsten Kreis, Sanja Fidler, and Antonio Torralba.","author":"Li Daiqing","year":"2022","unstructured":"Daiqing Li, Huan Ling, Seung Wook Kim, Karsten Kreis, Sanja Fidler, and Antonio Torralba. 2022. Bigdatasetgan: Synthesizing imagenet with pixel-wise annotations. In CVPR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01007"},{"key":"e_1_3_2_1_34_1","volume-title":"Zone: Zero-shot instruction-guided local editing. arXiv preprint arXiv:2312.16794","author":"Li Shanglin","year":"2023","unstructured":"Shanglin Li, Bohan Zeng, Yutang Feng, Sicheng Gao, Xuhui Liu, Jiaming Liu, Li Lin, Xu Tang, Yao Hu, Jianzhuang Liu, et al., 2023. Zone: Zero-shot instruction-guided local editing. arXiv preprint arXiv:2312.16794 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023b. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"Chen-Hsuan Lin Jun Gao Luming Tang Towaki Takikawa Xiaohui Zeng Xun Huang Karsten Kreis Sanja Fidler Ming-Yu Liu and Tsung-Yi Lin. 2023a. Magic3d: High-resolution text-to-3d content creation. In CVPR."},{"key":"e_1_3_2_1_37_1","volume-title":"Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073","author":"Meng Chenlin","year":"2021","unstructured":"Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2021. Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"Null-text Inversion for Editing Real Images using Guided Diffusion Models. arXiv preprint arXiv:2211.09794","author":"Mokady Ron","year":"2022","unstructured":"Ron Mokady, Amir Hertz, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Null-text Inversion for Editing Real Images using Guided Diffusion Models. arXiv preprint arXiv:2211.09794 (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"Dragondiffusion: Enabling drag-style manipulation on diffusion models. In ICLR.","author":"Mou Chong","year":"2024","unstructured":"Chong Mou, Xintao Wang, Jiechong Song, Ying Shan, and Jian Zhang. 2024. Dragondiffusion: Enabling drag-style manipulation on diffusion models. In ICLR."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Jisu Nam Heesu Kim DongJae Lee Siyoon Jin Seungryong Kim and Seunggyu Chang. 2024. DreamMatcher: Appearance Matching Self-Attention for Semantically-Consistent Text-to-Image Personalization. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00774"},{"key":"e_1_3_2_1_41_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In ICML.","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In ICML."},{"key":"e_1_3_2_1_42_1","unstructured":"Junbo Niu Yuanhong Zheng Ziyang Miao Hejun Dong Chunjiang Ge Hao Liang Ma Lu Bohan Zeng Qiahao Zheng Conghui He et al. 2025. Native Visual Understanding: Resolving Resolution Dilemmas in Vision-Language Models. arXiv preprint arXiv:2506.12776 (2025)."},{"key":"e_1_3_2_1_43_1","unstructured":"Xingang Pan Ayush Tewari Thomas Leimk\u00fchler Lingjie Liu Abhimitra Meka and Christian Theobalt. 2023. Drag your gan: Interactive point-based manipulation on the generative image manifold. In SIGGRAPH."},{"key":"e_1_3_2_1_44_1","volume-title":"Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu.","author":"Parmar Gaurav","year":"2023","unstructured":"Gaurav Parmar, Krishna Kumar Singh, Richard Zhang, Yijun Li, Jingwan Lu, and Jun-Yan Zhu. 2023. Zero-shot image-to-image translation. In SIGGRAPH."},{"key":"e_1_3_2_1_45_1","volume-title":"Scalable Diffusion Models with Transformers. arXiv:2212.09748","author":"Peebles William","year":"2022","unstructured":"William Peebles and Saining Xie. 2022. Scalable Diffusion Models with Transformers. arXiv:2212.09748 (2022)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"William Peebles Jun-Yan Zhu Richard Zhang Antonio Torralba Alexei A Efros and Eli Shechtman. 2022. Gan-supervised dense visual alignment. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01311"},{"key":"e_1_3_2_1_47_1","volume-title":"SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In ICLR.","author":"Podell Dustin","year":"2024","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2024. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In ICLR."},{"key":"e_1_3_2_1_48_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. In ICLR.","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2022. Dreamfusion: Text-to-3d using 2d diffusion. In ICLR."},{"key":"e_1_3_2_1_49_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_2_1_50_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_52_1","volume-title":"Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In CVPR.","author":"Ruiz Nataniel","year":"2023","unstructured":"Nataniel Ruiz, Yuanzhen Li, Varun Jampani, Yael Pritch, Michael Rubinstein, and Kfir Aberman. 2023. Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In CVPR."},{"key":"e_1_3_2_1_53_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al., 2022. Photorealistic text-to-image diffusion models with deep language understanding. In NeurIPS."},{"key":"e_1_3_2_1_54_1","volume-title":"Runming He, Hao Liang, Meiyi Qiang, Zimo Meng, Zhengyang Zhao, Bohan Zeng, Zhengzhou Zhu, Bin Cui, et al.","author":"Shen Chengyu","year":"2025","unstructured":"Chengyu Shen, Zhen Hao Wong, Runming He, Hao Liang, Meiyi Qiang, Zimo Meng, Zhengyang Zhao, Bohan Zeng, Zhengzhou Zhu, Bin Cui, et al., 2025. Let's Verify Math Questions Step by Step. arXiv preprint arXiv:2505.13903 (2025)."},{"key":"e_1_3_2_1_55_1","volume-title":"Mavors: Multi-granularity video representation for multimodal large language model. arXiv preprint arXiv:2504.10068","author":"Shi Yang","year":"2025","unstructured":"Yang Shi, Jiaheng Liu, Yushuo Guan, Zhenhua Wu, Yuanxing Zhang, Zihao Wang, Weihong Lin, Jingyun Hua, Zekun Wang, Xinlong Chen, et al., 2025a. Mavors: Multi-granularity video representation for multimodal large language model. arXiv preprint arXiv:2504.10068 (2025)."},{"key":"e_1_3_2_1_56_1","unstructured":"Yang Shi Huanqian Wang Wulin Xie Huanyao Zhang Lijie Zhao Yi-Fan Zhang Xinfeng Li Chaoyou Fu Zhuoer Wen Wenting Liu et al. 2025b. MME-VideoOCR: Evaluating OCR-Based Capabilities of Multimodal LLMs in Video Scenarios. arXiv preprint arXiv:2505.21333 (2025)."},{"key":"e_1_3_2_1_57_1","volume-title":"Vincent YF Tan, and Song Bai","author":"Shi Yujun","year":"2023","unstructured":"Yujun Shi, Chuhui Xue, Jiachun Pan, Wenqing Zhang, Vincent YF Tan, and Song Bai. 2023. Dragdiffusion: Harnessing diffusion models for interactive point-based image editing. arXiv preprint arXiv:2306.14435 (2023)."},{"key":"e_1_3_2_1_58_1","volume-title":"Irina Blok, Huiwen Chang, Jarred Barber, Lu Jiang, Glenn Entis, Yuanzhen Li, et al.","author":"Sohn Kihyuk","year":"2023","unstructured":"Kihyuk Sohn, Nataniel Ruiz, Kimin Lee, Daniel Castro Chin, Irina Blok, Huiwen Chang, Jarred Barber, Lu Jiang, Glenn Entis, Yuanzhen Li, et al., 2023. StyleDrop: Text-to-Image Generation in Any Style. arXiv preprint arXiv:2306.00983 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020a. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_60_1","unstructured":"Yang Song Jascha Sohl-Dickstein Diederik P Kingma Abhishek Kumar Stefano Ermon and Ben Poole. 2020b. Score-based generative modeling through stochastic differential equations. In ICLR."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Nontawat Tritrong Pitchaporn Rewatbowornwong and Supasorn Suwajanakorn. 2021. Repurposing gans for one-shot semantic part segmentation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00445"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Narek Tumanyan Michal Geyer Shai Bagon and Tali Dekel. 2023. Plug-and-play diffusion features for text-driven image-to-image translation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_1_63_1","unstructured":"Arash Vahdat Karsten Kreis and Jan Kautz. 2021. Score-based generative modeling in latent space. In NeurIPS."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Yuri Viazovetskyi Vladimir Ivashkin and Evgeny Kashin. 2020. Stylegan2 distillation for feed-forward image manipulation. In ECCV.","DOI":"10.1007\/978-3-030-58542-6_11"},{"key":"e_1_3_2_1_65_1","volume-title":"Internvid: A large-scale video-text dataset for multimodal understanding and generation. In ICLR.","author":"Wang Yi","year":"2024","unstructured":"Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, et al., 2024. Internvid: A large-scale video-text dataset for multimodal understanding and generation. In ICLR."},{"key":"e_1_3_2_1_66_1","volume-title":"Yuchao Gu, Yufei Shi, Wynne Hsu, Ying Shan, Xiaohu Qie, and Mike Zheng Shou.","author":"Wu Jay Zhangjie","year":"2023","unstructured":"Jay Zhangjie Wu, Yixiao Ge, Xintao Wang, Stan Weixian Lei, Yuchao Gu, Yufei Shi, Wynne Hsu, Ying Shan, Xiaohu Qie, and Mike Zheng Shou. 2023. Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. In CVPR."},{"key":"e_1_3_2_1_67_1","volume-title":"Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs. ICML","author":"Yang Ling","year":"2024","unstructured":"Ling Yang, Zhaochen Yu, Chenlin Meng, Minkai Xu, Stefano Ermon, and Bin Cui. 2024. Mastering Text-to-Image Diffusion: Recaptioning, Planning, and Generating with Multimodal LLMs. ICML (2024)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3531390"},{"key":"e_1_3_2_1_69_1","volume-title":"WideRange4D: Enabling High-Quality 4D Reconstruction with Wide-Range Movements and Scenes. arXiv preprint arXiv:2503.13435","author":"Yang Ling","year":"2025","unstructured":"Ling Yang, Kaixin Zhu, Juanxi Tian, Bohan Zeng, Mingbao Lin, Hongjuan Pei, Wentao Zhang, and Shuicheng Yan. 2025b. WideRange4D: Enabling High-Quality 4D Reconstruction with Wide-Range Movements and Scenes. arXiv preprint arXiv:2503.13435 (2025)."},{"key":"e_1_3_2_1_70_1","volume-title":"IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models. arXiv preprint arXiv:2308.06721 (2023)."},{"key":"e_1_3_2_1_71_1","volume-title":"Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al.","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu Karagol Ayan, et al., 2022. Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)."},{"key":"e_1_3_2_1_72_1","volume-title":"IPDreamer: Appearance-Controllable 3D Object Generation with Image Prompts. arXiv preprint arXiv:2310.05375","author":"Zeng Bohan","year":"2023","unstructured":"Bohan Zeng, Shanglin Li, Yutang Feng, Hong Li, Sicheng Gao, Jiaming Liu, Huaxia Li, Xu Tang, Jianzhuang Liu, and Baochang Zhang. 2023. IPDreamer: Appearance-Controllable 3D Object Generation with Image Prompts. arXiv preprint arXiv:2310.05375 (2023)."},{"key":"e_1_3_2_1_73_1","unstructured":"Bohan Zeng Ling Yang Siyu Li Jiaming Liu Zixiang Zhang Juanxi Tian Kaixin Zhu Yongzhen Guo Fu-Yun Wang Minkai Xu et al. 2024. Trans4d: Realistic geometry-aware transition for compositional text-to-4d synthesis. arXiv preprint arXiv:2410.07155 (2024)."},{"key":"e_1_3_2_1_74_1","volume-title":"MagicBrush: A Manually Annotated Dataset for Instruction-Guided Image Editing. arXiv preprint arXiv:2306.10012","author":"Zhang Kai","year":"2023","unstructured":"Kai Zhang, Lingbo Mo, Wenhu Chen, Huan Sun, and Yu Su. 2023a. MagicBrush: A Manually Annotated Dataset for Instruction-Guided Image Editing. arXiv preprint arXiv:2306.10012 (2023)."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang Anyi Rao and Maneesh Agrawala. 2023b. Adding conditional control to text-to-image diffusion models. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_76_1","volume-title":"Ssr-encoder: Encoding selective subject representation for subject-driven generation. In CVPR.","author":"Zhang Yuxuan","year":"2024","unstructured":"Yuxuan Zhang, Jiaming Liu, Yiren Song, Rui Wang, Hao Tang, Jinpeng Yu, Huaxia Li, Xu Tang, Yao Hu, Han Pan, et al., 2024a. Ssr-encoder: Encoding selective subject representation for subject-driven generation. In CVPR."},{"key":"e_1_3_2_1_77_1","unstructured":"Yuechen Zhang Jinbo Xing Eric Lo and Jiaya Jia. 2024b. Real-world image variation by aligning diffusion inversion chain. In NeurIPS."},{"key":"e_1_3_2_1_78_1","unstructured":"Jun-Yan Zhu Taesung Park Phillip Isola and Alexei A Efros. 2017. Unpaired image-to-image translation using cycle-consistent adversarial networks. In ICCV."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758205","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:59:43Z","timestamp":1765310383000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758205"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":78,"alternative-id":["10.1145\/3746027.3758205","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758205","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}