{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T01:46:42Z","timestamp":1781747202606,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Key Research and Development Projects in Zhejiang Province","award":["2024C01106"],"award-info":[{"award-number":["2024C01106"]}]},{"name":"National Key Research and Development Project of China","award":["2022ZD0160101"],"award-info":[{"award-number":["2022ZD0160101"]}]},{"name":"NSFC","award":["62272411"],"award-info":[{"award-number":["62272411"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681488","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"7346-7355","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":35,"title":["WorldGPT: Empowering LLM as Multimodal World Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-6590-365X","authenticated-orcid":false,"given":"Zhiqi","family":"Ge","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8800-1354","authenticated-orcid":false,"given":"Hongzhe","family":"Huang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3633-0413","authenticated-orcid":false,"given":"Mingze","family":"Zhou","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-1291","authenticated-orcid":false,"given":"Juncheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3131-6916","authenticated-orcid":false,"given":"Guoming","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7356-9711","authenticated-orcid":false,"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9017-2508","authenticated-orcid":false,"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01767"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_1_4_1","volume-title":"MUVO: A Multimodal Generative World Model for Autonomous Driving with Geometric Representations. arXiv preprint arXiv:2311.11762","author":"Bogdoll Daniel","year":"2023","unstructured":"Daniel Bogdoll, Yitian Yang, and J Marius Z\u00f6llner. 2023. MUVO: A Multimodal Generative World Model for Autonomous Driving with Geometric Representations. arXiv preprint arXiv:2311.11762 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"InstructPix2Pix: Learning to Follow Image Editing Instructions. arXiv preprint arXiv:2211.09800","author":"Brooks Tim","year":"2022","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A Efros. 2022. InstructPix2Pix: Learning to Follow Image Editing Instructions. arXiv preprint arXiv:2211.09800 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Genie: Generative Interactive Environments. arxiv: 2402.15391 [cs.LG]","author":"Bruce Jake","year":"2024","unstructured":"Jake Bruce, Michael Dennis, Ashley Edwards, Jack Parker-Holder, Yuge Shi, Edward Hughes, Matthew Lai, Aditi Mavalankar, Richie Steigerwald, Chris Apps, Yusuf Aytar, Sarah Bechtle, Feryal Behbahani, Stephanie Chan, Nicolas Heess, Lucy Gonzalez, Simon Osindero, Sherjil Ozair, Scott Reed, Jingwei Zhang, Konrad Zolna, Jeff Clune, Nando de Freitas, Satinder Singh, and Tim Rockt\u00e4schel. 2024. Genie: Generative Interactive Environments. arxiv: 2402.15391 [cs.LG]"},{"key":"e_1_3_2_1_7_1","volume-title":"StableVideo: Text-driven Consistency-aware Diffusion Video Editing. arXiv preprint arXiv:2308.09592","author":"Chai Wenhao","year":"2023","unstructured":"Wenhao Chai, Xun Guo, Gaoang Wang, and Yan Lu. 2023. StableVideo: Text-driven Consistency-aware Diffusion Video Editing. arXiv preprint arXiv:2308.09592 (2023)."},{"key":"e_1_3_2_1_8_1","unstructured":"Yi Chen Yuying Ge Yixiao Ge Mingyu Ding Bohao Li Rui Wang Ruifeng Xu Ying Shan and Xihui Liu. 2023. EgoPlan-Bench: Benchmarking Egocentric Embodied Planning with Multimodal Large Language Models. arxiv: 2312.06722 [cs.CV]"},{"key":"e_1_3_2_1_9_1","unstructured":"Lijie Fan Kaifeng Chen Dilip Krishnan Dina Katabi Phillip Isola and Yonglong Tian. 2023. Scaling Laws of Synthetic Images for Model Training. .. for Now. arxiv: 2312.04567 [cs.CV]"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"e_1_3_2_1_12_1","volume-title":"World models. arXiv preprint arXiv:1803.10122","author":"Ha David","year":"2018","unstructured":"David Ha and J\u00fcrgen Schmidhuber. 2018. World models. arXiv preprint arXiv:1803.10122 (2018)."},{"key":"e_1_3_2_1_13_1","volume-title":"Dream to Control: Learning Behaviors by Latent Imagination. arXiv preprint arXiv:1912.01603","author":"Hafner Danijar","year":"2019","unstructured":"Danijar Hafner, Timothy Lillicrap, Jimmy Ba, and Mohammad Norouzi. 2019. Dream to Control: Learning Behaviors by Latent Imagination. arXiv preprint arXiv:1912.01603 (2019)."},{"key":"e_1_3_2_1_14_1","volume-title":"Mastering Atari with Discrete World Models. In International Conference on Learning Representations.","author":"Hafner Danijar","year":"2020","unstructured":"Danijar Hafner, Timothy P Lillicrap, Mohammad Norouzi, and Jimmy Ba. 2020. Mastering Atari with Discrete World Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_15_1","volume-title":"Mastering diverse domains through world models. arXiv preprint arXiv:2301.04104","author":"Hafner Danijar","year":"2023","unstructured":"Danijar Hafner, Jurgis Pasukonis, Jimmy Ba, and Timothy Lillicrap. 2023. Mastering diverse domains through world models. arXiv preprint arXiv:2301.04104 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905","author":"Han Jiaming","year":"2023","unstructured":"Jiaming Han, Renrui Zhang, Wenqi Shao, Peng Gao, Peng Xu, Han Xiao, Kaipeng Zhang, Chris Liu, Song Wen, Ziyu Guo, et al. 2023. Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905","author":"Han Jiaming","year":"2023","unstructured":"Jiaming Han, Renrui Zhang, Wenqi Shao, Peng Gao, Peng Xu, Han Xiao, Kaipeng Zhang, Chris Liu, Song Wen, Ziyu Guo, et al. 2023. Imagebind-llm: Multi-modality instruction tuning. arXiv preprint arXiv:2309.03905 (2023)."},{"key":"e_1_3_2_1_18_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey Gritsenko Diederik P Kingma Ben Poole Mohammad Norouzi David J Fleet et al. 2022. Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Gaia-1: A generative world model for autonomous driving. arXiv preprint arXiv:2309.17080","author":"Hu Anthony","year":"2023","unstructured":"Anthony Hu, Lloyd Russell, Hudson Yeo, Zak Murez, George Fedoseev, Alex Kendall, Jamie Shotton, and Gianluca Corrado. 2023. Gaia-1: A generative world model for autonomous driving. arXiv preprint arXiv:2309.17080 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","volume-title":"Soar: An architecture for general intelligence. Artificial intelligence","author":"Laird John E","year":"1987","unstructured":"John E Laird, Allen Newell, and Paul S Rosenbloom. 1987. Soar: An architecture for general intelligence. Artificial intelligence, Vol. 33, 1 (1987), 1--64."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4613-2277-1_16"},{"key":"e_1_3_2_1_23_1","article-title":"A path towards autonomous machine intelligence version 0.9. 2, 2022-06--27","volume":"62","author":"LeCun Yann","year":"2022","unstructured":"Yann LeCun. 2022. A path towards autonomous machine intelligence version 0.9. 2, 2022-06--27. Open Review, Vol. 62, 1 (2022).","journal-title":"Open Review"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202). PMLR, 19730--19742."},{"key":"e_1_3_2_1_25_1","volume-title":"Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative Instructions. arXiv preprint arXiv:2308.04152","author":"Li Juncheng","year":"2023","unstructured":"Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Hanwang Zhang, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, and Yueting Zhuang. 2023. Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative Instructions. arXiv preprint arXiv:2308.04152 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-LLaVA: Learning United Visual Representation by Alignment Before Projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Liu Haohe","year":"2023","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D Plumbley. 2023. AudioLDM: Text-to-Audio Generation with Latent Diffusion Models. Proceedings of the International Conference on Machine Learning (2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_29_1","volume-title":"Language, Audio, and Action. arXiv preprint arXiv:2312.17172","author":"Lu Jiasen","year":"2023","unstructured":"Jiasen Lu, Christopher Clark, Sangho Lee, Zichen Zhang, Savya Khosla, Ryan Marten, Derek Hoiem, and Aniruddha Kembhavi. 2023. Unified-IO 2: Scaling Autoregressive Multimodal Models with Vision, Language, Audio, and Action. arXiv preprint arXiv:2312.17172 (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"Chenyang Lyu Minghao Wu Longyue Wang Xinting Huang Bingshuai Liu Zefeng Du Shuming Shi and Zhaopeng Tu. 2023. Macaw-LLM: Multi-Modal Language Modeling with Image Audio Video and Text Integration. arxiv: 2306.09093 [cs.CL]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_32_1","volume-title":"Instruction Tuning with GPT-4. arXiv preprint arXiv:2304.03277","author":"Peng Baolin","year":"2023","unstructured":"Baolin Peng, Chunyuan Li, Pengcheng He, Michel Galley, and Jianfeng Gao. 2023. Instruction Tuning with GPT-4. arXiv preprint arXiv:2304.03277 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the AAAI Symposium Series","volume":"2","author":"Romero Oscar J","year":"2023","unstructured":"Oscar J Romero, John Zimmerman, Aaron Steinfeld, and Anthony Tomasic. 2023. Synergistic integration of large language models and cognitive architectures for robust ai: An exploratory analysis. In Proceedings of the AAAI Symposium Series, Vol. 2. 396--405."},{"key":"e_1_3_2_1_36_1","unstructured":"Wenqi Shao Yutao Hu Peng Gao Meng Lei Kaipeng Zhang Fanqing Meng Peng Xu Siyuan Huang Hongsheng Li Yu Qiao and Ping Luo. 2023. Tiny LVLM-eHub: Early Multimodal Experiments with Bard. arxiv: 2308.03729 [cs.CV]"},{"key":"e_1_3_2_1_37_1","unstructured":"Sahand Sharifzadeh Christos Kaplanis Shreya Pathak Dharshan Kumaran Anastasija Ilic Jovana Mitrovic Charles Blundell and Andrea Banino. 2024. Synth^2: Boosting Visual-Language Models with Synthetic Captions and Image Embeddings. arxiv: 2403.07750 [cs.CV]"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings, Part I 14","author":"Sigurdsson Gunnar A","year":"2016","unstructured":"Gunnar A Sigurdsson, G\u00fcl Varol, Xiaolong Wang, Ali Farhadi, Ivan Laptev, and Abhinav Gupta. 2016. Hollywood in homes: Crowdsourcing data collection for activity understanding. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14. Springer, 510--526."},{"key":"e_1_3_2_1_39_1","volume-title":"Griffiths","author":"Sumers Theodore","year":"2023","unstructured":"Theodore Sumers, Shunyu Yao, Karthik Narasimhan, and Thomas L. Griffiths. 2023. Cognitive Architectures for Language Agents. arxiv: 2309.02427 [cs.AI]"},{"key":"e_1_3_2_1_40_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Tang Zineng","year":"2023","unstructured":"Zineng Tang, Ziyi Yang, Chenguang Zhu, Michael Zeng, and Mohit Bansal. 2023. Any-to-Any Generation via Composable Diffusion. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_41_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Drivedreamer: Towards real-world-driven world models for autonomous driving. arXiv preprint arXiv:2309.09777","author":"Wang Xiaofeng","year":"2023","unstructured":"Xiaofeng Wang, Zheng Zhu, Guan Huang, Xinze Chen, and Jiwen Lu. 2023. Drivedreamer: Towards real-world-driven world models for autonomous driving. arXiv preprint arXiv:2309.09777 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Xiaofeng Wang Zheng Zhu Guan Huang Boyuan Wang Xinze Chen and Jiwen Lu. 2024. WorldDreamer: Towards General World Models for Video Generation via Predicting Masked Tokens. arxiv: 2401.09985 [cs.CV]"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_46_1","volume-title":"Conference on Robot Learning. PMLR, 2226--2240","author":"Wu Philipp","year":"2023","unstructured":"Philipp Wu, Alejandro Escontrela, Danijar Hafner, Pieter Abbeel, and Ken Goldberg. 2023. Daydreamer: World models for physical robot learning. In Conference on Robot Learning. PMLR, 2226--2240."},{"key":"e_1_3_2_1_47_1","volume-title":"Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519","author":"Wu Shengqiong","year":"2023","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2023. Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Language Models Meet World Models: Embodied Experiences Enhance Language Models. arXiv preprint arXiv:2305","author":"Xiang Jiannan","year":"2023","unstructured":"Jiannan Xiang, Tianhua Tao, Yi Gu, Tianmin Shu, Zirui Wang, Zichao Yang, and Zhiting Hu. 2023. Language Models Meet World Models: Embodied Experiences Enhance Language Models. arXiv preprint arXiv:2305.10626 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_50_1","volume-title":"Learning Interactive Real-World Simulators. arXiv preprint arXiv:2310.06114","author":"Yang Mengjiao","year":"2023","unstructured":"Mengjiao Yang, Yilun Du, Kamyar Ghasemipour, Jonathan Tompson, Dale Schuurmans, and Pieter Abbeel. 2023. Learning Interactive Real-World Simulators. arXiv preprint arXiv:2310.06114 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"e_1_3_2_1_52_1","volume-title":"Levine (Eds.)","volume":"36","author":"Yang Rui","year":"2023","unstructured":"Rui Yang, Lin Song, Yanwei Li, Sijie Zhao, Yixiao Ge, Xiu Li, and Ying Shan. 2023. GPT4Tools: Teaching Large Language Model to Use Tools via Self-instruction. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 71995--72007. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/e393677793767624f2821cec8bdd02f1-Paper-Conference.pdf"},{"key":"e_1_3_2_1_53_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qian Qi Ji Zhang and Fei Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arxiv: 2304.14178 [cs.CL]"},{"key":"e_1_3_2_1_54_1","first-page":"23634","article-title":"Merlot: Multimodal neural script knowledge models","volume":"34","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, Jize Cao, Ali Farhadi, and Yejin Choi. 2021. Merlot: Multimodal neural script knowledge models. Advances in Neural Information Processing Systems, Vol. 34 (2021), 23634--23651.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_55_1","volume-title":"Minigpt-5: Interleaved vision-and-language generation via generative vokens. arXiv preprint arXiv:2310.02239","author":"Zheng Kaizhi","year":"2023","unstructured":"Kaizhi Zheng, Xuehai He, and Xin Eric Wang. 2023. Minigpt-5: Interleaved vision-and-language generation via generative vokens. arXiv preprint arXiv:2310.02239 (2023)."},{"key":"e_1_3_2_1_56_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arxiv: 2306.05685 [cs.CL]"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"e_1_3_2_1_58_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al. 2023. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681488","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681488","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681488"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":59,"alternative-id":["10.1145\/3664647.3681488","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681488","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}