{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,5]],"date-time":"2025-11-05T11:28:57Z","timestamp":1762342137705,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":11,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["No.JQ20023"],"award-info":[{"award-number":["No.JQ20023"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61832002"],"award-info":[{"award-number":["61832002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612665","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"9365-9367","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["mPLUG-Octopus: The Versatile Assistant Empowered by A Modularized End-to-End Multimodal LLM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7977-5540","authenticated-orcid":false,"given":"Qinghao","family":"Ye","sequence":"first","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9442-5912","authenticated-orcid":false,"given":"Haiyang","family":"Xu","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4959-8878","authenticated-orcid":false,"given":"Ming","family":"Yan","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1532-9451","authenticated-orcid":false,"given":"Chenlin","family":"Zhao","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group &amp; Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3204-6607","authenticated-orcid":false,"given":"Junyang","family":"Wang","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group &amp; Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5453-9755","authenticated-orcid":false,"given":"Xiaoshan","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3835-7975","authenticated-orcid":false,"given":"Ji","family":"Zhang","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-5053","authenticated-orcid":false,"given":"Fei","family":"Huang","sequence":"additional","affiliation":[{"name":"DAMO Academy, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0699-3205","authenticated-orcid":false,"given":"Jitao","family":"Sang","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University &amp; Beijing Peng Cheng Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. In NeurIPS."},{"key":"e_1_3_2_1_2_1","unstructured":"Edward J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_3_1","unstructured":"OpenAI. 2022. Introducing chatgpt. https:\/\/openai.com\/blog\/chatgpt."},{"key":"e_1_3_2_1_4_1","volume-title":"High-Resolution Image Synthesis with Latent Diffusion Models. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Rombach Robin","year":"2021","unstructured":"Robin Rombach, A. Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 10674--10685."},{"key":"e_1_3_2_1_5_1","volume-title":"Multi-Task Pre-Training for Plug-and-Play Task-Oriented Dialogue System. In Annual Meeting of the Association for Computational Linguistics.","author":"Su Yixuan","year":"2021","unstructured":"Yixuan Su, Lei Shu, Elman Mansimov, Arshit Gupta, Deng Cai, Yi-An Lai, and Yi Zhang. 2021. Multi-Task Pre-Training for Plug-and-Play Task-Oriented Dialogue System. In Annual Meeting of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_6_1","volume-title":"Chat-PLUG: Open-Domain Generative Dialogue System with Internet-Augmented Instruction Tuning for Digital Human. ArXiv abs\/2304.07849","author":"Tian Junfeng","year":"2023","unstructured":"Junfeng Tian, Hehong Chen, Guohai Xu, Mingshi Yan, Xing Gao, Jianhai Zhang, Chenliang Li, Jiayi Liu, Wenshen Xu, Haiyang Xu, Qiuchen Qian, Wei Wang, Qinghao Ye, Jie Zhang, Ji Zhang, Feiran Huang, and Jingren Zhou. 2023. Chat-PLUG: Open-Domain Generative Dialogue System with Internet-Augmented Instruction Tuning for Digital Human. ArXiv abs\/2304.07849 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aur\u00e9lien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. CoRR abs\/2302.13971 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models. CoRR abs\/2303.04671","author":"Wu Chenfei","year":"2023","unstructured":"Chenfei Wu, Shengming Yin, Weizhen Qi, Xiaodong Wang, Zecheng Tang, and Nan Duan. 2023. Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models. CoRR abs\/2303.04671 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action. CoRR abs\/2303.11381","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Ehsan Azarnasab, Faisal Ahmed, Zicheng Liu, Ce Liu, Michael Zeng, and LijuanWang. 2023. MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action. CoRR abs\/2303.11381 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Ji Chao Zhang, and Feiyan Huang","author":"Ye Qinghao","year":"2023","unstructured":"Qinghao Ye, Haiyang Xu, Guohai Xu, Jiabo Ye, Ming Yan, Yi Zhou, Junyan Wang, Anwen Hu, Pengcheng Shi, Yaya Shi, Chenliang Li, Yuanhong Xu, Hehong Chen, Junfeng Tian, Qiang Qi, Ji Chao Zhang, and Feiyan Huang. 2023. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. ArXiv abs\/2304.14178 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"EVA: An Open-Domain Chinese Dialogue System with Large-Scale Generative Pre-Training. ArXiv abs\/2108.01547","author":"Zhou Hao","year":"2021","unstructured":"Hao Zhou, Pei Ke, Zheng Zhang, Yuxian Gu, Yinhe Zheng, Chujie Zheng, Yida Wang, Chen HenryWu, Hao Sun, Xiaocong Yang, BosiWen, Xiaoyan Zhu, Minlie Huang, and Jie Tang. 2021. EVA: An Open-Domain Chinese Dialogue System with Large-Scale Generative Pre-Training. ArXiv abs\/2108.01547 (2021)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612665","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612665","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:04:35Z","timestamp":1755821075000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612665"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":11,"alternative-id":["10.1145\/3581783.3612665","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612665","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}