{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T19:19:50Z","timestamp":1778267990949,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Research Foundation, Singapore","award":["Industry Alignment Fund ? Pre-positioning (IAF-PP)"],"award-info":[{"award-number":["Industry Alignment Fund ? Pre-positioning (IAF-PP)"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102070,62220106008,62306065"],"award-info":[{"award-number":["62102070,62220106008,62306065"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681656","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"7734-7743","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":28,"title":["GalleryGPT: Analyzing Paintings with Large Multimodal Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9714-8738","authenticated-orcid":false,"given":"Yi","family":"Bin","sequence":"first","affiliation":[{"name":"Tongji University &amp; National University of Singapore, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8881-5772","authenticated-orcid":false,"given":"Wenhao","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2945-1107","authenticated-orcid":false,"given":"Yujuan","family":"Ding","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Hong Kong SAR, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8430-4954","authenticated-orcid":false,"given":"Zhiqiang","family":"Hu","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9318-0084","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Tongji University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5070-4511","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6565-7511","authenticated-orcid":false,"given":"See-Kiong","family":"Ng","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2999-2088","authenticated-orcid":false,"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"Tongji University &amp; University of Electronic Science and Technology of China, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)."},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond.","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475173"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548184"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00716"},{"key":"e_1_3_2_1_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings, Part IV 12","author":"Carneiro Gustavo","year":"2012","unstructured":"Gustavo Carneiro, Nuno Pinho Da Silva, Alessio Del Bue, and Jo ao Paulo Costeira. 2012. Artistic image classification: An analysis on the printart database. In Computer Vision--ECCV 2012: 12th European Conference on Computer Vision, Florence, Italy, October 7--13, 2012, Proceedings, Part IV 12. Springer, 143--157."},{"key":"e_1_3_2_1_8_1","volume-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478","author":"Chen Jun","year":"2023","unstructured":"Jun Chen, Deyao Zhu, Xiaoqian Shen, Xiang Li, Zechun Liu, Pengchuan Zhang, Raghuraman Krishnamoorthi, Vikas Chandra, Yunyang Xiong, and Mohamed Elhoseiny. 2023. Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arXiv preprint arXiv:2306.15195 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793","author":"Chen Lin","year":"2023","unstructured":"Lin Chen, Jisong Li, Xiaoyi Dong, Pan Zhang, Conghui He, Jiaqi Wang, Feng Zhao, and Dahua Lin. 2023. Sharegpt4v: Improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)."},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, Vol. 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00444"},{"key":"e_1_3_2_1_13_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","volume-title":"A survey on rag meets llms: Towards retrieval-augmented large language models. arXiv preprint arXiv:2405.06211","author":"Ding Yujuan","year":"2024","unstructured":"Yujuan Ding, Wenqi Fan, Liangbo Ning, Shijie Wang, Hengyun Li, Dawei Yin, Tat-Seng Chua, and Qing Li. 2024. A survey on rag meets llms: Towards retrieval-augmented large language models. arXiv preprint arXiv:2405.06211 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3651232"},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, et al. 2020. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0--0.","author":"Garcia Noa","year":"2018","unstructured":"Noa Garcia and George Vogiatzis. 2018. How to read paintings: semantic art understanding with multi-modal retrieval. In Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0--0."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66096-3_8"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1167\/16.12.326"},{"key":"e_1_3_2_1_21_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_22_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. 2021. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"e_1_3_2_1_24_1","volume-title":"https:\/\/gemini.google.com [Accessed","author":"Google Inc. 2023. Gemini.","year":"2024","unstructured":"Google Inc. 2023. Gemini. https:\/\/gemini.google.com [Accessed: 4 Feb, 2024]."},{"key":"e_1_3_2_1_25_1","volume-title":"https:\/\/chat.openai.com [Accessed","author":"OpenAI Inc. 2022. ChatGPT.","year":"2024","unstructured":"OpenAI Inc. 2022. ChatGPT. https:\/\/chat.openai.com [Accessed: 4 Feb, 2024]."},{"key":"e_1_3_2_1_26_1","volume-title":"https:\/\/openai.com\/research\/gpt-4v-system-card [Accessed","author":"OpenAI Inc. 2023. GPT-4V(ision).","year":"2024","unstructured":"OpenAI Inc. 2023. GPT-4V(ision). https:\/\/openai.com\/research\/gpt-4v-system-card [Accessed: 4 Feb, 2024]."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2008.923513"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_1_30_1","volume-title":"Joost Van de Weijer, and Michael Felsberg","author":"Khan Fahad Shahbaz","year":"2014","unstructured":"Fahad Shahbaz Khan, Shida Beigpour, Joost Van de Weijer, and Michael Felsberg. 2014. Painting-91: a large scale database for computational painting categorization. Machine vision and applications, Vol. 25 (2014), 1385--1397."},{"key":"e_1_3_2_1_31_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, Vol. 25 (2012)."},{"key":"e_1_3_2_1_32_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_36_1","volume-title":"The artbench dataset: Benchmarking generative models with artworks. arXiv preprint arXiv:2206.11404","author":"Liao Peiyuan","year":"2022","unstructured":"Peiyuan Liao, Xiuyu Li, Xihui Liu, and Kurt Keutzer. 2022. The artbench dataset: Benchmarking generative models with artworks. arXiv preprint arXiv:2206.11404 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101811"},{"key":"e_1_3_2_1_38_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"Cong-Duy Nguyen, See-Kiong Ng, and Luu Anh Tuan.","author":"Nguyen Thong","year":"2024","unstructured":"Thong Nguyen, Yi Bin, Junbin Xiao, Leigang Qu, Yicong Li, Jay Zhangjie Wu, Cong-Duy Nguyen, See-Kiong Ng, and Luu Anh Tuan. 2024. Video-Language Understanding: A Survey from Model Architecture, Model Training, and Data Perspectives. arXiv preprint arXiv:2406.05615 (2024)."},{"key":"e_1_3_2_1_41_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems Vol. 35 (2022) 27730--27744."},{"key":"e_1_3_2_1_42_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_44_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_45_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_46_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_47_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1670671.1670672"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350972"},{"key":"e_1_3_2_1_50_1","volume-title":"Math-LLaVA: Bootstrapping Mathematical Reasoning for Multimodal Large Language Models. arXiv preprint arXiv:2406.17294","author":"Shi Wenhao","year":"2024","unstructured":"Wenhao Shi, Zhiqiang Hu, Yi Bin, Junhua Liu, Yang Yang, See-Kiong Ng, Lidong Bing, and Roy Ka-Wei Lee. 2024. Math-LLaVA: Bootstrapping Mathematical Reasoning for Multimodal Large Language Models. arXiv preprint arXiv:2406.17294 (2024)."},{"key":"e_1_3_2_1_51_1","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna [Accessed","author":"Team The Vicuna","year":"2023","unstructured":"The Vicuna Team. 2023. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/lmsys.org\/blog\/2023-03--30-vicuna [Accessed: 4 Feb, 2024]."},{"key":"e_1_3_2_1_52_1","volume-title":"Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, et al.","author":"Thoppilan Romal","year":"2022","unstructured":"Romal Thoppilan, Daniel De Freitas, Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, et al. 2022. Lamda: Language models for dialog applications. arXiv preprint arXiv:2201.08239 (2022)."},{"key":"e_1_3_2_1_53_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_54_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_56_1","unstructured":"Aiyuan Yang Bin Xiao Bingning Wang Borong Zhang Ce Bian Chao Yin Chenxu Lv Da Pan Dian Wang Dong Yan et al. 2023. Baichuan 2: Open large-scale language models. arXiv preprint arXiv:2309.10305 (2023)."},{"key":"e_1_3_2_1_57_1","volume-title":"Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_58_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681656","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681656","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681656"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3681656","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681656","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}