{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:20:46Z","timestamp":1773246046056,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Project of China","award":["2022ZD0160101"],"award-info":[{"award-number":["2022ZD0160101"]}]},{"name":"NSFC","award":["No. 62272411"],"award-info":[{"award-number":["No. 62272411"]}]},{"name":"Research funding from FinVolution Group"},{"name":"Key Research and Development Projects in Zhejiang Province","award":["No. 2024C01106"],"award-info":[{"award-number":["No. 2024C01106"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681552","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"846-855","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Fact :Teaching MLLMs with &lt;u&gt;Fa&lt;\/u&gt;ithful, &lt;u&gt;C&lt;\/u&gt;oncise and &lt;u&gt;T&lt;\/u&gt;ransferable Rationales"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9705-5398","authenticated-orcid":false,"given":"Minghe","family":"Gao","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5750-6448","authenticated-orcid":false,"given":"Shuang","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1161-8546","authenticated-orcid":false,"given":"Liang","family":"Pang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8276-3620","authenticated-orcid":false,"given":"Yuan","family":"Yao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5513-911X","authenticated-orcid":false,"given":"Jisheng","family":"Dang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5988-7609","authenticated-orcid":false,"given":"Wenqiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-1291","authenticated-orcid":false,"given":"Juncheng","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7356-9711","authenticated-orcid":false,"given":"Siliang","family":"Tang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9017-2508","authenticated-orcid":false,"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TallyQA: Answering Complex Counting Questions. arXiv preprint arXiv:1810.12440","author":"Acharya Manoj","year":"2018","unstructured":"Manoj Acharya, Kushal Kafle, and Christopher Kanan. 2018. TallyQA: Answering Complex Counting Questions. arXiv preprint arXiv:1810.12440 (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Oh (Eds.)","volume":"35","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikoaj Bikowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 23716--23736."},{"key":"e_1_3_2_1_3_1","unstructured":"Anas Awadalla Irena Gao Josh Gardner Jack Hessel Yusuf Hanafy Wanrong Zhu Kalyani Marathe Yonatan Bitton Samir Gadre Shiori Sagawa Jenia Jitsev Simon Kornblith PangWei Koh Gabriel Ilharco MitchellWortsman and Ludwig Schmidt. 2023. OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models. arXiv:2308.01390 [cs.CV]"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01844"},{"key":"e_1_3_2_1_5_1","unstructured":"Jun Chen Deyao Zhu Xiaoqian Shen Xiang Li Zechun Liu Pengchuan Zhang Raghuraman Krishnamoorthi Vikas Chandra Yunyang Xiong and Mohamed Elhoseiny. 2023. MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478 [cs.CV]"},{"key":"e_1_3_2_1_6_1","unstructured":"HyungWon Chung Le Hou S. Longpre Barret Zoph Yi Tay William Fedus Eric Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma Albert Webson Shixiang Shane Gu Zhuyun Dai Mirac Suzgun Xinyun Chen Aakanksha Chowdhery Dasha Valter Sharan Narang Gaurav Mishra Adams Wei Yu Vincent Zhao Yanping Huang Andrew M. Dai Hongkun Yu Slav Petrov Ed Huai hsin Chi Jeff Dean Jacob Devlin Adam Roberts Denny Zhou Quoc V. Le and JasonWei. 2022. Scaling Instruction-Finetuned Language Models. ArXiv abs\/2210.11416 (2022). https:\/\/api.semanticscholar.org\/CorpusID:253018554"},{"key":"e_1_3_2_1_7_1","volume-title":"Levine (Eds.)","volume":"36","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, DONGXU LI, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 49250-- 49267."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"e_1_3_2_1_9_1","volume-title":"DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs. arXiv:1903.00161 [cs.CL]","author":"Dua Dheeru","year":"2019","unstructured":"Dheeru Dua, Yizhong Wang, Pradeep Dasigi, Gabriel Stanovsky, Sameer Singh, and Matt Gardner. 2019. DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs. arXiv:1903.00161 [cs.CL]"},{"key":"e_1_3_2_1_10_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394 [cs.CV]","author":"Fu Chaoyou","year":"2024","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, YunshengWu, and Rongrong Ji. 2024. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv:2306.13394 [cs.CV]"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Roei Herzig Alon Mendelson Leonid Karlinsky Assaf Arbelle Rogerio Feris Trevor Darrell and Amir Globerson. 2023. Incorporating Structured Representations into Pretrained Vision & Language Models Using Scene Graphs. arXiv:2305.06343 [cs.CV]","DOI":"10.18653\/v1\/2023.emnlp-main.870"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Cheng-Yu Hsieh Chun-Liang Li Chih-Kuan Yeh Hootan Nakhost Yasuhisa Fujii Alexander Ratner Ranjay Krishna Chen-Yu Lee and Tomas Pfister. 2023. Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes. arXiv:2305.02301 [cs.CL]","DOI":"10.18653\/v1\/2023.findings-acl.507"},{"key":"e_1_3_2_1_15_1","unstructured":"Yushi Hu Otilia Stretcu Chun-Ta Lu Krishnamurthy Viswanathan Kenji Hata Enming Luo Ranjay Krishna and Ariel Fuxman. 2023. Visual Program Distillation: Distilling Tools and Programmatic Reasoning into Vision-Language Models. arXiv:2312.03052 [cs.CV]"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Robin Jia and Percy Liang. 2017. Adversarial Examples for Evaluating Reading Comprehension Systems. arXiv:1707.07328 [cs.CL]","DOI":"10.18653\/v1\/D17-1215"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Brihi Joshi Ziyi Liu Sahana Ramnath Aaron Chan Zhewei Tong Shaoliang Nie Qifan Wang Yejin Choi and Xiang Ren. 2023. Are Machine Rationales (Not) Useful to Humans? Measuring and Improving Human Utility of Free-Text Rationales. arXiv:2305.07095 [cs.CL]","DOI":"10.18653\/v1\/2023.acl-long.392"},{"key":"e_1_3_2_1_19_1","unstructured":"Bohao Li RuiWang GuangzhiWang Yuying Ge Yixiao Ge and Ying Shan. 2023. SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv:2307.16125 [cs.CL] https:\/\/arxiv.org\/abs\/2307.16125"},{"key":"e_1_3_2_1_20_1","unstructured":"Bo Li Peiyuan Zhang Jingkang Yang Yuanhan Zhang Fanyi Pu and Ziwei Liu. 2023. OtterHD: A High-Resolution Multi-modality Model. arXiv:2311.04219 [cs.CV]"},{"key":"e_1_3_2_1_21_1","unstructured":"Jiazheng Li Lin Gui Yuxiang Zhou David West Cesare Aloisi and Yulan He. 2023. Distilling ChatGPT for Explainable Automated Student Answer Assessment. arXiv:2305.12962 [cs.CL]"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/ v202\/li23q.html"},{"key":"e_1_3_2_1_23_1","volume-title":"Evaluating Object Hallucination in Large Vision-Language Models. In Conference on Empirical Methods in Natural Language Processing. https: \/\/api.semanticscholar.org\/CorpusID:258740697","author":"Li Yifan","year":"2023","unstructured":"Yifan Li, Yifan Du, Kun Zhou, Jinpeng Wang, Wayne Xin Zhao, and Ji rong Wen. 2023. Evaluating Object Hallucination in Large Vision-Language Models. In Conference on Empirical Methods in Natural Language Processing. https: \/\/api.semanticscholar.org\/CorpusID:258740697"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00566"},{"key":"e_1_3_2_1_26_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. arXiv:2310.03744 [cs.CV]"},{"key":"e_1_3_2_1_27_1","volume-title":"Levine (Eds.)","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 34892--34916."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Yuan Liu Haodong Duan Yuanhan Zhang Bo Li Songyang Zhang Wangbo Zhao Yike Yuan Jiaqi Wang Conghui He Ziwei Liu Kai Chen and Dahua Lin. 2024. MMBench: Is Your Multi-modal Model an All-around Player? arXiv:2307.06281 [cs.CV] https:\/\/arxiv.org\/abs\/2307.06281","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01050"},{"key":"e_1_3_2_1_30_1","volume-title":"Peters","author":"Marasovic Ana","year":"2022","unstructured":"Ana Marasovic, Iz Beltagy, Doug Downey, and Matthew E. Peters. 2022. Few-Shot Self-Rationalization with Natural Language Prompts. arXiv:2111.08284 [cs.CL]"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_32_1","unstructured":"Arindam Mitra Luciano Del Corro Shweti Mahajan Andres Codas Clarisse Simoes Sahaj Agarwal Xuxi Chen Anastasia Razdaibiedina Erik Jones Kriti Aggarwal Hamid Palangi Guoqing Zheng Corby Rosset Hamed Khanpour and Ahmed Awadallah. 2023. Orca 2: Teaching Small Language Models How to Reason. arXiv:2311.11045 [cs.AI]"},{"key":"e_1_3_2_1_33_1","unstructured":"OpenAI. [n. d.]. GhatGPT. https:\/\/openai.com\/chatgpt."},{"key":"e_1_3_2_1_34_1","unstructured":"OpenAI Josh Achiam Steven Adler Sandhini Agarwal"},{"key":"e_1_3_2_1_35_1","volume-title":"Logic- LM: Empowering Large Language Models with Symbolic Solvers for Faithful Logical Reasoning. arXiv:2305.12295 [cs.CL]","author":"Pan Liangming","year":"2023","unstructured":"Liangming Pan, Alon Albalak, XinyiWang, and William YangWang. 2023. Logic- LM: Empowering Large Language Models with Symbolic Solvers for Faithful Logical Reasoning. arXiv:2305.12295 [cs.CL]"},{"key":"e_1_3_2_1_36_1","volume-title":"Paul Pu Liang, Ximing Lu, Peter West, Youngjae Yu, Qiuyuan Huang, Jianfeng Gao, Ali Farhadi, and Yejin Choi.","author":"Park Jae Sung","year":"2023","unstructured":"Jae Sung Park, Jack Hessel, Khyathi Raghavi Chandu, Paul Pu Liang, Ximing Lu, Peter West, Youngjae Yu, Qiuyuan Huang, Jianfeng Gao, Ali Farhadi, and Yejin Choi. 2023. Localized Symbolic Knowledge Distillation for Visual Commonsense Models. https:\/\/api.semanticscholar.org\/CorpusID:266149843"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research","volume":"31227","author":"Shi Freda","year":"2023","unstructured":"Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H. Chi, Nathanael Sch\u00e4rli, and Denny Zhou. 2023. Large Language Models Can Be Easily Distracted by Irrelevant Context. In Proceedings of the 40th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 31210--31227. https:\/\/proceedings.mlr.press\/v202\/shi23a.html"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_2_1_40_1","volume-title":"Vipergpt: Visual inference via python execution for reasoning. arXiv preprint arXiv:2303.08128","author":"Sur\u00eds D\u00eddac","year":"2023","unstructured":"D\u00eddac Sur\u00eds, Sachit Menon, and Carl Vondrick. 2023. Vipergpt: Visual inference via python execution for reasoning. arXiv preprint arXiv:2303.08128 (2023)."},{"key":"e_1_3_2_1_41_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 [cs.CL]"},{"key":"e_1_3_2_1_42_1","volume-title":"Advances in Neural Information Processing Systems","author":"Turpin Miles","unstructured":"Miles Turpin, Julian Michael, Ethan Perez, and Samuel Bowman. 2023. Language Models Don't Always Say What They Think: Unfaithful Explanations in Chainof- Thought Prompting. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 74952--74965."},{"key":"e_1_3_2_1_43_1","volume-title":"Jianfeng Wang, Kevin Lin, Zhengyuan Yang, Lijuan Wang, and Mike Zheng Shou.","author":"Wang Alex Jinpeng","year":"2024","unstructured":"Alex Jinpeng Wang, Linjie Li, Kevin Qinghong Lin, Jianfeng Wang, Kevin Lin, Zhengyuan Yang, Lijuan Wang, and Mike Zheng Shou. 2024. COSMO: COntrastive Streamlined MultimOdal Model with Interleaved Pre-Training. arXiv:2401.00849 [cs.CV]"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Siyuan Wang Zhongyu Wei Yejin Choi and Xiang Ren. 2024. Can LLMs Reason with Rules? Logic Scaffolding for Stress-Testing and Improving LLMs. arXiv:2402.11442 [cs.CL]","DOI":"10.18653\/v1\/2024.acl-long.406"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"e_1_3_2_1_46_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qi Qian Ji Zhang Fei Huang and Jingren Zhou. 2024. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arXiv:2304.14178 [cs.CL]"},{"key":"e_1_3_2_1_47_1","unstructured":"Qinghao Ye Haiyang Xu Jiabo Ye Ming Yan Anwen Hu Haowei Liu Qi Qian Ji Zhang Fei Huang and Jingren Zhou. 2023. mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration. arXiv:2311.04257 [cs.CL]"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612520"},{"key":"e_1_3_2_1_49_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. arXiv:2304.10592 [cs.CV]"},{"key":"e_1_3_2_1_50_1","unstructured":"Jinguo Zhu Xiaohan Ding Yixiao Ge Yuying Ge Sijie Zhao Hengshuang Zhao Xiaohua Wang and Ying Shan. 2023. VL-GPT: A Generative Pretrained Transformer for Vision and Language Understanding and Generation. arXiv:2312.09251 [cs.CV]"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681552","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681552","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681552"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":50,"alternative-id":["10.1145\/3664647.3681552","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681552","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}