{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:03:54Z","timestamp":1775815434789,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant 62402158, Grant 72188101 and Grant U22A2094"],"award-info":[{"award-number":["Grant 62402158, Grant 72188101 and Grant U22A2094"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Major Project of Anhui Province","award":["Grant 202203a05020011, Grant 202423k09020001, and Grant 2408085J040"],"award-info":[{"award-number":["Grant 202203a05020011, Grant 202423k09020001, and Grant 2408085J040"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["Grant JZ2024HGTG0309, Grant JZ2024AHST0337, Grant JZ2023YQTD0072"],"award-info":[{"award-number":["Grant JZ2024HGTG0309, Grant JZ2024AHST0337, Grant JZ2023YQTD0072"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755726","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"5814-5823","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Beyond Emotion Recognition: A Multi-Turn Multimodal Emotion Understanding and Reasoning Benchmark"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4090-7494","authenticated-orcid":false,"given":"Jinpeng","family":"Hu","sequence":"first","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5041-1576","authenticated-orcid":false,"given":"Hongchang","family":"Shi","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9873-6735","authenticated-orcid":false,"given":"Chongyuan","family":"Dai","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6451-4877","authenticated-orcid":false,"given":"Zhuo","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6764-3375","authenticated-orcid":false,"given":"Peipei","family":"Song","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China and Institute of Artificial Intelligence (IAI), Hefei Comprehensive National Science Center, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.38094\/jastt20291"},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877-1901."},{"key":"e_1_3_2_1_4_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42 (2008), 335-359."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Lin Chen Xilin Wei Jinsong Li Xiaoyi Dong Pan Zhang Yuhang Zang Zehui Chen Haodong Duan Bin Lin Zhenyu Tang et al. 2024. Sharegpt4video: Improving video understanding and generation with better captions. arXiv preprint arXiv:2406.04325 (2024).","DOI":"10.52202\/079017-0614"},{"key":"e_1_3_2_1_6_1","volume-title":"Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with Instruction Tuning. arXiv preprint arXiv:2406.11161","author":"Cheng Zebang","year":"2024","unstructured":"Zebang Cheng, Zhi-Qi Cheng, Jun-Yan He, Jingdong Sun, Kai Wang, Yuxiang Lin, Zheng Lian, Xiaojiang Peng, and Alexander Hauptmann. 2024a. Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with Instruction Tuning. arXiv preprint arXiv:2406.11161 (2024)."},{"key":"e_1_3_2_1_7_1","unstructured":"Zesen Cheng Sicong Leng Hang Zhang Yifei Xin Xin Li Guanzheng Chen Yongxin Zhu Wenqi Zhang Ziyang Luo Deli Zhao et al. 2024b. VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs. arXiv preprint arXiv:2406.07476 (2024)."},{"key":"e_1_3_2_1_8_1","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al., 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, Vol. 24, 240 (2023), 1-113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W15-1204"},{"key":"e_1_3_2_1_10_1","volume-title":"FAF: A novel multimodal emotion recognition approach integrating face, body and text. arXiv preprint arXiv:2211.15425","author":"Fang Zhongyu","year":"2022","unstructured":"Zhongyu Fang, Aoyun He, Qihui Yu, Baopeng Gao, Weiping Ding, Tong Zhang, and Lei Ma. 2022. FAF: A novel multimodal emotion recognition approach integrating face, body and text. arXiv preprint arXiv:2211.15425 (2022)."},{"key":"e_1_3_2_1_11_1","first-page":"82","article-title":"Llms accelerate annotation for medical information extraction. In machine learning for health (ML4H)","author":"Goel Akshay","year":"2023","unstructured":"Akshay Goel, Almog Gueta, Omry Gilon, Chang Liu, Sofia Erell, Lan Huong Nguyen, Xiaohong Hao, Bolous Jaber, Shashir Reddy, Rupesh Kartha, et al., 2023. Llms accelerate annotation for medical information extraction. In machine learning for health (ML4H). PMLR, 82-100.","journal-title":"PMLR"},{"key":"e_1_3_2_1_12_1","first-page":"3123","article-title":"The distress analysis interview corpus of human and computer interviews","author":"Gratch Jonathan","year":"2014","unstructured":"Jonathan Gratch, Ron Artstein, Gale M Lucas, Giota Stratou, Stefan Scherer, Angela Nazarian, Rachel Wood, Jill Boberg, David DeVault, Stacy Marsella, et al., 2014. The distress analysis interview corpus of human and computer interviews.. In LREC. Reykjavik, 3123-3128.","journal-title":"LREC. Reykjavik"},{"key":"e_1_3_2_1_13_1","volume-title":"Psycollm: Enhancing llm for psychological understanding and evaluation","author":"Hu Jinpeng","year":"2024","unstructured":"Jinpeng Hu, Tengteng Dong, Luo Gang, Hui Ma, Peng Zou, Xiao Sun, Dan Guo, Xun Yang, and Meng Wang. 2024a. Psycollm: Enhancing llm for psychological understanding and evaluation. IEEE Transactions on Computational Social Systems (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.441"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.320"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.140"},{"key":"e_1_3_2_1_17_1","volume-title":"Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395","author":"Hu Shengding","year":"2024","unstructured":"Shengding Hu, Yuge Tu, Xu Han, Chaoqun He, Ganqu Cui, Xiang Long, Zhi Zheng, Yewei Fang, Yuxiang Huang, Weilin Zhao, et al., 2024b. Minicpm: Unveiling the potential of small language models with scalable training strategies. arXiv preprint arXiv:2404.06395 (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"e_1_3_2_1_20_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024b. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models. arXiv preprint arXiv:2407.07895","author":"Li Feng","year":"2024","unstructured":"Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, and Chunyuan Li. 2024c. LLaVA-NeXT-Interleave: Tackling Multi-image, Video, and 3D in Large Multimodal Models. arXiv preprint arXiv:2407.07895 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Self-instructed derived prompt generation meets in-context learning: Unlocking new potential of black-box llms. arXiv preprint arXiv:2409.01552","author":"Li Zhuo","year":"2024","unstructured":"Zhuo Li, Yuhao Du, Jinpeng Hu, Xiang Wan, and Anningzhe Gao. 2024a. Self-instructed derived prompt generation meets in-context learning: Unlocking new potential of black-box llms. arXiv preprint arXiv:2409.01552 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Add-One-In: Incremental Sample Selection for Large Language Models via a Choice-Based Greedy Paradigm. arXiv preprint arXiv:2503.02359","author":"Li Zhuo","year":"2025","unstructured":"Zhuo Li, Yuhao Du, Xiaoqi Jiao, Yiwen Guo, Yuege Feng, Xiang Wan, Anningzhe Gao, and Jinpeng Hu. 2025a. Add-One-In: Incremental Sample Selection for Large Language Models via a Choice-Based Greedy Paradigm. arXiv preprint arXiv:2503.02359 (2025)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3546031"},{"key":"e_1_3_2_1_26_1","volume-title":"Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Bin Zhu, Yang Ye, Munan Ning, Peng Jin, and Li Yuan. 2023. Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"e_1_3_2_1_29_1","volume-title":"Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution. arXiv preprint arXiv:2409.12961","author":"Liu Zuyan","year":"2024","unstructured":"Zuyan Liu, Yuhao Dong, Ziwei Liu, Winston Hu, Jiwen Lu, and Yongming Rao. 2024a. Oryx MLLM: On-Demand Spatial-Temporal Understanding at Arbitrary Resolution. arXiv preprint arXiv:2409.12961 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Mixkit: Free Assets for Video, Music, and Sound Effects. https:\/\/mixkit.co.","year":"2023","unstructured":"Mixkit. 2023. Mixkit: Free Assets for Video, Music, and Sound Effects. https:\/\/mixkit.co."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2070481.2070509"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-020-00655-3"},{"key":"e_1_3_2_1_34_1","volume-title":"Pexels: Free Stock Photos and Videos. https:\/\/www.pexels.com.","year":"2023","unstructured":"Pexels. 2023. Pexels: Free Stock Photos and Videos. https:\/\/www.pexels.com."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1050"},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_37_1","volume-title":"Towards empathetic open-domain conversation models: A new benchmark and dataset. arXiv","author":"Rashkin H","year":"2018","unstructured":"H Rashkin, EM Smith, M Li, and YL Boureau. [n.d.]. Towards empathetic open-domain conversation models: A new benchmark and dataset. arXiv 2018. arXiv preprint arXiv:1811.00207 ([n.d.])."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413909"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3183402"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3359045"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611726"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2021.3097002"},{"key":"e_1_3_2_1_43_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al., 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Dreaddit: A reddit dataset for stress analysis in social media. arXiv preprint arXiv:1911.00133","author":"Turcan Elsbeth","year":"2019","unstructured":"Elsbeth Turcan and Kathleen McKeown. 2019. Dreaddit: A reddit dataset for stress analysis in social media. arXiv preprint arXiv:1911.00133 (2019)."},{"key":"e_1_3_2_1_45_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_46_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le.","author":"Wei Jason","year":"2021","unstructured":"Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2021. Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)."},{"key":"e_1_3_2_1_48_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_49_1","volume-title":"Paul Pu Liang, and Louis-Philippe Morency","author":"Wilf Alex","year":"2023","unstructured":"Alex Wilf, Leena Mathur, Sheryl Mathew, Claire Ko, Youssouf Kebe, Paul Pu Liang, and Louis-Philippe Morency. 2023. Social-IQ 2.0 Challenge: Benchmarking Multimodal Social Understanding. https:\/\/github.com\/abwilf\/Social-IQ-2.0-Challenge."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2013.34"},{"key":"e_1_3_2_1_51_1","volume-title":"First Conference on Language Modeling.","author":"Wu Qingyun","year":"2024","unstructured":"Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Beibin Li, Erkang Zhu, Li Jiang, Xiaoyun Zhang, Shaokun Zhang, Jiale Liu, et al., 2024. Autogen: Enabling next-gen LLM applications via multi-agent conversations. In First Conference on Language Modeling."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-23585-6_8"},{"key":"e_1_3_2_1_53_1","unstructured":"Can Xu Qingfeng Sun Kai Zheng Xiubo Geng Pu Zhao Jiazhan Feng Chongyang Tao and Daxin Jiang. 2023. WizardLM: Empowering Large Language Models to Follow Complex Instructions. arXiv:2304.12244 [cs.CL] https:\/\/arxiv.org\/abs\/2304.12244"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301371"},{"key":"e_1_3_2_1_55_1","volume-title":"One: A New Data Source and Learning Paradigm for Multimodal LLMs. arXiv preprint arXiv:2404.16375","author":"Yan An","year":"2024","unstructured":"An Yan, Zhengyuan Yang, Junda Wu, Wanrong Zhu, Jianwei Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Julian McAuley, Jianfeng Gao, et al., 2024. List Items One by One: A New Data Source and Learning Paradigm for Multimodal LLMs. arXiv preprint arXiv:2404.16375 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=AgDICX1h50","author":"Yasunaga Michihiro","year":"2024","unstructured":"Michihiro Yasunaga, Xinyun Chen, Yujia Li, Panupong Pasupat, Jure Leskovec, Percy Liang, Ed H. Chi, and Denny Zhou. 2024. Large Language Models as Analogical Reasoners. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=AgDICX1h50"},{"key":"e_1_3_2_1_57_1","volume-title":"German and French. In Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing","volume":"2020","author":"Zadeh Amir","year":"2020","unstructured":"Amir Zadeh, Yan Sheng Cao, Simon Hessner, Paul Pu Liang, Soujanya Poria, and Louis-Philippe Morency. 2020. CMU-MOSEAS: A multimodal language dataset for Spanish, Portuguese, German and French. In Proceedings of the Conference on Empirical Methods in Natural Language Processing. Conference on Empirical Methods in Natural Language Processing, Vol. 2020. NIH Public Access, 1801."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00901"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_2_1_61_1","volume-title":"Llavar: Enhanced visual instruction tuning for text-rich image understanding. arXiv preprint arXiv:2306.17107","author":"Zhang Yanzhe","year":"2023","unstructured":"Yanzhe Zhang, Ruiyi Zhang, Jiuxiang Gu, Yufan Zhou, Nedim Lipka, Diyi Yang, and Tong Sun. 2023. Llavar: Enhanced visual instruction tuning for text-rich image understanding. arXiv preprint arXiv:2306.17107 (2023)."},{"key":"e_1_3_2_1_62_1","volume-title":"Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087","author":"Zhao Bo","year":"2023","unstructured":"Bo Zhao, Boya Wu, Muyang He, and Tiejun Huang. 2023. Svit: Scaling up visual instruction tuning. arXiv preprint arXiv:2307.04087 (2023)."},{"key":"e_1_3_2_1_63_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3bq3jsvcQ1","author":"Zheng Huaixiu Steven","year":"2024","unstructured":"Huaixiu Steven Zheng, Swaroop Mishra, Xinyun Chen, Heng-Tze Cheng, Ed H. Chi, Quoc V Le, and Denny Zhou. 2024. Take a Step Back: Evoking Reasoning via Abstraction in Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=3bq3jsvcQ1"},{"key":"e_1_3_2_1_64_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755726","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:00:33Z","timestamp":1765339233000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755726"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":64,"alternative-id":["10.1145\/3746027.3755726","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755726","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}