{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:46Z","timestamp":1765342786370,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China","award":["No. 62106204"],"award-info":[{"award-number":["No. 62106204"]}]},{"name":"the Natural Science Foundation of Sichuan","award":["No. 2025YFHZ0124"],"award-info":[{"award-number":["No. 2025YFHZ0124"]}]},{"name":"the Frontier Cross Innovation Team Project of Southwest Jiaotong University","award":["YH1500112432297"],"award-info":[{"award-number":["YH1500112432297"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755653","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"4894-4903","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Why is a Bird's Caption a Good Demonstration? Towards Effective Multimodal In-Context Learning without Dedicated Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-8162-3201","authenticated-orcid":false,"given":"Junlin","family":"Fang","sequence":"first","affiliation":[{"name":"School of Computing and Artificial Intelligence, Southwest Jiaotong University, Chendu, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5612-7818","authenticated-orcid":false,"given":"Wenya","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computing and Artificial Intelligence, Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7148-353X","authenticated-orcid":false,"given":"Lingli","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computing and Artificial Intelligence, Southwest Jiaotong University, Cheng du, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1640-0992","authenticated-orcid":false,"given":"Fengmao","family":"Lv","sequence":"additional","affiliation":[{"name":"School of Computing and Artificial Intelligence, Southwest Jiaotong University, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00161"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.621"},{"key":"e_1_3_2_1_4_1","volume-title":"Understanding and Improving In-Context Learning on Vision-language Models. In ICLR 2024 Workshop.","author":"Chen Shuo","year":"2024","unstructured":"Shuo Chen, Zhen Han, Bailan He, Mark Buckley, Philip Torr, Volker Tresp, and Jindong Gu. 2024a. Understanding and Improving In-Context Learning on Vision-language Models. In ICLR 2024 Workshop."},{"key":"e_1_3_2_1_5_1","volume-title":"Understanding and Improving In-Context Learning on Vision-language Models. CoRR","author":"Chen Shuo","year":"1802","unstructured":"Shuo Chen, Zhen Han, Bailan He, Mark Buckley, Philip H. S. Torr, Volker Tresp, and Jindong Gu. 2023. Understanding and Improving In-Context Learning on Vision-language Models. CoRR, Vol. abs\/2311.18021 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2015. Microsoft COCO Captions: Data Collection and Evaluation Server. CoRR, Vol. abs\/1504.00325 (2015)."},{"key":"e_1_3_2_1_7_1","first-page":"24185","article-title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","author":"Chen Zhe","year":"2024","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, et al., 2024b. Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In CVPR. 24185-24198.","journal-title":"CVPR."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"e_1_3_2_1_9_1","volume-title":"AAAI","author":"Desai Poorav","year":"2022","unstructured":"Poorav Desai, Tanmoy Chakraborty, and Md. Shad Akhtar. 2022. Nice Perfume. How Long Did You Marinate in It? Multimodal Sarcasm Explanation. In AAAI 2022. AAAI Press, 10563-10571."},{"key":"e_1_3_2_1_10_1","volume-title":"e-SNLI-VE-2.0: Corrected Visual-Textual Entailment with Natural Language Explanations. CoRR","author":"Do Virginie","year":"2020","unstructured":"Virginie Do, Oana-Maria Camburu, Zeynep Akata, and Thomas Lukasiewicz. 2020. e-SNLI-VE-2.0: Corrected Visual-Textual Entailment with Natural Language Explanations. CoRR, Vol. abs\/2004.03744 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Amit Alfassy, Assaf Arbelle, Shimon Ullman, and Leonid Karlinsky.","author":"Doveh Sivan","year":"2024","unstructured":"Sivan Doveh, Shaked Perek, Muhammad Jehanzeb Mirza, Amit Alfassy, Assaf Arbelle, Shimon Ullman, and Leonid Karlinsky. 2024. Towards Multimodal In-Context Learning for Vision & Language Models. CoRR, Vol. abs\/2403.12736 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"FICML","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-Thought: Step-by-Step Video Reasoning from Perception to Cognition. In FICML 2024. OpenReview.net."},{"key":"e_1_3_2_1_13_1","unstructured":"fhai50032. 2023. Poster Caption Dataset. https:\/\/huggingface.co\/datasets\/fhai50032\/poster_caption. Accessed: 2025-02-07."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01785"},{"key":"e_1_3_2_1_15_1","volume-title":"Tree-Planner: Efficient Close-loop Task Planning with Large Language Models. In ICLR","author":"Hu Mengkang","year":"2024","unstructured":"Mengkang Hu, Yao Mu, Xinmiao Yu, Mingyu Ding, Shiguang Wu, Wenqi Shao, Qiguang Chen, Bin Wang, Yu Qiao, and Ping Luo. 2024. Tree-Planner: Efficient Close-loop Task Planning with Large Language Models. In ICLR 2024. OpenReview.net."},{"key":"e_1_3_2_1_16_1","first-page":"1433","volume-title":"MemeCap: A Dataset for Captioning and Interpreting Memes. In EMNLP","author":"Hwang Eunjeong","year":"2023","unstructured":"Eunjeong Hwang and Vered Shwartz. 2023. MemeCap: A Dataset for Captioning and Interpreting Memes. In EMNLP 2023. Association for Computational Linguistics, 1433-1445."},{"key":"e_1_3_2_1_17_1","volume-title":"Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In ICML 2021 (Proceedings of Machine Learning Research","volume":"4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In ICML 2021 (Proceedings of Machine Learning Research, Vol. 139). PMLR, 4904-4916."},{"key":"e_1_3_2_1_18_1","first-page":"1988","volume-title":"CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In CVPR","author":"Johnson Justin","year":"2017","unstructured":"Justin Johnson, Bharath Hariharan, Laurens van der Maaten, Li Fei-Fei, C. Lawrence Zitnick, and Ross B. Girshick. 2017. CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In CVPR 2017. IEEE Computer Society, 1988-1997."},{"key":"e_1_3_2_1_19_1","volume-title":"OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv:2306.16527 [cs.IR]","author":"Lauren\u00e7on Hugo","year":"2023","unstructured":"Hugo Lauren\u00e7on, Lucile Saulnier, L\u00e9o Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, and Victor Sanh. 2023. OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. arXiv:2306.16527 [cs.IR]"},{"key":"e_1_3_2_1_20_1","first-page":"13299","article-title":"SEED-Bench: Benchmarking Multimodal Large Language Models. In CVPR 2024","author":"Li Bohao","year":"2024","unstructured":"Bohao Li, Yuying Ge, Yixiao Ge, Guangzhi Wang, Rui Wang, Ruimao Zhang, and Ying Shan. 2024. SEED-Bench: Benchmarking Multimodal Large Language Models. In CVPR 2024. IEEE, 13299-13308.","journal-title":"IEEE"},{"key":"e_1_3_2_1_21_1","volume-title":"MIMIC-IT: Multi-Modal In-Context Instruction Tuning. CoRR","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Fanyi Pu, Jingkang Yang, Chunyuan Li, and Ziwei Liu. 2023b. MIMIC-IT: Multi-Modal In-Context Instruction Tuning. CoRR, Vol. abs\/2306.05425 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"A Survey on Benchmarks of Multimodal Large Language Models. CoRR","author":"Li Jian","year":"2024","unstructured":"Jian Li and Weiheng Lu. 2024. A Survey on Benchmarks of Multimodal Large Language Models. CoRR, Vol. abs\/2408.08632 (2024)."},{"key":"e_1_3_2_1_23_1","first-page":"4644","volume-title":"Unified Demonstration Retriever for In-Context Learning. In ACL","author":"Li Xiaonan","year":"2023","unstructured":"Xiaonan Li, Kai Lv, Hang Yan, Tianyang Lin, Wei Zhu, Yuan Ni, Guotong Xie, Xiaoling Wang, and Xipeng Qiu. 2023a. Unified Demonstration Retriever for In-Context Learning. In ACL 2023. Association for Computational Linguistics, 4644-4668."},{"key":"e_1_3_2_1_24_1","first-page":"195","volume-title":"Adapting Generative Pretrained Language Model for Open-domain Multimodal Sentence Summarization. In SIGIR","author":"Lin Dengtian","year":"2023","unstructured":"Dengtian Lin, Liqiang Jing, Xuemeng Song, Meng Liu, Teng Sun, and Liqiang Nie. 2023. Adapting Generative Pretrained Language Model for Open-domain Multimodal Sentence Summarization. In SIGIR 2023. ACM, 195-204."},{"volume-title":"A Survey of Opinion Mining and Sentiment Analysis","author":"Liu Bing","key":"e_1_3_2_1_25_1","unstructured":"Bing Liu and Lei Zhang. 2012. A Survey of Opinion Mining and Sentiment Analysis. In Mining Text Data, Charu C. Aggarwal and ChengXiang Zhai (Eds.). Springer, 415-463."},{"key":"e_1_3_2_1_26_1","volume-title":"In-context Vectors: Making In Context Learning More Effective and Controllable Through Latent Space Steering. In ICML","author":"Liu Sheng","year":"2024","unstructured":"Sheng Liu, Haotian Ye, Lei Xing, and James Y. Zou. 2024. In-context Vectors: Making In Context Learning More Effective and Controllable Through Latent Space Steering. In ICML 2024. OpenReview.net."},{"key":"e_1_3_2_1_27_1","volume-title":"First Conference on Language Modeling.","author":"Long Quanyu","year":"2024","unstructured":"Quanyu Long, Yin Wu, Wenya Wang, and Sinno Jialin Pan. 2024. Does In-Context Learning Really Learn? Rethinking How Large Language Models Respond and Solve Tasks via In-Context Learning. In First Conference on Language Modeling."},{"key":"e_1_3_2_1_28_1","volume-title":"The Twelfth ICLR","author":"Lu Pan","year":"2024","unstructured":"Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, and Jianfeng Gao. 2024. MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts. In The Twelfth ICLR, 2024. OpenReview.net."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.305"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.759"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"e_1_3_2_1_32_1","first-page":"13362","volume-title":"CommVQA: Situating Visual Question Answering in Communicative Contexts. In EMNLP","author":"Naik Nandita","year":"2024","unstructured":"Nandita Naik, Christopher Potts, and Elisa Kreiss. 2024. CommVQA: Situating Visual Question Answering in Communicative Contexts. In EMNLP 2024. Association for Computational Linguistics, 13362-13377."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2009.191"},{"key":"e_1_3_2_1_34_1","first-page":"7441","volume-title":"In-Context Learning with Iterative Demonstration Selection. In Findings of EMNLP","author":"Qin Chengwei","year":"2024","unstructured":"Chengwei Qin, Aston Zhang, Chen Chen, Anirudh Dagar, and Wenming Ye. 2024b. In-Context Learning with Iterative Demonstration Selection. In Findings of EMNLP 2024,. Association for Computational Linguistics, 7441-7455."},{"key":"e_1_3_2_1_35_1","volume-title":"An In-Depth Exploration. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Qin Libo","year":"2024","unstructured":"Libo Qin, Qiguang Chen, Hao Fei, Zhi Chen, Min Li, and Wanxiang Che. 2024a. What Factors Affect Multi-Modal In-Context Learning? An In-Depth Exploration. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.163"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.689"},{"key":"e_1_3_2_1_38_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In ICML 2021 (Proceedings of Machine Learning Research","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML 2021 (Proceedings of Machine Learning Research, Vol. 139). PMLR, 8748-8763."},{"key":"e_1_3_2_1_39_1","first-page":"2953","article-title":"Exploring Models and Data for Image Question Answering","volume":"2015","author":"Ren Mengye","year":"2015","unstructured":"Mengye Ren, Ryan Kiros, and Richard S. Zemel. 2015. Exploring Models and Data for Image Question Answering. In Neurips 2015. 2953-2961.","journal-title":"Neurips"},{"key":"e_1_3_2_1_40_1","first-page":"146","article-title":"A-OKVQA: A Benchmark for Visual Question Answering Using World Knowledge. In ECCV (Lecture Notes in Computer Science, Vol. 13668)","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-OKVQA: A Benchmark for Visual Question Answering Using World Knowledge. In ECCV (Lecture Notes in Computer Science, Vol. 13668). Springer, 146-162.","journal-title":"Springer"},{"key":"e_1_3_2_1_41_1","volume-title":"Selective Annotation Makes Language Models Better Few-Shot Learners. In ICLR","author":"Su Hongjin","year":"2023","unstructured":"Hongjin Su, Jungo Kasai, Chen Henry Wu, Weijia Shi, Tianlu Wang, Jiayi Xin, Rui Zhang, Mari Ostendorf, Luke Zettlemoyer, Noah A. Smith, and Tao Yu. 2023. Selective Annotation Makes Language Models Better Few-Shot Learners. In ICLR 2023. OpenReview.net."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"e_1_3_2_1_43_1","first-page":"200","article-title":"Multimodal Few-Shot Learning with Frozen Language Models","volume":"2021","author":"Tsimpoukelli Maria","year":"2021","unstructured":"Maria Tsimpoukelli, Jacob Menick, Serkan Cabi, S. M. Ali Eslami, Oriol Vinyals, and Felix Hill. 2021. Multimodal Few-Shot Learning with Frozen Language Models. In NeurIPS 2021. 200-212.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_44_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. CoRR","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. CoRR, Vol. abs\/2409.12191 (2024)."},{"key":"e_1_3_2_1_45_1","first-page":"10123","volume-title":"CVPR","author":"Wang Xinyu","year":"2020","unstructured":"Xinyu Wang, Yuliang Liu, Chunhua Shen, Chun Chet Ng, Canjie Luo, Lianwen Jin, Chee Seng Chan, Anton van den Hengel, and Liangwei Wang. 2020. On the General Value of Evidence, and Bilingual Scene-Text Visual Question Answering. In CVPR 2020. Computer Vision Foundation \/ IEEE, 10123-10132."},{"key":"e_1_3_2_1_46_1","first-page":"13484","volume-title":"Self-Instruct: Aligning Language Models with Self-Generated Instructions. In ACL","author":"Wang Yizhong","year":"2023","unstructured":"Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Hannaneh Hajishirzi. 2023. Self-Instruct: Aligning Language Models with Self-Generated Instructions. In ACL 2023. Association for Computational Linguistics, 13484-13508."},{"key":"e_1_3_2_1_47_1","volume-title":"Towards Semantic Equivalence of Tokenization in Multimodal LLM. CoRR","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Xiangtai Li, Jiayi Ji, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024a. Towards Semantic Equivalence of Tokenization in Multimodal LLM. CoRR, Vol. abs\/2406.05127 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"NExT-GPT: Any-to-Any Multimodal LLM. In ICML","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024b. NExT-GPT: Any-to-Any Multimodal LLM. In ICML 2024. OpenReview.net."},{"key":"e_1_3_2_1_49_1","first-page":"2887","volume-title":"MET-Meme: A Multimodal Meme Dataset Rich in Metaphors. In SIGIR","author":"Xu Bo","year":"2022","unstructured":"Bo Xu, Tingting Li, Junzhe Zheng, Mehdi Naseriparsa, Zhehuan Zhao, Hongfei Lin, and Feng Xia. 2022. MET-Meme: A Multimodal Meme Dataset Rich in Metaphors. In SIGIR 2022. ACM, 2887-2899."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"e_1_3_2_1_51_1","volume-title":"A Survey on Multimodal Large Language Models. CoRR","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. CoRR, Vol. abs\/2306.13549 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.155"},{"key":"e_1_3_2_1_53_1","first-page":"9556","volume-title":"MMMU: A Massive Multi-Discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In IEEE\/CVF Conference on CVPR","author":"Yue Xiang","year":"2024","unstructured":"Xiang Yue, Yuansheng Ni, Tianyu Zheng, Kai Zhang, Ruoqi Liu, Ge Zhang, Samuel Stevens, Dongfu Jiang, Weiming Ren, Yuxuan Sun, Cong Wei, Botao Yu, Ruibin Yuan, Renliang Sun, Ming Yin, Boyuan Zheng, Zhenzhu Yang, Yibo Liu, Wenhao Huang, Huan Sun, Yu Su, and Wenhu Chen. 2024. MMMU: A Massive Multi-Discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI. In IEEE\/CVF Conference on CVPR, 2024. IEEE, 9556-9567."},{"key":"e_1_3_2_1_54_1","first-page":"12401","volume-title":"MM-LLMs: Recent Advances in MultiModal Large Language Models. In Findings of ACL","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024. MM-LLMs: Recent Advances in MultiModal Large Language Models. In Findings of ACL 2024. Association for Computational Linguistics, 12401-12430."},{"key":"e_1_3_2_1_55_1","first-page":"9134","volume-title":"Active Example Selection for In-Context Learning. In EMNLP","author":"Zhang Yiming","year":"2022","unstructured":"Yiming Zhang, Shi Feng, and Chenhao Tan. 2022. Active Example Selection for In-Context Learning. In EMNLP 2022. Association for Computational Linguistics, 9134-9148."},{"key":"e_1_3_2_1_56_1","volume-title":"NeurIPS","author":"Zhang Yuanhan","year":"2023","unstructured":"Yuanhan Zhang, Kaiyang Zhou, and Ziwei Liu. 2023. What Makes Good Examples for Visual In-Context Learning?. In NeurIPS 2023."},{"key":"e_1_3_2_1_57_1","volume-title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning. In ICLR","author":"Zhao Haozhe","year":"2024","unstructured":"Haozhe Zhao, Zefan Cai, Shuzheng Si, Xiaojian Ma, Kaikai An, Liang Chen, Zixuan Liu, Sheng Wang, Wenjuan Han, and Baobao Chang. 2024. MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning. In ICLR 2024."},{"key":"e_1_3_2_1_58_1","first-page":"1345","article-title":"FastMMD","volume":"27","author":"Zhao Ji","year":"2015","unstructured":"Ji Zhao and Deyu Meng. 2015. FastMMD: Ensemble of Circular Discrepancy for Efficient Two-Sample Test. Neural Comput., Vol. 27, 6 (2015), 1345-1372.","journal-title":"Ensemble of Circular Discrepancy for Efficient Two-Sample Test. Neural Comput."},{"key":"e_1_3_2_1_59_1","volume-title":"A Survey of Large Language Models. CoRR","author":"Zhao Wayne Xin","year":"1822","unstructured":"Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, Yifan Du, Chen Yang, Yushuo Chen, Zhipeng Chen, Jinhao Jiang, Ruiyang Ren, Yifan Li, Xinyu Tang, Zikang Liu, Peiyu Liu, Jian-Yun Nie, and Ji-Rong Wen. 2023. A Survey of Large Language Models. CoRR, Vol. abs\/2303.18223 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Neruips","author":"Zheng Ge","year":"2023","unstructured":"Ge Zheng, Bin Yang, Jiajin Tang, Hong-Yu Zhou, and Sibei Yang. 2023. DDCoT: Duty-Distinct Chain-of-Thought Prompting for Multimodal Reasoning in Language Models. In Neruips, 2023."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755653","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:57:01Z","timestamp":1765342621000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755653"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":60,"alternative-id":["10.1145\/3746027.3755653","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755653","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}