{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T02:12:58Z","timestamp":1780539178126,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":72,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755235","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"392-401","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["From Semantics, Scene to Instance-awareness: Distilling Foundation Model for Open-vocabulary Grounded Situation Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7793-5261","authenticated-orcid":false,"given":"Chen","family":"Cai","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6705-7808","authenticated-orcid":false,"given":"Tianyi","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9137-2869","authenticated-orcid":false,"given":"Jianjun","family":"Gao","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3226-0920","authenticated-orcid":false,"given":"Wenyang","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9859-9573","authenticated-orcid":false,"given":"Kejun","family":"Wu","sequence":"additional","affiliation":[{"name":"EIC, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6749-0305","authenticated-orcid":false,"given":"Ruoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8659-4724","authenticated-orcid":false,"given":"Yi","family":"Wang","sequence":"additional","affiliation":[{"name":"EEE, The Hong Kong Polytechnic University, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8342-4682","authenticated-orcid":false,"given":"Soo Chin","family":"Liew","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274","author":"Bahng Hyojin","year":"2022","unstructured":"Hyojin Bahng, Ali Jahanian, Swami Sankaranarayanan, and Phillip Isola. 2022. Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 (2022)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02125"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.227"},{"key":"e_1_3_2_1_4_1","first-page":"739","article-title":"Detecting any human-object interaction relationship: Universal hoi detector with spatial prompt learning on foundation models","volume":"36","author":"Cao Yichao","year":"2023","unstructured":"Yichao Cao, Qingfei Tang, Xiu Su, Song Chen, Shan You, Xiaobo Lu, and Chang Xu. 2023. Detecting any human-object interaction relationship: Universal hoi detector with spatial prompt learning on foundation models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 739-751.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","first-page":"381","article-title":"Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv)","author":"Chao Yu-Wei","year":"2018","unstructured":"Yu-Wei Chao, Yunfan Liu, Xieyang Liu, Huayi Zeng, and Jia Deng. 2018. Learning to detect human-object interactions. In 2018 ieee winter conference on applications of computer vision (wacv). IEEE, 381-389.","journal-title":"IEEE"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01834"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547943"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01904"},{"key":"e_1_3_2_1_10_1","volume-title":"Grounded situation recognition with transformers. arXiv preprint arXiv:2111.10135","author":"Cho Junhyeong","year":"2021","unstructured":"Junhyeong Cho, Youngseok Yoon, Hyeonjun Lee, and Suha Kwak. 2021. Grounded situation recognition with transformers. arXiv preprint arXiv:2111.10135 (2021)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640457.3688118"},{"key":"e_1_3_2_1_12_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. Instructblip: Towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"CL-HOI: Cross-level human-object interaction distillation from multimodal large language models. Knowledge-Based Systems","author":"Gao Jianjun","year":"2025","unstructured":"Jianjun Gao, Chen Cai, Ruoyu Wang, Wenyang Liu, Kim-Hui Yap, Kratika Garg, and Boon Siew Han. 2025. CL-HOI: Cross-level human-object interaction distillation from multimodal large language models. Knowledge-Based Systems (2025), 113561."},{"key":"e_1_3_2_1_14_1","unstructured":"Peng Gao Jiaming Han Renrui Zhang Ziyi Lin Shijie Geng Aojun Zhou Wei Zhang Pan Lu Conghui He Xiangyu Yue et al. 2023. Llama-adapter v2: Parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)."},{"key":"e_1_3_2_1_15_1","unstructured":"Jiawei Gu Xuhui Jiang Zhichao Shi Hexiang Tan Xuehao Zhai Chengjin Xu Wei Li Yinghan Shen Shengjie Ma Honghao Liu et al. 2024b. A survey on llm-as-a-judge. arXiv preprint arXiv:2411.15594 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Gu Yuxian","year":"2024","unstructured":"Yuxian Gu, Li Dong, Furu Wei, and Minlie Huang. 2024a. MiniLLM: Knowledge distillation of large language models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_18_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25159"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_35"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00056"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01441"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01047"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.457"},{"key":"e_1_3_2_1_26_1","volume-title":"Negative label guided ood detection with pretrained vision-language models. arXiv preprint arXiv:2403.20078","author":"Jiang Xue","year":"2024","unstructured":"Xue Jiang, Feng Liu, Zhen Fang, Hong Chen, Tongliang Liu, Feng Zheng, and Bo Han. 2024. Negative label guided ood detection with pretrained vision-language models. arXiv preprint arXiv:2403.20078 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681036"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01576"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02513"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28121"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"e_1_3_2_1_33_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00200"},{"key":"e_1_3_2_1_35_1","volume-title":"PromptSR: Cascade Prompting for Lightweight Image Super-Resolution. arXiv preprint arXiv:2507.04118","author":"Liu Wenyang","year":"2025","unstructured":"Wenyang Liu, Chen Cai, Jianjun Gao, Kejun Wu, Yi Wang, Kim-Hui Yap, and Lap-Pui Chau. 2025. PromptSR: Cascade Prompting for Lightweight Image Super-Resolution. arXiv preprint arXiv:2507.04118 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.279"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01568"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-024-10877-1"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02251"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01398"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_19"},{"key":"e_1_3_2_1_42_1","volume-title":"Dynamic Scene Understanding from Vision-Language Representations. arXiv preprint arXiv:2501.11653","author":"Pruss Shahaf","year":"2025","unstructured":"Shahaf Pruss, Morris Alper, and Hadar Averbuch-Elor. 2025. Dynamic Scene Understanding from Vision-Language Representations. arXiv preprint arXiv:2501.11653 (2025)."},{"key":"e_1_3_2_1_43_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01540"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 444-453","author":"Roy Debaditya","year":"2024","unstructured":"Debaditya Roy, Dhruv Verma, and Basura Fernando. 2024. ClipSitu: Effectively Leveraging CLIP for Conditional Predictions in Situation Recognition. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 444-453."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01423"},{"key":"e_1_3_2_1_47_1","volume-title":"Ryan Burnell, Libin Bai, Anmol Gulati, et al.","author":"Team Gemini","year":"2024","unstructured":"Gemini Team, Petko Georgiev, Ving Ian Lei, Ryan Burnell, Libin Bai, Anmol Gulati, et al., 2024. Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context, 2024. URL https:\/\/arxiv.org\/abs\/2403.05530 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Tinyllm: Learning a small student from multiple large language models. arXiv e-prints","author":"Tian Yijun","year":"2024","unstructured":"Yijun Tian, Yikun Han, Xiusi Chen, Wei Wang, and Nitesh V Chawla. 2024. Tinyllm: Learning a small student from multiple large language models. arXiv e-prints (2024), arXiv-2402."},{"key":"e_1_3_2_1_49_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25359"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00101"},{"key":"e_1_3_2_1_52_1","volume-title":"Dataset Distillation with Neural Characteristic Function: A Minmax Perspective. arXiv preprint arXiv:2502.20653","author":"Wang Shaobo","year":"2025","unstructured":"Shaobo Wang, Yicun Yang, Zhiyuan Liu, Chenghao Sun, Xuming Hu, Conghui He, and Linfeng Zhang. 2025b. Dataset Distillation with Neural Characteristic Function: A Minmax Perspective. arXiv preprint arXiv:2502.20653 (2025)."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing. 3723-3740","author":"Wang Taowen","year":"2024","unstructured":"Taowen Wang, Yiyang Liu, James Chenhao Liang, Junhan Zhao, Yiming Cui, Yuning Mao, Shaoliang Nie, Jiahao Liu, Fuli Feng, Zenglin Xu, Cheng Han, Lifu Huang, Qifan Wang, and Dongfang Liu. 2024. M^2PT: Multimodal Prompt Tuning for Zero-shot Instruction Learning. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing. 3723-3740."},{"key":"e_1_3_2_1_54_1","volume-title":"EVA02-AT: Egocentric Video-Language Understanding with Spatial-Temporal Rotary Positional Embeddings and Symmetric Optimization. arXiv preprint arXiv:2506.14356","author":"Wang Xiaoqi","year":"2025","unstructured":"Xiaoqi Wang, Yi Wang, and Lap-Pui Chau. 2025a. EVA02-AT: Egocentric Video-Language Understanding with Spatial-Temporal Rotary Positional Embeddings and Symmetric Optimization. arXiv preprint arXiv:2506.14356 (2025)."},{"key":"e_1_3_2_1_55_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022b. Emergent abilities of large language models. arXiv preprint arXiv:2206.07682 (2022)."},{"key":"e_1_3_2_1_56_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022c. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20167"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645359"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28422"},{"key":"e_1_3_2_1_60_1","volume-title":"A survey on knowledge distillation of large language models. arXiv preprint arXiv:2402.13116","author":"Xu Xiaohan","year":"2024","unstructured":"Xiaohan Xu, Ming Li, Chongyang Tao, Tao Shen, Reynold Cheng, Jinyang Li, Can Xu, Dacheng Tao, and Tianyi Zhou. 2024. A survey on knowledge distillation of large language models. arXiv preprint arXiv:2402.13116 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652","author":"Young Alex","year":"2024","unstructured":"Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge Zhang, Guanwei Zhang, Guoyin Wang, Heng Li, Jiangcheng Zhu, Jianqun Chen, et al., 2024. Yi: Open foundation models by 01. ai. arXiv preprint arXiv:2403.04652 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"International Journal of Computer Vision","author":"Zang Yuhang","year":"2024","unstructured":"Yuhang Zang, Wei Li, Jun Han, Kaiyang Zhou, and Chen Change Loy. 2024. Contextual object detection with multimodal large language models. International Journal of Computer Vision (2024), 1-19."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01508"},{"key":"e_1_3_2_1_64_1","volume-title":"Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.766"},{"key":"e_1_3_2_1_66_1","volume-title":"Vialm: A survey and benchmark of visually impaired assistance with large models. arXiv preprint arXiv:2402.01735","author":"Zhao Yi","year":"2024","unstructured":"Yi Zhao, Yilin Zhang, Rong Xiang, Jing Li, and Hillming Li. 2024a. Vialm: A survey and benchmark of visually impaired assistance with large models. arXiv preprint arXiv:2402.01735 (2024)."},{"key":"e_1_3_2_1_67_1","first-page":"13026","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING","author":"Zhao Yichun","year":"2024","unstructured":"Yichun Zhao, Shuheng Zhou, and Huijia Zhu. 2024c. Probe then retrieve and reason: Distilling probing and reasoning capabilities into smaller language models. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 13026-13032."},{"key":"e_1_3_2_1_68_1","first-page":"5168","article-title":"Ddcot: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models","volume":"36","author":"Zheng Ge","year":"2023","unstructured":"Ge Zheng, Bin Yang, Jiajin Tang, Hong-Yu Zhou, and Sibei Yang. 2023b. Ddcot: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 5168-5191.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_69_1","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al., 2023a. Judging llm-as-a-judge with mt-bench and chatbot arena. Advances in Neural Information Processing Systems, Vol. 36 (2023), 46595-46623.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_70_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023a. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"e_1_3_2_1_72_1","volume-title":"Congrui Yin, Yuan Ni, Xiaoling Wang, and Guotong Xie.","author":"Zhu Wei","year":"2024","unstructured":"Wei Zhu, Aaron Xuxiang Tian, Congrui Yin, Yuan Ni, Xiaoling Wang, and Guotong Xie. 2024. IAPT: Instruction-Aware Prompt Tuning for Large Language Models. arXiv preprint arXiv:2405.18203 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755235","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:22Z","timestamp":1765309822000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755235"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":72,"alternative-id":["10.1145\/3746027.3755235","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755235","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}