{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T11:29:36Z","timestamp":1764588576735,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2023ZD0121101"],"award-info":[{"award-number":["2023ZD0121101"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3688995","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11435-11441","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Demonstrative Instruction Following in Multimodal LLMs via Integrating Low-Rank Adaptation with Ensemble Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0634-6152","authenticated-orcid":false,"given":"Jingyu","family":"Wei","sequence":"first","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8794-7376","authenticated-orcid":false,"given":"Yi","family":"Su","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5997-5169","authenticated-orcid":false,"given":"Kele","family":"Xu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4082-4798","authenticated-orcid":false,"given":"Lingbin","family":"Zeng","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9953-8438","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"additional","affiliation":[{"name":"Strategic Assessments and Consultation Institute, Academy of Military Science, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3245-1901","authenticated-orcid":false,"given":"Huaimin","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, Hunan, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Flamingo: A Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems 35 (NeurIPS","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob Menick, Sebastian Borgeaud, Andrew Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikolaj Binkowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Karen Simonyan. 2022. Flamingo: A Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems 35 (NeurIPS 2022), Vol. 35. Curran Associates, Inc., New Orleans,LA, USA, 23716--23736."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_3_1","volume-title":"Gabriel Ilharco, Mitchell Wortsman, and Ludwig Schmidt.","author":"Awadalla Anas","year":"2023","unstructured":"Anas Awadalla, Irena Gao, Josh Gardner, Jack Hessel, Yusuf Hanafy, Wanrong Zhu, Kalyani Marathe, Yonatan Bitton, Samir Gadre, Shiori Sagawa, Jenia Jitsev, Simon Kornblith, Pang Wei Koh, Gabriel Ilharco, Mitchell Wortsman, and Ludwig Schmidt. 2023. OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models. arxiv: 2308.01390 https:\/\/arxiv.org\/abs\/2308.01390"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF00058655"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612844"},{"key":"e_1_3_2_1_6_1","volume-title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arxiv: 2306.15195 https:\/\/arxiv.org\/abs\/2306.15195","author":"Chen Keqin","year":"2023","unstructured":"Keqin Chen, Zhao Zhang, Weili Zeng, Richong Zhang, Feng Zhu, and Rui Zhao. 2023. Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic. arxiv: 2306.15195 https:\/\/arxiv.org\/abs\/2306.15195"},{"key":"e_1_3_2_1_7_1","volume-title":"VoiceStyle: Voice-based Face Generation via Cross-modal Prototype Contrastive Learning. ACM Transactions on Multimedia Computing, Communications, and Applications (June","author":"Chen Wuyang","year":"2024","unstructured":"Wuyang Chen, Boqing Zhu, Kele Xu, Yong Dou, and Dawei Feng. 2024. VoiceStyle: Voice-based Face Generation via Cross-modal Prototype Contrastive Learning. ACM Transactions on Multimedia Computing, Communications, and Applications (June 2024), 3671002."},{"key":"e_1_3_2_1_8_1","volume-title":"Reproducible Scaling Laws for Contrastive Language-Image Learning. In The IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR","author":"Cherti Mehdi","year":"2023","unstructured":"Mehdi Cherti, Romain Beaumont, Ross Wightman, Mitchell Wortsman, Gabriel Ilharco, Cade Gordon, Christoph Schuhmann, Ludwig Schmidt, and Jenia Jitsev. 2023. Reproducible Scaling Laws for Contrastive Language-Image Learning. In The IEEE \/ CVF Computer Vision and Pattern Recognition Conference (CVPR 2023). IEEE, Vancouver, BC, Canada, 2818--2829."},{"key":"e_1_3_2_1_9_1","volume-title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In The 37th Annual Conference on Neural Information Processing Systems (NeurIPS","volume":"36","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In The 37th Annual Conference on Neural Information Processing Systems (NeurIPS 2023), Vol. 36. Curran Associates, Inc., New Orleans, LA, USA, 49250--49267."},{"key":"e_1_3_2_1_10_1","volume-title":"The Twelfth International Conference on Learning Representations(ICLR","author":"Dao Tri","year":"2024","unstructured":"Tri Dao. 2024. FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. In The Twelfth International Conference on Learning Representations(ICLR 2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (ICML","volume":"202","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi S. M. Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, Wenlong Huang, Yevgen Chebotar, Pierre Sermanet, Daniel Duckworth, Sergey Levine, Vincent Vanhoucke, Karol Hausman, Marc Toussaint, Klaus Greff, Andy Zeng, Igor Mordatch, and Pete Florence. 2023. PaLM-E: An Embodied Multimodal Language Model. In Proceedings of the 40th International Conference on Machine Learning (ICML 2023), Vol. 202. Honolulu, Hawaii, USA, 8469--8488."},{"key":"e_1_3_2_1_12_1","unstructured":"Peng Gao Jiaming Han Renrui Zhang Ziyi Lin Shijie Geng Aojun Zhou Wei Zhang Pan Lu Conghui He Xiangyu Yue Hongsheng Li and Yu Qiao. 2023. LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model. arxiv: 2304.15010 https:\/\/arxiv.org\/abs\/2304.15010"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3688993"},{"key":"e_1_3_2_1_14_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In The 10th International Conference on Learning Representations (ICLR","author":"Hu Edward","year":"2022","unstructured":"Edward Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In The 10th International Conference on Learning Representations (ICLR 2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"Barun Patra, Qiang Liu, Kriti Aggarwal, Zewen Chi, Nils Bjorck, Vishrav Chaudhary, Subhojit Som, Xia Song, and Furu Wei.","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Barun Patra, Qiang Liu, Kriti Aggarwal, Zewen Chi, Nils Bjorck, Vishrav Chaudhary, Subhojit Som, Xia Song, and Furu Wei. 2023. Language Is Not All You Need: Aligning Perception with Language Models. In Advances in Neural Information Processing Systems 36 (NeurIPS 2023). Curran Associates, Inc., New Orleans, LA, USA, 72096--72109."},{"key":"e_1_3_2_1_16_1","volume-title":"Pavan Kumar Anand, Ranjay Krishna, and Linda Shapiro.","author":"Ikezogwo Wisdom","year":"2023","unstructured":"Wisdom Ikezogwo, Saygin Seyfioglu, Fatemeh Ghezloo, Dylan Geva, Fatwir Sheikh Mohammed, Pavan Kumar Anand, Ranjay Krishna, and Linda Shapiro. 2023. Quilt-1M: One Million Image-Text Pairs for Histopathology. In Advances in Neural Information Processing Systems 36 (NeurIPS 2023), Vol. 36. Curran Associates, Inc., New Orleans, LA, USA, 37995--38017."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.792"},{"key":"e_1_3_2_1_18_1","volume-title":"Prismatic VLMs: Investigating the Design Space of Visually-Conditioned Language Models. In International Conference on Machine Learning (ICML","author":"Karamcheti Siddharth","year":"2024","unstructured":"Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, and Dorsa Sadigh. 2024. Prismatic VLMs: Investigating the Design Space of Visually-Conditioned Language Models. In International Conference on Machine Learning (ICML 2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (ICML","volume":"202","author":"Koh Jing Yu","year":"2023","unstructured":"Jing Yu Koh, Ruslan Salakhutdinov, and Daniel Fried. 2023. Grounding Language Models to Images for Multimodal Inputs and Outputs. In Proceedings of the 40th International Conference on Machine Learning (ICML 2023), Vol. 202. Honolulu, Hawaii, USA, 17283--17300."},{"key":"e_1_3_2_1_20_1","volume-title":"OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. In Advances in Neural Information Processing Systems 36 (NeurIPS","author":"Laurenccon Hugo","year":"2023","unstructured":"Hugo Laurenccon, Lucile Saulnier, Leo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander Rush, Douwe Kiela, Matthieu Cord, and Victor Sanh. 2023. OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents. In Advances in Neural Information Processing Systems 36 (NeurIPS 2023), Vol. 36. Curran Associates, Inc., New Orleans, LA, USA, 71683--71702."},{"key":"e_1_3_2_1_21_1","unstructured":"Hugo Lauren\u00e7on L\u00e9o Tronchon Matthieu Cord and Victor Sanh. 2024. What matters when building vision-language models-arxiv: 2405.02246 https:\/\/arxiv.org\/abs\/2405.02246"},{"key":"e_1_3_2_1_22_1","volume-title":"The 12th International Conference on Learning Representations, (ICLR","author":"Lermen Simon","year":"2024","unstructured":"Simon Lermen and Charlie Rogers-Smith. 2024. LoRA Fine-tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B. In The 12th International Conference on Learning Representations, (ICLR 2024). Vienna, Austria."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (ICML 2023","volume":"202","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In Proceedings of the 40th International Conference on Machine Learning (ICML 2023), Vol. 202. Honolulu, Hawaii, USA, 19730--19742."},{"key":"e_1_3_2_1_24_1","volume-title":"The 12th International Conference on Learning Representations (ICLR","author":"Li Juncheng","year":"2024","unstructured":"Juncheng Li, Kaihang Pan, Zhiqi Ge, Minghe Gao, Wei Ji, Wenqiao Zhang, Tat-Seng Chua, Siliang Tang, Hanwang Zhang, and Yueting Zhuang. 2024. Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative Instructions. In The 12th International Conference on Learning Representations (ICLR 2024). Vienna, Austria."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_26_1","volume-title":"Visual Instruction Tuning. In The 37th Annual Conference on Neural Information Processing Systems (NeurIPS","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In The 37th Annual Conference on Neural Information Processing Systems (NeurIPS 2023), Vol. 36. Curran Associates, Inc., New Orleans,LA, USA, 34892--34916."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning Transferable Visual Models from Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning (ICML 2021), Vol. 139. 8748--8763."},{"key":"e_1_3_2_1_28_1","volume-title":"The 36th Annual Conference on Neural Information Processing Systems (NeurIPS","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, Patrick Schramowski, Srivatsa Kundurthy, Katherine Crowson, Ludwig Schmidt, Robert Kaczmarczyk, and Jenia Jitsev. 2022. LAION-5B: An Open Large-scale Dataset for Training Next Generation Image-text Models. In The 36th Annual Conference on Neural Information Processing Systems (NeurIPS 2022), Vol. 35. Curran Associates, Inc., New Orleans, LA, USA, 25278--25294."},{"key":"e_1_3_2_1_29_1","volume-title":"OTTER: Effortless Label Distribution Adaptation of Zero-shot Models. In The 12th International Conference on Learning Representations (ICLR","author":"Shin Changho","year":"2024","unstructured":"Changho Shin, Jitian Zhao, Sonia Cromp, Harit Vishwakarma, and Frederic Sala. 2024. OTTER: Effortless Label Distribution Adaptation of Zero-shot Models. In The 12th International Conference on Learning Representations (ICLR 2024). Vienna Austria."},{"key":"e_1_3_2_1_30_1","volume-title":"SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In The 10th International Conference on Learning Representations (ICLR","author":"Wang Zirui","year":"2022","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2022. SimVLM: Simple Visual Language Model Pretraining with Weak Supervision. In The 10th International Conference on Learning Representations (ICLR 2022)."},{"key":"e_1_3_2_1_31_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qi Qian Ji Zhang Fei Huang and Jingren Zhou. 2024. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arxiv: 2304.14178 https:\/\/arxiv.org\/abs\/2304.14178"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations (EMNLP","author":"Zhang Hang","year":"2023","unstructured":"Hang Zhang, Xin Li, and Lidong Bing. 2023. Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations (EMNLP 2023). Association for Computational Linguistics, System Demonstrations, Singapore, 543--553."},{"key":"e_1_3_2_1_33_1","volume-title":"The 12th International Conference on Learning Representations (ICLR","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In The 12th International Conference on Learning Representations (ICLR 2024). Vienna, Austria."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688995","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3688995","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:29Z","timestamp":1750295849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688995"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":33,"alternative-id":["10.1145\/3664647.3688995","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3688995","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}