{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T16:27:39Z","timestamp":1775665659679,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62037001, No.62307032"],"award-info":[{"award-number":["No.62037001, No.62307032"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100013105","name":"Shanghai Rising-Star Program","doi-asserted-by":"publisher","award":["23QA1409000"],"award-info":[{"award-number":["23QA1409000"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100013105","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Science Fund at Shanghai Institute for Advanced Study","award":["SN-ZJU-SIAS-0010"],"award-info":[{"award-number":["SN-ZJU-SIAS-0010"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681014","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"3489-3498","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Semantic Alignment for Multimodal Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7917-8849","authenticated-orcid":false,"given":"Tao","family":"Wu","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3482-234X","authenticated-orcid":false,"given":"Mengze","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0415-6937","authenticated-orcid":false,"given":"Jingyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8106-9768","authenticated-orcid":false,"given":"Wei","family":"Ji","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8353-2392","authenticated-orcid":false,"given":"Wang","family":"Lin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2094-3554","authenticated-orcid":false,"given":"Jinyang","family":"Gao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7024-9790","authenticated-orcid":false,"given":"Kun","family":"Kuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2139-8807","authenticated-orcid":false,"given":"Fei","family":"Wu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katherine Millican, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob L. Menick, Sebastian Borgeaud, Andy Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikolaj Binkowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Kar\u00e9n Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022, Sanmi Koyejo, S. Mohamed, A. Agarwal, Danielle Belgrave, K. Cho, and A. Oh (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2022\/hash\/960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M. Dai Anja Hauth Katie Millican David Silver Slav Petrov Melvin Johnson Ioannis Antonoglou Julian Schrittwieser Amelia Glaese Jilin Chen Emily Pitler Timothy P. Lillicrap Angeliki Lazaridou Orhan Firat James Molloy Michael Isard Paul Ronald Barham Tom Hennigan Benjamin Lee Fabio Viola Malcolm Reynolds Yuanzhong Xu Ryan Doherty Eli Collins Clemens Meyer Eliza Rutherford Erica Moreira Kareem Ayoub Megha Goel George Tucker Enrique Piqueras Maxim Krikun Iain Barr Nikolay Savinov Ivo Danihelka Becca Roelofs Ana\"is White Anders Andreassen Tamara von Glehn Lakshman Yagati Mehran Kazemi Lucas Gonzalez Misha Khalman Jakub Sygnowski and et al. 2023. Gemini: A Family of Highly Capable Multimodal Models. CoRR Vol. abs\/2312.11805 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2312.11805 showeprint[arXiv]2312.11805","DOI":"10.48550\/ARXIV.2312.11805"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.05947"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548161"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00146"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.CVIU.2023.103878"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2403.06764"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611987"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.06500"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1065"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2401.03105"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2020.107383"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","unstructured":"Jack Hessel Lillian Lee and David Mimno. 2019. Unsupervised Discovery of Multimodal Links in Multi-image Multi-sentence Documents. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing EMNLP-IJCNLP 2019 Hong Kong China November 3--7 2019 Kentaro Inui Jing Jiang Vincent Ng and Xiaojun Wan (Eds.). Association for Computational Linguistics 2034--2045. https:\/\/doi.org\/10.18653\/V1\/D19--1210","DOI":"10.18653\/V1"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1436"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611826"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.241"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2305.03726"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/V1"},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_2_1_24_1","unstructured":"Juncheng Li Kaihang Pan Zhiqi Ge Minghe Gao Hanwang Zhang Wei Ji Wenqiao Zhang Tat-Seng Chua Siliang Tang and Yueting Zhuang. 2023. Fine-tuning Multimodal LLMs to Follow Zero-shot Demonstrative Instructions. arxiv: 2308.04152 [cs.CV]"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00350"},{"key":"e_1_3_2_1_26_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01402"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V37I11.26551"},{"key":"e_1_3_2_1_29_1","volume-title":"Visual Instruction Tuning. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=w0H2xGHlkw","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Thirty-seventh Conference on Neural Information Processing Systems. https:\/\/openreview.net\/forum?id=w0H2xGHlkw"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_1_31_1","volume-title":"Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6--9, 2019. OpenReview.net. https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"e_1_3_2_1_32_1","volume-title":"Semantic Codebook Learning for Dynamic Recommendation Models. CoRR","author":"Lv Zheqi","year":"2024","unstructured":"Zheqi Lv, Shaoxuan He, Tianyu Zhan, Shengyu Zhang, Wenqiao Zhang, Jingyuan Chen, Zhou Zhao, and Fei Wu. 2024. Semantic Codebook Learning for Dynamic Recommendation Models. CoRR, Vol. abs\/2408.00123 (2024). showeprint[arXiv]2408.00123 https:\/\/arxiv.org\/abs\/2408.00123"},{"key":"e_1_3_2_1_33_1","volume-title":"Beng Chin Ooi, and Fei Wu","author":"Lv Zheqi","year":"2023","unstructured":"Zheqi Lv, Wenqiao Zhang, Shengyu Zhang, Kun Kuang, Feng Wang, Yongwei Wang, Zhengyu Chen, Tao Shen, Hongxia Yang, Beng Chin Ooi, and Fei Wu. 2023. DUET: A Tuning-Free Device-Cloud Collaborative Parameters Generation Framework for Efficient Device Model Generalization. In WWW. ACM, 3077--3085."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2306.14259"},{"key":"e_1_3_2_1_36_1","volume-title":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F. Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022, Sanmi Koyejo, S. Mohamed, A. Agarwal, Danielle Belgrave, K. Cho, and A. Oh (Eds.). http:\/\/papers.nips.cc\/paper_files\/paper\/2022\/hash\/b1efde53be364a73914f58805a001731-Abstract-Conference.html"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00472"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V37I2.25300"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00206"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612516"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2402.12408"},{"key":"e_1_3_2_1_44_1","volume-title":"Multi-Image Summarization: Textual Summary from a Set of Cohesive Images. CoRR","author":"Trieu Nicholas","year":"2020","unstructured":"Nicholas Trieu, Sebastian Goodman, Pradyumna Narayana, Kazoo Sone, and Radu Soricut. 2020. Multi-Image Summarization: Textual Summary from a Set of Cohesive Images. CoRR, Vol. abs\/2006.08686 (2020). showeprint[arXiv]2006.08686 https:\/\/arxiv.org\/abs\/2006.08686"},{"key":"e_1_3_2_1_45_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4--9, 2017, Long Beach, CA, USA, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett (Eds.). 5998--6008. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612179"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548022"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2309.17421"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611830"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2309.07915"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.10592"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681014","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681014","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:36Z","timestamp":1750295856000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681014"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3681014","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681014","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}