{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:07:40Z","timestamp":1775815660713,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T00:00:00Z","timestamp":1752969600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"CCF-Tencent Open Fund, Tencent Rhino-Bird Focused Research Program"},{"name":"Collaborative Research Fund","award":["No.C1043-24GF"],"award-info":[{"award-number":["No.C1043-24GF"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["No. 2023ZD0121104"],"award-info":[{"award-number":["No. 2023ZD0121104"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Research Impact Fund","award":["No.R1015-23"],"award-info":[{"award-number":["No.R1015-23"]}]},{"name":"APRC - CityU New Research Initiatives","award":["No.9610565"],"award-info":[{"award-number":["No.9610565"]}]},{"name":"CityU - HKIDS Early Career Research Grant","award":["No.9360163"],"award-info":[{"award-number":["No.9360163"]}]},{"name":"Hong Kong ITC Innovation and Technology Fund Midstream Research Programme for Universities Project","award":["No.ITS\/034\/22MS"],"award-info":[{"award-number":["No.ITS\/034\/22MS"]}]},{"name":"CCF-Ant Research Fund, Ant Group Research Fund"},{"name":"Kuaishou"},{"name":"CCF-BaiChuan-Ebtech Foundation Model Fund"},{"name":"Hong Kong Environmental and Conservation Fund","award":["No. 88\/2022"],"award-info":[{"award-number":["No. 88\/2022"]}]},{"name":"SIRG - CityU Strategic Interdisciplinary Research Grant","award":["No.7020046"],"award-info":[{"award-number":["No.7020046"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62222213,62072423"],"award-info":[{"award-number":["No. 62222213,62072423"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Huawei Innovation Research Program"},{"name":"CCF-Alimama Tech Kangaroo Fund","award":["No. 2024002"],"award-info":[{"award-number":["No. 2024002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,20]]},"DOI":"10.1145\/3690624.3709440","type":"proceedings-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T18:48:32Z","timestamp":1743792512000},"page":"2815-2826","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["NoteLLM-2: Multimodal Large Representation Models for Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-2579-8783","authenticated-orcid":false,"given":"Chao","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China, City University of Hong Kong, Hong Kong, Hong Kong &amp; State Key Laboratory of Cognitive Intelligence, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1468-7539","authenticated-orcid":false,"given":"Haoxin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3206-6827","authenticated-orcid":false,"given":"Shiwei","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China &amp; State Key Laboratory of Cognitive Intelligence, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0708-5035","authenticated-orcid":false,"given":"Di","family":"Wu","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4246-5386","authenticated-orcid":false,"given":"Tong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China &amp; State Key Laboratory of Cognitive Intelligence, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2926-4416","authenticated-orcid":false,"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5960-1684","authenticated-orcid":false,"given":"Yan","family":"Gao","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1274-7111","authenticated-orcid":false,"given":"Yao","family":"Hu","sequence":"additional","affiliation":[{"name":"Xiaohongshu Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4835-4102","authenticated-orcid":false,"given":"Enhong","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China &amp; State Key Laboratory of Cognitive Intelligence, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,7,20]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"[n. d.]. Llama3. https:\/\/llama.meta.com\/llama3\/."},{"key":"e_1_3_2_2_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"Qwen-vl: A frontier large visionlanguage model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, ShijieWang, Sinan Tan, PengWang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large visionlanguage model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_2_4_1","volume-title":"Llm2vec: Large language models are secretly powerful text encoders. arXiv preprint arXiv:2404.05961","author":"Ghader Parishad Behnam","year":"2024","unstructured":"Parishad Behnam Ghader, Vaibhav Adlakha, Marius Mosbach, Dzmitry Bahdanau, Nicolas Chapados, and Siva Reddy. 2024. Llm2vec: Large language models are secretly powerful text encoders. arXiv preprint arXiv:2404.05961 (2024)."},{"key":"e_1_3_2_2_5_1","volume-title":"TOMGPT: Reliable Text-Only Training Approach for Cost-Effective Multi-modal Large Language Model. TKDD","author":"Chen Yunkai","year":"2024","unstructured":"Yunkai Chen, Qimeng Wang, Shiwei Wu, Yan Gao, Tong Xu, and Yao Hu. 2024. TOMGPT: Reliable Text-Only Training Approach for Cost-Effective Multi-modal Large Language Model. TKDD (2024)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Zhihong Chen Guiming Chen Shizhe Diao XiangWan and BenyouWang. 2023. On the Difference of BERT-style and CLIP-style Text Encoders. In Findings of ACL. 13710--13721.","DOI":"10.18653\/v1\/2023.findings-acl.866"},{"key":"e_1_3_2_2_7_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"e_1_3_2_2_8_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_2_9_1","unstructured":"Together Computer. 2023. RedPajama: an Open Dataset for Training Large Language Models. https:\/\/github.com\/togethercomputer\/RedPajama-Data"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3124365"},{"key":"e_1_3_2_2_11_1","unstructured":"Nilotpal Das Aniket Joshi Promod Yenigalla and Gourav Agrwal. 2022. MAPS: multimodal attention for product similarity. In WACV. 3338--3346."},{"key":"e_1_3_2_2_12_1","unstructured":"DeepSeek-AI. 2024. DeepSeek-V2: A Strong Economical and Efficient Mixtureof- Experts Language Model. arXiv:2405.04434 [cs.CL]"},{"key":"e_1_3_2_2_13_1","volume-title":"A survey on in-context learning. arXiv preprint arXiv:2301.00234","author":"Dong Qingxiu","year":"2022","unstructured":"Qingxiu Dong, Lei Li, Damai Dai, Ce Zheng, Zhiyong Wu, Baobao Chang, Xu Sun, Jingjing Xu, and Zhifang Sui. 2022. A survey on in-context learning. arXiv preprint arXiv:2301.00234 (2022)."},{"key":"e_1_3_2_2_14_1","volume-title":"DreamLLM: Synergistic Multimodal Comprehension and Creation. In The Twelfth International Conference on Learning Representations.","author":"Dong Runpei","year":"2023","unstructured":"Runpei Dong, Chunrui Han, Yuang Peng, Zekun Qi, Zheng Ge, Jinrong Yang, Liang Zhao, Jianjian Sun, Hongyu Zhou, Haoran Wei, et al. 2023. DreamLLM: Synergistic Multimodal Comprehension and Creation. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Zi-Yi Dou Yichong Xu Zhe Gan Jianfeng Wang Shuohang Wang Lijuan Wang Chenguang Zhu Pengchuan Zhang Lu Yuan Nanyun Peng et al. 2022. An empirical study of training end-to-end vision-and-language transformers. In CVPR. 18166--18176.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"e_1_3_2_2_16_1","unstructured":"Yuying Ge Sijie Zhao Ziyun Zeng Yixiao Ge Chen Li Xintao Wang and Ying Shan. 2023. Planting a SEED of Vision in Large Language Model. In ICLR."},{"key":"e_1_3_2_2_17_1","volume-title":"Fame-vil: Multi-tasking vision-language model for heterogeneous fashion tasks. In CVPR. 2669--2680.","author":"Han Xiao","year":"2023","unstructured":"Xiao Han, Xiatian Zhu, Licheng Yu, Li Zhang, Yi-Zhe Song, and Tao Xiang. 2023. Fame-vil: Multi-tasking vision-language model for heterogeneous fashion tasks. In CVPR. 2669--2680."},{"key":"e_1_3_2_2_18_1","volume-title":"Wanjuan: A comprehensive multimodal dataset for advancing english and chinese large models. arXiv preprint arXiv:2308.10755","author":"He Conghui","year":"2023","unstructured":"Conghui He, Zhenjiang Jin, Chao Xu, Jiantao Qiu, Bin Wang, Wei Li, Hang Yan, JiaqiWang, and Dahua Lin. 2023. Wanjuan: A comprehensive multimodal dataset for advancing english and chinese large models. arXiv preprint arXiv:2308.10755 (2023)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","unstructured":"Gabriel Ilharco Mitchell Wortsman Ross Wightman Cade Gordon Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller Hannaneh Hajishirzi Ali Farhadi and Ludwig Schmidt. 2021. OpenCLIP. doi:10.5281\/zenodo.5143773 If you use this software please cite it as below..","DOI":"10.5281\/zenodo.5143773"},{"key":"e_1_3_2_2_20_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al.","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. 2023. Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)."},{"key":"e_1_3_2_2_21_1","volume-title":"Scaling sentence embeddings with large language models. arXiv preprint arXiv:2307.16645","author":"Jiang Ting","year":"2023","unstructured":"Ting Jiang, Shaohan Huang, Zhongzhi Luan, Deqing Wang, and Fuzhen Zhuang. 2023. Scaling sentence embeddings with large language models. arXiv preprint arXiv:2307.16645 (2023)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Yang Jin Yongzhi Li Zehuan Yuan and Yadong Mu. 2023. Learning Instance- Level Representation for Large-Scale Multi-Modal Pretraining in E-commerce. In CVPR. 11060--11069.","DOI":"10.1109\/CVPR52729.2023.01064"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_2_24_1","volume-title":"Prismatic vlms: Investigating the design space of visually-conditioned language models. arXiv preprint arXiv:2402.07865","author":"Karamcheti Siddharth","year":"2024","unstructured":"Siddharth Karamcheti, Suraj Nair, Ashwin Balakrishna, Percy Liang, Thomas Kollar, and Dorsa Sadigh. 2024. Prismatic vlms: Investigating the design space of visually-conditioned language models. arXiv preprint arXiv:2402.07865 (2024)."},{"key":"e_1_3_2_2_25_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL. 4171--4186.","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL. 4171--4186."},{"key":"e_1_3_2_2_26_1","volume-title":"What matters when building vision-language models? arXiv preprint arXiv:2405.02246","author":"Lauren\u00e7on Hugo","year":"2024","unstructured":"Hugo Lauren\u00e7on, L\u00e9o Tronchon, Matthieu Cord, and Victor Sanh. 2024. What matters when building vision-language models? arXiv preprint arXiv:2405.02246 (2024)."},{"key":"e_1_3_2_2_27_1","volume-title":"ICML. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. PMLR, 19730--19742."},{"key":"e_1_3_2_2_28_1","volume-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. PMLR, 12888--12900."},{"key":"e_1_3_2_2_29_1","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. NeurIPS 34 (2021), 9694--9705.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_30_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Multimodal recommender systems: A survey. arXiv preprint arXiv:2302.03883","author":"Liu Qidong","year":"2023","unstructured":"Qidong Liu, Jiaxi Hu, Yutian Xiao, Jingtong Gao, and Xiangyu Zhao. 2023. Multimodal recommender systems: A survey. arXiv preprint arXiv:2302.03883 (2023)."},{"key":"e_1_3_2_2_32_1","volume-title":"Finetuning llama for multi-stage text retrieval. arXiv preprint arXiv:2310.08319","author":"Ma Xueguang","year":"2023","unstructured":"Xueguang Ma, Liang Wang, Nan Yang, Furu Wei, and Jimmy Lin. 2023. Finetuning llama for multi-stage text retrieval. arXiv preprint arXiv:2310.08319 (2023)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"crossref","unstructured":"Brandon McKinzie Zhe Gan Jean-Philippe Fauconnier Sam Dodge Bowen Zhang Philipp Dufter Dhruti Shah Xianzhi Du Futang Peng Floris Weers et al. 2024. MM1: Methods Analysis & Insights from Multimodal LLM Pre-training. arXiv preprint arXiv:2403.09611 (2024).","DOI":"10.1007\/978-3-031-73397-0_18"},{"key":"e_1_3_2_2_34_1","volume-title":"Generative representational instruction tuning. arXiv preprint arXiv:2402.09906","author":"Muennighoff Niklas","year":"2024","unstructured":"Niklas Muennighoff, Hongjin Su, Liang Wang, Nan Yang, Furu Wei, Tao Yu, Amanpreet Singh, and Douwe Kiela. 2024. Generative representational instruction tuning. arXiv preprint arXiv:2402.09906 (2024)."},{"key":"e_1_3_2_2_35_1","volume-title":"Jerry Tworek, Qiming Yuan, Nikolas Tezak, Jong Wook Kim, Chris Hallacy, et al.","author":"Neelakantan Arvind","year":"2022","unstructured":"Arvind Neelakantan, Tao Xu, Raul Puri, Alec Radford, Jesse Michael Han, Jerry Tworek, Qiming Yuan, Nikolas Tezak, Jong Wook Kim, Chris Hallacy, et al. 2022. Text and code embeddings by contrastive pre-training. arXiv preprint arXiv:2201.10005 (2022)."},{"key":"e_1_3_2_2_36_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748--8763."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_2_38_1","volume-title":"USENIX Annual Technical Conference. 551--564","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. {Zero-offload}: Democratizing {billion-scale} model training. In USENIX Annual Technical Conference. 551--564."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Stephen Robertson Hugo Zaragoza et al. 2009. The probabilistic relevance framework: BM25 and beyond. Foundations and Trends\u00ae in Information Retrieval 3 4 (2009) 333--389.","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_2_40_1","volume-title":"Yong Jae Lee, and Yan Yan","author":"Shang Yuzhang","year":"2024","unstructured":"Yuzhang Shang, Mu Cai, Bingxin Xu, Yong Jae Lee, and Yan Yan. 2024. LLaVAPruMerge: Adaptive Token Reduction for Efficient Large Multimodal Models. arXiv preprint arXiv:2403.15388 (2024)."},{"key":"e_1_3_2_2_41_1","volume-title":"Repetition Improves Language Model Embeddings. arXiv preprint arXiv:2402.15449","author":"Springer Jacob Mitchell","year":"2024","unstructured":"Jacob Mitchell Springer, Suhas Kotha, Daniel Fried, Graham Neubig, and Aditi Raghunathan. 2024. Repetition Improves Language Model Embeddings. arXiv preprint arXiv:2402.15449 (2024)."},{"key":"e_1_3_2_2_42_1","volume-title":"Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355","author":"Su Yixuan","year":"2023","unstructured":"Yixuan Su, Tian Lan, Huayang Li, Jialu Xu, Yan Wang, and Deng Cai. 2023. Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355 (2023)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Lin Sun Jiquan Wang Yindu Su Fangsheng Weng Yuxuan Sun Zengwei Zheng and Yuanyi Chen. 2020. RIVA: a pre-trained tweet multimodal model based on text-image relation for multimodal NER. In COLING. 1852--1862.","DOI":"10.18653\/v1\/2020.coling-main.168"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i15.17633"},{"key":"e_1_3_2_2_45_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_2_46_1","volume-title":"EVA-CLIP-18B: Scaling CLIP to 18 Billion Parameters. arXiv preprint arXiv:2402.04252","author":"Sun Quan","year":"2024","unstructured":"Quan Sun, Jinsheng Wang, Qiying Yu, Yufeng Cui, Fan Zhang, Xiaosong Zhang, and Xinlong Wang. 2024. EVA-CLIP-18B: Scaling CLIP to 18 Billion Parameters. arXiv preprint arXiv:2402.04252 (2024)."},{"key":"e_1_3_2_2_47_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Lean Wang Lei Li Damai Dai Deli Chen Hao Zhou Fandong Meng Jie Zhou and Xu Sun. 2023. Label Words are Anchors: An Information Flow Perspective for Understanding In-Context Learning. In EMNLP. 9840--9855.","DOI":"10.18653\/v1\/2023.emnlp-main.609"},{"key":"e_1_3_2_2_49_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2023","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2023. Image as a foreign language: Beit pretraining for vision and vision-language tasks. In CVPR. 19175--19186."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_2_51_1","volume-title":"Mm-rec: Visiolinguistic model empowered multimodal news recommendation. In SIGIR. 2560--2564.","author":"Wu Chuhan","year":"2022","unstructured":"Chuhan Wu, Fangzhao Wu, Tao Qi, Chao Zhang, Yongfeng Huang, and Tong Xu. 2022. Mm-rec: Visiolinguistic model empowered multimodal news recommendation. In SIGIR. 2560--2564."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40555-y"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","unstructured":"Zhao Xue Hanyu Zhao Sha Yuan and YequanWang. 2022. WuDaoCorpora Text. doi:10.57760\/sciencedb.o00126.00004","DOI":"10.57760\/sciencedb.o00126.00004"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Jiahao Xun Shengyu Zhang Zhou Zhao Jieming Zhu Qi Zhang Jingjie Li Xiuqiang He Xiaofei He Tat-Seng Chua and Fei Wu. 2021. Why do we click: visual impression-aware news recommendation. In MM. 3881--3890.","DOI":"10.1145\/3474085.3475514"},{"key":"e_1_3_2_2_55_1","volume-title":"Large scale product graph construction for recommendation in e-commerce. arXiv preprint arXiv:2010.05525","author":"Yang Xiaoyong","year":"2020","unstructured":"Xiaoyong Yang, Yadong Zhu, Yi Zhang, Xiaobo Wang, and Quan Yuan. 2020. Large scale product graph construction for recommendation in e-commerce. arXiv preprint arXiv:2010.05525 (2020)."},{"key":"e_1_3_2_2_56_1","volume-title":"A survey on multimodal large language models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_2_57_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539151"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"crossref","unstructured":"Xiaohua Zhai Basil Mustafa Alexander Kolesnikov and Lucas Beyer. 2023. Sigmoid loss for language image pre-training. In ICCV. 11975--11986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_2_60_1","volume-title":"Long-clip: Unlocking the long-text capability of clip. arXiv preprint arXiv:2403.15378","author":"Zhang Beichen","year":"2024","unstructured":"Beichen Zhang, Pan Zhang, Xiaoyi Dong, Yuhang Zang, and Jiaqi Wang. 2024. Long-clip: Unlocking the long-text capability of clip. arXiv preprint arXiv:2403.15378 (2024)."},{"key":"e_1_3_2_2_61_1","volume-title":"NoteLLM: A Retrievable Large Language Model for Note Recommendation. arXiv preprint arXiv:2403.01744","author":"Zhang Chao","year":"2024","unstructured":"Chao Zhang, Shiwei Wu, Haoxin Zhang, Tong Xu, Yan Gao, Yao Hu, and Enhong Chen. 2024. NoteLLM: A Retrievable Large Language Model for Note Recommendation. arXiv preprint arXiv:2403.01744 (2024)."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.2987728"},{"key":"e_1_3_2_2_63_1","volume-title":"Debiasing Large Visual Language Models. arXiv preprint arXiv:2403.05262","author":"Zhang Yi-Fan","year":"2024","unstructured":"Yi-Fan Zhang, Weichen Yu, Qingsong Wen, Xue Wang, Zhang Zhang, Liang Wang, Rong Jin, and Tieniu Tan. 2024. Debiasing Large Visual Language Models. arXiv preprint arXiv:2403.05262 (2024)."},{"key":"e_1_3_2_2_64_1","volume-title":"Make: Vision-language pre-training based product retrieval in taobao search. In WWW. 356--360.","author":"Zheng Xiaoyang","year":"2023","unstructured":"Xiaoyang Zheng, Zilong Wang, Sen Li, Ke Xu, Tao Zhuang, Qingwen Liu, and Xiaoyi Zeng. 2023. Make: Vision-language pre-training based product retrieval in taobao search. In WWW. 356--360."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"crossref","unstructured":"Han Zhu Xiang Li Pengye Zhang Guozheng Li Jie He Han Li and Kun Gai. 2018. Learning tree-based deep model for recommender systems. In KDD. 1079--1088.","DOI":"10.1145\/3219819.3219826"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709440","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3690624.3709440","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T15:42:42Z","timestamp":1755358962000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709440"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,20]]},"references-count":65,"alternative-id":["10.1145\/3690624.3709440","10.1145\/3690624"],"URL":"https:\/\/doi.org\/10.1145\/3690624.3709440","relation":{},"subject":[],"published":{"date-parts":[[2025,7,20]]},"assertion":[{"value":"2025-07-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}