{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:45:38Z","timestamp":1777873538578,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","funder":[{"name":"Yonghua Foundation"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737275","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:03:27Z","timestamp":1754255007000},"page":"5029-5038","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Aligning and Balancing ID and Multimodal Representations for Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-9056-9543","authenticated-orcid":false,"given":"Binrui","family":"Wu","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4550-3950","authenticated-orcid":false,"given":"Shisong","family":"Tang","sequence":"additional","affiliation":[{"name":"KuaiShou Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7772-9523","authenticated-orcid":false,"given":"Fan","family":"Li","sequence":"additional","affiliation":[{"name":"KuaiShou Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5044-6368","authenticated-orcid":false,"given":"Bing","family":"Han","sequence":"additional","affiliation":[{"name":"KuaiShou Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2914-6527","authenticated-orcid":false,"given":"Chang","family":"Meng","sequence":"additional","affiliation":[{"name":"KuaiShou Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2394-2995","authenticated-orcid":false,"given":"Jingyu","family":"Xiao","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0628-1416","authenticated-orcid":false,"given":"Jiechao","family":"Gao","sequence":"additional","affiliation":[{"name":"Center for SDGC, Stanford University, Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","unstructured":"Rishi Bommasani Drew A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080797"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13740-016-0060-9"},{"key":"e_1_3_2_2_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_7_1","volume-title":"Zero-shot recommender systems. arXiv preprint arXiv:2105.08318","author":"Ding Hao","year":"2021","unstructured":"Hao Ding, Yifei Ma, Anoop Deoras, Yuyang Wang, and Hao Wang. 2021. Zero-shot recommender systems. arXiv preprint arXiv:2105.08318 (2021)."},{"key":"e_1_3_2_2_8_1","volume-title":"Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors). The annals of statistics","author":"Friedman Jerome","year":"2000","unstructured":"Jerome Friedman, Trevor Hastie, and Robert Tibshirani. 2000. Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors). The annals of statistics, Vol. 28, 2 (2000), 337-407."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.9973"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2005.50"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3347043"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/582415.582418"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Enkelejda Kasneci Kathrin Se\u00dfler Stefan K\u00fcchemann Maria Bannert Daryna Dementieva Frank Fischer Urs Gasser Georg Groh Stephan G\u00fcnnemann Eyke H\u00fcllermeier et al. 2023. ChatGPT for good? On opportunities and challenges of large language models for education. Learning and individual differences Vol. 103 (2023) 102274.","DOI":"10.1016\/j.lindif.2023.102274"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1401890.1401944"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2009.263"},{"key":"e_1_3_2_2_17_1","volume-title":"Deep learning. nature","author":"LeCun Yann","year":"2015","unstructured":"Yann LeCun, Yoshua Bengio, and Geoffrey Hinton. 2015. Deep learning. nature, Vol. 521, 7553 (2015), 436-444."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.121"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671514"},{"key":"e_1_3_2_2_20_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_2_21_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_2_22_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694-9705."},{"key":"e_1_3_2_2_23_1","volume-title":"AAAI 2025 Workshop on Artificial Intelligence with Causal Techniques. https:\/\/openreview.net\/forum?id=7btnaN4Std","author":"Li Meng","year":"2025","unstructured":"Meng Li and Haochen Sui. 2025. Causal Recommendation via Machine Unlearning with a Few Unbiased Data. In AAAI 2025 Workshop on Artificial Intelligence with Causal Techniques. https:\/\/openreview.net\/forum?id=7btnaN4Std"},{"key":"e_1_3_2_2_24_1","volume-title":"com recommendations: Item-to-item collaborative filtering","author":"Linden Greg","year":"2003","unstructured":"Greg Linden, Brent Smith, and Jeremy York. 2003. Amazon. com recommendations: Item-to-item collaborative filtering. IEEE Internet computing, Vol. 7, 1 (2003), 76-80."},{"key":"e_1_3_2_2_25_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024a. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3679626"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2766462.2767755"},{"key":"e_1_3_2_2_28_1","volume-title":"Twenty-Fourth International Joint Conference on Artificial Intelligence.","author":"Mo Kaixiang","year":"2015","unstructured":"Kaixiang Mo, Bo Liu, Lei Xiao, Yong Li, and Jie Jiang. 2015. Image feature learning for cold start problem in display advertising. In Twenty-Fourth International Joint Conference on Artificial Intelligence."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098108"},{"key":"e_1_3_2_2_30_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. arXiv preprint arXiv:2010.16061","author":"Powers David MW","year":"2020","unstructured":"David MW Powers. 2020. Evaluation: from precision, recall and F-measure to ROC, informedness, markedness and correlation. arXiv preprint arXiv:2010.16061 (2020)."},{"key":"e_1_3_2_2_32_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2010.127"},{"key":"e_1_3_2_2_34_1","volume-title":"BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618","author":"Rendle Steffen","year":"2012","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2012. BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618 (2012)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/245108.245121"},{"key":"e_1_3_2_2_36_1","volume-title":"Carlo Tomasi, and Leonidas J Guibas","author":"Rubner Yossi","year":"2000","unstructured":"Yossi Rubner, Carlo Tomasi, and Leonidas J Guibas. 2000. The earth mover's distance as a metric for image retrieval. International journal of computer vision, Vol. 40 (2000), 99-121."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/371920.372071"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/564376.564421"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640457.3688190"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357925"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511934"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599797"},{"key":"e_1_3_2_2_43_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_2_44_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3124749.3124754"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413556"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330665"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1671"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531896"},{"key":"e_1_3_2_2_51_1","volume-title":"Invariant Spatiotemporal Representation Learning for Cross-patient Seizure Classification. In The First Workshop on NeuroAI @ NeurIPS2024","author":"Wu Yuntian","year":"2024","unstructured":"Yuntian Wu, Yuntian Yang, Jiabao Sean Xiao, Chuan Zhou, Haochen Sui, and Haoxuan Li. 2024. Invariant Spatiotemporal Representation Learning for Cross-patient Seizure Classification. In The First Workshop on NeuroAI @ NeurIPS2024. https:\/\/openreview.net\/forum?id=Ex6wAivo7G"},{"key":"e_1_3_2_2_52_1","volume-title":"A survey on multimodal large language models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613915"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591932"},{"key":"e_1_3_2_2_55_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024b. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_2_56_1","volume-title":"DRepMRec: A Dual Representation Learning Framework for Multimodal Recommendation. arXiv preprint arXiv:2404.11119","author":"Zhang Kangning","year":"2024","unstructured":"Kangning Zhang, Yingjie Qin, Ruilong Su, Yifan Liu, Jiarui Jin, Weinan Zhang, and Yong Yu. 2024a. DRepMRec: A Dual Representation Learning Framework for Multimodal Recommendation. arXiv preprint arXiv:2404.11119 (2024)."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219823"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611943"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583251"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463010"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462843"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462948"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737275","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:01:53Z","timestamp":1777572113000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737275"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":62,"alternative-id":["10.1145\/3711896.3737275","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737275","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}