{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T04:20:10Z","timestamp":1778559610782,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,7,20]],"date-time":"2025-07-20T00:00:00Z","timestamp":1752969600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,20]]},"DOI":"10.1145\/3690624.3709255","type":"proceedings-article","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T18:44:43Z","timestamp":1743792283000},"page":"848-858","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Fine-tuning Multimodal Large Language Models for Product Bundling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6037-8580","authenticated-orcid":false,"given":"Xiaohao","family":"Liu","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1613-1641","authenticated-orcid":false,"given":"Jie","family":"Wu","sequence":"additional","affiliation":[{"name":"Communication University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9011-8464","authenticated-orcid":false,"given":"Zhulin","family":"Tao","sequence":"additional","affiliation":[{"name":"Communication University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3038-5389","authenticated-orcid":false,"given":"Yunshan","family":"Ma","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1791-3159","authenticated-orcid":false,"given":"Yinwei","family":"Wei","sequence":"additional","affiliation":[{"name":"Shandong University, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,20]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Jinze Bai Chang Zhou Junshuai Song Xiaoru Qu Weiting An Zhao Li and Jun Gao. 2019. Personalized bundle list recommendation. In WWW. 60--71.","DOI":"10.1145\/3308558.3313568"},{"key":"e_1_3_2_1_3_1","volume-title":"Bundling and Competition on the Internet. Marketing science","author":"Bakos Yannis","year":"2000","unstructured":"Yannis Bakos and Erik Brynjolfsson. 2000. Bundling and Competition on the Internet. Marketing science, Vol. 19, 1 (2000), 63--82."},{"key":"e_1_3_2_1_4_1","volume-title":"Tallrec: An effective and efficient tuning framework to align large language model with recommendation. In RecSys. 1007--1014.","author":"Bao Keqin","year":"2023","unstructured":"Keqin Bao, Jizhi Zhang, Yang Zhang, Wenjie Wang, Fuli Feng, and Xiangnan He. 2023. Tallrec: An effective and efficient tuning framework to align large language model with recommendation. In RecSys. 1007--1014."},{"key":"e_1_3_2_1_5_1","first-page":"2326","article-title":"Bundle recommendation and generation with graph neural networks","volume":"35","author":"Chang Jianxin","year":"2021","unstructured":"Jianxin Chang, Chen Gao, Xiangnan He, Depeng Jin, and Yong Li. 2021. Bundle recommendation and generation with graph neural networks. TKDE, Vol. 35, 3 (2021), 2326--2340.","journal-title":"TKDE"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Wen Chen Pipei Huang Jiaming Xu Xin Guo Cheng Guo Fei Sun Chao Li Andreas Pfadler Huan Zhao and Binqiang Zhao. 2019. POG: personalized outfit generation for fashion recommendation at Alibaba iFashion. In KDD. 2662--2670.","DOI":"10.1145\/3292500.3330652"},{"key":"e_1_3_2_1_7_1","volume-title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. CoRR","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. CoRR, Vol. abs\/2311.07919 (2023). showeprint[arXiv]2311.07919"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Qilin Deng Kai Wang Minghao Zhao Runze Wu Yu Ding Zhene Zou Yue Shang Jianrong Tao and Changjie Fan. 2021. Build your own bundle-a neural combinatorial optimization method. In ACM Multimedia. 2625--2633.","DOI":"10.1145\/3474085.3475440"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Qilin Deng Kai Wang Minghao Zhao Zhene Zou Runze Wu Jianrong Tao Changjie Fan and Liang Chen. 2020. Personalized bundle recommendation in online games. In CIKM. 2381--2388.","DOI":"10.1145\/3340531.3412734"},{"key":"e_1_3_2_1_10_1","volume-title":"Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al.","author":"Driess Danny","year":"2023","unstructured":"Danny Driess, Fei Xia, Mehdi SM Sajjadi, Corey Lynch, Aakanksha Chowdhery, Brian Ichter, Ayzaan Wahid, Jonathan Tompson, Quan Vuong, Tianhe Yu, et al. 2023. Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Mahmoud Al Ismail, and Huaming Wang","author":"Elizalde Benjamin","year":"2023","unstructured":"Benjamin Elizalde, Soham Deshmukh, Mahmoud Al Ismail, and Huaming Wang. 2023. Clap learning audio concepts from natural language supervision. In ICASSP. IEEE, 1--5."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Xintong Han Zuxuan Wu Yu-Gang Jiang and Larry S Davis. 2017. Learning fashion compatibility with bidirectional lstms. In ACM Multimedia. 1078--1086.","DOI":"10.1145\/3123266.3123394"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Yun He Yin Zhang Weiwen Liu and James Caverlee. 2020b. Consistency-aware recommendation for user-generated item list continuation. In WSDM. 250--258.","DOI":"10.1145\/3336191.3371776"},{"key":"e_1_3_2_1_15_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645627"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Brian Lester Rami Al-Rfou and Noah Constant. 2021. The Power of Scale for Parameter-Efficient Prompt Tuning. In EMNLP. 3045--3059.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_1_18_1","volume-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day. NeurIPS","author":"Li Chunyuan","year":"2024","unstructured":"Chunyuan Li, Cliff Wong, Sheng Zhang, Naoto Usuyama, Haotian Liu, Jianwei Yang, Tristan Naumann, Hoifung Poon, and Jianfeng Gao. 2024b. Llava-med: Training a large language-and-vision assistant for biomedicine in one day. NeurIPS (2024)."},{"key":"e_1_3_2_1_19_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. 19730--19742."},{"key":"e_1_3_2_1_20_1","unstructured":"Ming Li Lin Li Xiaohui Tao and Jimmy Xiangji Huang. 2024a. MealRec: A Meal Recommendation Dataset with Meal-Course Affiliation for Personalization and Healthiness. (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Xingchen Li Xiang Wang Xiangnan He Long Chen Jun Xiao and Tat-Seng Chua. 2020. Hierarchical fashion graph network for personalized outfit recommendation. In SIGIR. 159--168.","DOI":"10.1145\/3397271.3401080"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Dawen Liang Rahul G Krishnan Matthew D Hoffman and Tony Jebara. 2018. Variational autoencoders for collaborative filtering. In WWW. 689--698.","DOI":"10.1145\/3178876.3186150"},{"key":"e_1_3_2_1_23_1","volume-title":"Llara: Aligning large language models with sequential recommenders.","author":"Liao Jiayi","year":"2024","unstructured":"Jiayi Liao, Sihang Li, Zhengyi Yang, Jiancan Wu, Yancheng Yuan, Xiang Wang, and Xiangnan He. 2024. Llara: Aligning large language models with sequential recommenders. (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Yujie Lin Pengjie Ren Zhumin Chen Zhaochun Ren Jun Ma and Maarten De Rijke. 2019. Improving outfit recommendation with co-supervision of fashion generation. In WWW. 1095--1105.","DOI":"10.1145\/3308558.3313614"},{"key":"e_1_3_2_1_25_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_26_1","volume-title":"Preference Diffusion for Recommendation. arXiv preprint arXiv:2410.13117","author":"Liu Shuo","year":"2024","unstructured":"Shuo Liu, An Zhang, Guoqing Hu, Hong Qian, and Tat-seng Chua. 2024. Preference Diffusion for Recommendation. arXiv preprint arXiv:2410.13117 (2024)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Shuo Liu Zihan Zhou Yuanhao Liu Jing Zhang and Hong Qian. 2025. Language Representation Favored Zero-Shot Cross-Domain Cognitive Diagnosis. In KDD.","DOI":"10.1145\/3690624.3709281"},{"key":"e_1_3_2_1_28_1","volume-title":"Elimrec: Eliminating single-modal bias in multimedia recommendation. In ACM Multimedia. 687--695.","author":"Liu Xiaohao","year":"2022","unstructured":"Xiaohao Liu, Zhulin Tao, Jiahong Shao, Lifang Yang, and Xianglin Huang. 2022. Elimrec: Eliminating single-modal bias in multimedia recommendation. In ACM Multimedia. 687--695."},{"key":"e_1_3_2_1_29_1","volume-title":"Macaw-llm: Multi-modal language modeling with image, audio, video, and text integration. arXiv preprint arXiv:2306.09093","author":"Lyu Chenyang","year":"2023","unstructured":"Chenyang Lyu, Minghao Wu, Longyue Wang, Xinting Huang, Bingshuai Liu, Zefeng Du, Shuming Shi, and Zhaopeng Tu. 2023. Macaw-llm: Multi-modal language modeling with image, audio, video, and text integration. arXiv preprint arXiv:2306.09093 (2023)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640810"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Yunshan Ma Yingzhi He An Zhang Xiang Wang and Tat-Seng Chua. 2022. CrossCBR: Cross-view contrastive learning for bundle recommendation. In KDD. 1233--1241.","DOI":"10.1145\/3534678.3539229"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Yunshan Ma Xiaohao Liu Yinwei Wei Zhulin Tao Xiang Wang and Tat-Seng Chua. 2024b. Leveraging multimodal features and item-level user feedback for bundle construction. In WSDM. 510--519.","DOI":"10.1145\/3616855.3635854"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3352100"},{"key":"e_1_3_2_1_34_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Yuyang Ren Zhang Haonan Luoyi Fu Xinbing Wang and Chenghu Zhou. 2023. Distillation-Enhanced Graph Masked Autoencoders for Bundle Recommendation. In SIGIR. 1660--1669.","DOI":"10.1145\/3539618.3591666"},{"key":"e_1_3_2_1_36_1","volume-title":"LARP: Language Audio Relational Pre-training for Cold-Start Playlist Continuation. In KDD. ACM.","author":"Salganik Rebecca","year":"2024","unstructured":"Rebecca Salganik, Xiaohao Liu, Jian Kang, Yunshan Ma, and Tat-Seng Chua. 2024. LARP: Language Audio Relational Pre-training for Cold-Start Playlist Continuation. In KDD. ACM."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Oren Sar Shalom Noam Koenigstein Ulrich Paquet and Hastagiri P Vanchinathan. 2016. Beyond collaborative filtering: The list recommendation problem. In WWW. 63--72.","DOI":"10.1145\/2872427.2883057"},{"key":"e_1_3_2_1_38_1","volume-title":"Language Models Encode Collaborative Signals in Recommendation. arXiv preprint arXiv:2407.05441","author":"Sheng Leheng","year":"2024","unstructured":"Leheng Sheng, An Zhang, Yi Zhang, Yuxin Chen, Xiang Wang, and Tat-Seng Chua. 2024. Language Models Encode Collaborative Signals in Recommendation. arXiv preprint arXiv:2407.05441 (2024)."},{"key":"e_1_3_2_1_39_1","volume-title":"Dynamic In-Context Learning from Nearest Neighbors for Bundle Generation. arXiv preprint arXiv:2312.16262","author":"Sun Zhu","year":"2023","unstructured":"Zhu Sun, Kaidong Feng, Jie Yang, Xinghua Qu, Hui Fang, Yew-Soon Ong, and Wenyuan Liu. 2023. Dynamic In-Context Learning from Nearest Neighbors for Bundle Generation. arXiv preprint arXiv:2312.16262 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Zhu Sun Jie Yang Kaidong Feng Hui Fang Xinghua Qu and Yew Soon Ong. 2022. Revisiting bundle recommendation: Datasets tasks challenges and opportunities for intent-aware product bundling. In SIGIR. 2900--2911.","DOI":"10.1145\/3477495.3531904"},{"key":"e_1_3_2_1_41_1","unstructured":"Llama Team. 2024. The Llama 3 Herd of Models."},{"key":"e_1_3_2_1_42_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Yinwei Wei Xiaohao Liu Yunshan Ma Xiang Wang Liqiang Nie and Tat-Seng Chua. 2023. Strategy-aware bundle recommender system. In SIGIR. 1198--1207.","DOI":"10.1145\/3539618.3591771"},{"key":"e_1_3_2_1_45_1","volume-title":"Laks VS Lakshmanan, and Peter T Wood","author":"Xie Min","year":"2010","unstructured":"Min Xie, Laks VS Lakshmanan, and Peter T Wood. 2010. Breaking out of the box of recommendations: from items to packages. In RecSys. 151--158."},{"key":"e_1_3_2_1_46_1","volume-title":"A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109755"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3344257"},{"key":"e_1_3_2_1_49_1","volume-title":"SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation. CoRR","author":"Zhang Dong","year":"2024","unstructured":"Dong Zhang, Xin Zhang, Jun Zhan, Shimin Li, Yaqian Zhou, and Xipeng Qiu. 2024b. SpeechGPT-Gen: Scaling Chain-of-Information Speech Generation. CoRR, Vol. abs\/2401.13527 (2024). showeprint[arXiv]2401.13527"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Mengmei Zhang Mingwei Sun Peng Wang Shen Fan Yanhu Mo Xiaoxiao Xu Hong Liu Cheng Yang and Chuan Shi. 2024a. GraphTranslator: Aligning Graph Model to Large Language Model for Open-ended Tasks. In WWW. 1003--1014.","DOI":"10.1145\/3589334.3645682"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Zizhuo Zhang and Bang Wang. 2023. Prompt learning for news recommendation. In SIGIR. 227--237.","DOI":"10.1145\/3539618.3591752"},{"key":"e_1_3_2_1_52_1","volume-title":"BLIP-2 Answers: Automatic Questioning Towards Enriched Visual Descriptions. arXiv preprint arXiv:2303.06594","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Kilichbek Haydarov, Xiaoqian Shen, Wenxuan Zhang, and Mohamed Elhoseiny. 2023a. ChatGPT Asks, BLIP-2 Answers: Automatic Questioning Towards Enriched Visual Descriptions. arXiv preprint arXiv:2303.06594 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023b. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Towards Hierarchical Intent Disentanglement for Bundle Recommendation. TKDE","author":"Zou Ding","year":"2023","unstructured":"Ding Zou, Sen Zhao, Wei Wei, Xian-ling Mao, Ruixuan Li, Dangyang Chen, Rui Fang, and Yuanyuan Fu. 2023. Towards Hierarchical Intent Disentanglement for Bundle Recommendation. TKDE (2023)."}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709255","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3690624.3709255","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T15:42:13Z","timestamp":1755358933000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3690624.3709255"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,20]]},"references-count":54,"alternative-id":["10.1145\/3690624.3709255","10.1145\/3690624"],"URL":"https:\/\/doi.org\/10.1145\/3690624.3709255","relation":{},"subject":[],"published":{"date-parts":[[2025,7,20]]},"assertion":[{"value":"2025-07-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}