{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T19:40:40Z","timestamp":1772826040623,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"This work is fully supported by the Advanced Research and Technology Innovation Centre (ARTIC), the National University of Singapore under Grant (project number: A-8000969-00-00). This research is also supported by the National Natural Science Foundation of China (9227010114) and the University Synergy Innovation Program of Anhui Province (GXXT-2022-040)."}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612091","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"955-965","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["Online Distillation-enhanced Multi-modal Transformer for Sequential Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8106-9768","authenticated-orcid":false,"given":"Wei","family":"Ji","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7804-6945","authenticated-orcid":false,"given":"Xiangyan","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1367-711X","authenticated-orcid":false,"given":"An","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1791-3159","authenticated-orcid":false,"given":"Yinwei","family":"Wei","sequence":"additional","affiliation":[{"name":"Monash University, Melboune, VIC, Austria"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7606-1475","authenticated-orcid":false,"given":"Yongxin","family":"Ni","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6148-6329","authenticated-orcid":false,"given":"Xiang","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235","author":"Anil Rohan","year":"2018","unstructured":"Rohan Anil, Gabriel Pereyra, Alexandre Passos, Robert Ormandi, George E Dahl, and Geoffrey E Hinton. 2018. Large scale distributed neural network training through online distillation. arXiv preprint arXiv:1804.03235 (2018)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539170"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5746"},{"key":"e_1_3_2_1_4_1","volume-title":"Cache-Augmented Inbatch Importance Resampling for Training Recommender Retriever. arXiv preprint arXiv:2205.14859","author":"Chen Jin","year":"2022","unstructured":"Jin Chen, Defu Lian, Yucheng Li, Baoyun Wang, Kai Zheng, and Enhong Chen. 2022. Cache-Augmented Inbatch Importance Resampling for Training Recommender Retriever. arXiv preprint arXiv:2205.14859 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.58"},{"key":"e_1_3_2_1_6_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_7_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01103"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.9973"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_12_1","volume-title":"Session-based recommendations with recurrent neural networks. arXiv preprint arXiv:1511.06939","author":"Hidasi Bal\u00e1zs","year":"2015","unstructured":"Bal\u00e1zs Hidasi, Alexandros Karatzoglou, Linas Baltrunas, and Domonkos Tikk. 2015. Session-based recommendations with recurrent neural networks. arXiv preprint arXiv:1511.06939 (2015)."},{"key":"e_1_3_2_1_13_1","volume-title":"Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)."},{"key":"e_1_3_2_1_14_1","volume-title":"Learning Vector-Quantized Item Representation for Transferable Sequential Recommenders. arXiv preprint arXiv:2210.12316","author":"Hou Yupeng","year":"2022","unstructured":"Yupeng Hou, Zhankui He, Julian McAuley, and Wayne Xin Zhao. 2022a. Learning Vector-Quantized Item Representation for Transferable Sequential Recommenders. arXiv preprint arXiv:2210.12316 (2022)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539381"},{"key":"e_1_3_2_1_16_1","volume-title":"Mrtnet: Multi-resolution temporal network for video sentence grounding. arXiv preprint arXiv:2212.13163","author":"Ji Wei","year":"2022","unstructured":"Wei Ji, Long Chen, Yinwei Wei, Yiming Wu, and Tat-Seng Chua. 2022. Mrtnet: Multi-resolution temporal network for video sentence grounding. arXiv preprint arXiv:2212.13163 (2022)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3002083"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612088"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02204"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2018.00035"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412615"},{"key":"e_1_3_2_1_22_1","volume-title":"Temporal ensembling for semi-supervised learning. arXiv preprint arXiv:1610.02242","author":"Laine Samuli","year":"2016","unstructured":"Samuli Laine and Timo Aila. 2016. Temporal ensembling for semi-supervised learning. arXiv preprint arXiv:1610.02242 (2016)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.117"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3217449"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557101"},{"key":"e_1_3_2_1_26_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_27_1","volume-title":"CARCA: Context and Attribute-Aware Next-Item Recommendation via Cross-Attention. arXiv preprint arXiv:2204.06519","author":"Rashed Ahmed","year":"2022","unstructured":"Ahmed Rashed, Shereen Elsayed, and Lars Schmidt-Thieme. 2022. CARCA: Context and Attribute-Aware Next-Item Recommendation via Cross-Attention. arXiv preprint arXiv:2204.06519 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_1_29_1","volume-title":"TransRec: Learning Transferable Recommendation from Mixture-of-Modality Feedback. arXiv preprint arXiv:2206.06190","author":"Wang Jie","year":"2022","unstructured":"Jie Wang, Fajie Yuan, Mingyue Cheng, Joemon M Jose, Chenyun Yu, Beibei Kong, Zhijin Wang, Bo Hu, and Zang Li. 2022. TransRec: Learning Transferable Recommendation from Mixture-of-Modality Feedback. arXiv preprint arXiv:2206.06190 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331267"},{"key":"e_1_3_2_1_31_1","volume-title":"2023 a. Multi-Modal Self-Supervised Learning for Recommendation. arXiv preprint arXiv:2302.10632","author":"Wei Wei","year":"2023","unstructured":"Wei Wei, Chao Huang, Lianghao Xia, and Chuxu Zhang. 2023 a. Multi-Modal Self-Supervised Learning for Recommendation. arXiv preprint arXiv:2302.10632 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591716"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3088307"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413556"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3231352"},{"key":"e_1_3_2_1_37_1","volume-title":"Mm-rec: multimodal news recommendation. arXiv preprint arXiv:2104.07407","author":"Wu Chuhan","year":"2021","unstructured":"Chuhan Wu, Fangzhao Wu, Tao Qi, and Yongfeng Huang. 2021. Mm-rec: multimodal news recommendation. arXiv preprint arXiv:2104.07407 (2021)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301346"},{"key":"e_1_3_2_1_39_1","volume-title":"Empirical evaluation of rectified activations in convolutional network. arXiv preprint arXiv:1505.00853","author":"Xu Bing","year":"2015","unstructured":"Bing Xu, Naiyan Wang, Tianqi Chen, and Mu Li. 2015. Empirical evaluation of rectified activations in convolutional network. arXiv preprint arXiv:1505.00853 (2015)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547394"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Zhengyi Yang Xiangnan He Jizhi Zhang Jiancan Wu Xin Xin Jiawei Chen and Xiang Wang. 2023. A Generic Learning Framework for Sequential Recommendation with Distribution Shifts. In SIGIR. 331--340.","DOI":"10.1145\/3539618.3591624"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3346996"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20221"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3289600.3290975"},{"key":"e_1_3_2_1_45_1","volume-title":"Where to Go Next for Recommender Systems? ID-vs. Modality-based recommender models revisited. arXiv preprint arXiv:2303.13835","author":"Yuan Zheng","year":"2023","unstructured":"Zheng Yuan, Fajie Yuan, Yu Song, Youhua Li, Junchen Fu, Fei Yang, Yunzhu Pan, and Yongxin Ni. 2023. Where to Go Next for Recommender Systems? ID-vs. Modality-based recommender models revisited. arXiv preprint arXiv:2303.13835 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"2023 a. Transfer visual prompt generator across llms. arXiv preprint arXiv:2305.01278","author":"Zhang Ao","year":"2023","unstructured":"Ao Zhang, Hao Fei, Yuan Yao, Wei Ji, Li Li, Zhiyuan Liu, and Tat-Seng Chua. 2023 a. Transfer visual prompt generator across llms. arXiv preprint arXiv:2305.01278 (2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"An Zhang Wenchang Ma Xiang Wang and Tat-Seng Chua. 2022a. Incorporating Bias-aware Margins into Contrastive Loss for Collaborative Filtering. In NeurIPS."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"An Zhang Jingnan Zheng Xiang Wang Yancheng Yuan and Tat seng Chua. 2023 b. Invariant Collaborative Filtering to Popularity Distribution Shift. In WWW.","DOI":"10.1145\/3543507.3583461"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3221949"},{"key":"e_1_3_2_1_50_1","volume-title":"2023 c. Multimodal Pre-training Framework for Sequential Recommendation via Contrastive Learning. arXiv preprint arXiv:2303.11879","author":"Zhang Lingzi","year":"2023","unstructured":"Lingzi Zhang, Xin Zhou, and Zhiqi Shen. 2023 c. Multimodal Pre-training Framework for Sequential Recommendation via Contrastive Learning. arXiv preprint arXiv:2303.11879 (2023)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Tingting Zhang Pengpeng Zhao Yanchi Liu Victor S Sheng Jiajie Xu Deqing Wang Guanfeng Liu Xiaofang Zhou et al. 2019. Feature-level Deeper Self-Attention Network for Sequential Recommendation.. In IJCAI. 4320--4326.","DOI":"10.24963\/ijcai.2019\/600"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482016"},{"key":"e_1_3_2_1_54_1","volume-title":"2023 a. Enhancing Dyadic Relations with Homogeneous Graphs for Multimodal Recommendation. arXiv preprint arXiv:2301.12097","author":"Zhou Hongyu","year":"2023","unstructured":"Hongyu Zhou, Xin Zhou, and Zhiqi Shen. 2023 a. Enhancing Dyadic Relations with Homogeneous Graphs for Multimodal Recommendation. arXiv preprint arXiv:2301.12097 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"2023 b. A Comprehensive Survey on Multimodal Recommender Systems: Taxonomy, Evaluation, and Future Directions. arXiv preprint arXiv:2302.04473","author":"Zhou Hongyu","year":"2023","unstructured":"Hongyu Zhou, Xin Zhou, Zhiwei Zeng, Lingzi Zhang, and Zhiqi Shen. 2023 b. A Comprehensive Survey on Multimodal Recommender Systems: Taxonomy, Evaluation, and Future Directions. arXiv preprint arXiv:2302.04473 (2023)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411954"},{"key":"e_1_3_2_1_57_1","volume-title":"Bootstrap latent representations for multi-modal recommendation. arXiv preprint arXiv:2207.05969","author":"Zhou Xin","year":"2022","unstructured":"Xin Zhou, Hongyu Zhou, Yong Liu, Zhiwei Zeng, Chunyan Miao, Pengwei Wang, Yuan You, and Feijun Jiang 2022. Bootstrap latent representations for multi-modal recommendation. arXiv preprint arXiv:2207.05969 (2022)."},{"key":"e_1_3_2_1_58_1","unstructured":"Xiatian Zhu Shaogang Gong et al. 2018. Knowledge distillation by on-the-fly native ensemble. Advances in neural information processing systems Vol. 31 (2018)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612091","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612091","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:50Z","timestamp":1755821150000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612091"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3612091","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612091","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}