{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:19:45Z","timestamp":1759969185931,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["Grant No. 62372408, 62372057"],"award-info":[{"award-number":["Grant No. 62372408, 62372057"]}]},{"name":"Zhejiang Key Laboratory of Accessible Perception and Intelligent Systems"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717553","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:12:56Z","timestamp":1748016776000},"page":"2278-2287","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ModalSync: Synchronizing User Behavior with Multimodal Features for Multimodal Pre-training Recommendation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9139-0109","authenticated-orcid":false,"given":"Shiqin","family":"Liu","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8179-7503","authenticated-orcid":false,"given":"Chaozhuo","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Trustworthy Distributed Computing and Service (MoE), Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9167-3897","authenticated-orcid":false,"given":"Minjun","family":"Zhao","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6981-3873","authenticated-orcid":false,"given":"Litian","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1097-2044","authenticated-orcid":false,"given":"Jiajun","family":"Bu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltru\u0161aitis Tadas","year":"2018","unstructured":"Tadas Baltru\u0161aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423--443."},{"key":"e_1_3_2_2_2_1","article-title":"Large scale online learning of image similarity through ranking","volume":"11","author":"Chechik Gal","year":"2010","unstructured":"Gal Chechik, Varun Sharma, Uri Shalit, and Samy Bengio. 2010. Large scale online learning of image similarity through ranking. Journal of Machine Learning Research, Vol. 11, 3 (2010).","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_3_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_4_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_2_5_1","unstructured":"Ionut Cosmin Duta Li Liu Fan Zhu and Ling Shao. 2020. Improved residual networks for image and video recognition. In ICPR. 9415--9422."},{"key":"e_1_3_2_2_6_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 249--256","author":"Glorot Xavier","year":"2010","unstructured":"Xavier Glorot and Yoshua Bengio. 2010. Understanding the difficulty of training deep feedforward neural networks. In Proceedings of the thirteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 249--256."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28688"},{"key":"e_1_3_2_2_8_1","volume-title":"Inductive representation learning on large graphs. Advances in neural information processing systems","author":"Hamilton Will","year":"2017","unstructured":"Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017. Inductive representation learning on large graphs. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Kai Han Yunhe Wang Hanting Chen Xinghao Chen Jianyuan Guo Zhenhua Liu Yehui Tang An Xiao Chunjing Xu Yixing Xu et al. 2022. A survey on vision transformer. IEEE transactions on pattern analysis and machine intelligence Vol. 45 1 (2022) 87--110.","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872427.2883037"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v30i1.9973"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401063"},{"key":"e_1_3_2_2_14_1","volume-title":"Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351","author":"Jiao Xiaoqi","year":"2019","unstructured":"Xiaoqi Jiao, Yichun Yin, Lifeng Shang, Xin Jiang, Xiao Chen, Linlin Li, Fang Wang, and Qun Liu. 2019. Tinybert: Distilling bert for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959165"},{"key":"e_1_3_2_2_16_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_17_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3298689.3347003"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403088"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3656580"},{"key":"e_1_3_2_2_21_1","volume-title":"Multimodal recommender systems: A survey. Comput. Surveys","author":"Liu Qidong","year":"2023","unstructured":"Qidong Liu, Jiaxi Hu, Yutian Xiao, Xiangyu Zhao, Jingtong Gao, Wanyu Wang, Qing Li, and Jiliang Tang. 2023. Multimodal recommender systems: A survey. Comput. Surveys (2023)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01173"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.14569\/IJACSA.2020.0110975"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"Nils Reimers and Iryna Gurevych. 2019. Sentence-BERT: Sentence embeddings using siamese BERT-Networks. In EMNLP. 3980--3990.","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_2_26_1","volume-title":"BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618","author":"Rendle Steffen","year":"2012","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2012. BPR: Bayesian personalized ranking from implicit feedback. arXiv preprint arXiv:1205.2618 (2012)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_28_1","volume-title":"Faster, Cheaper and Lighter. arXiv preprint arXiv:1910.01108","author":"Sanh V","year":"2019","unstructured":"V Sanh. 2019. DistilBERT, A Distilled Version of BERT: Smaller, Faster, Cheaper and Lighter. arXiv preprint arXiv:1910.01108 (2019)."},{"key":"e_1_3_2_2_29_1","volume-title":"Markus Hagenbuchner, and Gabriele Monfardini.","author":"Scarselli Franco","year":"2008","unstructured":"Franco Scarselli, Marco Gori, Ah Chung Tsoi, Markus Hagenbuchner, and Gabriele Monfardini. 2008. The graph neural network model. IEEE transactions on neural networks, Vol. 20, 1 (2008), 61--80."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_2_31_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very deep convolutional networks for large-scale image recognition. In ICLR."},{"key":"e_1_3_2_2_32_1","volume-title":"Improved deep metric learning with multi-class n-pair loss objective. Advances in neural information processing systems","author":"Sohn Kihyuk","year":"2016","unstructured":"Kihyuk Sohn. 2016. Improved deep metric learning with multi-class n-pair loss objective. Advances in neural information processing systems, Vol. 29 (2016)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681207"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3187556"},{"key":"e_1_3_2_2_35_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437963.3441800"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Xiang Wang Xiangnan He Meng Wang Fuli Feng and Tat-Seng Chua. 2019. Neural graph collaborative filtering. In SIGIR. 165--174.","DOI":"10.1145\/3331184.3331267"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_2_39_1","volume-title":"Distance metric learning for large margin nearest neighbor classification. Advances in neural information processing systems","author":"Weinberger Kilian Q","year":"2005","unstructured":"Kilian Q Weinberger, John Blitzer, and Lawrence Saul. 2005. Distance metric learning for large margin nearest neighbor classification. Advances in neural information processing systems, Vol. 18 (2005)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00988"},{"volume-title":"Transfer learning","author":"Yang Qiang","key":"e_1_3_2_2_41_1","unstructured":"Qiang Yang, Yu Zhang, Wenyuan Dai, and Sinno Jialin Pan. 2020. Transfer learning. Cambridge University Press."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532027"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613915"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330961"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475259"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3221949"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3285029"},{"key":"e_1_3_2_2_48_1","volume-title":"A comprehensive survey on multimodal recommender systems: Taxonomy, evaluation, and future directions. arXiv preprint arXiv:2302.04473","author":"Zhou Hongyu","year":"2023","unstructured":"Hongyu Zhou, Xin Zhou, Zhiwei Zeng, Lingzi Zhang, and Zhiqi Shen. 2023b. A comprehensive survey on multimodal recommender systems: Taxonomy, evaluation, and future directions. arXiv preprint arXiv:2302.04473 (2023)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611943"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583251"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717553","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717553","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:02:24Z","timestamp":1759892544000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717553"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":50,"alternative-id":["10.1145\/3701716.3717553","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717553","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}