{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T07:56:07Z","timestamp":1778313367506,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T00:00:00Z","timestamp":1650844800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61802029,61872278,U1536121,61370195"],"award-info":[{"award-number":["No. 61802029,61872278,U1536121,61370195"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,4,25]]},"DOI":"10.1145\/3485447.3512079","type":"proceedings-article","created":{"date-parts":[[2022,4,25]],"date-time":"2022-04-25T05:11:23Z","timestamp":1650863483000},"page":"2058-2066","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":33,"title":["Modality Matches Modality: Pretraining Modality-Disentangled Item Representations for Recommendation"],"prefix":"10.1145","author":[{"given":"Tengyue","family":"Han","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"given":"Pengfei","family":"Wang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"given":"Shaozhang","family":"Niu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, China"}]},{"given":"Chenliang","family":"Li","sequence":"additional","affiliation":[{"name":"Wuhan University, China"}]}],"member":"320","published-online":{"date-parts":[[2022,4,25]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v28i1.8715"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080797"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Xu Chen Hanxiong Chen Hongteng Xu Yongfeng Zhang Yixin Cao Zheng Qin and Hongyuan Zha. 2019. Personalized Fashion Recommendation with Visual Explanations based on Multimodal Attention Network: Towards Visually Explainable Recommendation. In SIGIR. ACM 765\u2013774.","DOI":"10.1145\/3331184.3331254"},{"key":"e_1_3_2_1_5_1","volume-title":"WWW","author":"Cheng Zhiyong","year":"2018","unstructured":"Zhiyong Cheng, Ying Ding, Lei Zhu, and Mohan\u00a0S. Kankanhalli. 2018. Aspect-Aware Latent Factor Model: Rating Prediction with Ratings and Reviews. In WWW 2018. ACM, 639\u2013648."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2881260"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2134090"},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1). Association for Computational Linguistics, 4171\u20134186."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Fangxiang Feng Xiaojie Wang and Ruifan Li. 2014. Cross-modal Retrieval with Correspondence Autoencoder. In ACM Multimedia. ACM 7\u201316.","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Guibing Guo Shichang Ouyang Xiaodong He Fajie Yuan and Xiaohua Liu. 2019. Dynamic Item Block and Prediction Enhancing Block for Sequential Recommendation. In IJCAI. 1373\u20131379.","DOI":"10.24963\/ijcai.2019\/190"},{"key":"e_1_3_2_1_11_1","volume-title":"Sherlock: Sparse Hierarchical Embeddings for Visually-Aware One-Class Collaborative Filtering","author":"He Ruining","year":"2016","unstructured":"Ruining He, Chunbin Lin, Jianguo Wang, and Julian\u00a0J. McAuley. 2016. Sherlock: Sparse Hierarchical Embeddings for Visually-Aware One-Class Collaborative Filtering. In IJCAI. IJCAI\/AAAI Press, 3740\u20133746."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872427.2883037"},{"key":"e_1_3_2_1_13_1","volume-title":"VBPR: Visual Bayesian Personalized Ranking from Implicit Feedback. In AAAI","author":"He Ruining","year":"2016","unstructured":"Ruining He and Julian\u00a0J. McAuley. 2016. VBPR: Visual Bayesian Personalized Ranking from Implicit Feedback. In AAAI 2016. AAAI Press, 144\u2013150."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806416.2806504"},{"key":"e_1_3_2_1_15_1","unstructured":"Xiangnan He and Tat-Seng Chua. 2017. Neural Factorization Machines for Sparse Predictive Analytics. In SIGIR. ACM 355\u2013364."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Xiangnan He Lizi Liao Hanwang Zhang Liqiang Nie Xia Hu and Tat-Seng Chua. 2017. Neural Collaborative Filtering. In WWW. ACM 173\u2013182.","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"G.\u00a0E. Hinton and R.\u00a0R. Salakhutdinov. 2006. Reducing the Dimensionality of Data with Neural Networks. Science 313(2006).","DOI":"10.1126\/science.1127647"},{"key":"e_1_3_2_1_18_1","volume-title":"ECCV (3)(Lecture Notes in Computer Science, Vol.\u00a011207)","author":"Huang Xun","unstructured":"Xun Huang, Ming-Yu Liu, Serge\u00a0J. Belongie, and Jan Kautz. 2018. Multimodal Unsupervised Image-to-Image Translation. In ECCV (3)(Lecture Notes in Computer Science, Vol.\u00a011207). Springer, 179\u2013196."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2670560"},{"key":"e_1_3_2_1_20_1","volume-title":"Visually-Aware Fashion Recommendation and Design with Generative Image Models. In ICDM","author":"Kang Wang-Cheng","year":"2017","unstructured":"Wang-Cheng Kang, Chen Fang, Zhaowen Wang, and Julian\u00a0J. McAuley. 2017. Visually-Aware Fashion Recommendation and Design with Generative Image Models. In ICDM 2017. IEEE Computer Society, 207\u2013216."},{"key":"e_1_3_2_1_21_1","volume-title":"Self-Attentive Sequential Recommendation. In ICDM","author":"Kang Wang-Cheng","year":"2018","unstructured":"Wang-Cheng Kang and Julian\u00a0J. McAuley. 2018. Self-Attentive Sequential Recommendation. In ICDM 2018. 197\u2013206."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1016"},{"key":"e_1_3_2_1_23_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension","author":"Lewis Mike","year":"2020","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Veselin Stoyanov, and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. In ACL. Association for Computational Linguistics, 7871\u20137880."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/397"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.14778\/3430915.3430924"},{"key":"e_1_3_2_1_26_1","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. CoRR abs\/1907.11692(2019)."},{"key":"e_1_3_2_1_27_1","unstructured":"Zhiwei Liu Ziwei Fan Yu Wang and Philip\u00a0S. Yu. 2021. Augmenting Sequential Recommendation with Pseudo-Prior Items via Reversely Pre-training Transformer. In SIGIR. ACM 1608\u20131612."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 28th International Conference on Machine Learning, ICML 2011","author":"Ngiam Jiquan","year":"2011","unstructured":"Jiquan Ngiam, Aditya Khosla, Mingyu Kim, Juhan Nam, Honglak Lee, and Andrew\u00a0Y. Ng. 2011. Multimodal Deep Learning. In Proceedings of the 28th International Conference on Machine Learning, ICML 2011, Bellevue, Washington, USA, June 28 - July 2, 2011. 689\u2013696."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403280"},{"key":"e_1_3_2_1_30_1","volume-title":"Glove: Global Vectors for Word Representation. In EMNLP. ACL, 1532\u20131543.","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher\u00a0D. Manning. 2014. Glove: Global Vectors for Word Representation. In EMNLP. ACL, 1532\u20131543."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Jinwei Qi and Yuxin Peng. 2018. Cross-modal Bidirectional Translation via Reinforcement Learning. In IJCAI. ijcai.org 2630\u20132636.","DOI":"10.24963\/ijcai.2018\/365"},{"key":"e_1_3_2_1_32_1","volume-title":"U-BERT: Pre-training User Representations for Improved Recommendation","author":"Qiu Zhaopeng","unstructured":"Zhaopeng Qiu, Xian Wu, Jingyue Gao, and Wei Fan. 2021. U-BERT: Pre-training User Representations for Improved Recommendation. In AAAI. AAAI Press, 4320\u20134327."},{"key":"e_1_3_2_1_33_1","volume-title":"BPR: Bayesian Personalized Ranking from Implicit Feedback","author":"Rendle Steffen","year":"2009","unstructured":"Steffen Rendle, Christoph Freudenthaler, Zeno Gantner, and Lars Schmidt-Thieme. 2009. BPR: Bayesian Personalized Ranking from Implicit Feedback. In UAI. AUAI Press, 452\u2013461."},{"key":"e_1_3_2_1_34_1","volume-title":"WWW","author":"Rendle Steffen","year":"2010","unstructured":"Steffen Rendle, Christoph Freudenthaler, and Lars Schmidt-Thieme. 2010. Factorizing personalized Markov chains for next-basket recommendation. In WWW 2010. ACM, 811\u2013820."},{"key":"e_1_3_2_1_35_1","volume-title":"UK","author":"Sariyildiz Mert\u00a0B\u00fclent","year":"2020","unstructured":"Mert\u00a0B\u00fclent Sariyildiz, Julien Perez, and Diane Larlus. 2020. Learning Visual Representations with Caption Annotations. In Computer Vision - ECCV 2020 - 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part VIII. 153\u2013170."},{"key":"e_1_3_2_1_36_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Fei Sun Jun Liu Jian Wu Changhua Pei Xiao Lin Wenwu Ou and Peng Jiang. 2019. BERT4Rec: Sequential Recommendation with Bidirectional Encoder Representations from Transformer. In CIKM. ACM 1441\u20131450.","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_1_38_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In EMNLP\/IJCNLP (1)","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In EMNLP\/IJCNLP (1). Association for Computational Linguistics, 5099\u20135110."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_1_40_1","volume-title":"Zero-Shot Event Detection Using Multi-modal Fusion of Weakly Supervised Concepts","author":"Wu Shuang","unstructured":"Shuang Wu, Sravanthi Bondugula, Florian Luisier, Xiaodan Zhuang, and Pradeep Natarajan. 2014. Zero-Shot Event Detection Using Multi-modal Fusion of Weakly Supervised Concepts. In CVPR. IEEE Computer Society, 2665\u20132672."},{"key":"e_1_3_2_1_41_1","volume-title":"Attend and Tell: Neural Image Caption Generation with Visual Attention. In ICML(JMLR Workshop and Conference Proceedings, Vol.\u00a037)","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron\u00a0C. Courville, Ruslan Salakhutdinov, Richard\u00a0S. Zemel, and Yoshua Bengio. 2015. Show, Attend and Tell: Neural Image Caption Generation with Visual Attention. In ICML(JMLR Workshop and Conference Proceedings, Vol.\u00a037). JMLR.org, 2048\u20132057."},{"key":"e_1_3_2_1_42_1","volume-title":"Deep correlation for matching images and text","author":"Yan Fei","unstructured":"Fei Yan and Krystian Mikolajczyk. 2015. Deep correlation for matching images and text. In CVPR. IEEE Computer Society, 3441\u20133450."},{"key":"e_1_3_2_1_43_1","volume-title":"Learning Modality-Specific Representations with Self-Supervised Multi-Task Learning for Multimodal Sentiment Analysis","author":"Yu Wenmeng","unstructured":"Wenmeng Yu, Hua Xu, Ziqi Yuan, and Jiele Wu. 2021. Learning Modality-Specific Representations with Self-Supervised Multi-Task Learning for Multimodal Sentiment Analysis. In AAAI. AAAI Press, 10790\u201310797."},{"key":"e_1_3_2_1_44_1","volume-title":"Multimodal Contrastive Training for Visual Representation Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021","author":"Yuan Xin","year":"2021","unstructured":"Xin Yuan, Zhe Lin, Jason Kuen, Jianming Zhang, Yilin Wang, Michael Maire, Ajinkya Kale, and Baldo Faieta. 2021. Multimodal Contrastive Training for Visual Representation Learning. In IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19-25, 2021. 6995\u20137004."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.3389\/fdata.2021.602071"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411954"}],"event":{"name":"WWW '22: The ACM Web Conference 2022","location":"Virtual Event, Lyon France","acronym":"WWW '22","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2022"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3512079","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3485447.3512079","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:07Z","timestamp":1750188607000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3485447.3512079"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,4,25]]},"references-count":46,"alternative-id":["10.1145\/3485447.3512079","10.1145\/3485447"],"URL":"https:\/\/doi.org\/10.1145\/3485447.3512079","relation":{},"subject":[],"published":{"date-parts":[[2022,4,25]]},"assertion":[{"value":"2022-04-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}