{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T23:59:31Z","timestamp":1776470371835,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,2,22]]},"DOI":"10.1145\/3773966.3777958","type":"proceedings-article","created":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T17:50:01Z","timestamp":1771264201000},"page":"924-933","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MOON: Generative MLLM-based Multimodal Representation Learning for E-commerce Product Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0149-4220","authenticated-orcid":false,"given":"Daoze","family":"Zhang","sequence":"first","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1889-7732","authenticated-orcid":false,"given":"Chenghan","family":"Fu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6313-3504","authenticated-orcid":false,"given":"Zhanheng","family":"Nie","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4772-6380","authenticated-orcid":false,"given":"Jianyu","family":"Liu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3774-886X","authenticated-orcid":false,"given":"Wanxian","family":"Guan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7781-1498","authenticated-orcid":false,"given":"Yuan","family":"Gao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5778-6452","authenticated-orcid":false,"given":"Jun","family":"Song","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4285-5033","authenticated-orcid":false,"given":"Pengjie","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3111-1005","authenticated-orcid":false,"given":"Jian","family":"Xu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4037-6315","authenticated-orcid":false,"given":"Bo","family":"Zheng","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2026,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-00129-1_21"},{"key":"e_1_3_2_1_2_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025. Qwen2.5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482493"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612408"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-022-23052-9"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627673.3680098"},{"key":"e_1_3_2_1_7_1","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171-4186."},{"key":"e_1_3_2_1_8_1","volume-title":"Training and challenging models for text-guided fashion image retrieval. arXiv preprint arXiv:2204.11004","author":"Dodds Eric","year":"2022","unstructured":"Eric Dodds, Jack Culpepper, and Gaurav Srivastava. 2022. Training and challenging models for text-guided fashion image retrieval. arXiv preprint arXiv:2204.11004 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02057"},{"key":"e_1_3_2_1_10_1","volume-title":"MOON Embedding: Multimodal Representation Learning for E-commerce Search Advertising. arXiv preprint arXiv:2511.11305","author":"Fu Chenghan","year":"2025","unstructured":"Chenghan Fu, Daoze Zhang, Yukang Lin, Zhanheng Nie, Xiang Zhang, Jianyu Liu, Yueran Liu, Wanxian Guan, Pengjie Wang, Jian Xu, and Bo Zheng. 2025. MOON Embedding: Multimodal Representation Learning for E-commerce Search Advertising. arXiv preprint arXiv:2511.11305 (2025)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401430"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01064"},{"key":"e_1_3_2_1_14_1","volume-title":"Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571","author":"Lin Sheng-Chieh","year":"2024","unstructured":"Sheng-Chieh Lin, Chankyu Lee, Mohammad Shoeybi, Jimmy Lin, Bryan Catanzaro, and Wei Ping. 2024. Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571 (2024)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109519"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570423"},{"key":"e_1_3_2_1_18_1","first-page":"9564","article-title":"Multimodal contrastive learning with limoe: the language-image mixture of experts","volume":"35","author":"Mustafa Basil","year":"2022","unstructured":"Basil Mustafa, Carlos Riquelme, Joan Puigcerver, Rodolphe Jenatton, and Neil Houlsby. 2022. Multimodal contrastive learning with limoe: the language-image mixture of experts. Advances in Neural Information Processing Systems, Vol. 35 (2022), 9564-9576.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"Dynamic Modality-balanced Multimodal Representation Learning for E-commerce Product Understanding. arXiv preprint arXiv:2511.12449","author":"Nie Zhanheng","year":"2025","unstructured":"Zhanheng Nie, Chenghan Fu, Daoze Zhang, Junxian Wu, Wanxian Guan, Pengjie Wang, Jian Xu, and Bo Zheng. 2025. MOON2.0: Dynamic Modality-balanced Multimodal Representation Learning for E-commerce Product Understanding. arXiv preprint arXiv:2511.12449 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1080\/08874417.2021.1910880"},{"key":"e_1_3_2_1_21_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-0-387-85820-3_1"},{"key":"e_1_3_2_1_23_1","volume-title":"Fashion-gen: The generative fashion dataset and challenge. arXiv preprint arXiv:1806.08317","author":"Rostamzadeh Negar","year":"2018","unstructured":"Negar Rostamzadeh, Seyedarian Hosseini, Thomas Boquet, Wojciech Stokowiec, Ying Zhang, Christian Jauvin, and Chris Pal. 2018. Fashion-gen: The generative fashion dataset and challenge. arXiv preprint arXiv:1806.08317 (2018)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475303"},{"key":"e_1_3_2_1_25_1","volume-title":"Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al.","author":"Tschannen Michael","year":"2025","unstructured":"Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin, Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, et al., 2025. Siglip 2: Multilingual vision-language encoders with improved semantic understanding, localization, and dense features. arXiv preprint arXiv:2502.14786 (2025)."},{"key":"e_1_3_2_1_26_1","unstructured":"Aaron van den Oord Yazhe Li and Oriol Vinyals. 2019. Representation Learning with Contrastive Predictive Coding. arXiv:1807.03748 [cs.LG]"},{"key":"e_1_3_2_1_27_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_28_1","unstructured":"Peng Wang Shuai Bai Sinan Tan Shijie Wang Zhihao Fan Jinze Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge et al. 2024. Qwen2-vl: Enhancing vision-language model's perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313644"},{"key":"e_1_3_2_1_30_1","volume-title":"MIM: Multi-modal Content Interest Modeling Paradigm for User Behavior Modeling. arXiv preprint arXiv:2502.00321","author":"Yan Bencheng","year":"2025","unstructured":"Bencheng Yan, Si Chen, Shichang Jia, Jianyu Liu, Yueran Liu, Chenghan Fu, Wanxian Guan, Hui Zhao, Xiang Zhang, Kai Zhang, et al., 2025. MIM: Multi-modal Content Interest Modeling Paradigm for User Behavior Modeling. arXiv preprint arXiv:2502.00321 (2025)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3488560.3498377"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_1"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539151"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401156"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01157"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.222"},{"key":"e_1_3_2_1_37_1","volume-title":"GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855","author":"Zhang Xin","year":"2024","unstructured":"Xin Zhang, Yanzhao Zhang, Wen Xie, Mingxin Li, Ziqi Dai, Dingkun Long, Pengjun Xie, Meishan Zhang, Wenjie Li, and Min Zhang. 2024. GME: Improving Universal Multimodal Retrieval by Multimodal LLMs. arXiv preprint arXiv:2412.16855 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240323.3240367"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i28.35383"},{"key":"e_1_3_2_1_40_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao et al. 2025. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479 (2025)."}],"event":{"name":"WSDM '26:The Nineteenth ACM International Conference on Web Search and Data Mining","location":"Boise ID USA","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGIR ACM Special Interest Group on Information Retrieval","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the Nineteenth ACM International Conference on Web Search and Data Mining"],"original-title":[],"deposited":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T17:52:03Z","timestamp":1771264323000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773966.3777958"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,21]]},"references-count":40,"alternative-id":["10.1145\/3773966.3777958","10.1145\/3773966"],"URL":"https:\/\/doi.org\/10.1145\/3773966.3777958","relation":{},"subject":[],"published":{"date-parts":[[2026,2,21]]},"assertion":[{"value":"2026-02-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}