{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:16:45Z","timestamp":1775841405436,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China","award":["No.62502404"],"award-info":[{"award-number":["No.62502404"]}]},{"name":"Hong Kong Research Grants Council","award":["Research Impact Fund No.R1015-23"],"award-info":[{"award-number":["Research Impact Fund No.R1015-23"]}]},{"name":"Hong Kong Research Grants Council","award":["Collaborative Research Fund No.C1043-24GF"],"award-info":[{"award-number":["Collaborative Research Fund No.C1043-24GF"]}]},{"name":"Hong Kong Research Grants Council","award":["General Research Fund No.11218325"],"award-info":[{"award-number":["General Research Fund No.11218325"]}]},{"name":"Institute of Digital Medicine of City University of Hong Kong","award":["No.9229503"],"award-info":[{"award-number":["No.9229503"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792819","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:39Z","timestamp":1775771679000},"page":"7700-7711","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ARCHER: Shooting Straight in Multimodal E-Commerce Search at Alibaba with Progressive Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0073-0172","authenticated-orcid":false,"given":"Maolin","family":"Wang","sequence":"first","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1115-4649","authenticated-orcid":false,"given":"Lang","family":"Fu","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9643-8059","authenticated-orcid":false,"given":"Jun","family":"Chu","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0255-5640","authenticated-orcid":false,"given":"Kai","family":"Guo","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6024-5918","authenticated-orcid":false,"given":"Chenjie","family":"Qin","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3516-1344","authenticated-orcid":false,"given":"Xinxin","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4309-0037","authenticated-orcid":false,"given":"Siyu","family":"Wu","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8249-6879","authenticated-orcid":false,"given":"Wen","family":"Jiang","sequence":"additional","affiliation":[{"name":"Alibaba International Digital Commerce Group, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2926-4416","authenticated-orcid":false,"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv","author":"Alexey Dosovitskiy","year":"2010","unstructured":"Dosovitskiy Alexey. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929 (2020)."},{"key":"e_1_3_2_1_2_1","volume-title":"Fashion Image-to-Image Translation for Complementary Item Retrieval. arXiv preprint arXiv:2408.09847","author":"Attimonelli Matteo","year":"2024","unstructured":"Matteo Attimonelli, Claudio Pomo, Dietmar Jannach, and Tommaso Di Noia. 2024. Fashion Image-to-Image Translation for Complementary Item Retrieval. arXiv preprint arXiv:2408.09847 (2024)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617597"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_5_1","volume-title":"Survey of Semantic-Based Image-to-Image Retrieval. In 2024 5th International Conference on Artificial Intelligence and Computer Engineering (ICAICE). IEEE, 51-54","author":"Cao Danyang","year":"2024","unstructured":"Danyang Cao, Hongbo Zhou, and Huifang Yang. 2024. Survey of Semantic-Based Image-to-Image Retrieval. In 2024 5th International Conference on Artificial Intelligence and Computer Engineering (ICAICE). IEEE, 51-54."},{"key":"e_1_3_2_1_6_1","volume-title":"Image-text retrieval: A survey on recent research and development. arXiv preprint arXiv:2203.14713","author":"Cao Min","year":"2022","unstructured":"Min Cao, Shiping Li, Juntao Li, Liqiang Nie, and Min Zhang. 2022. Image-text retrieval: A survey on recent research and development. arXiv preprint arXiv:2203.14713 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01192"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_2_1_9_1","volume-title":"IPL: Leveraging multimodal large language models for intelligent product listing. arXiv preprint arXiv:2410.16977","author":"Chen Kang","year":"2024","unstructured":"Kang Chen, Qingheng Zhang, Chengbao Lian, Yixin Ji, Xuwei Liu, Shuguang Han, Guoqiang Wu, Fei Huang, and Jufeng Chen. 2024. IPL: Leveraging multimodal large language models for intelligent product listing. arXiv preprint arXiv:2410.16977 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W Cohen. 2022. Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR, 4904-4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904-4916."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.220"},{"key":"e_1_3_2_1_14_1","volume-title":"Bushra Zafar, Saadat Hanif Dar, Muhammad Sajid, and Tehmina Khalil.","author":"Latif Afshan","year":"2019","unstructured":"Afshan Latif, Aqsa Rasheed, Umer Sajid, Jameel Ahmed, Nouman Ali, Naeem Iqbal Ratyal, Bushra Zafar, Saadat Hanif Dar, Muhammad Sajid, and Tehmina Khalil. 2019. Content-Based Image Retrieval and Feature Extraction: A Comprehensive Review. Mathematical problems in engineering, Vol. 2019, 1 (2019), 9658350."},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_16_1","volume-title":"A Survey of Multimodal Composite Editing and Retrieval. arXiv preprint arXiv:2409.05405","author":"Li Suyan","year":"2024","unstructured":"Suyan Li, Fuxiang Huang, and Lei Zhang. 2024. A Survey of Multimodal Composite Editing and Retrieval. arXiv preprint arXiv:2409.05405 (2024)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583378"},{"key":"e_1_3_2_1_18_1","first-page":"1","article-title":"Multimodal recommender systems: A survey","volume":"57","author":"Liu Qidong","year":"2024","unstructured":"Qidong Liu, Jiaxi Hu, Yutian Xiao, Xiangyu Zhao, Jingtong Gao, Wanyu Wang, Qing Li, and Jiliang Tang. 2024. Multimodal recommender systems: A survey. Comput. Surveys, Vol. 57, 2 (2024), 1-17.","journal-title":"Comput. Surveys"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_1_20_1","volume-title":"Universal vision-language dense retrieval: Learning a unified representation space for multi-modal retrieval. arXiv preprint arXiv:2209.00179","author":"Liu Zhenghao","year":"2022","unstructured":"Zhenghao Liu, Chenyan Xiong, Yuanhuiyi Lv, Zhiyuan Liu, and Ge Yu. 2022. Universal vision-language dense retrieval: Learning a unified representation space for multi-modal retrieval. arXiv preprint arXiv:2209.00179 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"End-to-end knowledge retrieval with multi-modal queries. arXiv preprint arXiv:2306.00424","author":"Luo Man","year":"2023","unstructured":"Man Luo, Zhiyuan Fang, Tejas Gokhale, Yezhou Yang, and Chitta Baral. 2023. End-to-end knowledge retrieval with multi-modal queries. arXiv preprint arXiv:2306.00424 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Unifying multimodal retrieval via document screenshot embedding. arXiv preprint arXiv:2406.11251","author":"Ma Xueguang","year":"2024","unstructured":"Xueguang Ma, Sheng-Chieh Lin, Minghan Li, Wenhu Chen, and Jimmy Lin. 2024. Unifying multimodal retrieval via document screenshot embedding. arXiv preprint arXiv:2406.11251 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_24_1","unstructured":"Bharadwaj Ravur. 2023. Sorting Clothes using Image Segmentation and Object Detection. Ph.D. Dissertation. Dublin National College of Ireland."},{"key":"e_1_3_2_1_25_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Ray Arijit","year":"2024","unstructured":"Arijit Ray, Filip Radenovic, Abhimanyu Dubey, Bryan Plummer, Ranjay Krishna, and Kate Saenko. 2024. Cola: A benchmark for compositional text-to-image retrieval. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang, and Yue Cao. 2023. Eva-clip: Improved training techniques for clip at scale. arXiv preprint arXiv:2303.15389 (2023)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3737257"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3711896.3736968"},{"key":"e_1_3_2_1_29_1","volume-title":"Large multimodal model compression via efficient pruning and distillation at AntGroup. arXiv preprint arXiv:2312.05795","author":"Wang Maolin","year":"2023","unstructured":"Maolin Wang, Yao Zhao, Jiajia Liu, Jingdong Chen, Chenyi Zhuang, Jinjie Gu, Ruocheng Guo, and Xiangyu Zhao. 2023. Large multimodal model compression via efficient pruning and distillation at AntGroup. arXiv preprint arXiv:2312.05795 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"European Conference on Computer Vision. Springer, 387-404","author":"Wei Cong","year":"2024","unstructured":"Cong Wei, Yang Chen, Haonan Chen, Hexiang Hu, Ge Zhang, Jie Fu, Alan Ritter, and Wenhu Chen. 2024. Uniir: Training and benchmarking universal multimodal information retrievers. In European Conference on Computer Vision. Springer, 387-404."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"e_1_3_2_1_32_1","volume-title":"Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval. In International Conference on Learning Representations.","author":"Xiong Lee","unstructured":"Lee Xiong, Chenyan Xiong, Ye Li, Kwok-Fung Tang, Jialin Liu, Paul N Bennett, Junaid Ahmed, and Arnold Overwijk. [n.d.]. Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, et al., 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_1_34_1","volume-title":"Volo: Vision outlooker for visual recognition","author":"Yuan Li","year":"2022","unstructured":"Li Yuan, Qibin Hou, Zihang Jiang, Jiashi Feng, and Shuicheng Yan. 2022. Volo: Vision outlooker for visual recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 5 (2022), 6575-6586."},{"key":"e_1_3_2_1_35_1","unstructured":"Shitao Yuan Jingrong Liu and Fangzhou Zheng. 2023. BGE-ViT: A Visual Tokenizer for Cross-modal Fine-tuning. https:\/\/huggingface.co\/BAAI\/bge-visualized."},{"key":"e_1_3_2_1_36_1","volume-title":"Notellm-2: Multimodal large representation models for recommendation. arXiv preprint arXiv:2405.16789","author":"Zhang Chao","year":"2024","unstructured":"Chao Zhang, Haoxin Zhang, Shiwei Wu, Di Wu, Tong Xu, Xiangyu Zhao, Yan Gao, Yao Hu, and Enhong Chen. 2024b. Notellm-2: Multimodal large representation models for recommendation. arXiv preprint arXiv:2405.16789 (2024)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567836"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3369699"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219820"},{"key":"e_1_3_2_1_41_1","volume-title":"Deep reinforcement learning for search, recommendation, and online advertising: a survey. ACM sigweb newsletter","author":"Zhao Xiangyu","year":"2019","unstructured":"Xiangyu Zhao, Long Xia, Jiliang Tang, and Dawei Yin. 2019. Deep reinforcement learning for search, recommendation, and online advertising: a survey. ACM sigweb newsletter, Vol. 2019, Spring (2019), 1-15."},{"key":"e_1_3_2_1_42_1","volume-title":"VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval. arXiv preprint arXiv:2406.04292","author":"Zhou Junjie","year":"2024","unstructured":"Junjie Zhou, Zheng Liu, Shitao Xiao, Bo Zhao, and Yongping Xiong. 2024. VISTA: Visualized Text Embedding For Universal Multi-Modal Retrieval. arXiv preprint arXiv:2406.04292 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin. arXiv preprint arXiv:2310.14037","author":"Zhou Tianshuo","year":"2023","unstructured":"Tianshuo Zhou, Sen Mei, Xinze Li, Zhenghao Liu, Chenyan Xiong, Zhiyuan Liu, Yu Gu, and Ge Yu. 2023. Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin. arXiv preprint arXiv:2310.14037 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658032"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:36:15Z","timestamp":1775838975000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792819"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":44,"alternative-id":["10.1145\/3774904.3792819","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792819","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}