{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T19:00:50Z","timestamp":1771268450261,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":21,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,2,22]]},"DOI":"10.1145\/3773966.3779367","type":"proceedings-article","created":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T17:50:01Z","timestamp":1771264201000},"page":"1105-1109","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Factorized Transport Alignment for Multimodal and Multiview E-commerce Representation Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8006-4383","authenticated-orcid":false,"given":"Xiwen","family":"Chen","sequence":"first","affiliation":[{"name":"Clemson University, Clemson, SC, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8547-3154","authenticated-orcid":false,"given":"Yen-Chieh","family":"Lien","sequence":"additional","affiliation":[{"name":"Etsy, Inc., Brooklyn, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5817-3993","authenticated-orcid":false,"given":"Susan","family":"Liu","sequence":"additional","affiliation":[{"name":"Etsy, Inc., Brooklyn, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5341-2002","authenticated-orcid":false,"given":"Mar\u00eda","family":"Casta\u00f1os","sequence":"additional","affiliation":[{"name":"Etsy, Inc., Brooklyn, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3330-6132","authenticated-orcid":false,"given":"Abolfazl","family":"Razi","sequence":"additional","affiliation":[{"name":"Clemson University, Clemson, SC, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9652-4326","authenticated-orcid":false,"given":"Xiaoting","family":"Zhao","sequence":"additional","affiliation":[{"name":"Etsy, Inc., Brooklyn, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4225-0056","authenticated-orcid":false,"given":"Congzhe","family":"Su","sequence":"additional","affiliation":[{"name":"Etsy, Inc., Brooklyn, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,2,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Prompt-OT: An Optimal Transport Regularization Paradigm for Knowledge Preservation in Vision-Language Model Adaptation. arXiv preprint arXiv:2503.08906","author":"Chen Xiwen","year":"2025","unstructured":"Xiwen Chen, Wenhui Zhu, Peijie Qiu, Hao Wang, Huayu Li, Haiyu Wu, Aristeidis Sotiras, Yalin Wang, and Abolfazl Razi. 2025. Prompt-OT: An Optimal Transport Regularization Paradigm for Knowledge Preservation in Vision-Language Model Adaptation. arXiv preprint arXiv:2503.08906 (2025)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1718487.1718531"},{"key":"e_1_3_2_1_4_1","volume-title":"VL-CLIP: Enhancing Multimodal Recommendations via Visual Grounding and LLM-Augmented CLIP Embeddings. arXiv preprint arXiv:2507.17080","author":"Giahi Ramin","year":"2025","unstructured":"Ramin Giahi, Kehui Yao, Sriram Kollipara, Kai Zhao, Vahid Mirjalili, Jianpeng Xu, Topojoy Biswas, Evren Korpeoglu, and Kannan Achan. 2025. VL-CLIP: Enhancing Multimodal Recommendations via Visual Grounding and LLM-Augmented CLIP Embeddings. arXiv preprint arXiv:2507.17080 (2025)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-28238-6_31"},{"key":"e_1_3_2_1_6_1","volume-title":"Praveen Kolli, Utsaw Kumar, and Mandar Rahurkar.","author":"Gurjar Omkar","year":"2025","unstructured":"Omkar Gurjar, Kin Sum Liu, Praveen Kolli, Utsaw Kumar, and Mandar Rahurkar. 2025. DashCLIP: Leveraging multimodal models for generating semantic embeddings for DoorDash. arXiv preprint arXiv:2504.07110 (2025)."},{"key":"e_1_3_2_1_7_1","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In ICML. PMLR, 4904-4916.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_8_1","volume-title":"Henrique Schechter Vera, et al","author":"Lee Jinhyuk","year":"2025","unstructured":"Jinhyuk Lee, Feiyang Chen, Sahil Dua, Daniel Cer, Madhuri Shanbhogue, Iftekhar Naim, Gustavo Hern\u00e1ndez \u00c1brego, Zhe Li, Kaifeng Chen, Henrique Schechter Vera, et al., 2025. Gemini embedding: Generalizable embeddings from gemini. arXiv preprint arXiv:2503.07891 (2025)."},{"key":"e_1_3_2_1_9_1","volume-title":"Doc2Token: Bridging Vocabulary Gap by Predicting Missing Tokens for E-commerce Search. arXiv preprint arXiv:2406.19647","author":"Li Kaihao","year":"2024","unstructured":"Kaihao Li, Juexin Lin, and Tony Lee. 2024. Doc2Token: Bridging Vocabulary Gap by Predicting Missing Tokens for E-commerce Search. arXiv preprint arXiv:2406.19647 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"key":"e_1_3_2_1_11_1","unstructured":"OpenAI. 2024. text-embedding-3. https:\/\/platform.openai.com\/docs\/guides\/embeddings. Accessed: 2025-08-06."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539156"},{"key":"e_1_3_2_1_13_1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML. PMLR, 8748-8763.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_14_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Shi Liangliang","year":"2024","unstructured":"Liangliang Shi, Jack Fan, and Junchi Yan. 2024. Ot-clip: Understanding and generalizing clip via optimal transport. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_15_1","volume-title":"Michael G\u00fcnther, Bo Wang, Markus Krimmel, Feng Wang, Georgios Mastrapas, Andreas Koukounas, Nan Wang, et al.","author":"Sturua Saba","year":"2024","unstructured":"Saba Sturua, Isabelle Mohr, Mohammad Kalim Akram, Michael G\u00fcnther, Bo Wang, Markus Krimmel, Feng Wang, Georgios Mastrapas, Andreas Koukounas, Nan Wang, et al., 2024. jina-embeddings-v3: Multilingual embeddings with task lora. arXiv preprint arXiv:2409.10173 (2024)."},{"key":"e_1_3_2_1_16_1","first-page":"47","volume-title":"Scaling User Modeling: Large-scale Online User Representations for Ads Personalization in Meta. In Companion Proceedings of the ACM on Web Conference","author":"Zhang Wei","year":"2024","unstructured":"Wei Zhang, Dai Li, Chen Liang, Fang Zhou, Zhongke Zhang, Xuewei Wang, Ru Li, Yi Zhou, Yaning Huang, Dong Liang, et al., 2024. Scaling User Modeling: Large-scale Online User Representations for Ads Personalization in Meta. In Companion Proceedings of the ACM on Web Conference 2024. 47-55."},{"key":"e_1_3_2_1_17_1","volume-title":"Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176","author":"Zhang Yanzhao","year":"2025","unstructured":"Yanzhao Zhang, Mingxin Li, Dingkun Long, Xin Zhang, Huan Lin, Baosong Yang, Pengjun Xie, An Yang, Dayiheng Liu, Junyang Lin, Fei Huang, and Jingren Zhou. 2025. Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. arXiv preprint arXiv:2506.05176 (2025)."},{"key":"e_1_3_2_1_18_1","unstructured":"Tianyu Zhu and Jesse Clark. 2024a. Marqo Ecommerce Embeddings - Foundation Model for Product Embeddings. https:\/\/github.com\/marqo-ai\/marqo-ecommerce-embeddings\/"},{"key":"e_1_3_2_1_19_1","volume-title":"EVTP-IVS: Effective Visual Token Pruning For Unifying Instruction Visual Segmentation In Multi-Modal Large Language Models. arXiv preprint arXiv:2508.11886","author":"Zhu Wenhui","year":"2025","unstructured":"Wenhui Zhu, Xiwen Chen, Zhipeng Wang, Shao Tang, Sayan Ghosh, Xuanzhao Dong, Rajat Koner, and Yalin Wang. 2025. EVTP-IVS: Effective Visual Token Pruning For Unifying Instruction Visual Segmentation In Multi-Modal Large Language Models. arXiv preprint arXiv:2508.11886 (2025)."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 6390-6399","author":"Zhu Xinliang","year":"2024","unstructured":"Xinliang Zhu, Sheng-Wei Huang, Han Ding, Jinyu Yang, Kelvin Chen, Tao Zhou, Tal Neiman, Ouye Xie, Son Tran, Benjamin Yao, et al., 2024b. Bringing Multimodality to Amazon Visual Search System. In Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 6390-6399."},{"key":"e_1_3_2_1_21_1","volume-title":"Awt: Transferring vision-language models via augmentation, weighting, and transportation. arXiv preprint arXiv:2407.04603","author":"Zhu Yuhan","year":"2024","unstructured":"Yuhan Zhu, Yuyang Ji, Zhiyu Zhao, Gangshan Wu, and Limin Wang. 2024c. Awt: Transferring vision-language models via augmentation, weighting, and transportation. arXiv preprint arXiv:2407.04603 (2024)."}],"event":{"name":"WSDM '26:The Nineteenth ACM International Conference on Web Search and Data Mining","location":"Boise ID USA","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGIR ACM Special Interest Group on Information Retrieval","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the Nineteenth ACM International Conference on Web Search and Data Mining"],"original-title":[],"deposited":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:00:47Z","timestamp":1771264847000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773966.3779367"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,21]]},"references-count":21,"alternative-id":["10.1145\/3773966.3779367","10.1145\/3773966"],"URL":"https:\/\/doi.org\/10.1145\/3773966.3779367","relation":{},"subject":[],"published":{"date-parts":[[2026,2,21]]},"assertion":[{"value":"2026-02-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}