{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T21:16:06Z","timestamp":1773263766049,"version":"3.50.1"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T00:00:00Z","timestamp":1762905600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T00:00:00Z","timestamp":1762905600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,11,12]]},"DOI":"10.1109\/icdmw69685.2025.00177","type":"proceedings-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T19:50:39Z","timestamp":1773172239000},"page":"1480-1488","source":"Crossref","is-referenced-by-count":0,"title":["Bridging Modality Gaps in e-Commerce Products via Vision-Language Alignment"],"prefix":"10.1109","author":[{"given":"Yipeng","family":"Zhang","sequence":"first","affiliation":[{"name":"eBay Inc."}]},{"given":"Hongjun","family":"Yu","sequence":"additional","affiliation":[{"name":"eBay Inc."}]},{"given":"Aritra","family":"Mandal","sequence":"additional","affiliation":[{"name":"eBay Inc."}]},{"given":"Canran","family":"Xu","sequence":"additional","affiliation":[{"name":"eBay Inc."}]},{"given":"Qunzhi","family":"Zhou","sequence":"additional","affiliation":[{"name":"eBay Inc."}]},{"given":"Zhe","family":"Wu","sequence":"additional","affiliation":[{"name":"eBay Inc."}]}],"member":"263","reference":[{"key":"ref1","author":"Wang","year":"2024","journal-title":"Yolov10: Real-time end-to-end object detection"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref3","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Radford"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-industry.52"},{"key":"ref5","volume-title":"Semantic equiv-alence of e-commerce queries","author":"Mandal","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.580"},{"key":"ref10","author":"Devlin","year":"2018","journal-title":"Bert: Pre-training of deep bidirectional transformers for language understanding"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219839"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78090-5_4"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.knowledgenlp-1.6"},{"key":"ref14","article-title":"OpenAI","year":"2023","journal-title":"Gpt-4 technical report"},{"key":"ref15","author":"Chowdhery","year":"2022","journal-title":"Palm: Scaling language modeling with pathways"},{"key":"ref16","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-024-01276-1"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.383"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-industry.74"},{"key":"ref20","article-title":"Llasa: Large language and e-commerce shopping assistant","volume":"abs\/2408.02006","author":"Zhang","year":"2024","journal-title":"CoRR"},{"key":"ref21","article-title":"ecellm: Generalizing large language models for e-commerce from large-scale, high-quality instruction data","volume-title":"Forty-first International Conference on Machine Learning. OpenReview.net","author":"Peng"},{"key":"ref22","article-title":"Inves-tigating LLM applications in e-commerce","volume":"abs\/2408.12779","author":"Palen-Michel","year":"2024","journal-title":"CoRR"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01745"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref25","author":"Wang","year":"2024","journal-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution"},{"key":"ref26","article-title":"Internvl: Advancing vision-language alignment with large-scale pre-training","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP)","author":"Chen"},{"key":"ref27","author":"Xue","year":"2023","journal-title":"Pumgpt: A large vision-language model for product understanding"},{"key":"ref28","author":"Zhu","year":"2023","journal-title":"Vl-gpt: A generative pre-trained transformer for vision and language understanding and generation"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00204"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"6787","DOI":"10.18653\/v1\/2021.emnlp-main.544","article-title":"VideoCLIP: Contrastive pre-training for zero-shot video-text understanding","volume-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","author":"Xu","year":"2021"},{"key":"ref31","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","author":"Liang","year":"2022","journal-title":"NeurIPS"},{"key":"ref32","first-page":"2","article-title":"Contrastive learning of medical visual representations from paired images and text","volume-title":"Machine Learning for Healthcare Conference. PMLR","author":"Zhang"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"ref34","author":"Dubey","year":"2024","journal-title":"The llama 3 herd of models"},{"key":"ref35","article-title":"Direct preference optimization: your language model is secretly a reward model","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems, ser. NIPS \u201923","author":"Rafailov"},{"key":"ref36","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-demos.38","article-title":"Llamafactory: Unified efficient fine-tuning of 100+ language models","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)","author":"Zheng"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref38","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref39","author":"Chen","year":"2024","journal-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling"}],"event":{"name":"2025 IEEE International Conference on Data Mining Workshops (ICDMW)","location":"Washington, DC, USA","start":{"date-parts":[[2025,11,12]]},"end":{"date-parts":[[2025,11,15]]}},"container-title":["2025 IEEE International Conference on Data Mining Workshops (ICDMW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11415623\/11415713\/11415784.pdf?arnumber=11415784","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T05:15:57Z","timestamp":1773206157000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11415784\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,12]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icdmw69685.2025.00177","relation":{},"subject":[],"published":{"date-parts":[[2025,11,12]]}}}