{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:09:35Z","timestamp":1757617775698,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,9,22]]},"DOI":"10.1145\/3705328.3748128","type":"proceedings-article","created":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T10:51:29Z","timestamp":1757155889000},"page":"975-978","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Improving Visual Recommendation on E-commerce Platforms Using Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2952-9909","authenticated-orcid":false,"given":"Yuki","family":"Yada","sequence":"first","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0668-5182","authenticated-orcid":false,"given":"Sho","family":"Akiyama","sequence":"additional","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5918-6586","authenticated-orcid":false,"given":"Ryo","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0729-6971","authenticated-orcid":false,"given":"Yuta","family":"Ueno","sequence":"additional","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1903-466X","authenticated-orcid":false,"given":"Yusuke","family":"Shido","sequence":"additional","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7907-2394","authenticated-orcid":false,"given":"Andre","family":"Rusli","sequence":"additional","affiliation":[{"name":"Mercari, Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,9,7]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Xi Chen Xiao Wang Soravit Changpinyo AJ Piergiovanni Piotr Padlewski Daniel Salz Sebastian Goodman Adam Grycner Basil Mustafa Lucas Beyer et\u00a0al. 2022. Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2209.06794 (2022)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00444"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539071"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"e_1_3_3_2_7_2","unstructured":"Han Fang Pengfei Xiong Luhui Xu and Yu Chen. 2021. CLIP2Video: Mastering Video-Text Retrieval via Image CLIP. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.11097 (2021)."},{"key":"e_1_3_3_2_8_2","unstructured":"Google. 2017. Google Lens. https:\/\/lens.google.com\/."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_10_2","unstructured":"Andrew\u00a0G. Howard Menglong Zhu Bo Chen Dmitry Kalenichenko Weijun Wang Tobias Weyand Marco Andreetto and Hartwig Adam. 2017. MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1704.04861 (2017)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_3_2_12_2","unstructured":"Alec Radford Jong\u00a0Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2103.00020 (2021)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330696"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3097983.3098162"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330739"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"}],"event":{"name":"RecSys '25: Nineteenth ACM Conference on Recommender Systems","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGIR ACM Special Interest Group on Information Retrieval","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Prague Czech Republic","acronym":"RecSys '25"},"container-title":["Proceedings of the Nineteenth ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3705328.3748128","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T11:48:01Z","timestamp":1757159281000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3705328.3748128"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,7]]},"references-count":16,"alternative-id":["10.1145\/3705328.3748128","10.1145\/3705328"],"URL":"https:\/\/doi.org\/10.1145\/3705328.3748128","relation":{},"subject":[],"published":{"date-parts":[[2025,9,7]]},"assertion":[{"value":"2025-09-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}