{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T18:47:42Z","timestamp":1755802062637,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Major Project of Science and Technology of Anhui Province","award":["2023z020008, 202203a05020050, 202103a07020011"],"award-info":[{"award-number":["2023z020008, 202203a05020050, 202103a07020011"]}]},{"name":"National Science Foundation of China","award":["92270205, 12301659, 12171453"],"award-info":[{"award-number":["92270205, 12301659, 12171453"]}]},{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFA1005201, 2022YFA1005202, 2022YFA1005203"],"award-info":[{"award-number":["2022YFA1005201, 2022YFA1005202, 2022YFA1005203"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3657608","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"1089-1093","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CLCP: Realtime Text-Image Retrieval for Retailing via Pre-trained Clustering and Priority Queue"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-9758-5861","authenticated-orcid":false,"given":"Shuyang","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8938-5586","authenticated-orcid":false,"given":"Liangwu","family":"Wei","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9535-6901","authenticated-orcid":false,"given":"Qingyu","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9568-7587","authenticated-orcid":false,"given":"Yuntao","family":"Wei","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7324-3797","authenticated-orcid":false,"given":"Yanzhi","family":"Song","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/INISTA52262.2021.9548414"},{"key":"e_1_3_2_1_2_1","volume-title":"Rethinking Object Detection in Retail Stores. In The 35th AAAI Conference on Artificial Intelligence (AAAI","author":"Cai Yuanqiang","year":"2021","unstructured":"Yuanqiang Cai, Longyin Wen, Libo Zhang, Dawei Du, and Weiqiang Wang. 2021. Rethinking Object Detection in Retail Stores. In The 35th AAAI Conference on Artificial Intelligence (AAAI 2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_5_1","volume-title":"Similarity Reasoning and Filtration for Image-Text Matching. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:230523667","author":"Diao Haiwen","year":"2021","unstructured":"Haiwen Diao, Ying Zhang, Lingyun Ma, and Huchuan Lu. 2021. Similarity Reasoning and Filtration for Image-Text Matching. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:230523667"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the Sixteenth ACM International Conference on Web Search and Data Mining","author":"Du Yali","year":"2022","unstructured":"Yali Du, Yin wei Wei, Wei Ji, Fan Liu, Xin Luo, and Liqiang Nie. 2022. Multiqueue Momentum Contrast for Microvideo-Product Retrieval. Proceedings of the Sixteenth ACM International Conference on Web Search and Data Mining (2022). https:\/\/api.semanticscholar.org\/CorpusID:254974560"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351053"},{"key":"e_1_3_2_1_8_1","unstructured":"Xintong Han. 2019. Fine-grained visual-categorization dataset. In iMaterialist Challenge on Product Recognition. https:\/\/kaggle.com\/competitions\/imaterialistproduct-2019"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_1_12_1","volume-title":"Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Jiang Ding","year":"2023","unstructured":"Ding Jiang and Mang Ye. 2023. Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023), 2787--2797. https:\/\/api.semanticscholar.org\/CorpusID:257663606"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00058"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_15_1","volume-title":"Decoupled Multimodal Distilling for Emotion Recognition. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Li Yong","year":"2023","unstructured":"Yong Li, Yuan-Zheng Wang, and Zhen Cui. 2023. Decoupled Multimodal Distilling for Emotion Recognition. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023), 6631--6640. https:\/\/api.semanticscholar.org\/CorpusID:257756905"},{"volume-title":"Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:14113767","author":"Lin Tsung-Yi","key":"e_1_3_2_1_16_1","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:14113767"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350869"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3286710"},{"key":"e_1_3_2_1_19_1","unstructured":"Haotian Liu Chunyuan Li QingyangWu and Yong Jae Lee. 2023. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_21_1","volume-title":"COTS: Collaborative Two-Stream Vision-Language Pre-Training Model for Cross-Modal Retrieval. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022","author":"Lu Haoyu","year":"2022","unstructured":"Haoyu Lu, Nanyi Fei, Yuqi Huo, Yizhao Gao, Zhiwu Lu, and Jiaxin Wen. 2022. COTS: Collaborative Two-Stream Vision-Language Pre-Training Model for Cross-Modal Retrieval. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022), 15671--15680. https:\/\/api.semanticscholar.org\/CorpusID: 248218570"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3257193"},{"key":"e_1_3_2_1_23_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:231591445"},{"key":"e_1_3_2_1_24_1","volume-title":"a distilled version of BERT: smaller, faster, cheaper and lighter. ArXiv abs\/1910.01108","author":"Sanh Victor","year":"2019","unstructured":"Victor Sanh, Lysandre Debut, Julien Chaumond, and Thomas Wolf. 2019. DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. ArXiv abs\/1910.01108 (2019). https:\/\/api.semanticscholar.org\/CorpusID:203626972"},{"key":"e_1_3_2_1_25_1","volume-title":"Knowledge Aware Semantic Concept Expansion for Image-Text Matching. In International Joint Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:199466137","author":"Shi Botian","year":"2019","unstructured":"Botian Shi, Lei Ji, Pan Lu, Zhendong Niu, and Nan Duan. 2019. Knowledge Aware Semantic Concept Expansion for Image-Text Matching. In International Joint Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:199466137"},{"volume-title":"Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","author":"Song Yale","key":"e_1_3_2_1_26_1","unstructured":"Yale Song and M. Soleymani. 2019. Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019), 1979--1988. https:\/\/api.semanticscholar.org\/CorpusID: 184488029"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3088863"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-022-3513-y"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_31_1","unstructured":"Wei Yu Yuxiang Chen Linfang Wang. 2020. Products-10k: Large Scale Product Recognition Dataset. In Large Scale Product Recognition Challenge. https:\/\/products-10k.github.io\/challenge.html#dataset"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2019.2954956"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657608","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3657608","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:45:12Z","timestamp":1755765912000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3657608"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":33,"alternative-id":["10.1145\/3652583.3657608","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3657608","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}