{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:12:54Z","timestamp":1755839574507,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":73,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3591705","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:59Z","timestamp":1689726179000},"page":"1262-1272","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7720-806X","authenticated-orcid":false,"given":"Minyi","family":"Zhao","sequence":"first","affiliation":[{"name":"Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4352-4897","authenticated-orcid":false,"given":"Jinpeng","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0709-9356","authenticated-orcid":false,"given":"Dongliang","family":"Liao","sequence":"additional","affiliation":[{"name":"Tencent Inc., Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2193-6686","authenticated-orcid":false,"given":"Yiru","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7815-6142","authenticated-orcid":false,"given":"Huanzhong","family":"Duan","sequence":"additional","affiliation":[{"name":"Tencent Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.03.048"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Deng Cai Yan Wang Huayang Li Wai Lam and Lemao Liu. 2021. Neural Machine Translation with Monolingual Translation Memory. In ACL. ACL 7307--7318.","DOI":"10.18653\/v1\/2021.acl-long.567"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413841"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351055"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_2_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Chen Laming","year":"2018","unstructured":"Laming Chen, Guoxin Zhang, and Eric Zhou. 2018. Fast greedy map inference for determinantal point process to improve recommendation diversity. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_2_7_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 4171--4186."},{"key":"e_1_3_2_2_8_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"volume-title":"1996A density-based algorithm for discovering clusters in large spatial databases with noise.. In kdd","author":"Ester Martin","key":"e_1_3_2_2_9_1","unstructured":"Martin Ester, Hans-Peter Kriegel, J\u00f6rg Sander, Xiaowei Xu, et al. 1996A density-based algorithm for discovering clusters in large spatial databases with noise.. In kdd, Vol. 96. 226--231."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401052"},{"key":"e_1_3_2_2_11_1","first-page":"11309","article-title":"Self-paced contrastive learning with hybrid memory for domain adaptive object re-id","volume":"33","author":"Ge Yixiao","year":"2020","unstructured":"Yixiao Ge, Feng Zhu, Dapeng Chen, Rui Zhao, et al. 2020. Self-paced contrastive learning with hybrid memory for domain adaptive object re-id. Advances in Neural Information Processing Systems, Vol. 33 (2020), 11309--11321.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2980129"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_15_1","volume-title":"Dataset and Evaluation. In Working Notes Proceedings of the MediaEval 2016 Workshop","volume":"1739","author":"Ionescu Bogdan","year":"2016","unstructured":"Bogdan Ionescu, Alexandru-Lucian G\u00eensca, Maia Zaharieva, Bogdan Boteanu, Mihai Lupu, and Henning M\u00fcller. 2016a. Retrieving Diverse Social Images at MediaEval 2016: Challenge, Dataset and Evaluation. In Working Notes Proceedings of the MediaEval 2016 Workshop, Vol. 1739. CEUR-WS.org."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2713168.2713192"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-014-2369-4"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2557642.2563670"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2986579"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2986579"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2904991"},{"key":"e_1_3_2_2_22_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_2_23_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00591-010-0080-8"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331362"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the ICML. JMLR.org, 1188--1196","author":"Quoc Le V.","year":"2014","unstructured":"V. Quoc Le and Tom\u00e1s Mikolov. 2014. Distributed Representations of Sentences and Documents. In Proceedings of the ICML. JMLR.org, 1188--1196."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301200"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.772"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2984676"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00213"},{"key":"e_1_3_2_2_31_1","volume-title":"Simple to complex cross-modal learning to rank. Computer Vision and Image Understanding","author":"Luo Minnan","year":"2017","unstructured":"Minnan Luo, Xiaojun Chang, Zhihui Li, Liqiang Nie, Alexander G. Hauptmann, and Qinghua Zheng. 2017. Simple to complex cross-modal learning to rank. Computer Vision and Image Understanding (2017), 67--77."},{"key":"e_1_3_2_2_32_1","volume-title":"Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:1802.03426","author":"McInnes Leland","year":"2018","unstructured":"Leland McInnes, John Healy, and James Melville. 2018. Umap: Uniform manifold approximation and projection for dimension reduction. arXiv preprint arXiv:1802.03426 (2018)."},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop","volume":"1984","author":"Peng Liang","year":"2017","unstructured":"Liang Peng, Yi Bin, Xiyao Fu, Jie Zhou, Yang Yang, and Heng Tao Shen. 2017. CFM@MediaEval 2017 Retrieving Diverse Social Images Task via Re-ranking and Hierarchical Clustering. In Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop, Vol. 1984."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2705068"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3411914"},{"key":"e_1_3_2_2_36_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_2_38_1","volume-title":"Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop","volume":"1984","author":"Renders Jean-Michel","year":"2017","unstructured":"Jean-Michel Renders and Gabriela Csurka. 2017. NLE@MediaEval'17: Combining Cross-Media Similarity and Embeddings for Retrieving Diverse Social Images. In Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop, Vol. 1984."},{"key":"e_1_3_2_2_39_1","volume-title":"Proceedings of the Working Notes Proceedings of the MediaEval 2014 Workshop","volume":"1263","author":"Sarac Mustafa Ilker","year":"2014","unstructured":"Mustafa Ilker Sarac and Pinar Duygulu. 2014. Bilkent-RETINA at Retrieving Diverse Social Images Task of MediaEval 2014. In Proceedings of the Working Notes Proceedings of the MediaEval 2014 Workshop, Vol. 1263."},{"key":"e_1_3_2_2_40_1","volume-title":"Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop","volume":"1984","author":"Seddati Omar","year":"2017","unstructured":"Omar Seddati, Nada Ben-Lhachemi, St\u00e9phane Dupont, and Sa\u00efd Mahmoudi. 2017. UMONS @ MediaEval 2017: Diverse Social Images Retrieval. In Proceedings of the Working Notes Proceedings of the MediaEval 2017 Workshop, Vol. 1984."},{"key":"e_1_3_2_2_41_1","first-page":"35","article-title":"Modern information retrieval: A brief overview","volume":"24","author":"Amit Singhal","year":"2001","unstructured":"Amit Singhal et al. 2001. Modern information retrieval: A brief overview. IEEE Data Eng. Bull. , Vol. 24, 4 (2001), 35--43.","journal-title":"IEEE Data Eng. Bull."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462872"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/1101149.1101337"},{"key":"e_1_3_2_2_45_1","volume-title":"Proceedings of the NeurIPS. 5998--6008","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, N. Aidan Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Proceedings of the NeurIPS. 5998--6008."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_2_47_1","volume-title":"A Spatiotemporal Graph Neural Network for session-based recommendation. Expert Systems with Applications","author":"Wang Huanwen","year":"2022","unstructured":"Huanwen Wang, Yawen Zeng, Jianguo Chen, Zhouting Zhao, and Hao Chen. 2022. A Spatiotemporal Graph Neural Network for session-based recommendation. Expert Systems with Applications (2022)."},{"key":"e_1_3_2_2_48_1","volume-title":"A Comprehensive Survey on Cross-modal Retrieval. arXiv preprint arXiv:1607.06215","author":"Wang Kaiye","year":"2016","unstructured":"Kaiye Wang, Qiyue Yin, Wei Wang, Shu Wu, and Liang Wang. 2016b. A Comprehensive Survey on Cross-modal Retrieval. arXiv preprint arXiv:1607.06215 (2016)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00586"},{"key":"e_1_3_2_2_51_1","volume-title":"Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196","author":"Wei Jason","year":"2019","unstructured":"Jason Wei and Kai Zou. 2019. Eda: Easy data augmentation techniques for boosting performance on text classification tasks. arXiv preprint arXiv:1901.11196 (2019)."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413916"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2562670"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6941"},{"key":"e_1_3_2_2_56_1","volume-title":"Hauptmann","author":"Yan Caixia","year":"2020","unstructured":"Caixia Yan, Qinghua Zheng, Xiaojun Chang, Minnan Luo, Chung-Hsing Yeh, and Alexander G. Hauptmann. 2020. Semantics-Preserving Graph Propagation for Zero-Shot Object Detection. IEEE Transactions on Image Processing (2020), 8163--8176."},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jfranklin.2021.06.009"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.170"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539813.3545148"},{"key":"e_1_3_2_2_61_1","volume-title":"Proceedings of the Working Notes Proceedings of the MediaEval 2014 Workshop","volume":"1263","author":"Zaharieva Maia","year":"2014","unstructured":"Maia Zaharieva and Patrick Schwab. 2014. A Unified Framework for Retrieving Diverse Social Images. In Proceedings of the Working Notes Proceedings of the MediaEval 2014 Workshop, Vol. 1263."},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478025"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"e_1_3_2_2_64_1","volume-title":"Keyword-Based Diverse Image Retrieval With Variational Multiple Instance Graph","author":"Zeng Yawen","year":"2022","unstructured":"Yawen Zeng, Yiru Wang, Dongliang Liao, Gongfu Li, Weijie Huang, Jin Xu, Da Cao, and Hong Man. 2022b. Keyword-Based Diverse Image Retrieval With Variational Multiple Instance Graph. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-27355-1_30"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"e_1_3_2_2_67_1","volume-title":"mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412","author":"Zhang Hongyi","year":"2017","unstructured":"Hongyi Zhang, Moustapha Cisse, Yann N Dauphin, and David Lopez-Paz. 2017. mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412 (2017)."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548412"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475710"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/490"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"e_1_3_2_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00440"},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2007.911822"}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Taipei Taiwan","acronym":"SIGIR '23"},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591705","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3591705","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:00Z","timestamp":1750178820000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3591705"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":73,"alternative-id":["10.1145\/3539618.3591705","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3591705","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}