{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:42:15Z","timestamp":1771958535845,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176178"],"award-info":[{"award-number":["62176178"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0160403"],"award-info":[{"award-number":["2022ZD0160403"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681270","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"1662-1671","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Eliminate Before Align: A Remote Sensing Image-Text Retrieval Framework with Keyword Explicit Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2197-3739","authenticated-orcid":false,"given":"Zhong","family":"Ji","sequence":"first","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1075-3157","authenticated-orcid":false,"given":"Changxu","family":"Meng","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2819-388X","authenticated-orcid":false,"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6098-4772","authenticated-orcid":false,"given":"Haoran","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6670-3727","authenticated-orcid":false,"given":"Yanwei","family":"Pang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4361-956X","authenticated-orcid":false,"given":"Jungong","family":"Han","sequence":"additional","affiliation":[{"name":"The University of Sheffield, Sheffield, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12030405"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3215803"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467092"},{"key":"e_1_3_2_1_4_1","unstructured":"Xi Chen Xiao Wang Lucas Beyer Alexander Kolesnikov Jialin Wu Paul Voigtlaender Basil Mustafa Sebastian Goodman Ibrahim Alabdulmohsin Piotr Padlewski et al. 2023. Pali-3 vision language models: Smaller faster stronger. arXiv:2310.09199 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3105551"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_2_1_7_1","first-page":"49250","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","volume":"36","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2024. Instructblip: Towards general-purpose vision-language models with instruction tuning. Advances in Neural Information Processing Systems (NeurIPS) 36 (2024), 49250--49267.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_8_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of British Machine Vision Conference (BMVC).","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In Proceedings of British Machine Vision Conference (BMVC)."},{"key":"e_1_3_2_1_10_1","volume-title":"Hierarchical matching and reasoning for multi-query image retrieval. Neural Networks","author":"Ji Zhong","year":"2024","unstructured":"Zhong Ji, Zhihao Li, Yan Zhang, Haoran Wang, Yanwei Pang, and Xuelong Li. 2024. Hierarchical matching and reasoning for multi-query image retrieval. Neural Networks (2024), 106200."},{"key":"e_1_3_2_1_11_1","first-page":"1","article-title":"Knowledge-Aided Momentum Contrastive Learning for Remote-Sensing Image Text Retrieval","volume":"61","author":"Ji Zhong","year":"2023","unstructured":"Zhong Ji, Changxu Meng, Yan Zhang, Yanwei Pang, and Xuelong Li. 2023. Knowledge-Aided Momentum Contrastive Learning for Remote-Sensing Image Text Retrieval. IEEE Transactions on Geoscience and Remote Sensing 61 (2023), 1--13.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 27831--27840","author":"Kuckreja Kartik","year":"2024","unstructured":"Kartik Kuckreja, Muhammad Sohail Danish, Muzammal Naseer, Abhijit Das, Salman Khan, and Fahad Shahbaz Khan. 2024. Geochat: Grounded large visionlanguage model for remote sensing. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 27831--27840."},{"key":"e_1_3_2_1_15_1","volume-title":"International conference on machine learning (ICML). PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning (ICML). PMLR, 19730-- 19742."},{"key":"e_1_3_2_1_16_1","volume-title":"International conference on machine learning (ICML). PMLR, 12888-- 12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning (ICML). PMLR, 12888-- 12900."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12071130"},{"key":"e_1_3_2_1_18_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems (NeurIPS) 34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems (NeurIPS) 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.10.008"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS52108.2023.10281578"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3478269"},{"key":"e_1_3_2_1_22_1","unstructured":"Xiang Long Kaipeng Deng Guanzhong Wang Yang Zhang Qingqing Dang Yuan Gao Hui Shen Jianguo Ren Shumin Han Errui Ding et al. 2020. PP-YOLO: An effective and efficient implementation of object detector. arXiv preprint arXiv:2007.12099 (2020)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Fusion-based correlation learning model for cross-modal remote sensing image retrieval","volume":"19","author":"Lv Yafei","year":"2021","unstructured":"Yafei Lv,Wei Xiong, Xiaohan Zhang, and Yaqi Cui. 2021. Fusion-based correlation learning model for cross-modal remote sensing image retrieval. IEEE Geoscience and Remote Sensing Letters 19 (2021), 1--5.","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2024.3483283"},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the Second Workshop on Complex Data Challenges in Earth Observation (CDCEO","author":"Mi Li","year":"2022","unstructured":"Li Mi, Siran Li, Christel Chappuis, and Devis Tuia. 2022. Knowledge-Aware Cross-Modal Text-Image Retrieval for Remote Sensing Images. In Proceedings of the Second Workshop on Complex Data Challenges in Earth Observation (CDCEO 2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Representation learning with contrastive predictive coding. arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612374"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592236"},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning (ICML). PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning (ICML). PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2023.3280546","article-title":"Interacting-Enhancing Feature Transformer for Cross-modal Remote Sensing Image and Text Retrieval","volume":"61","author":"Tang Xu","year":"2023","unstructured":"Xu Tang, Yijing Wang, Jingjing Ma, Xiangrong Zhang, Fang Liu, and Licheng Jiao. 2023. Interacting-Enhancing Feature Transformer for Cross-modal Remote Sensing Image and Text Retrieval. IEEE Transactions on Geoscience and Remote Sensing 61 (2023), 1--15.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_32_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Attention is all you need. Advances in neural information processing systems (NeurIPS) 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems (NeurIPS) 30 (2017), 5998--6008."},{"key":"e_1_3_2_1_34_1","volume-title":"Remote sensing for agricultural applications: A meta-review. Remote sensing of environment 236","author":"Weiss Marie","year":"2020","unstructured":"Marie Weiss, Fr\u00e9d\u00e9ric Jacob, and Grgory Duveiller. 2020. Remote sensing for agricultural applications: A meta-review. Remote sensing of environment 236 (2020), 111402."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS52108.2023.10282896"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611709"},{"key":"e_1_3_2_1_37_1","first-page":"1","article-title":"Full-parameter vision navigation based on scene matching for aircrafts","volume":"57","author":"Yu QiFeng","year":"2014","unstructured":"QiFeng Yu, Yang Shang, XiaoChun Liu, ZhiHui Lei, Xiang Li, XianWei Zhu, XiaoLin Liu, Xia Yang, Ang Su, XiaoHu Zhang, et al. 2014. Full-parameter vision navigation based on scene matching for aircrafts. Science China Information Sciences 57 (2014), 1--10.","journal-title":"Science China Information Sciences"},{"key":"e_1_3_2_1_38_1","first-page":"1","article-title":"Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval","volume":"61","author":"Yuan Yuan","year":"2023","unstructured":"Yuan Yuan, Yang Zhan, and Zhitong Xiong. 2023. Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval. IEEE Transactions on Geoscience and Remote Sensing 61 (2023), 1--14.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_39_1","first-page":"1","article-title":"Exploring a Fine-Grained Multiscale Method for Cross-Modal Remote Sensing Image Retrieval","volume":"60","author":"Yuan Zhiqiang","year":"2021","unstructured":"Zhiqiang Yuan,Wenkai Zhang, Kun Fu, Xuan Li, Chubo Deng, HongqiWang, and Xian Sun. 2021. Exploring a Fine-Grained Multiscale Method for Cross-Modal Remote Sensing Image Retrieval. IEEE Transactions on Geoscience and Remote Sensing 60 (2021), 1--19.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_40_1","first-page":"1","article-title":"A lightweight multi-scale crossmodal text-image retrieval method in remote sensing","volume":"60","author":"Yuan Zhiqiang","year":"2021","unstructured":"Zhiqiang Yuan,Wenkai Zhang, Xuee Rong, Xuan Li, Jialiang Chen, HongqiWang, Kun Fu, and Xian Sun. 2021. A lightweight multi-scale crossmodal text-image retrieval method in remote sensing. IEEE Transactions on Geoscience and Remote Sensing 60 (2021), 1--19.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2022.103071"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2022.3163706","article-title":"Remote sensing cross-modal textimage retrieval based on global and local information","volume":"60","author":"Yuan Zhiqiang","year":"2022","unstructured":"Zhiqiang Yuan, Wenkai Zhang, Changyuan Tian, Xuee Rong, Zhengyuan Zhang, Hongqi Wang, Kun Fu, and Xian Sun. 2022. Remote sensing cross-modal textimage retrieval based on global and local information. IEEE Transactions on Geoscience and Remote Sensing 60 (2022), 1--16.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2023.3333375","article-title":"Hypersphere-based remote sensing cross-modal text-image retrieval via curriculum learning","volume":"61","author":"Zhang Weihang","year":"2023","unstructured":"Weihang Zhang, Jihao Li, Shuoke Li, Jialiang Chen, Wenkai Zhang, Xin Gao, and Xian Sun. 2023. Hypersphere-based remote sensing cross-modal text-image retrieval via curriculum learning. IEEE Transactions on Geoscience and Remote Sensing 61 (2023), 1--15.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3281507"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3348297"},{"key":"e_1_3_2_1_46_1","volume-title":"Rs5m: A large scale vision-language dataset for remote sensing vision-language foundation model. arXiv:2306.11300","author":"Zhang Zilun","year":"2023","unstructured":"Zilun Zhang, Tiancheng Zhao, Yulong Guo, and Jianwei Yin. 2023. Rs5m: A large scale vision-language dataset for remote sensing vision-language foundation model. arXiv:2306.11300 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23208437"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681270","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681270","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681270"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":47,"alternative-id":["10.1145\/3664647.3681270","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681270","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}