{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T14:52:58Z","timestamp":1784299978905,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LR21F020002"],"award-info":[{"award-number":["LR21F020002"]}]},{"name":"Natural Science Foundation of China","award":["61976192, 62102365"],"award-info":[{"award-number":["61976192, 62102365"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612374","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"611-620","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":41,"title":["A Prior Instruction Representation Framework for Remote Sensing Image-text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5968-5209","authenticated-orcid":false,"given":"Jiancheng","family":"Pan","sequence":"first","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1749-9509","authenticated-orcid":false,"given":"Qing","family":"Ma","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6177-3862","authenticated-orcid":false,"given":"Cong","family":"Bai","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.3390\/rs12030405"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.04.034"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00246"},{"key":"e_1_3_2_1_4_1","volume-title":"Yin-Wen Chang, and Chun-Sung Ferng.","author":"Chen Pu-Chin","year":"2021","unstructured":"Pu-Chin Chen, Henry Tsai, Srinadh Bhojanapalli, Hyung Won Chung, Yin-Wen Chang, and Chun-Sung Ferng. 2021. A simple and effective positional encoding for transformers. arXiv preprint arXiv:2104.08698 (2021)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3070872"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2016.2598228"},{"key":"e_1_3_2_1_7_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_1_8_1","volume-title":"Plug and play language models: A simple approach to controlled text generation. arXiv preprint arXiv:1912.02164","author":"Dathathri Sumanth","year":"2019","unstructured":"Sumanth Dathathri, Andrea Madotto, Janice Lan, Jane Hung, Eric Frank, Piero Molino, Jason Yosinski, and Rosanne Liu. 2019. Plug and play language models: A simple approach to controlled text generation. arXiv preprint arXiv:1912.02164 (2019)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","volume-title":"Topicrnn: A recurrent neural network with long-range semantic dependency. arXiv preprint arXiv:1611.01702","author":"Dieng Adji B","year":"2016","unstructured":"Adji B Dieng, Chong Wang, Jianfeng Gao, and John Paisley. 2016. Topicrnn: A recurrent neural network with long-range semantic dependency. arXiv preprint arXiv:1611.01702 (2016)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00296"},{"key":"e_1_3_2_1_13_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_14_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_15_1","volume-title":"MKVSE: Multimodal Knowledge Enhanced Visual-Semantic Embedding for Image-Text Retrieval. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Feng Duoduo","year":"2023","unstructured":"Duoduo Feng, Xiangteng He, and Yuxin Peng. 2023. MKVSE: Multimodal Knowledge Enhanced Visual-Semantic Embedding for Image-Text Retrieval. ACM Transactions on Multimedia Computing, Communications and Applications (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_18_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_19_1","unstructured":"Andrew Jaegle Sebastian Borgeaud Jean-Baptiste Alayrac Carl Doersch Catalin Ionescu David Ding Skanda Koppula Daniel Zoran Andrew Brock Evan Shelhamer et al. 2021. Perceiver io: A general architecture for structured inputs & outputs. arXiv preprint arXiv:2107.14795 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"A review of the status of satellite remote sensing and image processing techniques for mapping natural hazards and disasters. Progress in physical geography","author":"Joyce Karen E","year":"2009","unstructured":"Karen E Joyce, Stella E Belliss, Sergey V Samsonov, Stephen J McNeill, and Phil J Glassey. 2009. A review of the status of satellite remote sensing and image processing techniques for mapping natural hazards and disasters. Progress in physical geography, Vol. 33, 2 (2009), 183--207."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2009"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i9.17018"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00210"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_31_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2776321"},{"key":"e_1_3_2_1_33_1","first-page":"1","article-title":"Fusion-based correlation learning model for cross-modal remote sensing image retrieval","volume":"19","author":"Lv Yafei","year":"2021","unstructured":"Yafei Lv, Wei Xiong, Xiaohan Zhang, and Yaqi Cui. 2021. Fusion-based correlation learning model for cross-modal remote sensing image retrieval. IEEE Geoscience and Remote Sensing Letters, Vol. 19 (2021), 1--5.","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"key":"e_1_3_2_1_34_1","unstructured":"Li Mi Siran Li Christel Chappuis and Devis Tuia. 2022. Knowledge-Aware Cross-Modal Text-Image Retrieval for Remote Sensing Images. (2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592236"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413961"},{"key":"e_1_3_2_1_37_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3236837"},{"key":"e_1_3_2_1_39_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_41_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research, Vol. 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_42_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_43_1","unstructured":"Teng Wang Jinrui Zhang Junjie Fei Yixiao Ge Hao Zheng Yunlong Tang Zhe Li Mingqi Gao Shanshan Zhao Ying Shan et al. 2023. Caption anything: Interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00418"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2685945"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2022.10.013"},{"key":"e_1_3_2_1_47_1","volume-title":"Track anything: Segment anything meets videos. arXiv preprint arXiv:2304.11968","author":"Yang Jinyu","year":"2023","unstructured":"Jinyu Yang, Mingqi Gao, Zhe Li, Shang Gao, Fangjing Wang, and Feng Zheng. 2023. Track anything: Segment anything meets videos. arXiv preprint arXiv:2304.11968 (2023)."},{"key":"e_1_3_2_1_48_1","volume-title":"Exploring a fine-grained multiscale method for cross-modal remote sensing image retrieval. arXiv preprint arXiv:2204.09868","author":"Yuan Zhiqiang","year":"2022","unstructured":"Zhiqiang Yuan, Wenkai Zhang, Kun Fu, Xuan Li, Chubo Deng, Hongqi Wang, and Xian Sun. 2022a. Exploring a fine-grained multiscale method for cross-modal remote sensing image retrieval. arXiv preprint arXiv:2204.09868 (2022)."},{"key":"e_1_3_2_1_49_1","first-page":"1","article-title":"A lightweight multi-scale crossmodal text-image retrieval method in remote sensing","volume":"60","author":"Yuan Zhiqiang","year":"2021","unstructured":"Zhiqiang Yuan, Wenkai Zhang, Xuee Rong, Xuan Li, Jialiang Chen, Hongqi Wang, Kun Fu, and Xian Sun. 2021. A lightweight multi-scale crossmodal text-image retrieval method in remote sensing. IEEE Transactions on Geoscience and Remote Sensing, Vol. 60 (2021), 1--19.","journal-title":"IEEE Transactions on Geoscience and Remote Sensing"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2022.103071"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3163706"},{"key":"e_1_3_2_1_52_1","volume-title":"Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276","author":"Zeng Yan","year":"2021","unstructured":"Yan Zeng, Xinsong Zhang, and Hang Li. 2021. Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276 (2021)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS46834.2022.9883242"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2022.102355"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2015.2475299"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612374","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612374","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:54:20Z","timestamp":1755820460000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612374"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":55,"alternative-id":["10.1145\/3581783.3612374","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612374","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}