{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:46Z","timestamp":1781539006724,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810755","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"738-747","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LaViSE: Language-aware Vision Scale Enhancement for Referring Remote Sensing Image Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5564-5074","authenticated-orcid":false,"given":"Yan","family":"Li","sequence":"first","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8003-9107","authenticated-orcid":false,"given":"Junjie","family":"Zheng","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0082-541X","authenticated-orcid":false,"given":"Zhouchao","family":"Fu","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5759-6250","authenticated-orcid":false,"given":"Shengjie","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8386-6601","authenticated-orcid":false,"given":"Junjie","family":"Liao","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6017-0552","authenticated-orcid":false,"given":"Jianwei","family":"Zheng","sequence":"additional","affiliation":[{"name":"Zhejiang University of Technology, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Keyan Chen Chenyang Liu Bowen Chen Jiafan Zhang Zhengxia Zou and Zhenwei Shi. 2025. RSRefSeg 2: Decoupling Referring Remote Sensing Image Segmentation with Foundation Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.06231 (2025).","DOI":"10.1109\/IGARSS55030.2025.11243338"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Keyan Chen Jiafan Zhang Chenyang Liu Zhengxia Zou and Zhenwei Shi. 2025. Rsrefseg: Referring remote sensing image segmentation with foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.06809 (2025).","DOI":"10.1109\/IGARSS55030.2025.11243338"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_3_1_7_2","first-page":"4171","volume-title":"Proceedings of NAACL-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171\u20134186."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","unstructured":"Henghui Ding Chang Liu Suchen Wang and Xudong Jiang. 2023. VLT: Vision-Language Transformer and Query Generation for Referring Segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45 6 (June 2023) 7900\u20137916. 10.1109\/TPAMI.2022.3217852","DOI":"10.1109\/TPAMI.2022.3217852"},{"key":"e_1_3_3_1_9_2","unstructured":"Zhen Dong Yao Sun Ting Liu Wangmeng Zuo and Yunsong Gu. 2024. Cross-modal bidirectional interaction model for referring remote sensing image segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.08613 (2024)."},{"key":"e_1_3_3_1_10_2","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations. Originally arXiv:https:\/\/arXiv.org\/abs\/2010.11929."},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Alex Graves. 2012. Long short-term memory. Supervised sequence labelling with recurrent neural networks (2012) 37\u201345.","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00376"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","unstructured":"Sen Lei Xinyu Xiao Tianlin Zhang Heng-Chao Li Zhenwei Shi and Qing Zhu. 2025. Exploring Fine-Grained Image-Text Alignment for Referring Remote Sensing Image Segmentation. IEEE Transactions on Geoscience and Remote Sensing 63 (2025) 1\u201311. 10.1109\/TGRS.2024.3522293","DOI":"10.1109\/TGRS.2024.3522293"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Kai Li George Vosselman and Michael\u00a0Ying Yang. 2025. Scale-wise bidirectional alignment network for referring remote sensing image segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.00851 (2025).","DOI":"10.1016\/j.isprsjprs.2025.05.014"},{"key":"e_1_3_3_1_21_2","unstructured":"Kun Li Zeyu Xin Le Pang Cheng Pang Yuntian Deng Jing Yao Gui-Song Xia Deyu Meng Zhen Wang and Xiaochun Cao. 2025. SegEarth-R1: Geospatial pixel reasoning via large language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.09644 (2025)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"e_1_3_3_1_23_2","first-page":"3620","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Liu Daqing","year":"2018","unstructured":"Daqing Liu, Hanwang Zhang, Ke Lin, Rui Yao, and Tat-Seng Chua. 2018. Pirc net: Proposal instance referring expression comprehension network. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3620\u20133628."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.143"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","unstructured":"Maofu Liu Xin Jiang and Xiaokang Zhang. 2025. CADFormer: Fine-Grained Cross-Modal Alignment and Decoding Transformer for Referring Remote Sensing Image Segmentation. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing 18 (2025) 14557\u201314569. 10.1109\/JSTARS.2025.3576595","DOI":"10.1109\/JSTARS.2025.3576595"},{"key":"e_1_3_3_1_26_2","unstructured":"Si Liu Tianrui Hui Shaofei Huang Yunchao Wei Bo Li and Guanbin Li. 2021. Cross-modal progressive comprehension for referring segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 9 (2021) 4761\u20134775."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02517"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_30_2","volume-title":"International Conference on Learning Representations","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681318"},{"key":"e_1_3_3_1_33_2","volume-title":"Advances in Neural Information Processing Systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et\u00a0al. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems , Vol.\u00a032."},{"key":"e_1_3_3_1_34_2","first-page":"520","volume-title":"Proceedings of the European Conference on Computer Vision","author":"Qiu Haoxuan","year":"2018","unstructured":"Haoxuan Qiu, Yuan Li, Yongfei Wu, and Qingming Huang. 2018. Cross-modal language and visual reasoning for referring image segmentation. In Proceedings of the European Conference on Computer Vision. Springer, 520\u2013535."},{"key":"e_1_3_3_1_35_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_36_2","unstructured":"Tal Ridnik Emanuel Ben-Baruch Asaf Noy and Lihi Zelnik-Manor. 2021. ImageNet-21K Pretraining for the Masses. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2104.10972 (2021)."},{"key":"e_1_3_3_1_37_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","unstructured":"Qin Ma;Lingling Li;Xiaoqiang Lu;Licheng Jiao;Fang Liu;Wenping Ma;Xu\u00a0Liu;Long Sun. 2025. LSCF: Long-Term Semantic-Guidance ConvFormer for Referring Remote Sensing Image Segmentation. IEEE Transactions on Geoscience and Remote Sensing 63 (2025) 1\u201313. 10.1109\/TGRS.2025.3578515","DOI":"10.1109\/TGRS.2025.3578515"},{"key":"e_1_3_3_1_39_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01075"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","unstructured":"Zhenghang Yuan Lichao Mou Yuansheng Hua and Xiao\u00a0Xiang Zhu. 2024. RRSIS: Referring Remote Sensing Image Segmentation. IEEE Transactions on Geoscience and Remote Sensing 62 (2024) 1\u201312. 10.1109\/TGRS.2024.3369720","DOI":"10.1109\/TGRS.2024.3369720"},{"key":"e_1_3_3_1_47_2","unstructured":"Tong Zhang Zhen Wen Bo Kong Kai Liu Yao Zhang Peng Zhuang and Jing Li. 2025. Referring remote sensing image segmentation via bidirectional alignment guided joint prediction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.08486 (2025)."},{"key":"e_1_3_3_1_48_2","unstructured":"Yuxuan Zhang Tianheng Cheng Lianghui Zhu Rui Hu Lei Liu Heng Liu Longjin Ran Xiaoxin Chen Wenyu Liu and Xinggang Wang. 2024. Evf-sam: Early vision-language fusion for text-prompted segment anything model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.20076 (2024)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:34:12Z","timestamp":1781537652000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810755"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":47,"alternative-id":["10.1145\/3805622.3810755","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810755","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}