{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T18:06:58Z","timestamp":1773511618030,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Nature Science Foundation of China","award":["62372155 and 62302149"],"award-info":[{"award-number":["62372155 and 62302149"]}]},{"name":"Aeronautical Science Fund","award":["2022Z071108001"],"award-info":[{"award-number":["2022Z071108001"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["B240201077"],"award-info":[{"award-number":["B240201077"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Joint Fund of Ministry of Education for Equipment Pre-research","award":["8091B022123"],"award-info":[{"award-number":["8091B022123"]}]},{"name":"Water Science and Technology Project of Jiangsu Province","award":["2021063"],"award-info":[{"award-number":["2021063"]}]},{"name":"Qinglan Project of Jiangsu Province, Changzhou science and technology project","award":["20231313"],"award-info":[{"award-number":["20231313"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3700410.3702132","type":"proceedings-article","created":{"date-parts":[[2024,12,26]],"date-time":"2024-12-26T09:27:33Z","timestamp":1735205253000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A Encoder-Decoder Framework for Foundation Model-based Remote Sensing Semantic Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6541-9703","authenticated-orcid":false,"given":"Jiale","family":"Zhu","sequence":"first","affiliation":[{"name":"Hohai University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4588-3658","authenticated-orcid":false,"given":"Liang","family":"Yao","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8746-9845","authenticated-orcid":false,"given":"Fan","family":"Liu","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8724-5796","authenticated-orcid":false,"given":"Chuanyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0406-0105","authenticated-orcid":false,"given":"Chunmei","family":"Shen","sequence":"additional","affiliation":[{"name":"Hohai University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5822-8233","authenticated-orcid":false,"given":"Jun","family":"Zhou","sequence":"additional","affiliation":[{"name":"Griffith University, Nathan, Australia"}]}],"member":"320","published-online":{"date-parts":[[2024,12,26]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Vijay Badrinarayanan Alex Kendall and Roberto Cipolla. 2017. Segnet: A deep convolutional encoder-decoder architecture for image segmentation. IEEE transactions on pattern analysis and machine intelligence 39 12 (2017) 2481\u20132495.","DOI":"10.1109\/TPAMI.2016.2644615"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01538"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Yakoub Bazi Mohamad\u00a0Mahmoud Al\u00a0Rahhal Mohamed\u00a0Lamine Mekhalfi Mansour\u00a0Abdulaziz Al\u00a0Zuair and Farid Melgani. 2022. Bi-modal transformer-based approach for visual question answering in remote sensing imagery. IEEE Transactions on Geoscience and Remote Sensing 60 (2022) 1\u201311.","DOI":"10.1109\/TGRS.2022.3192460"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_3_2_7_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_2_8_2","unstructured":"Sedigheh Eslami Gerard de Melo and Christoph Meinel. 2021. Does clip benefit visual question answering in the medical domain as much as it does in the general domain? arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.13906 (2021)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Shuchen Fan Yuhe Sun and Penglang Shui. 2020. Region-merging method with texture pattern attention for SAR image segmentation. IEEE Geoscience and Remote Sensing Letters 18 1 (2020) 112\u2013116.","DOI":"10.1109\/LGRS.2020.2969321"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"crossref","unstructured":"Ananya Gupta Simon Watson and Hujun Yin. 2021. Deep learning-based aerial image segmentation with open data for disaster impact assessment. Neurocomputing 439 (2021) 22\u201333.","DOI":"10.1016\/j.neucom.2020.02.139"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"crossref","unstructured":"Essam\u00a0H Houssein Kashif Hussain Laith Abualigah Mohamed Abd\u00a0Elaziz Waleed Alomoush Gaurav Dhiman Youcef Djenouri and Erik Cuevas. 2021. An improved opposition-based marine predators algorithm for global optimization and multilevel thresholding image segmentation. Knowledge-based systems 229 (2021) 107348.","DOI":"10.1016\/j.knosys.2021.107348"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00069"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Zhiqi Huang and Hongjian You. 2023. MFSFNet: Multi-Scale Feature Subtraction Fusion Network for Remote Sensing Image Change Detection. Remote Sensing 15 15 (2023) 3740.","DOI":"10.3390\/rs15153740"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Kaiyu Li Xiangyong Cao and Deyu Meng. 2024. A New Learning Paradigm for Foundation Model-Based Remote-Sensing Change Detection. IEEE Transactions on Geoscience and Remote Sensing 62 (2024) 1\u201312.","DOI":"10.1109\/TGRS.2024.3365825"},{"key":"e_1_3_3_2_16_2","first-page":"1","volume-title":"2009 17th International Conference on Geoinformatics","author":"Li Ying","year":"2009","unstructured":"Ying Li and Bo Cheng. 2009. An improved k-nearest neighbor algorithm and its application to high resolution remote sensing image classification. In 2009 17th International Conference on Geoinformatics. Ieee, 1\u20134."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"e_1_3_3_2_18_2","unstructured":"Yan Li Weiwei Guo Dunyun He Jiaqi Zhou Yuze Gao and Wenxian Yu. 2023. CastDet: Toward Open Vocabulary Aerial Object Detection with CLIP-Activated Student-Teacher Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.11646 (2023)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Fan Liu Delong Chen Zhangqingyun Guan Xiaocong Zhou Jiale Zhu Qiaolin Ye Liyong Fu and Jun Zhou. 2024. Remoteclip: A vision language foundation model for remote sensing. IEEE Transactions on Geoscience and Remote Sensing (2024).","DOI":"10.1109\/TGRS.2024.3390838"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Yang Long Gui-Song Xia Shengyang Li Wen Yang Michael\u00a0Ying Yang Xiao\u00a0Xiang Zhu Liangpei Zhang and Deren Li. 2021. On creating benchmark dataset for aerial image interpretation: Reviews guidances and million-aid. IEEE Journal of selected topics in applied earth observations and remote sensing 14 (2021) 4205\u20134230.","DOI":"10.1109\/JSTARS.2021.3070368"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Yihao Luo Xiang Cao Juntao Zhang Jingjuan Guo Haibo Shen Tianjiang Wang and Qi Feng. 2022. CE-FPN: enhancing channel information for object detection. Multimedia Tools and Applications 81 21 (2022) 30685\u201330704.","DOI":"10.1007\/s11042-022-11940-1"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Georgii Mikriukov Mahdyar Ravanbakhsh and Beg\u00fcm Demir. 2022. Deep unsupervised contrastive hashing for large-scale cross-modal text-image retrieval in remote sensing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2201.08125 (2022).","DOI":"10.1109\/ICASSP43922.2022.9746251"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"crossref","unstructured":"Mehrnaz Niazi Kambiz Rahbar Mansour Sheikhan and Maryam Khademi. 2022. Entropy-based kernel graph cut for textural image region segmentation. Multimedia Tools and Applications 81 9 (2022) 13003\u201313023.","DOI":"10.1007\/s11042-022-12005-z"},{"key":"e_1_3_3_2_26_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_2_27_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Indrajit Saha Ujjwal Maulik Sanghamitra Bandyopadhyay and Dariusz Plewczynski. 2011. SVMeFC: SVM ensemble fuzzy clustering for satellite image segmentation. IEEE Geoscience and remote sensing letters 9 1 (2011) 52\u201355.","DOI":"10.1109\/LGRS.2011.2160150"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2019.8900532"},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Xin-Yi Tong Gui-Song Xia Qikai Lu Huanfeng Shen Shengyang Li Shucheng You and Liangpei Zhang. 2020. Land-cover classification with high-resolution remote sensing images using transferable deep models. Remote Sensing of Environment 237 (2020) 111322.","DOI":"10.1016\/j.rse.2019.111322"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Di Wang Jing Zhang Bo Du Gui-Song Xia and Dacheng Tao. 2022. An empirical study of remote sensing pretraining. IEEE Transactions on Geoscience and Remote Sensing (2022).","DOI":"10.1109\/TGRS.2022.3176603"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"crossref","unstructured":"Yan Wang Lingjia Gu Tao Jiang and Fang Gao. 2023. MDE-UNet: A multitask deformable UNet combined enhancement network for farmland boundary segmentation. IEEE Geoscience and Remote Sensing Letters 20 (2023) 1\u20135.","DOI":"10.1109\/LGRS.2023.3252048"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Yu Yang Xin Zhao Min Huang Xin Wang and Qibing Zhu. 2021. Multispectral image based germination detection of potato by using supervised multiple threshold segmentation model and Canny edge detector. Computers and Electronics in Agriculture 182 (2021) 106041.","DOI":"10.1016\/j.compag.2021.106041"},{"key":"e_1_3_3_2_36_2","unstructured":"Sheng Zhang Yanbo Xu Naoto Usuyama Jaspreet Bagga Robert Tinn Sam Preston Rajesh Rao Mu Wei Naveen Valluri Cliff Wong et\u00a0al. 2023. Large-scale domain-specific pretraining for biomedical vision-language processing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.00915 2 3 (2023) 6."},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.660"},{"key":"e_1_3_3_2_38_2","unstructured":"Xiaoqi Zhao Hongpeng Jia Youwei Pang Long Lv Feng Tian Lihe Zhang Weibing Sun and Huchuan Lu. 2023. M2SNet: Multi-scale in Multi-scale Subtraction Network for Medical Image Segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.10894 (2023)."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Yuting Zhu Lihong Long Jinjie Wang Jingwen Yan and Xiaoqing Wang. 2022. Road segmentation from high-fidelity remote sensing images using a context information capture network. Cognitive computation 14 2 (2022) 780\u2013793.","DOI":"10.1007\/s12559-021-09980-0"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00068"}],"event":{"name":"MMAsia'24: ACM Multimedia Asia Workshops","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia Workshops"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3700410.3702132","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3700410.3702132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:38Z","timestamp":1750295858000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3700410.3702132"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":39,"alternative-id":["10.1145\/3700410.3702132","10.1145\/3700410"],"URL":"https:\/\/doi.org\/10.1145\/3700410.3702132","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}