{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T04:37:46Z","timestamp":1776314266808,"version":"3.50.1"},"reference-count":178,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T00:00:00Z","timestamp":1728259200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T00:00:00Z","timestamp":1728259200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s11263-024-02247-9","type":"journal-article","created":{"date-parts":[[2024,10,7]],"date-time":"2024-10-07T11:02:03Z","timestamp":1728298923000},"page":"1410-1431","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":120,"title":["LSKNet: A Foundation Lightweight Backbone for Remote Sensing"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0613-3969","authenticated-orcid":false,"given":"Yuxuan","family":"Li","sequence":"first","affiliation":[]},{"given":"Xiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yimain","family":"Dai","sequence":"additional","affiliation":[]},{"given":"Qibin","family":"Hou","sequence":"additional","affiliation":[]},{"given":"Li","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yongxiang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Ming-Ming","family":"Cheng","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,7]]},"reference":[{"key":"2247_CR1","doi-asserted-by":"crossref","unstructured":"Bandara, W.G.C., & Patel, V.M. (2022). A transformer-based siamese network for change detection. In IEEE International Geoscience and Remote Sensing Symposium","DOI":"10.1109\/IGARSS46834.2022.9883686"},{"issue":"3","key":"2247_CR2","doi-asserted-by":"crossref","first-page":"516","DOI":"10.3390\/rs13030516","volume":"13","author":"Y Bazi","year":"2021","unstructured":"Bazi, Y., Bashmal, L., Rahhal, M. M. A., Dayil, R. A., & Ajlan, N. A. (2021). Vision transformers for remote sensing image classification. Remote Sensing, 13(3), 516.","journal-title":"Remote Sensing"},{"key":"2247_CR3","first-page":"6498","volume":"30","author":"Q Bi","year":"2021","unstructured":"Bi, Q., Qin, K., Zhang, H., & Xia, G.-S. (2021). Local semantic enhanced convnet for aerial scene recognition. TIP, 30, 6498\u20136511.","journal-title":"TIP"},{"key":"2247_CR4","doi-asserted-by":"crossref","unstructured":"Cai, Z., & Vasconcelos, N. (2018). Cascade R-CNN: Delving into high quality object detection. In CVPR","DOI":"10.1109\/CVPR.2018.00644"},{"issue":"5","key":"2247_CR5","doi-asserted-by":"crossref","first-page":"1483","DOI":"10.1109\/TPAMI.2019.2956516","volume":"43","author":"Z Cai","year":"2019","unstructured":"Cai, Z., & Vasconcelos, N. (2019). Cascade R-CNN: High quality object detection and instance segmentation. TPAMI, 43(5), 1483\u20131498.","journal-title":"TPAMI"},{"key":"2247_CR6","doi-asserted-by":"crossref","unstructured":"Cao, Y., Xu, J., Lin, S., Wei, F., & Hu, H. (2019). GCNet: Non-local networks meet squeeze-excitation networks and beyond. In ICCVW","DOI":"10.1109\/ICCVW.2019.00246"},{"key":"2247_CR7","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In ECCV","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2247_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Liu, M., Chen, D., Yuan, L., & Liu, Z. (2020). Dynamic convolution: Attention over convolution kernels. In CVPR","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"2247_CR9","doi-asserted-by":"crossref","unstructured":"Chen, K., Liu, C., Chen, H., Zhang, H., Li, W., Zou, Z., & Shi, Z. (2024). Rsprompter: Learning to prompt for remote sensing instance segmentation based on visual foundation model. TGRS","DOI":"10.1109\/TGRS.2024.3356074"},{"key":"2247_CR10","unstructured":"Chen, J., Lu, Y., Yu, Q., Luo, X., Adeli, E., Wang, Y., Lu, L., Yuille, A.L., & Zhou, Y. (2021). Transunet: Transformers make strong encoders for medical image segmentation. arXiv"},{"key":"2247_CR11","doi-asserted-by":"crossref","unstructured":"Chen, S.-B., Wei, Q.-S., Wang, W.-Z., Tang, J., Luo, B., & Wang, Z.-Y. (2022). Remote sensing scene classification via multi-branch local attention network. TIP","DOI":"10.1109\/TIP.2021.3127851"},{"key":"2247_CR12","unstructured":"Chen, Y., Yuan, X., Wu, R., Wang, J., Hou, Q., & Cheng, M.-M. (2023). YOLO-MS: Rethinking multi-scale representation learning for real-time object detection. arXiv"},{"key":"2247_CR13","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F., & Adam, H. (2018). Encoder-decoder with atrous separable convolution for semantic image segmentation. In ECCV","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"2247_CR14","doi-asserted-by":"crossref","unstructured":"Cheng, G., Han, J., & Lu, X. (2017). Remote sensing image scene classification: Benchmark and state of the art. Proceedings of the IEEE","DOI":"10.1109\/JPROC.2017.2675998"},{"key":"2247_CR15","doi-asserted-by":"crossref","unstructured":"Cheng, G., Yao, Y., Li, S., Li, K., Xie, X., Wang, J., Yao, X., & Han, J. (2022). Dual-aligned oriented detector. TGRS","DOI":"10.1109\/TGRS.2022.3149780"},{"key":"2247_CR16","first-page":"1","volume":"60","author":"G Cheng","year":"2022","unstructured":"Cheng, G., Wang, J., Li, K., Xie, X., Lang, C., Yao, Y., & Han, J. (2022). Anchor-free oriented proposal generator for object detection. TGRS, 60, 1\u201311.","journal-title":"TGRS"},{"key":"2247_CR17","first-page":"1","volume":"60","author":"H Chen","year":"2021","unstructured":"Chen, H., Li, W., & Shi, Z. (2021). Adversarial instance augmentation for building change detection in remote sensing images. TGRS, 60, 1\u201316.","journal-title":"TGRS"},{"key":"2247_CR18","first-page":"1","volume":"60","author":"H Chen","year":"2021","unstructured":"Chen, H., Qi, Z., & Shi, Z. (2021). Remote sensing image change detection with transformers. TGRS, 60, 1\u201314.","journal-title":"TGRS"},{"issue":"10","key":"2247_CR19","doi-asserted-by":"crossref","first-page":"1662","DOI":"10.3390\/rs12101662","volume":"12","author":"H Chen","year":"2020","unstructured":"Chen, H., & Shi, Z. (2020). A spatial-temporal attention-based method and a new dataset for remote sensing image change detection. Remote Sensing, 12(10), 1662.","journal-title":"Remote Sensing"},{"issue":"11","key":"2247_CR20","doi-asserted-by":"crossref","first-page":"8471","DOI":"10.1007\/s00521-022-08122-3","volume":"35","author":"A Codegoni","year":"2023","unstructured":"Codegoni, A., Lombardi, G., & Ferrari, A. (2023). Tinycd: A (not so) deep learning model for change detection. Neural Computing and Applications, 35(11), 8471\u20138486.","journal-title":"Neural Computing and Applications"},{"key":"2247_CR21","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., & Wei, Y. (2017). Deformable convolutional networks. In ICCV","DOI":"10.1109\/ICCV.2017.89"},{"key":"2247_CR22","doi-asserted-by":"crossref","unstructured":"Dai, J., Qi, H., Xiong, Y., Li, Y., Zhang, G., Hu, H., & Wei, Y. (2017). Deformable convolutional networks. In ICCV","DOI":"10.1109\/ICCV.2017.89"},{"issue":"5","key":"2247_CR23","doi-asserted-by":"crossref","first-page":"2342","DOI":"10.1109\/TCSVT.2022.3222906","volume":"33","author":"L Dai","year":"2022","unstructured":"Dai, L., Liu, H., Tang, H., Wu, Z., & Song, P. (2022). AO2-DETR: Arbitrary-oriented object detection transformer. IEEE Transactions on Circuits and Systems for Video Technology, 33(5), 2342\u20132356.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2247_CR24","doi-asserted-by":"crossref","unstructured":"Daudt, R.C., Le\u00a0Saux, B., & Boulch, A. (2018). Fully convolutional siamese networks for change detection. In 2018 25th IEEE International Conference on Image Processing (ICIP). IEEE, pp. 4063\u20134067","DOI":"10.1109\/ICIP.2018.8451652"},{"key":"2247_CR25","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). ImageNet: A large-scale hierarchical image database. In: CVPR","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2247_CR26","doi-asserted-by":"crossref","unstructured":"Deng, P., Xu, K., & Huang, H. (2022). When CNNs meet vision transformer: A joint framework for remote sensing scene classification. TGRS Letters","DOI":"10.1109\/LGRS.2021.3109061"},{"key":"2247_CR27","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Long, Y., Xia, G.-S., & Lu, Q. (2019). Learning RoI transformer for oriented object detection in aerial images. In CVPR","DOI":"10.1109\/CVPR.2019.00296"},{"key":"2247_CR28","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, X., Han, J., & Ding, G. (2022). Scaling up your kernels to 31x31: Revisiting large kernel design in CNNs. In CVPR","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"2247_CR29","doi-asserted-by":"crossref","unstructured":"Dong, Z., Gu, Y., & Liu, T. (2024). Upetu: A unified parameter-efficient fine-tuning framework for remote sensing foundation model. TGRS","DOI":"10.1109\/TGRS.2024.3382734"},{"key":"2247_CR30","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In ICLR"},{"key":"2247_CR31","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, & Zisserman, A. (2012). The PASCAL visual object classes challenge 2012 (VOC2012) results"},{"key":"2247_CR32","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, J., & Zisserman, A. (2007). The PASCAL visual object classes challenge 2007 (VOC2007) results"},{"key":"2247_CR33","first-page":"1","volume":"61","author":"S Fang","year":"2023","unstructured":"Fang, S., Li, K., & Li, Z. (2023). Changer: Feature interaction is what you need for change detection. TGRS, 61, 1\u201311.","journal-title":"TGRS"},{"key":"2247_CR34","first-page":"1","volume":"19","author":"S Fang","year":"2021","unstructured":"Fang, S., Li, K., Shao, J., & Li, Z. (2021). Snunet-cd: A densely connected siamese network for change detection of vhr images. IEEE Geoscience and Remote Sensing Letters, 19, 1\u20135.","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"issue":"1","key":"2247_CR35","doi-asserted-by":"crossref","first-page":"16","DOI":"10.1007\/s44267-023-00019-6","volume":"1","author":"D-P Fan","year":"2023","unstructured":"Fan, D.-P., Ji, G.-P., Xu, P., Cheng, M.-M., Sakaridis, C., & Gool, L. V. (2023). Advances in deep concealed scene understanding. Visual Intelligence, 1(1), 16.","journal-title":"Visual Intelligence"},{"key":"2247_CR36","doi-asserted-by":"crossref","unstructured":"Fu, J., Liu, J., Tian, H., Li, Y., Bao, Y., Fang, Z., & Lu, H. (2019). Dual attention network for scene segmentation. In CVPR","DOI":"10.1109\/CVPR.2019.00326"},{"key":"2247_CR37","doi-asserted-by":"crossref","unstructured":"Gao, S.-H., Cheng, M.-M., Zhao, K., Zhang, X.-Y., Yang, M.-H., & Torr, P. (2021). Res2Net: A new multi-scale backbone architecture. TPAMI","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"2247_CR38","doi-asserted-by":"crossref","unstructured":"Gao, S., Li, Z.-Y., Han, Q., Cheng, M.-M., & Wang, L. (2023). RF-Next: Efficient receptive field search for convolutional neural networks. TPAMI","DOI":"10.1109\/TPAMI.2022.3183829"},{"key":"2247_CR39","doi-asserted-by":"crossref","unstructured":"Guo, Z., Liu, C., Zhang, X., Jiao, J., Ji, X., & Ye, Q. (2021). Beyond bounding-box: Convex-hull feature adaptation for oriented and densely packed object detection. In CVPR","DOI":"10.1109\/CVPR46437.2021.00868"},{"key":"2247_CR40","doi-asserted-by":"crossref","unstructured":"Guo, M.-H., Lu, C., Liu, Z.-N., Cheng, M.-M., & Hu, S. (2022). Visual attention network. Computational Visual Media","DOI":"10.1007\/s41095-023-0364-2"},{"key":"2247_CR41","doi-asserted-by":"crossref","unstructured":"Guo, M.-H., Xu, T., Liu, J.-J., Liu, Z.-N., Jiang, P.-T., Mu, T.-J., Zhang, S.-H., Martin, R., Cheng, M.-M., & Hu, S.-M. (2021). Attention mechanisms in computer vision: A survey. Computational Visual Media","DOI":"10.1007\/s41095-022-0271-y"},{"key":"2247_CR42","first-page":"1140","volume":"35","author":"M-H Guo","year":"2022","unstructured":"Guo, M.-H., Lu, C.-Z., Hou, Q., Liu, Z.-N., Cheng, M.-M., & Hu, S.-M. (2022). SegNeXt: Rethinking convolutional attention design for semantic segmentation. NeurIPS, 35, 1140\u20131156.","journal-title":"NeurIPS"},{"key":"2247_CR43","unstructured":"Han, J., Ding, J., Li, J., & Xia, G.-S. (2020). Align deep features for oriented object detection. TGRS"},{"key":"2247_CR44","doi-asserted-by":"crossref","unstructured":"Han, J., Ding, J., Xue, N., & Xia, G.-S. (2021). ReDet: A rotation-equivariant detector for aerial object detection. In CVPR","DOI":"10.1109\/CVPR46437.2021.00281"},{"key":"2247_CR45","doi-asserted-by":"crossref","unstructured":"Han, C., Wu, C., & Du, B. (2023). Hcgmnet: A hierarchical change guiding map network for change detection. In IEEE International Geoscience and Remote Sensing Symposium","DOI":"10.1109\/IGARSS52108.2023.10283341"},{"key":"2247_CR46","doi-asserted-by":"crossref","unstructured":"Han, C., Wu, C., Guo, H., Hu, M., Li, J., & Chen, H. (2023). Change guiding network: Incorporating change prior to guide change detection in remote sensing imagery. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing","DOI":"10.1109\/JSTARS.2023.3310208"},{"key":"2247_CR47","doi-asserted-by":"crossref","unstructured":"Han, C., Wu, C., Hu, M., Li, J., & Chen, H. (2024). C2f-semicd: A coarse-to-fine semi-supervised change detection method based on consistency regularization in high-resolution remote-sensing images. TGRS","DOI":"10.1109\/TGRS.2024.3370568"},{"key":"2247_CR48","doi-asserted-by":"crossref","first-page":"3867","DOI":"10.1109\/JSTARS.2023.3264802","volume":"16","author":"C Han","year":"2023","unstructured":"Han, C., Wu, C., Guo, H., Hu, M., & Chen, H. (2023). Hanet: A hierarchical attention network for change detection with bi-temporal very-high-resolution remote sensing images. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 16, 3867\u20133878.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2247_CR49","doi-asserted-by":"crossref","unstructured":"He, N., Fang, L., Li, S., Plaza, J., & Plaza, A. (2020). Skip-connected covariance network for remote sensing scene classification. IEEE Transactions on Neural Networks and Learning Systems","DOI":"10.1109\/TNNLS.2019.2920374"},{"key":"2247_CR50","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In CVPR","DOI":"10.1109\/CVPR.2016.90"},{"key":"2247_CR51","unstructured":"Hendrycks, D., & Gimpel, K. (2016). Bridging nonlinearities and stochastic regularizers with gaussian error linear units. CoRR"},{"key":"2247_CR52","unstructured":"Hou, Q., Lu, C.-Z., Cheng, M.-M., & Feng, J. (2022). Conv2Former: A simple transformer-style ConvNet for visual recognition. ArXiv"},{"key":"2247_CR53","doi-asserted-by":"crossref","unstructured":"Hou, L., Lu, K., Xue, J., & Li, Y. (2022). Shape-adaptive selection and measurement for oriented object detection. In AAAI","DOI":"10.1609\/aaai.v36i1.19975"},{"key":"2247_CR54","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., & Sun, G. (2018). Squeeze-and-excitation networks. In CVPR","DOI":"10.1109\/CVPR.2018.00745"},{"key":"2247_CR55","unstructured":"Hu, J., Shen, L., Albanie, S., Sun, G., & Vedaldi, A. (2018). Gather-excite: Exploiting feature context in convolutional neural networks. In NeurPIS"},{"issue":"1","key":"2247_CR56","doi-asserted-by":"crossref","first-page":"263","DOI":"10.1109\/LRA.2020.3039744","volume":"6","author":"P Hu","year":"2020","unstructured":"Hu, P., Perazzi, F., Heilbron, F. C., Wang, O., Lin, Z., Saenko, K., & Sclaroff, S. (2020). Real-time semantic segmentation with fast attention. IEEE Robotics and Automation Letters, 6(1), 263\u2013270.","journal-title":"IEEE Robotics and Automation Letters"},{"key":"2247_CR57","unstructured":"ISPRS: 2D Semantic Labeling - Vaihingen. (2022). https:\/\/www.isprs.org\/education\/benchmarks\/UrbanSemLab\/2d-sem-label-vaihingen.aspx"},{"key":"2247_CR58","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Girshick, R., He, K., & Doll\u00e1r, P. (2019). Panoptic feature pyramid networks. InCVPR","DOI":"10.1109\/CVPR.2019.00656"},{"key":"2247_CR59","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.C., Lo, W.-Y., et al. (2023). Segment anything. In ICCV","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2247_CR60","doi-asserted-by":"crossref","unstructured":"Kuckreja, K., Danish, M.S., Naseer, M., Das, A., Khan, S., & Khan, F.S. (2023). Geochat: Grounded large vision-language model for remote sensing. arXiv","DOI":"10.1109\/CVPR52733.2024.02629"},{"key":"2247_CR61","unstructured":"Lang, S., Ventola, F., & Kersting, K. (2021). DAFNe: A one-stage anchor-free deep model for oriented object detection. CoRR"},{"key":"2247_CR62","doi-asserted-by":"crossref","unstructured":"Li, R., Duan, C., Zheng, S., Zhang, C., & Atkinson, P.M. (2022). Macu-net for semantic segmentation of fine-resolution remotely sensed images. IEEE Geoscience and Remote Sensing Letters,19","DOI":"10.1109\/LGRS.2021.3052886"},{"key":"2247_CR63","doi-asserted-by":"crossref","unstructured":"Li, Y., Hou, Q., Zheng, Z., Cheng, M.-M., Yang, J., & Li, X. (2023). Large selective kernel network for remote sensing object detection. In ICCV","DOI":"10.1109\/ICCV51070.2023.01540"},{"key":"2247_CR64","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, X., & Yang, J. (2022). Spatial group-wise enhance: Enhancing semantic feature learning in cnn. In ACCV","DOI":"10.1007\/978-3-031-26348-4_19"},{"key":"2247_CR65","unstructured":"Li, Y., Li, X., Li, W., Hou, Q., Liu, L., Cheng, M.-M., & Yang, J. (2024). Sardet-100k: Towards open-source benchmark and toolkit for large-scale sar object detection. arXiv"},{"key":"2247_CR66","doi-asserted-by":"crossref","unstructured":"Li, Y., Mao, H., Girshick, R., & He, K. (2022). Exploring plain vision transformer backbones for object detection. In ECCV","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"2247_CR67","doi-asserted-by":"crossref","unstructured":"Li, Z., Sun, Y., Zhang, L., & Tang, J. (2022). Ctnet: Context-based tandem network for semantic segmentation. TPAMI","DOI":"10.1109\/TPAMI.2021.3132068"},{"key":"2247_CR68","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, W., Hu, X., & Yang, J. (2019). Selective kernel networks. In CVPR","DOI":"10.1109\/CVPR.2019.00060"},{"key":"2247_CR69","unstructured":"Li, G., Yun, I., Kim, J., & Kim, J. (2019). DABNet: Depth-wise asymmetric bottleneck for real-time semantic segmentation"},{"issue":"11","key":"2247_CR70","first-page":"8077","volume":"58","author":"F Li","year":"2020","unstructured":"Li, F., Feng, R., Han, W., & Wang, L. (2020). High-resolution remote sensing image scene classification via key filter bank based on convolutional neural network. TGRS, 58(11), 8077\u20138092.","journal-title":"TGRS"},{"key":"2247_CR71","first-page":"1","volume":"60","author":"B Li","year":"2021","unstructured":"Li, B., Guo, Y., Yang, J., Wang, L., Wang, Y., & An, W. (2021). Gated recurrent multiattention network for VHR remote sensing image classification. TGRS, 60, 1\u201313.","journal-title":"TGRS"},{"key":"2247_CR72","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., & He, K. (2017). Doll\u00e1r. P. Focal loss for dense object detection. In ICCV","DOI":"10.1109\/ICCV.2017.324"},{"key":"2247_CR73","doi-asserted-by":"crossref","unstructured":"Lin, H., Hang, R., Wang, S., & Liu, Q. (2024). Diformer: A difference transformer network for remote sensing change detection. IEEE Geoscience and Remote Sensing Letters","DOI":"10.1109\/LGRS.2024.3359220"},{"key":"2247_CR74","unstructured":"Liu, S., Chen, T., Chen, X., Chen, X., Xiao, Q., Wu, B., Pechenizkiy, M., Mocanu, D., & Wang, Z. (2022). More convnets in the 2020s: Scaling up kernels beyond 51x51 using sparsity. ArXiv"},{"key":"2247_CR75","unstructured":"Liu, C., Dai, H., Wang, S., & Chen, J. (2023). Remote sensing image scene classification based on multidimensional attention and feature enhancement. IAENG International Journal of Computer Science"},{"key":"2247_CR76","doi-asserted-by":"crossref","unstructured":"Liu, J.-J., Hou, Q., Cheng, M.-M., Wang, C., & Feng, J. (2020). Improving convolutional networks with self-calibrated convolutions. In CVPR","DOI":"10.1109\/CVPR42600.2020.01011"},{"key":"2247_CR77","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2024). Visual instruction tuning. NeurIPS"},{"key":"2247_CR78","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In CVPR","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2247_CR79","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2247_CR80","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., & Xie, S. (2022). A convnet for the 2020s. In CVPR","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"2247_CR81","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.-Y., Feichtenhofer, C., Darrell, T., & Xie, S. (2022). A convnet for the 2020s. In CVPR","DOI":"10.1109\/CVPR52688.2022.01167"},{"issue":"5","key":"2247_CR82","doi-asserted-by":"crossref","first-page":"811","DOI":"10.1109\/LGRS.2020.2988032","volume":"18","author":"Y Liu","year":"2020","unstructured":"Liu, Y., Pang, C., Zhan, Z., Zhang, X., & Yang, X. (2020). Building change detection for remote sensing images using a dual-task constrained deep siamese convolutional network model. IEEE Geoscience and Remote Sensing Letters, 18(5), 811\u2013815.","journal-title":"IEEE Geoscience and Remote Sensing Letters"},{"issue":"8","key":"2247_CR83","first-page":"1074","volume":"13","author":"Z Liu","year":"2016","unstructured":"Liu, Z., Wang, H., Weng, L., & Yang, Y. (2016). Ship rotated bounding box space for ship extraction from high-resolution optical satellite images with complex backgrounds. TGRS Letters, 13(8), 1074\u2013107.","journal-title":"TGRS Letters"},{"key":"2247_CR84","first-page":"1","volume":"60","author":"R Li","year":"2021","unstructured":"Li, R., Zheng, S., Zhang, C., Duan, C., Su, J., Wang, L., & Atkinson, P. M. (2021). Multiattention network for semantic segmentation of fine-resolution remote sensing images. TGRS, 60, 1\u201313.","journal-title":"TGRS"},{"key":"2247_CR85","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1016\/j.isprsjprs.2021.09.005","volume":"181","author":"R Li","year":"2021","unstructured":"Li, R., Zheng, S., Zhang, C., Duan, C., Wang, L., & Atkinson, P. M. (2021). ABCNet: Attentive bilateral contextual network for efficient semantic segmentation of fine-resolution remotely sensed imagery. ISPRS Journal of Photogrammetry and Remote Sensing, 181, 84\u201398.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"2247_CR86","doi-asserted-by":"crossref","first-page":"4205","DOI":"10.1109\/JSTARS.2021.3070368","volume":"14","author":"Y Long","year":"2021","unstructured":"Long, Y., Xia, G.-S., Li, S., Yang, W., Yang, M. Y., Zhu, X. X., Zhang, L., & Li, D. (2021). On creating benchmark dataset for aerial image interpretation: Reviews, guidances, and million-aid. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 14, 4205\u20134230.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2247_CR87","unstructured":"Luo, W., Li, Y., Urtasun, R., & Zemel, R. (2016). Understanding the effective receptive field in deep convolutional neural networks. In NeurIPS"},{"key":"2247_CR88","unstructured":"Lyu, C., Zhang, W., Huang, H., Zhou, Y., Wang, Y., Liu, Y., Zhang, S., & Chen, K. (2022). RTMDet: An empirical study of designing real-time object detectors. CoRR"},{"key":"2247_CR89","doi-asserted-by":"crossref","first-page":"108","DOI":"10.1016\/j.isprsjprs.2020.05.009","volume":"165","author":"Y Lyu","year":"2020","unstructured":"Lyu, Y., Vosselman, G., Xia, G.-S., Yilmaz, A., & Yang, M. Y. (2020). UAVid: A semantic segmentation dataset for uav imagery. ISPRS Journal of Photogrammetry and Remote Sensing, 165, 108\u2013119.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"2247_CR90","first-page":"1","volume":"60","author":"A Ma","year":"2021","unstructured":"Ma, A., Wang, J., Zhong, Y., & Zheng, Z. (2021). FactSeg: Foreground activation-driven small object semantic segmentation in large-scale remote sensing imagery. TGRS, 60, 1\u201316.","journal-title":"TGRS"},{"issue":"3","key":"2247_CR91","doi-asserted-by":"crossref","first-page":"563","DOI":"10.1007\/s41095-022-0325-1","volume":"9","author":"J Mei","year":"2023","unstructured":"Mei, J., Zheng, Y.-B., & Cheng, M.-M. (2023). D2ANet: Difference-aware attention network for multi-level change detection from satellite imagery. Computational Visual Media, 9(3), 563\u2013579.","journal-title":"Computational Visual Media"},{"key":"2247_CR92","doi-asserted-by":"crossref","unstructured":"Ming, Q., Zhou, Z., Miao, L., Zhang, H., & Li, L. (2020). Dynamic anchor learning for arbitrary-oriented object detection. CoRR","DOI":"10.1609\/aaai.v35i3.16336"},{"key":"2247_CR93","doi-asserted-by":"crossref","unstructured":"Muhammad, M. B., & Yeasin, M. (2020). Eigen-CAM: Class activation map using principal components. CoRR","DOI":"10.1109\/IJCNN48605.2020.9206626"},{"key":"2247_CR94","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107611","volume":"110","author":"M Or\u0161i\u0107","year":"2021","unstructured":"Or\u0161i\u0107, M., & \u0161egvi\u0107, S. (2021). Efficient semantic segmentation with pyramidal fusion. Pattern Recognition, 110, 107611.","journal-title":"Pattern Recognition"},{"key":"2247_CR95","doi-asserted-by":"crossref","unstructured":"Pan, X., Ren, Y., Sheng, K., Dong, W., Yuan, H., Guo, X., Ma, C., & Xu, C. (2020). Dynamic refinement network for oriented and densely packed object detection. In CVPR","DOI":"10.1109\/CVPR42600.2020.01122"},{"issue":"24","key":"2247_CR96","doi-asserted-by":"crossref","first-page":"5100","DOI":"10.3390\/rs13245100","volume":"13","author":"T Panboonyuen","year":"2021","unstructured":"Panboonyuen, T., Jitkajornwanich, K., Lawawirojwong, S., Srestasathiern, P., & Vateekul, P. (2021). Transformer-based decoder designs for semantic segmentation on remotely sensed images. Remote Sensing, 13(24), 5100.","journal-title":"Remote Sensing"},{"key":"2247_CR97","unstructured":"Park, J., Woo, S., Lee, J.-Y., & Kweon, I.-S. (2018). BAM: Bottleneck attention module. In British Machine Vision Conference"},{"key":"2247_CR98","unstructured":"Photogrammetry, T.I.S., & (ISPRS), R.S. (2022). 2D Semantic Labeling Contest - Potsdam. https:\/\/www.isprs.org\/education\/benchmarks\/UrbanSemLab\/2d-sem-label-potsdam.aspx"},{"key":"2247_CR99","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., & Koltun, V. (2021). Vision transformers for dense prediction. In ICCV","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"2247_CR100","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster R-CNN: Towards real-time object detection with region proposal networks. In NeurIPS"},{"issue":"1","key":"2247_CR101","doi-asserted-by":"crossref","first-page":"263","DOI":"10.1109\/TITS.2017.2750080","volume":"19","author":"E Romera","year":"2017","unstructured":"Romera, E., Alvarez, J. M., Bergasa, L. M., & Arroyo, R. (2017). ERFNet: Efficient residual factorized convnet for real-time semantic segmentation. IEEE Transactions on Intelligent Transportation Systems, 19(1), 263\u2013272.","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"issue":"24","key":"2247_CR102","doi-asserted-by":"crossref","first-page":"5094","DOI":"10.3390\/rs13245094","volume":"13","author":"L Shen","year":"2021","unstructured":"Shen, L., Lu, Y., Chen, H., Wei, H., Xie, D., Yue, J., Chen, R., Lv, S., & Jiang, B. (2021). S2looking: A satellite side-looking dataset for building change detection. Remote Sensing, 13(24), 5094.","journal-title":"Remote Sensing"},{"key":"2247_CR103","doi-asserted-by":"crossref","unstructured":"Srinivas, A., Lin, T.-Y., Parmar, N., Shlens, J., Abbeel, P., & Vaswani, A. (2021). Bottleneck transformers for visual recognition. In CVPR","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"2247_CR104","doi-asserted-by":"crossref","unstructured":"Srivastava, S., & Sharma, G. (2024). Omnivec: Learning robust representations with cross modal sharing. In Winter Conference on Applications of Computer Vision","DOI":"10.1109\/WACV57701.2024.00127"},{"key":"2247_CR105","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., & Schmid, C. (2021). Segmenter: Transformer for semantic segmentation. In ICCV","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"2247_CR106","doi-asserted-by":"crossref","unstructured":"Su, Z., Zhang, J., Wang, L., Zhang, H., Liu, Z., Pietik\u00e4inen, M., & Liu, L. (2023). Lightweight pixel difference networks for efficient visual representation learning. TPAMI","DOI":"10.1109\/TPAMI.2023.3300513"},{"key":"2247_CR107","doi-asserted-by":"crossref","unstructured":"Sun, X., Wang, P., Lu, W., Zhu, Z., Lu, X., He, Q., Li, J., Rong, X., Yang, Z., Chang, H., He, Q., Yang, G., Wang, R., Lu, J., & Fu, K. (2023). Ringmo: A remote sensing foundation model with masked image modeling. TGRS","DOI":"10.1109\/TGRS.2022.3194732"},{"key":"2247_CR108","doi-asserted-by":"crossref","unstructured":"Sun, X., Wang, P., Yan, Z., Xu, F., Wang, R., Diao, W., Chen, J., Li, J., Feng, Y., Xu, T., Weinmann, M., Hinz, S., Wang, C., & Fu, K. (2022). FAIR1M: A benchmark dataset for fine-grained object recognition in high-resolution remote sensing imagery. ISPRS Journal of Photogrammetry and Remote Sensing","DOI":"10.1016\/j.isprsjprs.2021.12.004"},{"issue":"4","key":"2247_CR109","volume":"66","author":"X Sun","year":"2023","unstructured":"Sun, X., Tian, Y., Lu, W., Wang, P., Niu, R., Yu, H., & Fu, K. (2023). From single- to multi-modal remote sensing imagery interpretation: a survey and taxonomy. Science China Information Sciences, 66(4), 140301.","journal-title":"Science China Information Sciences"},{"issue":"10","key":"2247_CR110","doi-asserted-by":"crossref","first-page":"12562","DOI":"10.1109\/TPAMI.2023.3285009","volume":"45","author":"S Sun","year":"2023","unstructured":"Sun, S., Zhi, S., Liao, Q., Heikkil\u00e4, J., & Liu, L. (2023). Unbiased scene graph generation via two-stage causal modeling. TPAMI, 45(10), 12562\u201312580.","journal-title":"TPAMI"},{"issue":"6","key":"2247_CR111","doi-asserted-by":"crossref","first-page":"1598","DOI":"10.3390\/rs15061598","volume":"15","author":"L Sun","year":"2023","unstructured":"Sun, L., Zou, H., Wei, J., Cao, X., He, S., Li, M., & Liu, S. (2023). Semantic segmentation of high-resolution remote sensing images based on sparse self-attention and feature alignment. Remote Sensing, 15(6), 1598.","journal-title":"Remote Sensing"},{"key":"2247_CR112","volume":"237","author":"X-Y Tong","year":"2020","unstructured":"Tong, X.-Y., Xia, G.-S., Lu, Q., Shen, H., Li, S., You, S., & Zhang, L. (2020). Land-cover classification with high-resolution remote sensing images using transferable deep models. Remote Sensing of Environment, 237, 111322.","journal-title":"Remote Sensing of Environment"},{"key":"2247_CR113","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. NeurIPS"},{"key":"2247_CR114","unstructured":"Wang, J., Sun, K., Cheng, T., Jiang, B., Deng, C., Zhao, Y., Liu, D., Mu, Y., Tan, M., Wang, X., Liu, W., & Xiao, B. (2019). Deep high-resolution representation learning for visual recognition. TPAMI"},{"key":"2247_CR115","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2021). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In ICCV","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"2247_CR116","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, W., Li, H.-C., Zhang, H., & Xia, G.-S. (2021). Learning center probability map for detecting objects in aerial images. TGRS","DOI":"10.1109\/TGRS.2020.3010051"},{"key":"2247_CR117","doi-asserted-by":"crossref","unstructured":"Wang, D., Zhang, J., Xu, M., Liu, L., Wang, D., Gao, E., Han, C., Guo, H., Du, B., Tao, D., et al. (2024). Mtp: Advancing remote sensing foundation model via multi-task pretraining. arXiv","DOI":"10.1109\/JSTARS.2024.3408154"},{"key":"2247_CR118","unstructured":"Wang, J., Zheng, Z., Ma, A., Lu, X., & Zhong, Y. (2021). LoveDA: A remote sensing land-cover dataset for domain adaptive semantic segmentation. arXiv"},{"issue":"4","key":"2247_CR119","doi-asserted-by":"crossref","first-page":"447","DOI":"10.1007\/s11633-022-1410-8","volume":"20","author":"X Wang","year":"2023","unstructured":"Wang, X., Chen, G., Qian, G., Gao, P., Wei, X.-Y., Wang, Y., Tian, Y., & Gao, W. (2023). Large-scale multi-modal pre-trained models: A comprehensive survey. Machine Intelligence Research, 20(4), 447\u2013482.","journal-title":"Machine Intelligence Research"},{"key":"2247_CR120","first-page":"5396","volume":"29","author":"S Wang","year":"2020","unstructured":"Wang, S., Guan, Y., & Shao, L. (2020). Multi-granularity canonical appearance pooling for remote sensing scene classification. TIP, 29, 5396\u20135407.","journal-title":"TIP"},{"issue":"16","key":"2247_CR121","doi-asserted-by":"crossref","first-page":"3065","DOI":"10.3390\/rs13163065","volume":"13","author":"L Wang","year":"2021","unstructured":"Wang, L., Li, R., Wang, D., Duan, C., Wang, T., & Meng, X. (2021). Transformer meets convolution: A bilateral awareness network for semantic segmentation of very fine resolution urban scene images. Remote Sensing, 13(16), 3065.","journal-title":"Remote Sensing"},{"key":"2247_CR122","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1016\/j.isprsjprs.2022.06.008","volume":"190","author":"L Wang","year":"2022","unstructured":"Wang, L., Li, R., Zhang, C., Fang, S., Duan, C., Meng, X., & Atkinson, P. M. (2022). UNetFormer: A UNet-like transformer for efficient semantic segmentation of remote sensing urban scene imagery. ISPRS Journal of Photogrammetry and Remote Sensing, 190, 196\u2013214.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"2247_CR123","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.cobeha.2021.06.007","volume":"43","author":"L-L Wang","year":"2022","unstructured":"Wang, L.-L., Lui, S. S., & Chan, R. C. (2022). The past and future of mapping the biomarkers of psychosis. Current Opinion in Behavioral Sciences, 43, 1\u20135.","journal-title":"Current Opinion in Behavioral Sciences"},{"issue":"8","key":"2247_CR124","first-page":"6549","volume":"59","author":"S Wang","year":"2020","unstructured":"Wang, S., Ren, Y., Parr, G. P., Guan, Y., & Shao, L. (2020). Invariant deep compressible covariance pooling for aerial scene categorization. TGRS, 59(8), 6549\u20136561.","journal-title":"TGRS"},{"key":"2247_CR125","volume":"128","author":"W Wang","year":"2024","unstructured":"Wang, W., Sun, Y., Li, J., & Wang, X. (2024). Frequency and spatial based multi-layer context network (fscnet) for remote sensing scene classification. International Journal of Applied Earth Observation and Geoinformation, 128, 103781.","journal-title":"International Journal of Applied Earth Observation and Geoinformation"},{"issue":"3","key":"2247_CR126","doi-asserted-by":"crossref","first-page":"415","DOI":"10.1007\/s41095-022-0274-8","volume":"8","author":"W Wang","year":"2022","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2022). PVT v2: Improved baselines with pyramid vision transformer. Computational Visual Media, 8(3), 415\u2013424.","journal-title":"Computational Visual Media"},{"key":"2247_CR127","first-page":"1","volume":"61","author":"D Wang","year":"2022","unstructured":"Wang, D., Zhang, J., Du, B., Xia, G.-S., & Tao, D. (2022). An empirical study of remote sensing pretraining. TGRS, 61, 1\u201320.","journal-title":"TGRS"},{"key":"2247_CR128","first-page":"1","volume":"61","author":"D Wang","year":"2022","unstructured":"Wang, D., Zhang, Q., Xu, Y., Zhang, J., Du, B., Tao, D., & Zhang, L. (2022). Advancing plain vision transformer towards remote sensing foundation model. TGRS, 61, 1\u201315.","journal-title":"TGRS"},{"key":"2247_CR129","doi-asserted-by":"crossref","unstructured":"Woo, S., Debnath, S., Hu, R., Chen, X., Liu, Z., Kweon, I.-S., & Xie, S. (2023). ConvNeXt V2: Co-designing and scaling convnets with masked autoencoders. Arxiv","DOI":"10.1109\/CVPR52729.2023.01548"},{"key":"2247_CR130","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.-Y., & Kweon, I.S. (2018). CBAM: Convolutional block attention module. In ECCV","DOI":"10.1007\/978-3-030-01234-2_1"},{"issue":"11","key":"2247_CR131","doi-asserted-by":"crossref","first-page":"12760","DOI":"10.1109\/TPAMI.2022.3202765","volume":"45","author":"Y-H Wu","year":"2022","unstructured":"Wu, Y.-H., Liu, Y., Zhan, X., & Cheng, M.-M. (2022). P2T: Pyramid pooling transformer for scene understanding. TPAMI, 45(11), 12760\u201312771.","journal-title":"TPAMI"},{"key":"2247_CR132","doi-asserted-by":"crossref","unstructured":"Xia, G.-S., Bai, X., Ding, J., Zhu, Z., Belongie, S., Luo, J., Datcu, M., Pelillo, M., & Zhang, L. (2018). DOTA: A large-scale dataset for object detection in aerial images. In CVPR","DOI":"10.1109\/CVPR.2018.00418"},{"issue":"7","key":"2247_CR133","first-page":"3965","volume":"55","author":"G-S Xia","year":"2017","unstructured":"Xia, G.-S., Hu, J., Hu, F., Shi, B., Bai, X., Zhong, Y., Zhang, L., & Lu, X. (2017). AID: A benchmark data set for performance evaluation of aerial scene classification. TGRS, 55(7), 3965\u20133981.","journal-title":"TGRS"},{"key":"2247_CR134","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In ECCV","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"2247_CR135","doi-asserted-by":"crossref","unstructured":"Xie, X., Cheng, G., Wang, J., Yao, X., & Han, J. (2021). Oriented R-CNN for object detection. In ICCV","DOI":"10.1109\/ICCV48922.2021.00350"},{"key":"2247_CR136","first-page":"12077","volume":"34","author":"E Xie","year":"2021","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J. M., & Luo, P. (2021). SegFormer: Simple and efficient design for semantic segmentation with transformers. NeurIPS, 34, 12077\u201312090.","journal-title":"NeurIPS"},{"key":"2247_CR137","doi-asserted-by":"crossref","unstructured":"Xu, Y., Fu, M., Wang, Q., Wang, Y., Chen, K., Xia, G.-S., & Bai, X. (2021). Gliding vertex on the horizontal bounding box for multi-oriented object detection. TPAMI","DOI":"10.1109\/TPAMI.2020.2974745"},{"key":"2247_CR138","doi-asserted-by":"crossref","unstructured":"Xu, W., Xu, Y., Chang, T., & Tu, Z. (2021). Co-scale conv-attentional image transformers. In ICCV","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"2247_CR139","unstructured":"Xu, Y., Zhang, Q., Zhang, J., & Tao, D.(2021). Vitae: Vision transformer advanced by exploring intrinsic inductive bias. NeurIPS"},{"key":"2247_CR140","unstructured":"Yan, H., Li, Z., Li, W., Wang, C., Wu, M., & Zhang, C. (2021). ConTNet: Why not use convolution and transformer at the same time?. CoRR"},{"key":"2247_CR141","doi-asserted-by":"crossref","unstructured":"Yang, Y., & Newsam, S. (2010). Bag-of-visual-words and spatial extensions for land-use classification. In Proceedings of the International Conference on Advances in Geographic Information Systems","DOI":"10.1145\/1869790.1869829"},{"key":"2247_CR142","doi-asserted-by":"crossref","unstructured":"Yang, X., & Yan, J. (2020). Arbitrary-oriented object detection with circular smooth label. In ECCV","DOI":"10.1007\/978-3-030-58598-3_40"},{"key":"2247_CR143","unstructured":"Yang, B., Bender, G., Le, Q.V., & Ngiam, J. (2019). CondConv: Conditionally parameterized convolutions for efficient inference. NeurIPS"},{"key":"2247_CR144","unstructured":"Yang, X., Liu, Q., Yan, J., & Li, A. (2019). R3Det: Refined single-stage detector with feature refinement for rotating object. CoRR"},{"key":"2247_CR145","unstructured":"Yang, X., Yan, J., Ming, Q., Wang, W., Zhang, X., & Tian, Q. (2021). Rethinking rotated object detection with Gaussian Wasserstein distance loss. In ICML"},{"key":"2247_CR146","doi-asserted-by":"crossref","unstructured":"Yang, X., Yang, J., Yan, J., Zhang, Y., Zhang, T., Guo, Z., Sun, X., & Fu, K. (2019). SCRDet: Towards more robust detection for small, cluttered and rotated objects. In ICCV","DOI":"10.1109\/ICCV.2019.00832"},{"key":"2247_CR147","unstructured":"Yang, X., Zhou, Y., Zhang, G., Yang, J., Wang, W., Yan, J., Zhang, X., & Tian, Q. (2022). The KFIoU loss for rotated object detection. In ICLR"},{"key":"2247_CR148","doi-asserted-by":"crossref","first-page":"124","DOI":"10.1016\/j.isprsjprs.2021.06.006","volume":"178","author":"MY Yang","year":"2021","unstructured":"Yang, M. Y., Kumaar, S., Lyu, Y., & Nex, F. (2021). Real-time semantic segmentation with context aggregation network. ISPRS Journal of Photogrammetry and Remote Sensing, 178, 124\u2013134.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"2247_CR149","first-page":"18381","volume":"34","author":"X Yang","year":"2021","unstructured":"Yang, X., Yang, X., Yang, J., Ming, Q., Wang, W., Tian, Q., & Yan, J. (2021). Learning high-precision bounding box for rotated object detection via Kullback-Leibler divergence. NeurIPS, 34, 18381\u201318394.","journal-title":"NeurIPS"},{"key":"2247_CR150","doi-asserted-by":"crossref","unstructured":"Yu, W., Luo, M., Zhou, P., Si, C., Zhou, Y., Wang, X., Feng, J., & Yan, S. (2022). MetaFormer is actually what you need for vision. In CVPR","DOI":"10.1109\/CVPR52688.2022.01055"},{"key":"2247_CR151","doi-asserted-by":"crossref","unstructured":"Yu, H., Tian, Y., Ye, Q., & Liu, Y. (2024). Spatial transform decoupling for oriented object detection. In AAAI","DOI":"10.1609\/aaai.v38i7.28502"},{"key":"2247_CR152","doi-asserted-by":"crossref","unstructured":"Yu, C., Wang, J., Peng, C., Gao, C., Yu, G., & Sang, N. (2018). BiSeNet: Bilateral segmentation network for real-time semantic segmentation. In ECCV","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"2247_CR153","volume":"126","author":"SSA Zaidi","year":"2022","unstructured":"Zaidi, S. S. A., Ansari, M. S., Aslam, A., Kanwal, N., Asghar, M., & Lee, B. (2022). A survey of modern deep learning based object detection models. Digital Signal Processing, 126, 103514.","journal-title":"Digital Signal Processing"},{"key":"2247_CR154","doi-asserted-by":"crossref","unstructured":"Zhang, W., Deng, W., Cui, Z., Liu, J., & Jiao, L. (2024). Object knowledge distillation for joint detection and tracking in satellite videos. TGRS","DOI":"10.1109\/TGRS.2024.3355933"},{"key":"2247_CR155","doi-asserted-by":"crossref","unstructured":"Zhang, W., Jiao, L., Li, Y., Huang, Z., & Wang, H. (2022). Laplacian feature pyramid network for object detection in vhr optical remote sensing images. TGRS","DOI":"10.1109\/TGRS.2021.3072488"},{"key":"2247_CR156","doi-asserted-by":"crossref","unstructured":"Zhang, W., Jiao, L., Liu, F., Yang, S., & Liu, J. (2023). Dfat: Dynamic feature-adaptive tracking. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2022.3197145"},{"key":"2247_CR157","unstructured":"Zhang, X., Tian, Y., Xie, L., Huang, W., Dai, Q., Ye, Q., Tian, & Q. (2022). Hivit: A simpler and more efficient design of hierarchical vision transformer. In ICLR"},{"key":"2247_CR158","doi-asserted-by":"crossref","unstructured":"Zhang, H., Wu, C., Zhang, Z., Zhu, Y., Lin, H., Zhang, Z., Sun, Y., He, T., Mueller, J., Manmatha, R., Li, M., & Smola, A. (2022). ResNeSt: Split-attention networks. In CVPRW","DOI":"10.1109\/CVPRW56347.2022.00309"},{"key":"2247_CR159","doi-asserted-by":"crossref","first-page":"9768","DOI":"10.1109\/JSTARS.2021.3114404","volume":"14","author":"X Zhang","year":"2021","unstructured":"Zhang, X., An, W., Sun, J., Wu, H., Zhang, W., & Du, Y. (2021). Best representation branch model for remote sensing image scene classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 14, 9768\u20139780.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2247_CR160","volume":"133","author":"C-J Zhang","year":"2024","unstructured":"Zhang, C.-J., & Liu, J.-W. (2024). Change detection with incorporating multi-constraints and loss weights. Engineering Applications of Artificial Intelligence, 133, 108163.","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"2247_CR161","first-page":"1","volume":"60","author":"C Zhang","year":"2022","unstructured":"Zhang, C., Wang, L., Cheng, S., & Li, Y. (2022). SwinSUNet: Pure transformer network for remote sensing image change detection. TGRS, 60, 1\u201313.","journal-title":"TGRS"},{"issue":"5","key":"2247_CR162","doi-asserted-by":"crossref","first-page":"1141","DOI":"10.1007\/s11263-022-01739-w","volume":"131","author":"Q Zhang","year":"2023","unstructured":"Zhang, Q., Xu, Y., Zhang, J., & Tao, D. (2023). Vitaev2: Vision transformer advanced by exploring inductive bias for image recognition and beyond. IJCV, 131(5), 1141\u20131162.","journal-title":"IJCV"},{"key":"2247_CR163","doi-asserted-by":"crossref","first-page":"9530","DOI":"10.1109\/JSTARS.2021.3109661","volume":"14","author":"G Zhang","year":"2021","unstructured":"Zhang, G., Xu, W., Zhao, W., Huang, C., Yk, E. N., Chen, Y., & Su, J. (2021). A multiscale attention network for remote sensing scene images classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 14, 9530\u20139545.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2247_CR164","doi-asserted-by":"crossref","first-page":"183","DOI":"10.1016\/j.isprsjprs.2020.06.003","volume":"166","author":"C Zhang","year":"2020","unstructured":"Zhang, C., Yue, P., Tapete, D., Jiang, L., Shangguan, B., Huang, L., & Liu, G. (2020). A deeply supervised image fusion network for change detection in high resolution bi-temporal remote sensing images. ISPRS Journal of Photogrammetry and Remote Sensing, 166, 183\u2013200.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"key":"2247_CR165","first-page":"655","volume":"33","author":"D Zhang","year":"2020","unstructured":"Zhang, D., Zhang, H., Tang, J., Hua, X.-S., & Sun, Q. (2020). Causal intervention for weakly-supervised semantic segmentation. NeurIPS, 33, 655\u2013666.","journal-title":"NeurIPS"},{"key":"2247_CR166","doi-asserted-by":"crossref","unstructured":"Zhao, Q., Lyu, S., Li, Y., Ma, Y., & Chen, L. (2022a). Mgml: Multigranularity multilevel feature ensemble network for remote sensing scene classification. IEEE Transactions on Neural Networks and Learning Systems","DOI":"10.1109\/TNNLS.2021.3106391"},{"key":"2247_CR167","doi-asserted-by":"crossref","unstructured":"Zhao, Q., Ma, Y., Lyu, S., & Chen, L. (2022b). Embedded self-distillation in compact multibranch ensemble network for remote sensing scene classification. TGRS","DOI":"10.1109\/TGRS.2021.3126770"},{"key":"2247_CR168","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., & Jia, J. (2017). Pyramid scene parsing network. In CVPR","DOI":"10.1109\/CVPR.2017.660"},{"issue":"11","key":"2247_CR169","first-page":"1926","volume":"18","author":"Z Zhao","year":"2020","unstructured":"Zhao, Z., Li, J., Luo, Z., Li, J., & Chen, C. (2020). Remote sensing image scene classification based on an enhanced attention module. TGRS Letters, 18(11), 1926\u20131930.","journal-title":"TGRS Letters"},{"key":"2247_CR170","first-page":"1","volume":"61","author":"S Zhao","year":"2023","unstructured":"Zhao, S., Zhang, X., Xiao, P., & He, G. (2023). Exchanging dual-encoder-decoder: A new strategy for change detection with semantic guidance and spatial localization. TGRS, 61, 1\u201316.","journal-title":"TGRS"},{"key":"2247_CR171","doi-asserted-by":"crossref","unstructured":"Zheng, S., Lu, J., Zhao, H., Zhu, X., Luo, Z., Wang, Y., Fu, Y., Feng, J., Xiang, T., Torr, P.H.S., & Zhang, L. (2021). Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In CVPR","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"2247_CR172","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Zhong, Y., Wang, J., & Ma, A. (2020). Foreground-aware relation network for geospatial object segmentation in high spatial resolution remote sensing imagery. In CVPR","DOI":"10.1109\/CVPR42600.2020.00415"},{"key":"2247_CR173","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1016\/j.isprsjprs.2020.09.019","volume":"170","author":"X Zheng","year":"2020","unstructured":"Zheng, X., Huan, L., Xia, G.-S., & Gong, J. (2020). Parsing very high resolution urban scene images by learning deep convnets with edge-aware loss. ISPRS Journal of Photogrammetry and Remote Sensing, 170, 15\u201328.","journal-title":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"issue":"8","key":"2247_CR174","doi-asserted-by":"crossref","first-page":"10070","DOI":"10.1109\/TPAMI.2023.3248583","volume":"45","author":"Z Zheng","year":"2023","unstructured":"Zheng, Z., Ye, R., Hou, Q., Ren, D., Wang, P., Zuo, W., & Cheng, M.-M. (2023). Localization distillation for object detection. TPAMI, 45(8), 10070\u201310083.","journal-title":"TPAMI"},{"key":"2247_CR175","unstructured":"Zhirui, W., & Sun, X. (2023). SAR-AIRcraft-1.0: High-resolution SAR Aircraft Detection and Recognition Dataset. https:\/\/radars.ac.cn\/web\/data\/getData?dataType=SARDataset_en"},{"key":"2247_CR176","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Rahman\u00a0Siddiquee, M.M., Tajbakhsh, N., & Liang, J. (2018). UNet++: A nested U-Net architecture for medical image segmentation. In Deep learning in medical image analysis and multimodal learning for clinical decision support","DOI":"10.1007\/978-3-030-00889-5_1"},{"key":"2247_CR177","doi-asserted-by":"crossref","unstructured":"Zhu, X., Hu, H., Lin, S., & Dai, J. (2019). Deformable convnets v2: More deformable, better results. In CVPR","DOI":"10.1109\/CVPR.2019.00953"},{"key":"2247_CR178","doi-asserted-by":"crossref","unstructured":"Zhuang, J., Yang, J., Gu, L., & Dvornek, N. (2019). ShelfNet for fast semantic segmentation. In ICCVW","DOI":"10.1109\/ICCVW.2019.00113"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02247-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02247-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02247-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,24]],"date-time":"2025-02-24T19:31:48Z","timestamp":1740425508000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02247-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,7]]},"references-count":178,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["2247"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02247-9","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,7]]},"assertion":[{"value":"5 January 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 September 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 October 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}