{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T08:09:46Z","timestamp":1778486986451,"version":"3.51.4"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2341223"],"award-info":[{"award-number":["U2341223"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2341223"],"award-info":[{"award-number":["U2341223"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92264105"],"award-info":[{"award-number":["92264105"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2341223"],"award-info":[{"award-number":["U2341223"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Municipal Natural Science Foundation","award":["4232067"],"award-info":[{"award-number":["4232067"]}]},{"name":"Beijing Municipal Natural Science Foundation","award":["4232067"],"award-info":[{"award-number":["4232067"]}]},{"name":"Beijing Municipal Natural Science Foundation","award":["4232067"],"award-info":[{"award-number":["4232067"]}]},{"name":"Beijing Municipal Natural Science Foundation","award":["4232067"],"award-info":[{"award-number":["4232067"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Real-Time Image Proc"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s11554-025-01830-8","type":"journal-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T15:22:50Z","timestamp":1773156170000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["CAC-DETR: bridging heavy and light for real-time small object detection on UAVs"],"prefix":"10.1007","volume":"23","author":[{"given":"Zhixing","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Yudi","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Zhensong","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhihai","family":"Zhuo","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,10]]},"reference":[{"key":"1830_CR1","doi-asserted-by":"crossref","unstructured":"Cai, H., Li, J., Hu, M., Gan, C., Han, S.: Efficientvit: Lightweight multi-scale attention for high-resolution dense prediction. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 17302\u201317313 (2023)","DOI":"10.1109\/ICCV51070.2023.01587"},{"key":"1830_CR2","doi-asserted-by":"crossref","unstructured":"Chen, J., Kao, S.h., He, H., Zhuo, W., Wen, S., Lee, C.H., Chan, S.H.G.: Run, don\u2019t walk: chasing higher flops for faster neural networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 12021\u201312031 (2023)","DOI":"10.1109\/CVPR52729.2023.01157"},{"key":"1830_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Q., Wu, Q., Wang, J., Hu, Q., Hu, T., Ding, E., Cheng, J., Wang, J.: Mixformer: Mixing features across windows and dimensions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 5249\u20135259 (2022)","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"1830_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Q., Wu, Q., Wang, J., Hu, Q., Hu, T., Ding, E., Cheng, J., Wang, J.: Mixformer: Mixing features across windows and dimensions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 5249\u20135259 (2022)","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"1830_CR5","doi-asserted-by":"crossref","unstructured":"Chollet, F.: Xception: Deep learning with depthwise separable convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 1251\u20131258 (2017)","DOI":"10.1109\/CVPR.2017.195"},{"key":"1830_CR6","doi-asserted-by":"crossref","unstructured":"Ding, X., Zhang, X., Han, J., Ding, G.: Scaling up your kernels to 31x31: Revisiting large kernel design in cnns. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11963\u201311975 (2022)","DOI":"10.1109\/CVPR52688.2022.01166"},{"key":"1830_CR7","doi-asserted-by":"crossref","unstructured":"Dong, X., Bao, J., Chen, D., Zhang, W., Yu, N., Yuan, L., Chen, D., Guo, B.: Cswin transformer: A general vision transformer backbone with cross-shaped windows. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 12124\u201312134 (2022)","DOI":"10.1109\/CVPR52688.2022.01181"},{"issue":"1","key":"1830_CR8","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1007\/s11554-025-01622-0","volume":"22","author":"Y Dong","year":"2025","unstructured":"Dong, Y., Xu, F., Guo, J.: Lkr-detr: Small object detection in remote sensing images based on multi-large kernel convolution. J. Real-Time Image Proc. 22(1), 46 (2025)","journal-title":"J. Real-Time Image Proc."},{"key":"1830_CR9","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"1830_CR10","unstructured":"Du, D., Zhu, P., Wen, L., Bian, X., Lin, H., Hu, Q., Peng, T., Zheng, J., Wang, X., Zhang, Y., et\u00a0al.: Visdrone-det2019: The vision meets drone object detection in image challenge results. In: Proceedings of the IEEE\/CVF international conference on computer vision workshops, pp. 0\u20130 (2019)"},{"key":"1830_CR11","doi-asserted-by":"crossref","unstructured":"Gong, Y., Yu, X., Ding, Y., Peng, X., Zhao, J., Han, Z.: Effective fusion factor in fpn for tiny object detection. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp. 1160\u20131168 (2021)","DOI":"10.1109\/WACV48630.2021.00120"},{"key":"1830_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111939","volume":"296","author":"N Hoanh","year":"2024","unstructured":"Hoanh, N., Pham, T.V.: Focus-attention approach in optimizing detr for object detection from high-resolution images. Knowl.-Based Syst. 296, 111939 (2024)","journal-title":"Knowl.-Based Syst."},{"key":"1830_CR13","doi-asserted-by":"crossref","unstructured":"Hou, X., Liu, M., Zhang, S., Wei, P., Chen, B.: Salience detr: Enhancing detection transformer with hierarchical salience filtering refinement. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 17574\u201317583 (2024)","DOI":"10.1109\/CVPR52733.2024.01664"},{"key":"1830_CR14","doi-asserted-by":"crossref","unstructured":"Jin, Y., Wang, J., Lin, D.: Multi-level logit distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24276\u201324285 (2023)","DOI":"10.1109\/CVPR52729.2023.02325"},{"issue":"10s","key":"1830_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3505244","volume":"54","author":"S Khan","year":"2022","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: A survey. ACM computing surveys (CSUR) 54(10s), 1\u201341 (2022)","journal-title":"ACM computing surveys (CSUR)"},{"key":"1830_CR16","doi-asserted-by":"crossref","unstructured":"Li, F., Zeng, A., Liu, S., Zhang, H., Li, H., Zhang, L., Ni, L.M.: Lite detr: An interleaved multi-scale encoder for efficient detr. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 18558\u201318567 (2023)","DOI":"10.1109\/CVPR52729.2023.01780"},{"key":"1830_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., Wen, Y., He, L.: Scconv: Spatial and channel reconstruction convolution for feature redundancy. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6153\u20136162 (2023)","DOI":"10.1109\/CVPR52729.2023.00596"},{"key":"1830_CR18","doi-asserted-by":"publisher","first-page":"296","DOI":"10.1016\/j.isprsjprs.2019.11.023","volume":"159","author":"K Li","year":"2020","unstructured":"Li, K., Wan, G., Cheng, G., Meng, L., Han, J.: Object detection in optical remote sensing images: A survey and a new benchmark. ISPRS J. Photogramm. Remote. Sens. 159, 296\u2013307 (2020)","journal-title":"ISPRS J. Photogramm. Remote. Sens."},{"key":"1830_CR19","doi-asserted-by":"crossref","unstructured":"Liu, X., Peng, H., Zheng, N., Yang, Y., Hu, H., Yuan, Y.: Efficientvit: Memory efficient vision transformer with cascaded group attention. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 14420\u201314430 (2023)","DOI":"10.1109\/CVPR52729.2023.01386"},{"key":"1830_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1830_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11976\u201311986 (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"1830_CR22","doi-asserted-by":"crossref","unstructured":"Lou, M., Zhang, S., Zhou, H.Y., Yang, S., Wu, C., Yu, Y.: Transxnet: learning both global and local dynamics with a dual dynamic token mixer for visual recognition. IEEE Transactions on Neural Networks and Learning Systems (2025)","DOI":"10.1109\/TNNLS.2025.3550979"},{"key":"1830_CR23","doi-asserted-by":"crossref","unstructured":"Lou, M., Zhang, S., Zhou, H.Y., Yang, S., Wu, C., Yu, Y.: Transxnet: learning both global and local dynamics with a dual dynamic token mixer for visual recognition. IEEE Transactions on Neural Networks and Learning Systems (2025)","DOI":"10.1109\/TNNLS.2025.3550979"},{"key":"1830_CR24","doi-asserted-by":"crossref","unstructured":"Ma, N., Zhang, X., Zheng, H.T., Sun, J.: Shufflenet v2: Practical guidelines for efficient cnn architecture design. In: Proceedings of the European conference on computer vision (ECCV), pp. 116\u2013131 (2018)","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"1830_CR25","first-page":"10353","volume":"35","author":"Y Rao","year":"2022","unstructured":"Rao, Y., Zhao, W., Tang, Y., Zhou, J., Lim, S.N., Lu, J.: Hornet: Efficient high-order spatial interactions with recursive gated convolutions. Adv. Neural. Inf. Process. Syst. 35, 10353\u201310366 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1830_CR26","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"1830_CR27","doi-asserted-by":"crossref","unstructured":"Shi, D.: Transnext: Robust foveal visual perception for vision transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 17773\u201317783 (2024)","DOI":"10.1109\/CVPR52733.2024.01683"},{"issue":"1","key":"1830_CR28","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1038\/s41597-023-02066-6","volume":"10","author":"J Suo","year":"2023","unstructured":"Suo, J., Wang, T., Zhang, X., Chen, H., Zhou, W., Shi, W.: Hit-uav: A high-altitude infrared thermal dataset for unmanned aerial vehicle-based object detection. Scientific Data 10(1), 227 (2023)","journal-title":"Scientific Data"},{"key":"1830_CR29","doi-asserted-by":"crossref","unstructured":"Tang, Q., Li, J., Shi, Z., Hu, Y.: Lightdet: A lightweight and accurate object detection network. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2243\u20132247. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9054101"},{"key":"1830_CR30","doi-asserted-by":"crossref","unstructured":"Wang, A., Chen, H., Lin, Z., Han, J., Ding, G.: Repvit: Revisiting mobile cnn from vit perspective. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15909\u201315920 (2024)","DOI":"10.1109\/CVPR52733.2024.01506"},{"issue":"6","key":"1830_CR31","doi-asserted-by":"publisher","first-page":"240","DOI":"10.3390\/drones8060240","volume":"8","author":"S Wang","year":"2024","unstructured":"Wang, S., Jiang, H., Li, Z., Yang, J., Ma, X., Chen, J., Tang, X.: Phsi-rtdetr: A lightweight infrared small target detection algorithm based on uav aerial photography. Drones 8(6), 240 (2024)","journal-title":"Drones"},{"key":"1830_CR32","doi-asserted-by":"crossref","unstructured":"Wang, T., Yuan, L., Zhang, X., Feng, J.: Distilling object detectors with fine-grained feature imitation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 4933\u20134942 (2019)","DOI":"10.1109\/CVPR.2019.00507"},{"key":"1830_CR33","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Xu, T., Xin, Y., Li, J.: Fbrt-yolo: Faster and better for real-time aerial image detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8673\u20138681 (2025)","DOI":"10.1609\/aaai.v39i8.32937"},{"key":"1830_CR34","unstructured":"Zagoruyko, S., Komodakis, N.: Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. arXiv preprint arXiv:1612.03928 (2016)"},{"key":"1830_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, H., Hu, W., Wang, X.: Parc-net: Position aware circular convolution with merits from convnets and transformer. In: European conference on computer vision, pp. 613\u2013630. Springer (2022)","DOI":"10.1007\/978-3-031-19809-0_35"},{"key":"1830_CR36","unstructured":"Zhang, T., Li, L., Zhou, Y., Liu, W., Qian, C., Hwang, J.N., Ji, X.: Cas-vit: Convolutional additive self-attention vision transformers for efficient mobile applications. arXiv preprint arXiv:2408.03703 (2024)"},{"key":"1830_CR37","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Lv, W., Xu, S., Wei, J., Wang, G., Dang, Q., Liu, Y., Chen, J.: Detrs beat yolos on real-time object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16965\u201316974 (2024)","DOI":"10.1109\/CVPR52733.2024.01605"},{"key":"1830_CR38","doi-asserted-by":"crossref","unstructured":"Zheng, D., Dong, W., Hu, H., Chen, X., Wang, Y.: Less is more: Focus attention for efficient detr. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 6674\u20136683 (2023)","DOI":"10.1109\/ICCV51070.2023.00614"},{"key":"1830_CR39","doi-asserted-by":"crossref","unstructured":"Zheng, Z., Ye, R., Wang, P., Ren, D., Zuo, W., Hou, Q., Cheng, M.M.: Localization distillation for dense object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9407\u20139416 (2022)","DOI":"10.1109\/CVPR52688.2022.00919"},{"issue":"11","key":"1830_CR40","doi-asserted-by":"publisher","first-page":"9528","DOI":"10.1109\/TNNLS.2022.3151138","volume":"34","author":"J Zhong","year":"2022","unstructured":"Zhong, J., Chen, J., Mian, A.: Dualconv: Dual convolutional kernels for lightweight deep neural networks. IEEE Transactions on Neural Networks and Learning Systems 34(11), 9528\u20139535 (2022)","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"1830_CR41","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"}],"container-title":["Journal of Real-Time Image Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-025-01830-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11554-025-01830-8","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11554-025-01830-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T07:31:57Z","timestamp":1778484717000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11554-025-01830-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,10]]},"references-count":41,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["1830"],"URL":"https:\/\/doi.org\/10.1007\/s11554-025-01830-8","relation":{},"ISSN":["1861-8200","1861-8219"],"issn-type":[{"value":"1861-8200","type":"print"},{"value":"1861-8219","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,10]]},"assertion":[{"value":"19 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"70"}}