{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T06:02:12Z","timestamp":1775714532465,"version":"3.50.1"},"reference-count":63,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62266051"],"award-info":[{"award-number":["62266051"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802337"],"award-info":[{"award-number":["61802337"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.imavis.2026.105939","type":"journal-article","created":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:19:59Z","timestamp":1771949999000},"page":"105939","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SCAFTrack: Cross-layer spatial\u2013channel collaborative attention fusion for object tracking"],"prefix":"10.1016","volume":"168","author":[{"given":"Yuchao","family":"Lu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1711-3066","authenticated-orcid":false,"given":"Yun","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Yaowei","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.105939_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2022.103547","article-title":"Fully convolutional online tracking","volume":"224","author":"Cui","year":"2022","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.imavis.2026.105939_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104672","article-title":"Exploiting spatial and temporal context for online tracking with improved transformer","volume":"133","author":"Zhang","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105939_b3","series-title":"Beyond traditional single object tracking: A survey","author":"Abdelaziz","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b4","doi-asserted-by":"crossref","first-page":"80297","DOI":"10.1109\/ACCESS.2023.3298440","article-title":"Transformers in single object tracking: An experimental survey","volume":"11","author":"Kugarajeevan","year":"2023","journal-title":"IEEE Access"},{"key":"10.1016\/j.imavis.2026.105939_b5","article-title":"Imagenet classification with deep convolutional neural networks","volume":"25","author":"Krizhevsky","year":"2012","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105939_b6","series-title":"European Conference on Computer Vision","first-page":"850","article-title":"Fully-convolutional siamese networks for object tracking","author":"Bertinetto","year":"2016"},{"key":"10.1016\/j.imavis.2026.105939_b7","doi-asserted-by":"crossref","unstructured":"B. Li, J. Yan, W. Wu, Z. Zhu, X. Hu, High performance visual tracking with siamese region proposal network, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 8971\u20138980.","DOI":"10.1109\/CVPR.2018.00935"},{"key":"10.1016\/j.imavis.2026.105939_b8","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104886","article-title":"Improved siamcar with ranking-based pruning and optimization for efficient UAV tracking","volume":"141","author":"Jin","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105939_b9","doi-asserted-by":"crossref","unstructured":"Q. Wang, L. Zhang, L. Bertinetto, W. Hu, P.H. Torr, Fast online object tracking and segmentation: A unifying approach, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 1328\u20131338.","DOI":"10.1109\/CVPR.2019.00142"},{"key":"10.1016\/j.imavis.2026.105939_b10","series-title":"Optimized information flow for transformer tracking","author":"Kugarajeevan","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b11","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105939_b12","doi-asserted-by":"crossref","unstructured":"B. Yan, H. Peng, J. Fu, D. Wang, H. Lu, Learning spatio-temporal transformer for visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10448\u201310457.","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"10.1016\/j.imavis.2026.105939_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2024.120936","article-title":"Spatial-temporal graph transformer for object tracking against noise spoofing interference","volume":"678","author":"Li","year":"2024","journal-title":"Inform. Sci."},{"key":"10.1016\/j.imavis.2026.105939_b14","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104760","article-title":"Visual tracking using transformer with a combination of convolution and attention","volume":"137","author":"Wang","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105939_b15","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1549","article-title":"Global dilated attention and target focusing network for robust tracking","volume":"vol. 37","author":"Liang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105939_b16","doi-asserted-by":"crossref","unstructured":"Y. Cui, C. Jiang, L. Wang, G. Wu, Mixformer: End-to-end tracking with iterative mixed attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 13608\u201313618.","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"10.1016\/j.imavis.2026.105939_b17","series-title":"European Conference on Computer Vision","first-page":"341","article-title":"Joint feature learning and relation modeling for tracking: A one-stream framework","author":"Ye","year":"2022"},{"key":"10.1016\/j.imavis.2026.105939_b18","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.imavis.2026.105939_b19","doi-asserted-by":"crossref","unstructured":"Y. Wu, X. Wang, X. Yang, M. Liu, D. Zeng, H. Ye, S. Li, Learning Occlusion-Robust Vision Transformers for Real-Time UAV Tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 17103\u201317113.","DOI":"10.1109\/CVPR52734.2025.01594"},{"key":"10.1016\/j.imavis.2026.105939_b20","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105431","article-title":"Partitioned token fusion and pruning strategy for transformer tracking","volume":"154","author":"Zhang","year":"2025","journal-title":"Image Vis. Comput."},{"issue":"13","key":"10.1016\/j.imavis.2026.105939_b21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10489-025-06791-w","article-title":"Datrack: direction attention based transformer tracker","volume":"55","author":"Liu","year":"2025","journal-title":"Appl. Intell."},{"key":"10.1016\/j.imavis.2026.105939_b22","doi-asserted-by":"crossref","unstructured":"F. Ma, M.Z. Shou, L. Zhu, H. Fan, Y. Xu, Y. Yang, Z. Yan, Unified transformer tracker for object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 8781\u20138790.","DOI":"10.1109\/CVPR52688.2022.00858"},{"key":"10.1016\/j.imavis.2026.105939_b23","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111278","article-title":"Adaptively bypassing vision transformer blocks for efficient visual tracking","volume":"161","author":"Yang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.imavis.2026.105939_b24","series-title":"Camouflaged object tracking: A benchmark","author":"Guo","year":"2024"},{"issue":"5","key":"10.1016\/j.imavis.2026.105939_b25","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: A large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105939_b26","doi-asserted-by":"crossref","unstructured":"M. Muller, A. Bibi, S. Giancola, S. Alsubaihi, B. Ghanem, Trackingnet: A large-scale dataset and benchmark for object tracking in the wild, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 300\u2013317.","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"10.1016\/j.imavis.2026.105939_b27","doi-asserted-by":"crossref","unstructured":"H. Fan, L. Lin, F. Yang, P. Chu, G. Deng, S. Yu, H. Bai, Y. Xu, C. Liao, H. Ling, Lasot: A high-quality benchmark for large-scale single object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 5374\u20135383.","DOI":"10.1109\/CVPR.2019.00552"},{"key":"10.1016\/j.imavis.2026.105939_b28","unstructured":"U. Benchmark, A benchmark and simulator for UAV tracking, in: European Conference on Computer Vision, 2016."},{"key":"10.1016\/j.imavis.2026.105939_b29","doi-asserted-by":"crossref","unstructured":"H. Kiani Galoogahi, A. Fagg, C. Huang, D. Ramanan, S. Lucey, Need for speed: A benchmark for higher frame rate object tracking, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 1125\u20131134.","DOI":"10.1109\/ICCV.2017.128"},{"key":"10.1016\/j.imavis.2026.105939_b30","doi-asserted-by":"crossref","unstructured":"X. Wang, X. Shu, Z. Zhang, B. Jiang, Y. Wang, Y. Tian, F. Wu, Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13763\u201313773.","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"10.1016\/j.imavis.2026.105939_b31","doi-asserted-by":"crossref","unstructured":"F. Xie, C. Wang, G. Wang, W. Yang, W. Zeng, Learning tracking representations via dual-branch fully transformer networks, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 2688\u20132697.","DOI":"10.1109\/ICCVW54120.2021.00303"},{"issue":"9","key":"10.1016\/j.imavis.2026.105939_b32","doi-asserted-by":"crossref","first-page":"5102","DOI":"10.1109\/TCSVT.2023.3249468","article-title":"Learning spatial-frequency transformer for visual object tracking","volume":"33","author":"Tang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105939_b33","first-page":"16743","article-title":"Swintrack: A simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105939_b34","series-title":"European Conference on Computer Vision","first-page":"146","article-title":"Aiatrack: Attention in attention for transformer visual tracking","author":"Gao","year":"2022"},{"key":"10.1016\/j.imavis.2026.105939_b35","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1549","article-title":"Global dilated attention and target focusing network for robust tracking","volume":"vol. 37","author":"Liang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105939_b36","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"7588","article-title":"Odtrack: Online dense temporal token learning for visual tracking","volume":"vol. 38","author":"Zheng","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b37","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4986","article-title":"Mambalct: Boosting tracking via long-term context state space model","volume":"vol. 39","author":"Li","year":"2025"},{"key":"10.1016\/j.imavis.2026.105939_b38","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"927","article-title":"Bi-directional adapter for multimodal tracking","volume":"vol. 38","author":"Cao","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b39","series-title":"International Conference on Machine Learning","first-page":"10971","article-title":"Evolving attention with residual convolutions","author":"Wang","year":"2021"},{"key":"10.1016\/j.imavis.2026.105939_b40","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110853","article-title":"Revit: Enhancing vision transformers feature diversity with attention residual connections","volume":"156","author":"Diko","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.imavis.2026.105939_b41","series-title":"Deepvit: Towards deeper vision transformer","author":"Zhou","year":"2021"},{"key":"10.1016\/j.imavis.2026.105939_b42","series-title":"Why\u201d classic\u201d transformers are shallow and how to make them go deep","author":"Yu","year":"2023"},{"key":"10.1016\/j.imavis.2026.105939_b43","series-title":"Skip-layer attention: Bridging abstract and detailed dependencies in transformers","author":"Chen","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b44","series-title":"Cross-layer retrospective retrieving via layer attention","author":"Fang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105939_b45","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6569","article-title":"Centernet: Keypoint triplets for object detection","author":"Duan","year":"2019"},{"key":"10.1016\/j.imavis.2026.105939_b46","doi-asserted-by":"crossref","unstructured":"H. Law, J. Deng, Cornernet: Detecting objects as paired keypoints, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 734\u2013750.","DOI":"10.1007\/978-3-030-01264-9_45"},{"key":"10.1016\/j.imavis.2026.105939_b47","doi-asserted-by":"crossref","unstructured":"K. He, X. Chen, S. Xie, Y. Li, P. Doll\u00e1r, R. Girshick, Masked autoencoders are scalable vision learners, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10.1016\/j.imavis.2026.105939_b48","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.imavis.2026.105939_b49","series-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"10.1016\/j.imavis.2026.105939_b50","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10959","article-title":"Two-stream beats one-stream: asymmetric siamese network for efficient visual tracking","volume":"vol. 39","author":"Zhu","year":"2025"},{"key":"10.1016\/j.imavis.2026.105939_b51","article-title":"Exploring dynamic transformer for efficient object tracking","author":"Zhu","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.imavis.2026.105939_b52","doi-asserted-by":"crossref","first-page":"58736","DOI":"10.52202\/075280-2561","article-title":"Mixformerv2: Efficient fully transformer tracking","volume":"36","author":"Cui","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105939_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.111562","article-title":"ASAFormer: Visual tracking with convolutional vision transformer and asymmetric selective attention","volume":"291","author":"Gong","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.imavis.2026.105939_b54","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123716","article-title":"PPTtrack: Pyramid pooling based transformer backbone for visual tracking","volume":"249","author":"Wang","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.imavis.2026.105939_b55","doi-asserted-by":"crossref","unstructured":"G.Y. Gopal, M.A. Amer, Separable self and mixed attention transformers for efficient object tracking, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024, pp. 6708\u20136717.","DOI":"10.1109\/WACV57701.2024.00657"},{"key":"10.1016\/j.imavis.2026.105939_b56","series-title":"European Conference on Computer Vision","first-page":"319","article-title":"Diff-tracker: text-to-image diffusion models are unsupervised trackers","author":"Zhang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105939_b57","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2321","article-title":"Compact transformer tracker with correlative masked modeling","volume":"vol. 37","author":"Song","year":"2023"},{"key":"10.1016\/j.imavis.2026.105939_b58","series-title":"Exploiting lightweight hierarchical ViT and dynamic framework for efficient visual tracking","author":"Kang","year":"2025"},{"key":"10.1016\/j.imavis.2026.105939_b59","doi-asserted-by":"crossref","unstructured":"Z. Song, J. Yu, Y.-P.P. Chen, W. Yang, Transformer tracking with cyclic shifting window attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 8791\u20138800.","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"10.1016\/j.imavis.2026.105939_b60","series-title":"European Conference on Computer Vision","first-page":"771","article-title":"Ocean: Object-aware anchor-free tracking","author":"Zhang","year":"2020"},{"key":"10.1016\/j.imavis.2026.105939_b61","doi-asserted-by":"crossref","unstructured":"H. Zhao, D. Wang, H. Lu, Representation learning for visual object tracking by masked appearance transfer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 18696\u201318705.","DOI":"10.1109\/CVPR52729.2023.01793"},{"key":"10.1016\/j.imavis.2026.105939_b62","doi-asserted-by":"crossref","unstructured":"G. Bhat, M. Danelljan, L.V. Gool, R. Timofte, Learning discriminative model prediction for tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 6182\u20136191.","DOI":"10.1109\/ICCV.2019.00628"},{"key":"10.1016\/j.imavis.2026.105939_b63","doi-asserted-by":"crossref","unstructured":"Z. Zhang, Y. Liu, X. Wang, B. Li, W. Hu, Learn to match: Automatic matching network design for visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 13339\u201313348.","DOI":"10.1109\/ICCV48922.2021.01309"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000454?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000454?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T05:36:23Z","timestamp":1775712983000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626000454"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":63,"alternative-id":["S0262885626000454"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105939","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SCAFTrack: Cross-layer spatial\u2013channel collaborative attention fusion for object tracking","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105939","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"105939"}}