{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:16:46Z","timestamp":1778048206638,"version":"3.51.4"},"reference-count":80,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T00:00:00Z","timestamp":1778025600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T00:00:00Z","timestamp":1778025600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072370"],"award-info":[{"award-number":["62072370"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007128","name":"Natural Science Foundation of Shaanxi Province","doi-asserted-by":"publisher","award":["2023-JC-YB-598"],"award-info":[{"award-number":["2023-JC-YB-598"]}],"id":[{"id":"10.13039\/501100007128","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","award":["2024JC-YBQN-0664"],"award-info":[{"award-number":["2024JC-YBQN-0664"]}],"id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["2024M754276"],"award-info":[{"award-number":["2024M754276"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1007\/s00530-026-02339-1","type":"journal-article","created":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T05:40:19Z","timestamp":1778046019000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["USGA: unified intra- and cross-scale features with global\u2013local aggregation for long-term tracking"],"prefix":"10.1007","volume":"32","author":[{"given":"Xianxin","family":"Jia","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiqiang","family":"Hou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Yue","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuai","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sugang","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaobao","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Pu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,6]]},"reference":[{"issue":"9","key":"2339_CR1","doi-asserted-by":"publisher","first-page":"6431","DOI":"10.1109\/TPAMI.2024.3379457","volume":"46","author":"T Yao","year":"2024","unstructured":"Yao, T., Li, Y., Pan, Y., Mei, T.: Hiri-vit: scaling vision transformer with high resolution inputs. IEEE Trans. Pattern Anal. Mach. Intell. 46(9), 6431\u20136442 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2339_CR2","doi-asserted-by":"crossref","unstructured":"Hatamizadeh, A., Kautz, J.: Mambavision: A hybrid mamba-transformer vision backbone. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 25261\u201325270 (2025)","DOI":"10.1109\/CVPR52734.2025.02352"},{"key":"2339_CR3","doi-asserted-by":"crossref","unstructured":"Pan, Y., Li, Y., Yao, T., Ngo, C.-W., Mei, T.: Stream-vit: Learning streamlined convolutions in vision transformer. IEEE Transactions on Multimedia (2025)","DOI":"10.1109\/TMM.2025.3535321"},{"issue":"3","key":"2339_CR4","doi-asserted-by":"publisher","first-page":"238","DOI":"10.1007\/s00530-025-01813-6","volume":"31","author":"SS Kuppusami Sakthivel","year":"2025","unstructured":"Kuppusami Sakthivel, S.S., Joo, Y.H., Jeong, J.H.: Learning disruptor-aware channel selection and reliability with target regularization for robust visual tracking. Multimedia Syst. 31(3), 238 (2025)","journal-title":"Multimedia Syst."},{"key":"2339_CR5","doi-asserted-by":"crossref","unstructured":"Jia, X., Hou, Z., Wang, Y., Yue, H., Hu, S., Ma, S., Yang, X., Pu, L.: Integrating multi-scale appearance and motion cues for visual tracking via spatio-temporal prompt. Knowledge-Based Syst., 114760 (2025)","DOI":"10.2139\/ssrn.5314805"},{"key":"2339_CR6","doi-asserted-by":"crossref","unstructured":"Ettinger, S., Cheng, S., Caine, B., Liu, C., Zhao, H., Pradhan, S., Chai, Y., Sapp, B., Qi, C.R., Zhou, Y., et al.: Large scale interactive motion forecasting for autonomous driving: The waymo open motion dataset. Proceedings of the IEEE\/CVF International Conference on Computer Vision, 9710\u20139719 (2021)","DOI":"10.1109\/ICCV48922.2021.00957"},{"issue":"4","key":"2339_CR7","doi-asserted-by":"publisher","first-page":"316","DOI":"10.1007\/s00530-025-01927-x","volume":"31","author":"B Yu","year":"2025","unstructured":"Yu, B., Wang, D., Cao, J., Zhu, P., Zhao, Y.: Vehiclesim: realistic and 3d-aware video editing with one image for autonomous driving. Multimedia Syst. 31(4), 316 (2025)","journal-title":"Multimedia Syst."},{"key":"2339_CR8","doi-asserted-by":"crossref","unstructured":"Xue, C., Zhong, B., Liang, Q., Zheng, Y., Li, N., Xue, Y., Song, S.: Similarity-guided layer-adaptive vision transformer for uav tracking. Proceedings of the Computer Vision and Pattern Recognition Conference, 6730\u20136740 (2025)","DOI":"10.1109\/CVPR52734.2025.00631"},{"key":"2339_CR9","doi-asserted-by":"crossref","unstructured":"Wu, Y., Wang, X., Yang, X., Liu, M., Zeng, D., Ye, H., Li, S.: Learning occlusion-robust vision transformers for real-time uav tracking. Proceedings of the Computer Vision and Pattern Recognition Conference, 17103\u201317113 (2025)","DOI":"10.1109\/CVPR52734.2025.01594"},{"issue":"3","key":"2339_CR10","doi-asserted-by":"publisher","first-page":"723","DOI":"10.1109\/TPDS.2021.3081254","volume":"33","author":"L Cheng","year":"2021","unstructured":"Cheng, L., Wang, J., Li, Y.: Vitrack: efficient tracking on the edge for commodity video surveillance systems. IEEE Trans. Parallel Distrib. Syst. 33(3), 723\u2013735 (2021)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"1","key":"2339_CR11","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/s44267-024-00068-5","volume":"2","author":"C Liu","year":"2024","unstructured":"Liu, C., Yuan, Y., Chen, X., Lu, H., Wang, D.: Spatial-temporal initialization dilemma: towards realistic visual tracking. Vis. Intell. 2(1), 35 (2024)","journal-title":"Vis. Intell."},{"key":"2339_CR12","unstructured":"Shi, J., Yu, Y., Hui, B., Shi, J., Luo, H.: Historical states modeling for visual tracking. Neural Comput. and Appl., 1\u201318 (2025)"},{"issue":"1","key":"2339_CR13","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/s44267-024-00068-5","volume":"2","author":"C Liu","year":"2024","unstructured":"Liu, C., Yuan, Y., Chen, X., Lu, H., Wang, D.: Spatial-temporal initialization dilemma: towards realistic visual tracking. Vis. Intell. 2(1), 35 (2024)","journal-title":"Vis. Intell."},{"key":"2339_CR14","doi-asserted-by":"crossref","unstructured":"Lin, L., Fan, H., Zhang, Z., Wang, Y., Xu, Y., Ling, H.: Tracking meets lora: Faster training, larger model, stronger performance. European Conference on Computer Vision, 300\u2013318 (2024). Springer","DOI":"10.1007\/978-3-031-73232-4_17"},{"issue":"5","key":"2339_CR15","doi-asserted-by":"publisher","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","volume":"43","author":"L Huang","year":"2019","unstructured":"Huang, L., Zhao, X., Huang, K.: Got-10k: a large high-diversity benchmark for generic object tracking in the wild. IEEE Trans. Pattern Anal. Mach. Intell. 43(5), 1562\u20131577 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2339_CR16","doi-asserted-by":"crossref","unstructured":"Wu, Y., Lim, J., Yang, M.-H.: Online object tracking: A benchmark. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2411\u20132418 (2013)","DOI":"10.1109\/CVPR.2013.312"},{"key":"2339_CR17","doi-asserted-by":"crossref","unstructured":"Fan, H., Lin, L., Yang, F., Chu, P., Deng, G., Yu, S., Bai, H., Xu, Y., Liao, C., Ling, H.: Lasot: A high-quality benchmark for large-scale single object tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 5374\u20135383 (2019)","DOI":"10.1109\/CVPR.2019.00552"},{"key":"2339_CR18","unstructured":"Luke\u017ei\u010d, A., Zajc, L.\u010c., Voj\u00ed\u0159, T., Matas, J., Kristan, M.: Now you see me: evaluating performance in long-term visual tracking. arXiv preprint arXiv:1804.07056 (2018)"},{"key":"2339_CR19","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Wang, Q., Li, B., Wu, W., Yan, J., Hu, W.: Distractor-aware siamese networks for visual object tracking. Proceedings of the European Conference on Computer Vision (ECCV), 101\u2013117 (2018)","DOI":"10.1007\/978-3-030-01240-3_7"},{"key":"2339_CR20","unstructured":"Zhang, Y., Wang, D., Wang, L., Qi, J., Lu, H.: Learning regression and verification networks for long-term visual tracking. arxiv 2018. arXiv preprint arXiv:1809.04320"},{"key":"2339_CR21","first-page":"11037","volume":"34","author":"L Huang","year":"2020","unstructured":"Huang, L., Zhao, X., Huang, K.: Globaltrack: a simple and strong baseline for long-term tracking. Proceed. AAAI Conf. Artif. Intell. 34, 11037\u201311044 (2020)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"2339_CR22","doi-asserted-by":"crossref","unstructured":"Voigtlaender, P., Luiten, J., Torr, P.H., Leibe, B.: Siam r-cnn: Visual tracking by re-detection. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6578\u20136588 (2020)","DOI":"10.1109\/CVPR42600.2020.00661"},{"key":"2339_CR23","doi-asserted-by":"crossref","unstructured":"Dai, K., Zhang, Y., Wang, D., Li, J., Lu, H., Yang, X.: High-performance long-term tracking with meta-updater. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6298\u20136307 (2020)","DOI":"10.1109\/CVPR42600.2020.00633"},{"issue":"1","key":"2339_CR24","doi-asserted-by":"publisher","first-page":"460","DOI":"10.1109\/TPAMI.2022.3153645","volume":"45","author":"H Zhao","year":"2022","unstructured":"Zhao, H., Yan, B., Wang, D., Qian, X., Yang, X., Lu, H.: Effective local and global search for fast long-term tracking. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 460\u2013474 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"2339_CR25","doi-asserted-by":"publisher","first-page":"112","DOI":"10.1007\/s11554-023-01370-z","volume":"20","author":"Z Hou","year":"2023","unstructured":"Hou, Z., Han, R., Ma, J., Ma, S., Yu, W., Fan, J.: A global re-detection method based on siamese network in long-term visual tracking. J. Real-Time Image Proc. 20(6), 112 (2023)","journal-title":"J. Real-Time Image Proc."},{"issue":"7","key":"2339_CR26","doi-asserted-by":"publisher","first-page":"1409","DOI":"10.1109\/TPAMI.2011.239","volume":"34","author":"Z Kalal","year":"2011","unstructured":"Kalal, Z., Mikolajczyk, K., Matas, J.: Tracking-learning-detection. IEEE Trans. Pattern Anal. Mach. Intell. 34(7), 1409\u20131422 (2011)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2339_CR27","doi-asserted-by":"crossref","unstructured":"Le\u00a0Moing, G., Ponce, J., Schmid, C.: Dense optical tracking: Connecting the dots. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19187\u201319197 (2024)","DOI":"10.1109\/CVPR52733.2024.01815"},{"key":"2339_CR28","doi-asserted-by":"crossref","unstructured":"Nebehay, G., Pflugfelder, R.: Clustering of static-adaptive correspondences for deformable object tracking. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2784\u20132791 (2015)","DOI":"10.1109\/CVPR.2015.7298895"},{"key":"2339_CR29","doi-asserted-by":"crossref","unstructured":"Hong, Z., Chen, Z., Wang, C., Mei, X., Prokhorov, D., Tao, D.: Multi-store tracker (muster): A cognitive psychology inspired approach to object tracking. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 749\u2013758 (2015)","DOI":"10.1109\/CVPR.2015.7298675"},{"key":"2339_CR30","doi-asserted-by":"crossref","unstructured":"Zhu, G., Porikli, F., Li, H.: Beyond local search: Tracking objects everywhere with instance-specific proposals. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 943\u2013951 (2016)","DOI":"10.1109\/CVPR.2016.108"},{"key":"2339_CR31","doi-asserted-by":"crossref","unstructured":"Luke\u017ei\u010d, A., Zajc, L.\u010c., Voj\u00ed\u0159, T., Matas, J., Kristan, M.: Fucolot\u2013a fully-correlational long-term tracker. Computer Vision\u2013ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2\u20136, 2018, Revised Selected Papers, Part II 14, 595\u2013611. Springer (2019)","DOI":"10.1007\/978-3-030-20890-5_38"},{"key":"2339_CR32","doi-asserted-by":"crossref","unstructured":"Valmadre, J., Bertinetto, L., Henriques, J.F., Tao, R., Vedaldi, A., Smeulders, A.W., Torr, P.H., Gavves, E.: Long-term tracking in the wild: A benchmark. Proceedings of the European Conference on Computer Vision (ECCV), 670\u2013685 (2018)","DOI":"10.1007\/978-3-030-01219-9_41"},{"key":"2339_CR33","doi-asserted-by":"crossref","unstructured":"Supancic\u00a0III, J., Ramanan, D.: Tracking as online decision-making: Learning a policy from streaming videos with reinforcement learning. Proceedings of the IEEE International Conference on Computer Vision, 322\u2013331 (2017)","DOI":"10.1109\/ICCV.2017.43"},{"key":"2339_CR34","doi-asserted-by":"crossref","unstructured":"Yan, B., Zhao, H., Wang, D., Lu, H., Yang, X.: \u2019skimming-perusal\u2019tracking: A framework for real-time and robust long-term tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2385\u20132393 (2019)","DOI":"10.1109\/ICCV.2019.00247"},{"issue":"2","key":"2339_CR35","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s10044-025-01441-w","volume":"28","author":"H Zhang","year":"2025","unstructured":"Zhang, H., Fu, W., Yang, X., Qi, R., Wang, X., Zhang, C.: Dynamic metric memory network for long-term tracking with spatial-temporal region proposal method. Pattern Anal. Appl. 28(2), 60 (2025)","journal-title":"Pattern Anal. Appl."},{"key":"2339_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2025.105065","volume":"160","author":"Z Zhu","year":"2025","unstructured":"Zhu, Z., Zhao, X., Tang, K., Zhang, S., Yang, W., He, W.: Efficient long-term tracking with local-global similar object interference suppression. Digital Signal Process. 160, 105065 (2025)","journal-title":"Digital Signal Process."},{"key":"2339_CR37","doi-asserted-by":"crossref","unstructured":"Misra, D., Nalamada, T., Arasanipalai, A.U., Hou, Q.: Rotate to attend: Convolutional triplet attention module. Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 3139\u20133148 (2021)","DOI":"10.1109\/WACV48630.2021.00318"},{"key":"2339_CR38","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2025.129866","volume":"634","author":"Y Si","year":"2025","unstructured":"Si, Y., Xu, H., Zhu, X., Zhang, W., Dong, Y., Chen, Y., Li, H.: Scsa: exploring the synergistic effects between spatial and channel attention. Neurocomputing 634, 129866 (2025)","journal-title":"Neurocomputing"},{"key":"2339_CR39","doi-asserted-by":"crossref","unstructured":"Yang, J., Qiu, P., Zhang, Y., Marcus, D.S., Sotiras, A.: D-net: Dynamic large kernel with dynamic feature fusion for volumetric medical image segmentation. arXiv preprint arXiv:2403.10674 (2024)","DOI":"10.2139\/ssrn.5093171"},{"key":"2339_CR40","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.119132","volume":"214","author":"S Chen","year":"2023","unstructured":"Chen, S., Zhao, J., Zhou, Y., Wang, H., Yao, R., Zhang, L., Xue, Y.: Info-fpn: an informative feature pyramid network for object detection in remote sensing images. Expert Syst. Appl. 214, 119132 (2023)","journal-title":"Expert Syst. Appl."},{"key":"2339_CR41","doi-asserted-by":"crossref","unstructured":"Guo, C., Fan, B., Zhang, Q., Xiang, S., Pan, C.: Augfpn: Improving multi-scale feature learning for object detection. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 12595\u201312604 (2020)","DOI":"10.1109\/CVPR42600.2020.01261"},{"key":"2339_CR42","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)"},{"key":"2339_CR43","unstructured":"Benchmark, U.: A benchmark and simulator for uav tracking. European Conference on Computer Vision, 7 (2016)"},{"key":"2339_CR44","unstructured":"Kristan, M., Leonardis, A., Matas, J., Felsberg, M., Pflugfelder, R., K\u00e4m\u00e4r\u00e4inen, J.-K., Danelljan, M., Zajc, L.\u010c., Luke\u017ei\u010d, A., Drbohlav, O., et al.: The eighth visual object tracking vot2020 challenge results. Computer Vision\u2013ECCV 2020 Workshops: Glasgow, UK, August 23\u201328, 2020, Proceedings, Part V 16, 547\u2013601. Springer (2020)"},{"key":"2339_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. Computer vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2339_CR46","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2339_CR47","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vision 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vision"},{"key":"2339_CR48","doi-asserted-by":"crossref","unstructured":"Chen, X., Yan, B., Zhu, J., Wang, D., Yang, X., Lu, H.: Transformer tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 8126\u20138135 (2021)","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"2339_CR49","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inform. Process. Syst. 30 (2017)"},{"key":"2339_CR50","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Khan, F.S., Felsberg, M.: Atom: Accurate tracking by overlap maximization. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 4660\u20134669 (2019)","DOI":"10.1109\/CVPR.2019.00479"},{"key":"2339_CR51","doi-asserted-by":"crossref","unstructured":"Bhat, G., Danelljan, M., Gool, L.V., Timofte, R.: Learning discriminative model prediction for tracking. Proceedings of the IEEE\/CVF International Conference on Computer Vision, 6182\u20136191 (2019)","DOI":"10.1109\/ICCV.2019.00628"},{"key":"2339_CR52","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Gool, L.V., Timofte, R.: Probabilistic regression for visual tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 7183\u20137192 (2020)","DOI":"10.1109\/CVPR42600.2020.00721"},{"key":"2339_CR53","unstructured":"Danelljan, M., Bhat, G., Mayer, C., Paul, M.: Pytracking: Visual tracking library based on pytorch. GitHub (2020). https:\/\/github.com\/visionml\/pytracking"},{"key":"2339_CR54","doi-asserted-by":"crossref","unstructured":"Mayer, C., Danelljan, M., Bhat, G., Paul, M., Paudel, D.P., Yu, F., Van\u00a0Gool, L.: Transforming model prediction for tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 8731\u20138740 (2022)","DOI":"10.1109\/CVPR52688.2022.00853"},{"key":"2339_CR55","doi-asserted-by":"crossref","unstructured":"Bertinetto, L., Valmadre, J., Henriques, J.F., Vedaldi, A., Torr, P.H.: Fully-convolutional siamese networks for object tracking. Computer vision\u2013ECCV 2016 Workshops: Amsterdam, the Netherlands, October 8-10 and 15-16, 2016, Proceedings, Part II 14, 850\u2013865. Springer (2016)","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"2339_CR56","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Shahbaz\u00a0Khan, F., Felsberg, M.: Eco: Efficient convolution operators for tracking. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 6638\u20136646 (2017)","DOI":"10.1109\/CVPR.2017.733"},{"key":"2339_CR57","doi-asserted-by":"crossref","unstructured":"Chen, Z., Zhong, B., Li, G., Zhang, S., Ji, R.: Siamese box adaptive network for visual tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6668\u20136677 (2020)","DOI":"10.1109\/CVPR42600.2020.00670"},{"key":"2339_CR58","doi-asserted-by":"crossref","unstructured":"Li, Y., Yu, J., Cai, Z., Pan, Y.: Cross-modal target retrieval for tracking by natural language. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 4931\u20134940 (2022)","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"2339_CR59","first-page":"2321","volume":"37","author":"Z Song","year":"2023","unstructured":"Song, Z., Luo, R., Yu, J., Chen, Y.-P.P., Yang, W.: Compact transformer tracker with correlative masked modeling. Proceed. AAAI Conf. Artif. Intell. 37, 2321\u20132329 (2023)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"2339_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Z., Mao, K., He, Z.: Joint visual grounding and tracking with natural language specification. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 23151\u201323160 (2023)","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"2339_CR61","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1016\/j.patrec.2023.02.023","volume":"168","author":"H Zhao","year":"2023","unstructured":"Zhao, H., Wang, X., Wang, D., Lu, H., Ruan, X.: Transformer vision-language tracking via proxy token guided cross-modal fusion. Pattern Recogn. Lett. 168, 10\u201316 (2023)","journal-title":"Pattern Recogn. Lett."},{"key":"2339_CR62","doi-asserted-by":"crossref","unstructured":"Shao, Y., He, S., Ye, Q., Feng, Y., Luo, W., Chen, J.: Context-aware integration of language and visual references for natural language tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19208\u201319217 (2024)","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"2339_CR63","first-page":"2629","volume":"37","author":"D Zhang","year":"2024","unstructured":"Zhang, D., Hu, S., Feng, X., Li, X., Zhang, J., Huang, K., et al.: Beyond accuracy: tracking more like human via visual search. Adv. Neural. Inf. Process. Syst. 37, 2629\u20132662 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2339_CR64","doi-asserted-by":"crossref","unstructured":"Shao, Y., He, S., Ye, Q., Feng, Y., Luo, W., Chen, J.: Context-aware integration of language and visual references for natural language tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 19208\u201319217 (2024)","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"2339_CR65","doi-asserted-by":"crossref","unstructured":"Zhu, J., Chen, X., Diao, H., Li, S., He, J.-Y., Li, C., Luo, B., Wang, D., Lu, H.: Exploring dynamic transformer for efficient object tracking. IEEE Trans. Neural Netw. Learn. Syst. (2025)","DOI":"10.1109\/TNNLS.2025.3545752"},{"key":"2339_CR66","doi-asserted-by":"crossref","unstructured":"Zhu, J., Chen, X., Diao, H., Li, S., He, J.-Y., Li, C., Luo, B., Wang, D., Lu, H.: Exploring dynamic transformer for efficient object tracking. IEEE Trans. Neural Netw. and Learn. Syst. (2025)","DOI":"10.1109\/TNNLS.2025.3545752"},{"key":"2339_CR67","doi-asserted-by":"crossref","unstructured":"Wang, N., Zhou, W., Wang, J., Li, H.: Transformer meets tracker: Exploiting temporal context for robust visual tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 1571\u20131580 (2021)","DOI":"10.1109\/CVPR46437.2021.00162"},{"issue":"11","key":"2339_CR68","doi-asserted-by":"publisher","first-page":"6931","DOI":"10.1109\/TNNLS.2021.3083933","volume":"33","author":"X Wang","year":"2021","unstructured":"Wang, X., Tang, J., Luo, B., Wang, Y., Tian, Y., Wu, F.: Tracking by joint local and global search: a target-aware attention-based approach. IEEE Trans. Neural Netw. and Learn. Syst. 33(11), 6931\u20136945 (2021)","journal-title":"IEEE Trans. Neural Netw. and Learn. Syst."},{"key":"2339_CR69","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2022.104374","volume":"119","author":"L Yu","year":"2022","unstructured":"Yu, L., Qiao, B., Zhang, H., Yu, J., He, X.: Ltst: long-term segmentation tracker with memory attention network. Image Vis. Comput. 119, 104374 (2022)","journal-title":"Image Vis. Comput."},{"key":"2339_CR70","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.119890","volume":"223","author":"Z Gao","year":"2023","unstructured":"Gao, Z., Zhuang, Y., Gu, J., Yang, B., Nie, Z.: A joint local-global search mechanism for long-term tracking with dynamic memory network. Expert Syst. Appl. 223, 119890 (2023)","journal-title":"Expert Syst. Appl."},{"key":"2339_CR71","doi-asserted-by":"crossref","unstructured":"Gopal, G.Y., Amer, M.A.: Separable self and mixed attention transformers for efficient object tracking. Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 6708\u20136717 (2024)","DOI":"10.1109\/WACV57701.2024.00657"},{"issue":"2","key":"2339_CR72","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s10044-025-01441-w","volume":"28","author":"H Zhang","year":"2025","unstructured":"Zhang, H., Fu, W., Yang, X., Qi, R., Wang, X., Zhang, C.: Dynamic metric memory network for long-term tracking with spatial-temporal region proposal method. Pattern Anal. Appl. 28(2), 60 (2025)","journal-title":"Pattern Anal. Appl."},{"key":"2339_CR73","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2025.105065","volume":"160","author":"Z Zhu","year":"2025","unstructured":"Zhu, Z., Zhao, X., Tang, K., Zhang, S., Yang, W., He, W.: Efficient long-term tracking with local-global similar object interference suppression. Digital Signal Processi. 160, 105065 (2025)","journal-title":"Digital Signal Processi."},{"key":"2339_CR74","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Hager, G., Shahbaz\u00a0Khan, F., Felsberg, M.: Learning spatially regularized correlation filters for visual tracking. Proceedings of the IEEE International Conference on Computer Vision, 4310\u20134318 (2015)","DOI":"10.1109\/ICCV.2015.490"},{"key":"2339_CR75","doi-asserted-by":"crossref","unstructured":"Li, B., Yan, J., Wu, W., Zhu, Z., Hu, X.: High performance visual tracking with siamese region proposal network. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 8971\u20138980 (2018)","DOI":"10.1109\/CVPR.2018.00935"},{"key":"2339_CR76","doi-asserted-by":"crossref","unstructured":"Cheng, S., Zhong, B., Li, G., Liu, X., Tang, Z., Li, X., Wang, J.: Learning to filter: Siamese relation network for robust tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 4421\u20134431 (2021)","DOI":"10.1109\/CVPR46437.2021.00440"},{"key":"2339_CR77","doi-asserted-by":"crossref","unstructured":"Wang, Q., Zhang, L., Bertinetto, L., Hu, W., Torr, P.H.: Fast online object tracking and segmentation: A unifying approach. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 1328\u20131338 (2019)","DOI":"10.1109\/CVPR.2019.00142"},{"key":"2339_CR78","doi-asserted-by":"crossref","unstructured":"Guo, D., Wang, J., Cui, Y., Wang, Z., Chen, S.: Siamcar: Siamese fully convolutional classification and regression for visual tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 6269\u20136277 (2020)","DOI":"10.1109\/CVPR42600.2020.00630"},{"key":"2339_CR79","doi-asserted-by":"crossref","unstructured":"Guo, D., Shao, Y., Cui, Y., Wang, Z., Zhang, L., Shen, C.: Graph attention tracking. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 9543\u20139552 (2021)","DOI":"10.1109\/CVPR46437.2021.00942"},{"key":"2339_CR80","doi-asserted-by":"crossref","unstructured":"Fu, Z., Liu, Q., Fu, Z., Wang, Y.: Stmtrack: Template-free visual tracking with space-time memory networks. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 13774\u201313783 (2021)","DOI":"10.1109\/CVPR46437.2021.01356"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02339-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-026-02339-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-026-02339-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T05:41:07Z","timestamp":1778046067000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-026-02339-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,6]]},"references-count":80,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,8]]}},"alternative-id":["2339"],"URL":"https:\/\/doi.org\/10.1007\/s00530-026-02339-1","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5,6]]},"assertion":[{"value":"10 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"256"}}