{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T12:03:15Z","timestamp":1781006595605,"version":"3.54.1"},"reference-count":101,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007128","name":"Shaanxi Province Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007128","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.116116","type":"journal-article","created":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T06:51:56Z","timestamp":1778309516000},"page":"116116","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Unified Spatio-Temporal Tracking via Adaptive Embedding and Temporal Context Modeling"],"prefix":"10.1016","volume":"345","author":[{"given":"Xianxin","family":"Jia","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8284-1898","authenticated-orcid":false,"given":"Zhiqiang","family":"Hou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sugang","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4397-2155","authenticated-orcid":false,"given":"Lei","family":"Pu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116116_b1","series-title":"European Conference on Computer Vision","first-page":"300","article-title":"Tracking meets lora: Faster training, larger model, stronger performance","author":"Lin","year":"2024"},{"key":"10.1016\/j.knosys.2026.116116_b2","doi-asserted-by":"crossref","first-page":"58736","DOI":"10.52202\/075280-2561","article-title":"Mixformerv2: Efficient fully transformer tracking","volume":"36","author":"Cui","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b3","series-title":"European Conference on Computer Vision","first-page":"146","article-title":"Aiatrack: Attention in attention for transformer visual tracking","author":"Gao","year":"2022"},{"issue":"2","key":"10.1016\/j.knosys.2026.116116_b4","doi-asserted-by":"crossref","first-page":"1020","DOI":"10.1109\/TCSVT.2023.3289624","article-title":"Transformer tracking via frequency fusion","volume":"34","author":"Hu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116116_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.122836","article-title":"Autonomous driving system: A comprehensive survey","volume":"242","author":"Zhao","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116116_b6","doi-asserted-by":"crossref","unstructured":"S. Ettinger, S. Cheng, B. Caine, C. Liu, H. Zhao, S. Pradhan, Y. Chai, B. Sapp, C.R. Qi, Y. Zhou, et al., Large scale interactive motion forecasting for autonomous driving: The waymo open motion dataset, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 9710\u20139719.","DOI":"10.1109\/ICCV48922.2021.00957"},{"key":"10.1016\/j.knosys.2026.116116_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126817","article-title":"Analysis of convolutional-based variational autoencoders for privacy protection in realtime video surveillance","volume":"274","author":"Sivalakshmi","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116116_b8","doi-asserted-by":"crossref","DOI":"10.1016\/j.oceaneng.2024.118367","article-title":"Underwater visual surveillance: A comprehensive survey","volume":"309","author":"Rout","year":"2024","journal-title":"Ocean Eng."},{"issue":"13s","key":"10.1016\/j.knosys.2026.116116_b9","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3582272","article-title":"A survey on measuring cognitive workload in human-computer interaction","volume":"55","author":"Kosch","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.knosys.2026.116116_b10","series-title":"2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition","first-page":"2544","article-title":"Visual object tracking using adaptive correlation filters","author":"Bolme","year":"2010"},{"key":"10.1016\/j.knosys.2026.116116_b11","doi-asserted-by":"crossref","unstructured":"M. Danelljan, G. Bhat, F. Shahbaz Khan, M. Felsberg, Eco: Efficient convolution operators for tracking, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 6638\u20136646.","DOI":"10.1109\/CVPR.2017.733"},{"key":"10.1016\/j.knosys.2026.116116_b12","series-title":"Computer Vision\u2013ECCV 2016 Workshops: Amsterdam, the Netherlands, October 8-10 and 15-16, 2016, Proceedings, Part II 14","first-page":"850","article-title":"Fully-convolutional siamese networks for object tracking","author":"Bertinetto","year":"2016"},{"key":"10.1016\/j.knosys.2026.116116_b13","doi-asserted-by":"crossref","unstructured":"B. Li, W. Wu, Q. Wang, F. Zhang, J. Xing, J. Yan, Siamrpn++: Evolution of siamese visual tracking with very deep networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4282\u20134291.","DOI":"10.1109\/CVPR.2019.00441"},{"key":"10.1016\/j.knosys.2026.116116_b14","doi-asserted-by":"crossref","unstructured":"X. Chen, B. Yan, J. Zhu, D. Wang, X. Yang, H. Lu, Transformer tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 8126\u20138135.","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"10.1016\/j.knosys.2026.116116_b15","doi-asserted-by":"crossref","unstructured":"B. Yan, H. Peng, J. Fu, D. Wang, H. Lu, Learning spatio-temporal transformer for visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10448\u201310457.","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"10.1016\/j.knosys.2026.116116_b16","doi-asserted-by":"crossref","unstructured":"Y. Cui, C. Jiang, L. Wang, G. Wu, Mixformer: End-to-end tracking with iterative mixed attention, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 13608\u201313618.","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"10.1016\/j.knosys.2026.116116_b17","series-title":"European Conference on Computer Vision","first-page":"341","article-title":"Joint feature learning and relation modeling for tracking: A one-stream framework","author":"Ye","year":"2022"},{"key":"10.1016\/j.knosys.2026.116116_b18","series-title":"European Conference on Computer Vision","first-page":"375","article-title":"Backbone is all your need: A simplified architecture for visual object tracking","author":"Chen","year":"2022"},{"key":"10.1016\/j.knosys.2026.116116_b19","doi-asserted-by":"crossref","unstructured":"S. Gao, C. Zhou, J. Zhang, Generalized relation modeling for transformer tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 18686\u201318695.","DOI":"10.1109\/CVPR52729.2023.01792"},{"key":"10.1016\/j.knosys.2026.116116_b20","doi-asserted-by":"crossref","unstructured":"J. Xie, B. Zhong, Z. Mo, S. Zhang, L. Shi, S. Song, R. Ji, Autoregressive queries for adaptive tracking with spatio-temporal transformers, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 19300\u201319309.","DOI":"10.1109\/CVPR52733.2024.01826"},{"key":"10.1016\/j.knosys.2026.116116_b21","first-page":"7979","article-title":"MIMTrack: In-context tracking via masked image modeling","volume":"vol. 39","author":"Wang","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b22","first-page":"8824","article-title":"Less is more: Token context-aware learning for object tracking","volume":"vol. 39","author":"Xu","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b23","article-title":"Exploring dynamic transformer for efficient object tracking","author":"Zhu","year":"2025","journal-title":"IEEE Trans. Neural Networks Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b24","doi-asserted-by":"crossref","unstructured":"X. Wei, Y. Bai, Y. Zheng, D. Shi, Y. Gong, Autoregressive visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 9697\u20139706.","DOI":"10.1109\/CVPR52729.2023.00935"},{"key":"10.1016\/j.knosys.2026.116116_b25","doi-asserted-by":"crossref","unstructured":"W. Cai, Q. Liu, Y. Wang, Hiptrack: Visual tracking with historical prompts, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 19258\u201319267.","DOI":"10.1109\/CVPR52733.2024.01822"},{"key":"10.1016\/j.knosys.2026.116116_b26","first-page":"10635","article-title":"Decoupled spatio-temporal consistency learning for self-supervised tracking","volume":"vol. 39","author":"Zheng","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b27","first-page":"4838","article-title":"Explicit visual prompts for visual object tracking","volume":"vol. 38","author":"Shi","year":"2024"},{"key":"10.1016\/j.knosys.2026.116116_b28","first-page":"7588","article-title":"Odtrack: Online dense temporal token learning for visual tracking","volume":"vol. 38","author":"Zheng","year":"2024"},{"key":"10.1016\/j.knosys.2026.116116_b29","doi-asserted-by":"crossref","unstructured":"X. Liu, L. Zhou, Z. Zhou, J. Chen, Z. He, Mambavlt: Time-evolving multimodal state space model for vision-language tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 8731\u20138741.","DOI":"10.1109\/CVPR52734.2025.00816"},{"key":"10.1016\/j.knosys.2026.116116_b30","series-title":"Smtrack: State-aware mamba for efficient temporal modeling in visual tracking","author":"Ma","year":"2026"},{"key":"10.1016\/j.knosys.2026.116116_b31","series-title":"Transformers are ssms: Generalized models and efficient algorithms through structured state space duality","author":"Dao","year":"2024"},{"key":"10.1016\/j.knosys.2026.116116_b32","doi-asserted-by":"crossref","unstructured":"S. Liang, Y. Bai, Y. Gong, X. Wei, Autoregressive Sequential Pretraining for Visual Tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 7254\u20137264.","DOI":"10.1109\/CVPR52734.2025.00680"},{"key":"10.1016\/j.knosys.2026.116116_b33","doi-asserted-by":"crossref","unstructured":"Y. Huang, X. Li, Z. Zhou, Y. Wang, Z. He, M.-H. Yang, Rtracker: Recoverable tracking via pn tree structured memory, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 19038\u201319047.","DOI":"10.1109\/CVPR52733.2024.01801"},{"key":"10.1016\/j.knosys.2026.116116_b34","doi-asserted-by":"crossref","unstructured":"W. Cai, Q. Liu, Y. Wang, SPMTrack: Spatio-Temporal Parameter-Efficient Fine-Tuning with Mixture of Experts for Scalable Visual Tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 16871\u201316881.","DOI":"10.1109\/CVPR52734.2025.01572"},{"key":"10.1016\/j.knosys.2026.116116_b35","doi-asserted-by":"crossref","unstructured":"D. Guo, J. Wang, Y. Cui, Z. Wang, S. Chen, SiamCAR: Siamese fully convolutional classification and regression for visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6269\u20136277.","DOI":"10.1109\/CVPR42600.2020.00630"},{"key":"10.1016\/j.knosys.2026.116116_b36","doi-asserted-by":"crossref","unstructured":"Z. Chen, B. Zhong, G. Li, S. Zhang, R. Ji, Siamese box adaptive network for visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6668\u20136677.","DOI":"10.1109\/CVPR42600.2020.00670"},{"key":"10.1016\/j.knosys.2026.116116_b37","doi-asserted-by":"crossref","unstructured":"Z. Zhang, H. Peng, Deeper and wider siamese networks for real-time visual tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4591\u20134600.","DOI":"10.1109\/CVPR.2019.00472"},{"key":"10.1016\/j.knosys.2026.116116_b38","first-page":"16743","article-title":"Swintrack: A simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b39","first-page":"4194","article-title":"Exploring enhanced contextual information for video-level object tracking","volume":"vol. 39","author":"Kang","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b40","article-title":"Unifying motion and appearance cues for visual tracking via shared queries","author":"Xue","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116116_b41","first-page":"3581","article-title":"Exploiting multimodal spatial-temporal patterns for video object tracking","volume":"vol. 39","author":"Hu","year":"2025"},{"issue":"3","key":"10.1016\/j.knosys.2026.116116_b42","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1109\/TPAMI.2014.2345390","article-title":"High-speed tracking with kernelized correlation filters","volume":"37","author":"Henriques","year":"2014","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116116_b43","series-title":"Dcfnet: Discriminant correlation filters network for visual tracking","author":"Wang","year":"2017"},{"key":"10.1016\/j.knosys.2026.116116_b44","doi-asserted-by":"crossref","unstructured":"L. Bertinetto, J. Valmadre, S. Golodetz, O. Miksik, P.H. Torr, Staple: Complementary learners for real-time tracking, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 1401\u20131409.","DOI":"10.1109\/CVPR.2016.156"},{"key":"10.1016\/j.knosys.2026.116116_b45","doi-asserted-by":"crossref","unstructured":"B. Li, J. Yan, W. Wu, Z. Zhu, X. Hu, High performance visual tracking with siamese region proposal network, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 8971\u20138980.","DOI":"10.1109\/CVPR.2018.00935"},{"key":"10.1016\/j.knosys.2026.116116_b46","first-page":"773","article-title":"Target-aware tracking with long-term context attention","volume":"vol. 37","author":"He","year":"2023"},{"key":"10.1016\/j.knosys.2026.116116_b47","doi-asserted-by":"crossref","unstructured":"Z. Cao, Z. Huang, L. Pan, S. Zhang, Z. Liu, C. Fu, TCTrack: Temporal contexts for aerial tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 14798\u201314808.","DOI":"10.1109\/CVPR52688.2022.01438"},{"key":"10.1016\/j.knosys.2026.116116_b48","article-title":"Adaptive perception for unified visual multi-modal object tracking","author":"Hu","year":"2025","journal-title":"IEEE Trans. Artif. Intell."},{"issue":"10","key":"10.1016\/j.knosys.2026.116116_b49","doi-asserted-by":"crossref","first-page":"9102","DOI":"10.1109\/TCSVT.2024.3396289","article-title":"Toward modalities correlation for RGB-T tracking","volume":"34","author":"Hu","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"4","key":"10.1016\/j.knosys.2026.116116_b50","doi-asserted-by":"crossref","first-page":"2789","DOI":"10.1109\/TCSVT.2023.3312321","article-title":"Local-global temporal difference learning for satellite video super-resolution","volume":"34","author":"Xiao","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116116_b51","doi-asserted-by":"crossref","unstructured":"C. Mayer, M. Danelljan, G. Bhat, M. Paul, D.P. Paudel, F. Yu, L. Van Gool, Transforming model prediction for tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 8731\u20138740.","DOI":"10.1109\/CVPR52688.2022.00853"},{"key":"10.1016\/j.knosys.2026.116116_b52","unstructured":"L. Lin, H. Fan, Z. Zhang, Y. Huang, Y. Wang, Y. Xu, H. Ling, LoRATv2: Enabling Low-Cost Temporal Modeling in One-Stream Trackers, in: The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025."},{"key":"10.1016\/j.knosys.2026.116116_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.125451","article-title":"AMTrack: Transformer tracking via action information and mix-frequency features","volume":"261","author":"Ma","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.116116_b54","doi-asserted-by":"crossref","unstructured":"C. Xue, B. Zhong, Q. Liang, Y. Zheng, N. Li, Y. Xue, S. Song, Similarity-guided layer-adaptive vision transformer for UAV tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 6730\u20136740.","DOI":"10.1109\/CVPR52734.2025.00631"},{"key":"10.1016\/j.knosys.2026.116116_b55","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.111206","article-title":"Joint spatio-temporal modeling for visual tracking","volume":"283","author":"Sun","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116116_b56","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111787","article-title":"Motion deep association for spatio-temporal object tracking","volume":"168","author":"Guo","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.116116_b57","doi-asserted-by":"crossref","unstructured":"X. Chen, H. Peng, D. Wang, H. Lu, H. Hu, Seqtrack: Sequence to sequence learning for visual object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14572\u201314581.","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"10.1016\/j.knosys.2026.116116_b58","doi-asserted-by":"crossref","unstructured":"F. Xie, Z. Wang, C. Ma, Diffusiontrack: Point set diffusion model for visual object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 19113\u201319124.","DOI":"10.1109\/CVPR52733.2024.01808"},{"key":"10.1016\/j.knosys.2026.116116_b59","unstructured":"A. Gu, T. Dao, Mamba: Linear-time sequence modeling with selective state spaces, in: First Conference on Language Modeling, 2024."},{"key":"10.1016\/j.knosys.2026.116116_b60","first-page":"103031","article-title":"Vmamba: Visual state space model","volume":"37","author":"Liu","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b61","series-title":"U-mamba: Enhancing long-range dependency for biomedical image segmentation","author":"Ma","year":"2024"},{"key":"10.1016\/j.knosys.2026.116116_b62","doi-asserted-by":"crossref","first-page":"1783","DOI":"10.1109\/TMM.2024.3521798","article-title":"Frequency-assisted mamba for remote sensing image super-resolution","volume":"27","author":"Xiao","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.116116_b63","first-page":"4986","article-title":"Mambalct: Boosting tracking via long-term context state space model","volume":"vol. 39","author":"Li","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b64","doi-asserted-by":"crossref","unstructured":"D. Misra, T. Nalamada, A.U. Arasanipalai, Q. Hou, Rotate to attend: Convolutional triplet attention module, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021, pp. 3139\u20133148.","DOI":"10.1109\/WACV48630.2021.00318"},{"key":"10.1016\/j.knosys.2026.116116_b65","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.107079","article-title":"MCA: Multidimensional collaborative attention in deep convolutional neural networks for image recognition","volume":"126","author":"Yu","year":"2023","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.116116_b66","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Efficient multi-scale attention module with cross-spatial learning","author":"Ouyang","year":"2023"},{"key":"10.1016\/j.knosys.2026.116116_b67","unstructured":"Y.-L. Boureau, J. Ponce, Y. LeCun, A theoretical analysis of feature pooling in visual recognition, in: Proceedings of the 27th International Conference on Machine Learning (ICML-10), 2010, pp. 111\u2013118."},{"key":"10.1016\/j.knosys.2026.116116_b68","doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, G. Sun, Squeeze-and-excitation networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.knosys.2026.116116_b69","doi-asserted-by":"crossref","unstructured":"T. Karras, S. Laine, T. Aila, A style-based generator architecture for generative adversarial networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 4401\u20134410.","DOI":"10.1109\/CVPR.2019.00453"},{"key":"10.1016\/j.knosys.2026.116116_b70","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.127063","article-title":"Roformer: Enhanced transformer with rotary position embedding","volume":"568","author":"Su","year":"2024","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.116116_b71","series-title":"Rectifying magnitude neglect in linear attention","author":"Fan","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b72","first-page":"I","article-title":"Attention is all you need","volume":"30","author":"Ashish","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b73","unstructured":"T.-Y. Ross, G. Doll\u00e1r, Focal loss for dense object detection, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 2980\u20132988."},{"key":"10.1016\/j.knosys.2026.116116_b74","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b75","doi-asserted-by":"crossref","unstructured":"H. Rezatofighi, N. Tsoi, J. Gwak, A. Sadeghian, I. Reid, S. Savarese, Generalized intersection over union: A metric and a loss for bounding box regression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 658\u2013666.","DOI":"10.1109\/CVPR.2019.00075"},{"key":"10.1016\/j.knosys.2026.116116_b76","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2025.3557053","article-title":"Learning language prompt for vision-language tracking","author":"Zong","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116116_b77","article-title":"Enhancing the two-stream framework for efficient visual tracking","author":"Zong","year":"2025","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.116116_b78","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.112183","article-title":"Continuous spatio temporal prompts for visual tracking","volume":"161","author":"Sun","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.116116_b79","doi-asserted-by":"crossref","unstructured":"Y. Wu, X. Wang, X. Yang, M. Liu, D. Zeng, H. Ye, S. Li, Learning Occlusion-Robust Vision Transformers for Real-Time UAV Tracking, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 17103\u201317113.","DOI":"10.1109\/CVPR52734.2025.01594"},{"key":"10.1016\/j.knosys.2026.116116_b80","doi-asserted-by":"crossref","unstructured":"G.Y. Gopal, M.A. Amer, Separable self and mixed attention transformers for efficient object tracking, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024, pp. 6708\u20136717.","DOI":"10.1109\/WACV57701.2024.00657"},{"key":"10.1016\/j.knosys.2026.116116_b81","doi-asserted-by":"crossref","unstructured":"Y. Cai, J. Liu, J. Tang, G. Wu, Robust object modeling for visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 9589\u20139600.","DOI":"10.1109\/ICCV51070.2023.00879"},{"key":"10.1016\/j.knosys.2026.116116_b82","doi-asserted-by":"crossref","unstructured":"D. Yang, J. He, Y. Ma, Q. Yu, T. Zhang, Foreground-background distribution modeling transformer for visual object tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 10117\u201310127.","DOI":"10.1109\/ICCV51070.2023.00928"},{"key":"10.1016\/j.knosys.2026.116116_b83","series-title":"European Conference on Computer Vision","first-page":"771","article-title":"Ocean: Object-aware anchor-free tracking","author":"Zhang","year":"2020"},{"key":"10.1016\/j.knosys.2026.116116_b84","first-page":"2321","article-title":"Compact transformer tracker with correlative masked modeling","volume":"vol. 37","author":"Song","year":"2023"},{"key":"10.1016\/j.knosys.2026.116116_b85","first-page":"10959","article-title":"Two-stream beats one-stream: Asymmetric siamese network for efficient visual tracking","volume":"vol. 39","author":"Zhu","year":"2025"},{"key":"10.1016\/j.knosys.2026.116116_b86","doi-asserted-by":"crossref","unstructured":"B. Kang, X. Chen, D. Wang, H. Peng, H. Lu, Exploring lightweight hierarchical vision transformers for efficient visual tracking, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 9612\u20139621.","DOI":"10.1109\/ICCV51070.2023.00881"},{"key":"10.1016\/j.knosys.2026.116116_b87","doi-asserted-by":"crossref","unstructured":"B. Yan, H. Peng, K. Wu, D. Wang, J. Fu, H. Lu, Lighttrack: Finding lightweight neural networks for object tracking via one-shot architecture search, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 15180\u201315189.","DOI":"10.1109\/CVPR46437.2021.01493"},{"key":"10.1016\/j.knosys.2026.116116_b88","unstructured":"X. Zhang, Y. Tian, L. Xie, W. Huang, Q. Dai, Q. Ye, Q. Tian, Hivit: A simpler and more efficient design of hierarchical vision transformer, in: The Eleventh International Conference on Learning Representations, 2023."},{"issue":"5","key":"10.1016\/j.knosys.2026.116116_b89","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: A large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116116_b90","doi-asserted-by":"crossref","unstructured":"M. Muller, A. Bibi, S. Giancola, S. Alsubaihi, B. Ghanem, Trackingnet: A large-scale dataset and benchmark for object tracking in the wild, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 300\u2013317.","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"10.1016\/j.knosys.2026.116116_b91","doi-asserted-by":"crossref","unstructured":"H. Fan, L. Lin, F. Yang, P. Chu, G. Deng, S. Yu, H. Bai, Y. Xu, C. Liao, H. Ling, Lasot: A high-quality benchmark for large-scale single object tracking, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 5374\u20135383.","DOI":"10.1109\/CVPR.2019.00552"},{"key":"10.1016\/j.knosys.2026.116116_b92","first-page":"130797","article-title":"Vasttrack: Vast category visual object tracking","volume":"37","author":"Peng","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116116_b93","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.knosys.2026.116116_b94","doi-asserted-by":"crossref","first-page":"439","DOI":"10.1007\/s11263-020-01387-y","article-title":"Lasot: A high-quality large-scale single object tracking benchmark","volume":"129","author":"Fan","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.116116_b95","doi-asserted-by":"crossref","unstructured":"X. Wang, X. Shu, Z. Zhang, B. Jiang, Y. Wang, Y. Tian, F. Wu, Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13763\u201313773.","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"10.1016\/j.knosys.2026.116116_b96","article-title":"A benchmark and simulator for uav tracking","volume":"vol. 7","author":"Benchmark","year":"2016"},{"key":"10.1016\/j.knosys.2026.116116_b97","doi-asserted-by":"crossref","unstructured":"Y. Wu, J. Lim, M.-H. Yang, Online object tracking: A benchmark, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2013, pp. 2411\u20132418.","DOI":"10.1109\/CVPR.2013.312"},{"key":"10.1016\/j.knosys.2026.116116_b98","doi-asserted-by":"crossref","unstructured":"Q. Hou, D. Zhou, J. Feng, Coordinate attention for efficient mobile network design, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13713\u201313722.","DOI":"10.1109\/CVPR46437.2021.01350"},{"key":"10.1016\/j.knosys.2026.116116_b99","doi-asserted-by":"crossref","unstructured":"S. Woo, J. Park, J.-Y. Lee, I.S. Kweon, Cbam: Convolutional block attention module, in: Proceedings of the European Conference on Computer Vision, ECCV, 2018, pp. 3\u201319.","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"10.1016\/j.knosys.2026.116116_b100","doi-asserted-by":"crossref","unstructured":"J. Hu, L. Shen, G. Sun, Squeeze-and-excitation networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 7132\u20137141.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"10.1016\/j.knosys.2026.116116_b101","first-page":"37","article-title":"Long short-term memory","author":"Graves","year":"2012","journal-title":"Supervised Seq. Label. Recurr. Neural Networks"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008427?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008427?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T11:28:06Z","timestamp":1781004486000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008427"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":101,"alternative-id":["S0950705126008427"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116116","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Unified Spatio-Temporal Tracking via Adaptive Embedding and Temporal Context Modeling","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116116","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116116"}}