{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T11:05:18Z","timestamp":1779793518525,"version":"3.53.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T00:00:00Z","timestamp":1774915200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T00:00:00Z","timestamp":1774915200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Nature Science Foundation of China","doi-asserted-by":"crossref","award":["62441231"],"award-info":[{"award-number":["62441231"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Key R&D projects of Liaoning Province, China","award":["2024JH2\/102500015"],"award-info":[{"award-number":["2024JH2\/102500015"]}]},{"name":"Fundamental Research Funds for the Central Universities of Ministry of Education","award":["N25BSS034"],"award-info":[{"award-number":["N25BSS034"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s10489-026-07218-w","type":"journal-article","created":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T13:17:51Z","timestamp":1774963071000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ESMT: Context-adaptive vision-language tracking with episodic-semantic memory"],"prefix":"10.1007","volume":"56","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-3728-7577","authenticated-orcid":false,"given":"Jiapeng","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ying","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongfeng","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gang","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,3,31]]},"reference":[{"issue":"6","key":"7218_CR1","doi-asserted-by":"publisher","first-page":"459","DOI":"10.1007\/s10489-025-06350-3","volume":"55","author":"Q Xu","year":"2025","unstructured":"Xu Q, Xu Z, Chen Z et al (2025) Learning sparse filters-based convolutional networks without offline training for robust visual tracking. Appl Intell 55(6):459","journal-title":"Appl Intell"},{"issue":"7","key":"7218_CR2","first-page":"1","volume":"55","author":"P Gao","year":"2025","unstructured":"Gao P, Zhang XY, Yu T (2025) Learning nested attentional feature fusion network for high performance visual tracking. Appl Intell 55(7):1\u201314","journal-title":"Appl Intell"},{"issue":"13","key":"7218_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10489-025-06791-w","volume":"55","author":"J Liu","year":"2025","unstructured":"Liu J, Gao Y, Shi Z et al (2025) Datrack: direction attention based transformer tracker. Appl Intell 55(13):1\u201317","journal-title":"Appl Intell"},{"key":"7218_CR4","doi-asserted-by":"crossref","unstructured":"Scott A, Uchida I, Ding N, et al. (2024) Teamtrack: A dataset for multi-sport multi-object tracking in full-pitch videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3357\u20133366","DOI":"10.1109\/CVPRW63382.2024.00340"},{"key":"7218_CR5","doi-asserted-by":"crossref","unstructured":"Cai Y, Liu J, Tang J, et al. (2023) Robust object modeling for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 9589\u20139600","DOI":"10.1109\/ICCV51070.2023.00879"},{"key":"7218_CR6","doi-asserted-by":"crossref","unstructured":"Blatter P, Kanakis M, Danelljan M, et al. (2023) Efficient visual tracking with exemplar transformers. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 1571\u20131581","DOI":"10.1109\/WACV56688.2023.00162"},{"key":"7218_CR7","unstructured":"Yu H, Zhu P, Zhang K, et al. (2022) Learning dynamic compact memory embedding for deformable visual object tracking. IEEE Transactions on Neural Networks and Learning Systems"},{"key":"7218_CR8","doi-asserted-by":"crossref","unstructured":"Zhang H, Wang J, Zhang J, et al. (2023a) One-stream vision-language memory network for object tracking. IEEE Transactions on Multimedia","DOI":"10.1109\/TMM.2023.3285441"},{"key":"7218_CR9","doi-asserted-by":"crossref","unstructured":"Zhang C, Sun X, Yang Y, et al. (2023b) All in one: Exploring unified vision-language tracking with multi-modal alignment. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 5552\u20135561","DOI":"10.1145\/3581783.3611803"},{"key":"7218_CR10","doi-asserted-by":"crossref","unstructured":"Liu X, Xu T, Wang Y, et al. (2023) Bactrack: Building appearance collection for aerial tracking. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2023.3340372"},{"key":"7218_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.110286","volume":"263","author":"B Wei","year":"2023","unstructured":"Wei B, Chen H, Ding Q et al (2023) SiamSTC: Updatable Siamese tracking network via spatio-temporal context. Knowl-Based Syst 263:110286","journal-title":"Knowl-Based Syst"},{"key":"7218_CR12","doi-asserted-by":"crossref","unstructured":"Chen Y, Zhang D, Yang Zx, et al. (2023) Robust and efficient memory network for video object segmentation. arXiv:2304.11840","DOI":"10.1109\/ICME55011.2023.00304"},{"key":"7218_CR13","doi-asserted-by":"crossref","unstructured":"Fu Z, Liu Q, Fu Z, et al. (2021) STMTrack: Template-free visual tracking with space-time memory networks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13774\u201313783","DOI":"10.1109\/CVPR46437.2021.01356"},{"key":"7218_CR14","doi-asserted-by":"crossref","unstructured":"Gao J, Zhong B, Chen Y (2023) Unambiguous object tracking by exploiting target cues. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 1997\u20132005","DOI":"10.1145\/3581783.3612240"},{"key":"7218_CR15","doi-asserted-by":"crossref","unstructured":"Zhou Z, Zhou X, Chen Z, et al. (2023) Memory network with pixel-level spatio-temporal learning for visual object tracking. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2023.3272319"},{"key":"7218_CR16","doi-asserted-by":"crossref","unstructured":"Dai K, Zhang Y, Wang D, et al. (2020) High-performance long-term tracking with meta-updater. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6298\u20136307","DOI":"10.1109\/CVPR42600.2020.00633"},{"key":"7218_CR17","doi-asserted-by":"crossref","unstructured":"Zhang H, Liang J, Zhang J, et al. (2023) Attention-driven memory network for online visual tracking. IEEE Transactions on Neural Networks and Learning Systems","DOI":"10.1109\/TNNLS.2023.3299412"},{"key":"7218_CR18","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1016\/j.ins.2022.03.047","volume":"597","author":"H Zhang","year":"2022","unstructured":"Zhang H, Zhang J, Nie G et al (2022) Residual memory inference network for regression tracking with weighted gradient harmonized loss. Inf Sci 597:105\u2013124","journal-title":"Inf Sci"},{"key":"7218_CR19","doi-asserted-by":"crossref","unstructured":"Feng Q, Ablavsky V, Bai Q, et al. (2021) Siamese natural language tracker: Tracking by natural language descriptions with Siamese trackers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5851\u20135860","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"7218_CR20","doi-asserted-by":"crossref","unstructured":"Wang X, Shu X, Zhang Z, et al. (2021) Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13763\u201313773","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"7218_CR21","doi-asserted-by":"crossref","unstructured":"Li Z, Tao R, Gavves E, et al. (2017) Tracking by natural language specification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6495\u20136503","DOI":"10.1109\/CVPR.2017.777"},{"key":"7218_CR22","first-page":"4446","volume":"35","author":"M Guo","year":"2022","unstructured":"Guo M, Zhang Z, Fan H et al (2022a) Divert more attention to vision-language tracking. Adv Neural Inf Process Syst 35:4446\u20134460","journal-title":"Adv Neural Inf Process Syst"},{"key":"7218_CR23","doi-asserted-by":"crossref","unstructured":"Zhou L, Zhou Z, Mao K, et al. (2023) Joint visual grounding and tracking with natural language specification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 23151\u201323160","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"7218_CR24","doi-asserted-by":"crossref","unstructured":"Zheng Y, Zhong B, Liang Q, et al. (2023) Towards unified token learning for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2023.3301933"},{"key":"7218_CR25","doi-asserted-by":"crossref","unstructured":"Ma Y, Tang Y, Yang W, et al. (2024) Unifying visual and vision-language tracking via contrastive learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 4107\u20134116","DOI":"10.1609\/aaai.v38i5.28205"},{"issue":"12","key":"7218_CR26","doi-asserted-by":"publisher","first-page":"1041","DOI":"10.1016\/j.tics.2019.09.008","volume":"23","author":"L Renoult","year":"2019","unstructured":"Renoult L, Irish M, Moscovitch M et al (2019) From knowing to remembering: the semantic-episodic distinction. Trends Cogn Sci 23(12):1041\u20131057","journal-title":"Trends Cogn Sci"},{"key":"7218_CR27","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, et al. (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"7218_CR28","doi-asserted-by":"crossref","unstructured":"He K, Chen X, Xie S, et al. (2022) Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16000\u201316009","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"7218_CR29","unstructured":"Devlin J, Chang MW, Lee K, et al. (2018) BERT: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805"},{"key":"7218_CR30","doi-asserted-by":"crossref","unstructured":"Fan H, Lin L, Yang F, et al. (2019) LaSOT: A high-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5374\u20135383","DOI":"10.1109\/CVPR.2019.00552"},{"key":"7218_CR31","doi-asserted-by":"crossref","unstructured":"Chen X, Yan B, Zhu J, et al. (2021) Transformer tracking. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8126\u20138135","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"7218_CR32","doi-asserted-by":"crossref","unstructured":"Yan B, Peng H, Fu J, et al. (2021) Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10448\u201310457","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"7218_CR33","doi-asserted-by":"crossref","unstructured":"Zhang Z, Liu Y, Wang X, et al. (2021) Learn to match: Automatic matching network design for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 13339\u201313348","DOI":"10.1109\/ICCV48922.2021.01309"},{"key":"7218_CR34","doi-asserted-by":"crossref","unstructured":"Guo M, Zhang Z, Fan H, et al. (2022) Learning target-aware representation for visual tracking via informative interactions. arXiv:2201.02526","DOI":"10.24963\/ijcai.2022\/130"},{"key":"7218_CR35","doi-asserted-by":"crossref","unstructured":"Li Y, Yu J, Cai Z, et al. (2022) Cross-modal target retrieval for tracking by natural language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4931\u20134940","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"7218_CR36","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1016\/j.patrec.2023.02.023","volume":"168","author":"H Zhao","year":"2023","unstructured":"Zhao H, Wang X, Wang D et al (2023) Transformer vision-language tracking via proxy token guided cross-modal fusion. Pattern Recogn Lett 168:10\u201316","journal-title":"Pattern Recogn Lett"},{"key":"7218_CR37","unstructured":"Ge J, Chen X, Cao J, et al. (2023) Beyond visual cues: Synchronously exploring target-centric semantics for vision-language tracking. arXiv:2311.17085"},{"key":"7218_CR38","doi-asserted-by":"crossref","unstructured":"Shao Y, He S, Ye Q, et al. (2024) Context-aware integration of language and visual references for natural language tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 19208\u201319217","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"7218_CR39","doi-asserted-by":"crossref","unstructured":"Zhang G, Zhong B, Liang Q, et al. (2024) One-stream stepwise decreasing for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology 34(10), 9053\u20139063. https:\/\/doi.org\/10.1109\/TCSVT.2024.3395352","DOI":"10.1109\/TCSVT.2024.3395352"},{"key":"7218_CR40","doi-asserted-by":"crossref","unstructured":"Zhu H, Lu Q, Xue L, et al. (2025) Vision-language tracking with clip and interactive prompt learning. IEEE Transactions on Intelligent Transportation Systems 26(3), 3659\u20133670. https:\/\/doi.org\/10.1109\/TITS.2024.3520103","DOI":"10.1109\/TITS.2024.3520103"},{"key":"7218_CR41","doi-asserted-by":"crossref","unstructured":"Ge J, Cao J, Zhu X, et al. (2024) Consistencies are all you need for semi-supervised vision-language tracking. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp 1895\u20131904","DOI":"10.1145\/3664647.3680657"},{"key":"7218_CR42","doi-asserted-by":"crossref","unstructured":"Tang Y, Ma Y, Zhang T (2025) Semantic-aware network for natural language tracking. IEEE Transactions on Circuits and Systems for Video Technology 35(10), 10339\u201310352. https:\/\/doi.org\/10.1109\/TCSVT.2025.3563267","DOI":"10.1109\/TCSVT.2025.3563267"},{"key":"7218_CR43","doi-asserted-by":"crossref","unstructured":"Shi L, Zhong B, Liang Q, et al. (2025) Mamba adapter: Efficient multi-modal fusion for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology 35(9), 9300\u20139311. https:\/\/doi.org\/10.1109\/TCSVT.2025.3557570","DOI":"10.1109\/TCSVT.2025.3557570"},{"key":"7218_CR44","doi-asserted-by":"crossref","unstructured":"Zheng Y, Zhong B, Liang Q, et al. (2024) Toward unified token learning for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology 34(4), 2125\u20132135. https:\/\/doi.org\/10.1109\/TCSVT.2023.3301933","DOI":"10.1109\/TCSVT.2023.3301933"},{"key":"7218_CR45","doi-asserted-by":"crossref","unstructured":"Li X, Huang Y, He Z, et al. (2023) Citetracker: Correlating image and text for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 9974\u20139983","DOI":"10.1109\/ICCV51070.2023.00915"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-026-07218-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-026-07218-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-026-07218-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T10:47:28Z","timestamp":1779792448000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-026-07218-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,31]]},"references-count":45,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["7218"],"URL":"https:\/\/doi.org\/10.1007\/s10489-026-07218-w","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,31]]},"assertion":[{"value":"13 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 March 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"181"}}