{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T14:14:04Z","timestamp":1780064044137,"version":"3.54.0"},"reference-count":43,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T00:00:00Z","timestamp":1778198400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100002701","name":"Korea Ministry of Education","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002701","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003661","name":"Korea Institute for Advancement of Technology","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003661","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute of Information & Communications Technology Planning & Evaluation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.neucom.2026.133829","type":"journal-article","created":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T15:32:05Z","timestamp":1777563125000},"page":"133829","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["ToTracker: Enhancing visual tracking via textual and occlusion cues"],"prefix":"10.1016","volume":"692","author":[{"given":"Yuning","family":"Ye","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1190-6863","authenticated-orcid":false,"given":"Yuseok","family":"Ban","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133829_bib0005","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.133829_bib0010","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"9974","article-title":"Citetracker: correlating image and text for visual tracking","author":"Li","year":"2023"},{"key":"10.1016\/j.neucom.2026.133829_bib0015","doi-asserted-by":"crossref","first-page":"2682","DOI":"10.1109\/TMM.2025.3535323","article-title":"Improving visual object tracking through visual prompting","volume":"27","author":"Chen","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.neucom.2026.133829_bib0020","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1420","article-title":"Siamese instance search for tracking","author":"Tao","year":"2016"},{"key":"10.1016\/j.neucom.2026.133829_bib0025","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16","first-page":"771","article-title":"Ocean: object-aware anchor-free tracking","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neucom.2026.133829_bib0030","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8126","article-title":"Transformer tracking","author":"Chen","year":"2021"},{"key":"10.1016\/j.neucom.2026.133829_bib0035","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"10448","article-title":"Learning spatio-temporal transformer for visual tracking","author":"Yan","year":"2021"},{"key":"10.1016\/j.neucom.2026.133829_bib0040","series-title":"European Conference on Computer Vision","first-page":"341","article-title":"Joint feature learning and relation modeling for tracking: a one-stream framework","author":"Ye","year":"2022"},{"key":"10.1016\/j.neucom.2026.133829_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.130599","article-title":"End-to-end one-stream object tracking based on uncertainty regression","author":"Tang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133829_bib0050","article-title":"Acntrack: agent cross-attention guided multimodal multi-object tracking with neural kalman filter","author":"Zhang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133829_bib0055","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.131234","article-title":"Ggstrack: geometric graph with spatio-temporal convolution for multi-object tracking","author":"Yan","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133829_bib0060","article-title":"Efficient hybrid linear self-attention based visual object tracking with lora","author":"Xu","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133829_bib0065","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19079","article-title":"Onetracker: unifying visual object tracking with foundation models and efficient tuning","author":"Hong","year":"2024"},{"key":"10.1016\/j.neucom.2026.133829_bib0070","author":"Wang"},{"key":"10.1016\/j.neucom.2026.133829_bib0075","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111330","article-title":"Visible\u2013thermal multiple object tracking: large-scale video dataset and progressive fusion approach","volume":"161","author":"Zhu","year":"2025","journal-title":"Pattern Recognit."},{"issue":"5","key":"10.1016\/j.neucom.2026.133829_bib0080","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3726529","article-title":"Beyond visual cues: synchronously exploring target-centric semantics for vision-language tracking","volume":"21","author":"Ge","year":"2025","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"10.1016\/j.neucom.2026.133829_bib0085","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19208","article-title":"Context-aware integration of language and visual references for natural language tracking","author":"Shao","year":"2024"},{"key":"10.1016\/j.neucom.2026.133829_bib0090","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"8731","article-title":"Mambavlt: time-evolving multimodal state space model for vision-language tracking","author":"Liu","year":"2025"},{"key":"10.1016\/j.neucom.2026.133829_bib0095","series-title":"Proceedings of the 33rd ACM International Conference on Multimedia","first-page":"3037","article-title":"Gen4track: a tuning-free data augmentation framework via self-correcting diffusion model for vision-language tracking","author":"Ge","year":"2025"},{"key":"10.1016\/j.neucom.2026.133829_bib0100","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"1895","article-title":"Consistencies are all you need for semi-supervised vision-language tracking","author":"Ge","year":"2024"},{"key":"10.1016\/j.neucom.2026.133829_bib0105","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part v 13","first-page":"740","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neucom.2026.133829_bib0110","series-title":"International Conference on Learning Representations (ICLR)","article-title":"An image is worth 16\u00d716 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.neucom.2026.133829_bib0115","first-page":"4446","article-title":"Divert more attention to vision-language tracking","volume":"35","author":"Guo","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133829_bib0120","series-title":"European Conference on Computer Vision","first-page":"375","article-title":"Backbone is all your need: a simplified architecture for visual object tracking","author":"Chen","year":"2022"},{"key":"10.1016\/j.neucom.2026.133829_bib0125","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13608","article-title":"Mixformer: end-to-end tracking with iterative mixed attention","author":"Cui","year":"2022"},{"key":"10.1016\/j.neucom.2026.133829_bib0130","first-page":"16743","article-title":"Swintrack: a simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133829_bib0135","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14572","article-title":"Seqtrack: sequence to sequence learning for visual object tracking","author":"Chen","year":"2023"},{"key":"10.1016\/j.neucom.2026.133829_bib0140","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19048","article-title":"Artrackv2: prompting autoregressive tracker where to look and how to describe","author":"Bai","year":"2024"},{"key":"10.1016\/j.neucom.2026.133829_bib0145","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10959","article-title":"Two-stream beats one-stream: asymmetric siamese network for efficient visual tracking","volume":"vol. 39","author":"Zhu","year":"2025"},{"key":"10.1016\/j.neucom.2026.133829_bib0150","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"13427","article-title":"General compression framework for efficient transformer object tracking","author":"Hong","year":"2025"},{"key":"10.1016\/j.neucom.2026.133829_bib0155","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6182","article-title":"Learning discriminative model prediction for tracking","author":"Bhat","year":"2019"},{"key":"10.1016\/j.neucom.2026.133829_bib0160","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1571","article-title":"Transformer meets tracker: exploiting temporal context for robust visual tracking","author":"Wang","year":"2021"},{"key":"10.1016\/j.neucom.2026.133829_bib0165","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9697","article-title":"Autoregressive visual tracking","author":"Wei","year":"2023"},{"key":"10.1016\/j.neucom.2026.133829_bib0170","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16000","article-title":"Masked autoencoders are scalable vision learners","author":"He","year":"2022"},{"key":"10.1016\/j.neucom.2026.133829_bib0175","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5374","article-title":"Lasot: a high-quality benchmark for large-scale single object tracking","author":"Fan","year":"2019"},{"issue":"5","key":"10.1016\/j.neucom.2026.133829_bib0180","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: a large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133829_bib0185","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"300","article-title":"Trackingnet: a large-scale dataset and benchmark for object tracking in the wild","author":"Muller","year":"2018"},{"key":"10.1016\/j.neucom.2026.133829_bib0190","series-title":"Computer Vision\u2013ECCV 2020 Workshops: Glasgow, UK, August 23\u201328, 2020, Proceedings, Part v 16","first-page":"547","article-title":"The eighth visual object tracking vot2020 challenge results","author":"Kristan","year":"2020"},{"key":"10.1016\/j.neucom.2026.133829_bib0195","series-title":"International Conference on Learning Representations (ICLR)","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2019"},{"key":"10.1016\/j.neucom.2026.133829_bib0200","doi-asserted-by":"crossref","first-page":"439","DOI":"10.1007\/s11263-020-01387-y","article-title":"Lasot: a high-quality large-scale single object tracking benchmark","volume":"129","author":"Fan","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.neucom.2026.133829_bib0205","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"1125","article-title":"Need for speed: a benchmark for higher frame rate object tracking","author":"Kiani Galoogahi","year":"2017"},{"key":"10.1016\/j.neucom.2026.133829_bib0210","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, the Netherlands, October 11\u201314, 2016, Proceedings, Part I 14","first-page":"445","article-title":"A benchmark and simulator for UAV tracking","author":"Mueller","year":"2016"},{"key":"10.1016\/j.neucom.2026.133829_bib0215","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226012269?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231226012269?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:53:45Z","timestamp":1780062825000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231226012269"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":43,"alternative-id":["S0925231226012269"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133829","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"ToTracker: Enhancing visual tracking via textual and occlusion cues","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133829","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Authors. Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"133829"}}