{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T15:14:15Z","timestamp":1780413255466,"version":"3.54.1"},"reference-count":63,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004479","name":"Jiangxi Provincial Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004479","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109181","type":"journal-article","created":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T23:44:59Z","timestamp":1779925499000},"page":"109181","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["CWITrack: Transformer tracking via local-global cross-window interaction"],"prefix":"10.1016","volume":"203","author":[{"given":"Yuanyun","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pengcheng","family":"Sha","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shenmiao","family":"Jin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yichao","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6750-5105","authenticated-orcid":false,"given":"Jun","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109181_bib0001","series-title":"European conference on computer vision","first-page":"850","article-title":"Fully-convolutional siamese networks for object tracking","author":"Bertinetto","year":"2016"},{"key":"10.1016\/j.neunet.2026.109181_bib0002","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"1571","article-title":"Efficient visual tracking with exemplar transformers","author":"Blatter","year":"2023"},{"issue":"7","key":"10.1016\/j.neunet.2026.109181_bib0003","doi-asserted-by":"crossref","first-page":"9370","DOI":"10.1109\/TITS.2025.3570076","article-title":"Hierarchical attention-enhanced correlation refinement for robust visual tracking","volume":"26","author":"Chen","year":"2025","journal-title":"IEEE Transactions on Intelligent Transportation Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0004","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8126","article-title":"Transformer tracking","author":"Chen","year":"2021"},{"key":"10.1016\/j.neunet.2026.109181_bib0005","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"6668","article-title":"Siamese box adaptive network for visual tracking","author":"Chen","year":"2020"},{"key":"10.1016\/j.neunet.2026.109181_bib0006","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13608","article-title":"MixFormer: End-to-end tracking with iterative mixed attention","author":"Cui","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0007","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"12124","article-title":"CSWin Transformer: A general vision transformer backbone with cross-shaped windows","author":"Dong","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0008","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S. et al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929."},{"key":"10.1016\/j.neunet.2026.109181_bib0009","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"5374","article-title":"LaSOT: A high-quality benchmark for large-scale single object tracking","author":"Fan","year":"2019"},{"key":"10.1016\/j.neunet.2026.109181_bib0010","doi-asserted-by":"crossref","unstructured":"Fu, Z., Fu, Z., Liu, Q., Cai, W., & Wang, Y. (2022). SparseTT: Visual tracking with sparse transformers. arXiv preprint arXiv: 2205.03776.","DOI":"10.24963\/ijcai.2022\/127"},{"key":"10.1016\/j.neunet.2026.109181_bib0011","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.119890","article-title":"A joint local\u2013global search mechanism for long-term tracking with dynamic memory network","volume":"223","author":"Gao","year":"2023","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.109181_bib0012","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"9543","article-title":"Graph attention tracking","author":"Guo","year":"2021"},{"issue":"2","key":"10.1016\/j.neunet.2026.109181_bib0013","doi-asserted-by":"crossref","first-page":"1020","DOI":"10.1109\/TCSVT.2023.3289624","article-title":"Transformer tracking via frequency fusion","volume":"34","author":"Hu","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"5","key":"10.1016\/j.neunet.2026.109181_bib0014","doi-asserted-by":"crossref","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","article-title":"Got-10k: A large high-diversity benchmark for generic object tracking in the wild","volume":"43","author":"Huang","year":"2019","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109181_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121377","article-title":"Spatio-temporal mix deformable feature extractor in visual tracking","volume":"237","author":"Huang","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.neunet.2026.109181_bib0016","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"9612","article-title":"Exploring lightweight hierarchical vision transformers for efficient visual tracking","author":"Kang","year":"2023"},{"key":"10.1016\/j.neunet.2026.109181_bib0017","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"1125","article-title":"Need for speed: A benchmark for higher frame rate object tracking","author":"Kiani Galoogahi","year":"2017"},{"key":"10.1016\/j.neunet.2026.109181_bib0018","unstructured":"Kokkeby, K. L., Lutter, R. P., Munoz, M. L., Cathey, F. W., Hilliard, D. J., & Olson, T. L. (2015). Methods for autonomous tracking and surveillance. US Patent 9,026,272."},{"issue":"9","key":"10.1016\/j.neunet.2026.109181_bib0019","doi-asserted-by":"crossref","first-page":"1429","DOI":"10.1109\/TMM.2015.2455418","article-title":"On-road pedestrian tracking across multiple driving recorders","volume":"17","author":"Lee","year":"2015","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109181_bib0020","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"8971","article-title":"High performance visual tracking with siamese region proposal network","author":"Li","year":"2018"},{"key":"10.1016\/j.neunet.2026.109181_bib0021","unstructured":"Li, W., Wang, X., Xia, X., Wu, J., Li, J., Xiao, X., Zheng, M., & Wen, S. (2022). SepViT: Separable vision transformer. arXiv preprint arXiv: 2203.15380."},{"key":"10.1016\/j.neunet.2026.109181_bib0022","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.109435","article-title":"Siamese visual tracking combining granular level multi-scale features and global information","volume":"252","author":"Liang","year":"2022","journal-title":"Knowledge-Based Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0023","first-page":"16743","article-title":"SwinTrack: A simple and strong baseline for transformer tracking","volume":"35","author":"Lin","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0024","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"2980","article-title":"Focal loss for dense object detection","author":"Lin","year":"2017"},{"key":"10.1016\/j.neunet.2026.109181_bib0025","series-title":"European conference on computer vision","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neunet.2026.109181_bib0026","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"10012","article-title":"Swin Transformer: Hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.neunet.2026.109181_bib0027","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv: 1711.05101."},{"key":"10.1016\/j.neunet.2026.109181_bib0028","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8731","article-title":"Transforming model prediction for tracking","author":"Mayer","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0029","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109630","article-title":"SiamRank: A siamese based visual tracking network with ranking strategy","volume":"141","author":"Meng","year":"2023","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.109181_bib0030","series-title":"European conference on computer vision","first-page":"445","article-title":"A benchmark and simulator for uav tracking","author":"Mueller","year":"2016"},{"key":"10.1016\/j.neunet.2026.109181_bib0031","series-title":"Proceedings of the european conference on computer vision","first-page":"300","article-title":"TrackingNet: A large-scale dataset and benchmark for object tracking in the wild","author":"Muller","year":"2018"},{"key":"10.1016\/j.neunet.2026.109181_bib0032","unstructured":"Nie, J., Wu, H., He, Z., Yang, Y., Gao, M., & Dong, Z. (2022). Learning localization-aware target confidence for siamese visual tracking. arXiv preprint arXiv: 2204.14093."},{"key":"10.1016\/j.neunet.2026.109181_bib0033","first-page":"91","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0034","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"658","article-title":"Generalized intersection over union: A metric and a loss for bounding box regression","author":"Rezatofighi","year":"2019"},{"key":"10.1016\/j.neunet.2026.109181_bib0035","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"17425","article-title":"SwiftFormer: Efficient additive attention for transformer-based real-time mobile vision applications","author":"Shaker","year":"2023"},{"key":"10.1016\/j.neunet.2026.109181_bib0036","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"19208","article-title":"Context-aware integration of language and visual references for natural language tracking","author":"Shao","year":"2024"},{"key":"10.1016\/j.neunet.2026.109181_bib0037","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8101","article-title":"Unsupervised learning of accurate siamese tracking","author":"Shen","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0038","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"2321","article-title":"Compact transformer tracker with correlative masked modeling","volume":"vol. 37","author":"Song","year":"2023"},{"key":"10.1016\/j.neunet.2026.109181_bib0039","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8791","article-title":"Transformer tracking with cyclic shifting window attention","author":"Song","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0040","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.108502","article-title":"Two-stage aware attentional siamese network for visual tracking","volume":"124","author":"Sun","year":"2022","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.109181_bib0041","doi-asserted-by":"crossref","unstructured":"Sun, X., Sun, H., Jiang, S., Wang, J., Wei, X., & Hu, Z. (2024). Multi-attention associate prediction network for visual tracking. arXiv preprint arXiv: 2403.16395.","DOI":"10.1016\/j.neucom.2024.128785"},{"key":"10.1016\/j.neunet.2026.109181_bib0042","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8741","article-title":"Ranking-based siamese visual tracking","author":"Tang","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0043","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0044","doi-asserted-by":"crossref","first-page":"326","DOI":"10.1109\/TMM.2023.3264851","article-title":"CMAT: Integrating convolution mixer and self-attention for visual tracking","volume":"26","author":"Wang","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109181_bib0045","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"1571","article-title":"Transformer meets tracker: Exploiting temporal context for robust visual tracking","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2026.109181_bib0046","series-title":"Proceedings of the IEEE\/CVF international conference on computer vision","first-page":"568","article-title":"Pyramid vision transformer: A versatile backbone for dense prediction without convolutions","author":"Wang","year":"2021"},{"key":"10.1016\/j.neunet.2026.109181_bib0047","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13763","article-title":"Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark","author":"Wang","year":"2021"},{"issue":"6","key":"10.1016\/j.neunet.2026.109181_bib0048","doi-asserted-by":"crossref","first-page":"3638","DOI":"10.3934\/era.2025162","article-title":"A multi-scale cyclic-shift window transformer object tracker based on fast fourier transform","volume":"33","author":"Wu","year":"2025","journal-title":"Electronic Research Archive"},{"key":"10.1016\/j.neunet.2026.109181_bib0049","series-title":"Proceedings of the computer vision and pattern recognition conference","first-page":"17103","article-title":"Learning occlusion-robust vision transformers for real-time UAV tracking","author":"Wu","year":"2025"},{"key":"10.1016\/j.neunet.2026.109181_bib0050","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"8751","article-title":"Correlation-aware deep tracking","author":"Xie","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0051","unstructured":"Xie, F., Yang, W., Wang, C., Chu, L., Cao, Y., Ma, C., & Zeng, W. (2024). Correlation-embedded transformer tracking: A single-branch framework. arXiv preprint arXiv: 2401.12743."},{"key":"10.1016\/j.neunet.2026.109181_bib0052","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"2139","article-title":"Siamese transformer pyramid networks for real-time UAV tracking","author":"Xing","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0053","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2025.122364","article-title":"AMST: Object tracking based on collaborative framework with adaptive multi-strategy","volume":"718","author":"Xu","year":"2025","journal-title":"Information Sciences"},{"key":"10.1016\/j.neunet.2026.109181_bib0054","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"11604","article-title":"Motion-aware object tracking via motion and geometry-aware cues","volume":"vol. 40","author":"Yang","year":"2026"},{"key":"10.1016\/j.neunet.2026.109181_bib0055","doi-asserted-by":"crossref","first-page":"1956","DOI":"10.1109\/TMM.2021.3074239","article-title":"SiamCorners: Siamese corner networks for visual tracking","volume":"24","author":"Yang","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109181_bib0056","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106380","article-title":"DeforT: Deformable transformer for visual tracking","volume":"176","author":"Yang","year":"2024","journal-title":"Neural Networks"},{"issue":"1","key":"10.1016\/j.neunet.2026.109181_bib0057","doi-asserted-by":"crossref","first-page":"245","DOI":"10.1109\/TCSVT.2024.3460400","article-title":"Attention-based gating network for robust segmentation tracking","volume":"35","author":"Yang","year":"2024","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.neunet.2026.109181_bib0058","first-page":"2491","article-title":"Associating objects with transformers for video object segmentation","volume":"34","author":"Yang","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.109181_bib0059","series-title":"European conference on computer vision","first-page":"341","article-title":"Joint feature learning and relation modeling for tracking: A one-stream framework","author":"Ye","year":"2022"},{"key":"10.1016\/j.neunet.2026.109181_bib0060","series-title":"2025\u202fIEEE\/CVF Winter conference on applications of computer vision (WACV)","first-page":"9468","article-title":"Improving accuracy and generalization for efficient visual tracking","author":"Zaveri","year":"2025"},{"key":"10.1016\/j.neunet.2026.109181_bib0061","series-title":"Proceedings of the 33rd ACM international conference on multimedia","first-page":"8067","article-title":"Explicit context reasoning with supervision for visual tracking","author":"Zeng","year":"2025"},{"issue":"08","key":"10.1016\/j.neunet.2026.109181_bib0062","first-page":"1213","article-title":"Human body target tracking algorithm in human-computer interaction","volume":"49","author":"Zhang","year":"2015","journal-title":"Journal of Shanghai Jiao Tong University"},{"key":"10.1016\/j.neunet.2026.109181_bib0063","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"10959","article-title":"Two-stream beats one-stream: Asymmetric siamese network for efficient visual tracking","volume":"vol. 39","author":"Zhu","year":"2025"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006428?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006428?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T14:43:41Z","timestamp":1780411421000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026006428"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":63,"alternative-id":["S0893608026006428"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109181","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"CWITrack: Transformer tracking via local-global cross-window interaction","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109181","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109181"}}