{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T12:10:10Z","timestamp":1749471010446,"version":"3.41.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T00:00:00Z","timestamp":1749427200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T00:00:00Z","timestamp":1749427200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100017596","name":"Natural Science Basic Research Program of Shaanxi Province","doi-asserted-by":"publisher","award":["2023-JC-YB-826"],"award-info":[{"award-number":["2023-JC-YB-826"]}],"id":[{"id":"10.13039\/501100017596","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Open Projects funded by Hubei Engineering Research Center for Intelligent Detection and Identification of Complex Parts","award":["IDICP-KF-2024-20"],"award-info":[{"award-number":["IDICP-KF-2024-20"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-07498-y","type":"journal-article","created":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T11:50:45Z","timestamp":1749469845000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["TMATrack: token merging for autoregressive visual object tracking"],"prefix":"10.1007","volume":"81","author":[{"given":"Jinguang","family":"Chen","sequence":"first","affiliation":[]},{"given":"Hongxiao","family":"Yao","sequence":"additional","affiliation":[]},{"given":"Lili","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,9]]},"reference":[{"key":"7498_CR1","doi-asserted-by":"crossref","unstructured":"Bertinetto L, Valmadre J, Henriques JF, Vedaldi A, Torr PH (2016) Fully-convolutional siamese networks for object tracking. In: Computer Vision\u2013ECCV 2016 Workshops: Amsterdam, The Netherlands, October 8-10 and 15-16, 2016, Proceedings, Part II 14, pp. 850\u2013865. Springer","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"7498_CR2","doi-asserted-by":"crossref","unstructured":"Danelljan M, Bhat G, Shahbaz Khan F, Felsberg M (2017) Eco: efficient convolution operators for tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognitionm, pp 6638\u20136646","DOI":"10.1109\/CVPR.2017.733"},{"key":"7498_CR3","doi-asserted-by":"crossref","unstructured":"Wang N, Zhou W, Wang J, Li H (2021) Transformer meets tracker: exploiting temporal context for robust visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1571\u20131580","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"7498_CR4","doi-asserted-by":"crossref","unstructured":"Chen X, Yan B, Zhu J, Wang D, Yang X, Lu H (2021) Transformer tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8126\u20138135","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"7498_CR5","doi-asserted-by":"crossref","unstructured":"Yan B, Peng H, Fu J, Wang D, Lu H (2021) Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 10448\u201310457","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"7498_CR6","doi-asserted-by":"crossref","unstructured":"Cui Y, Jiang C, Wang L, Wu G (2022) Mixformer: end-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13608\u201313618","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"7498_CR7","doi-asserted-by":"crossref","unstructured":"Ye B, Chang H, Ma B, Shan S, Chen X (2022) Joint feature learning and relation modeling for tracking: a one-stream framework. In: European Conference on Computer Vision, pp 341\u2013357. Springer","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"7498_CR8","doi-asserted-by":"crossref","unstructured":"Li B, Wu W, Wang Q, Zhang F, Xing J, Yan J (2019) Siamrpn++: evolution of siamese visual tracking with very deep networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4282\u20134291","DOI":"10.1109\/CVPR.2019.00441"},{"key":"7498_CR9","doi-asserted-by":"crossref","unstructured":"Xu Y, Wang Z, Li Z, Yuan Y, Yu G (2020) Siamfc++: towards robust and accurate visual tracking with target estimation guidelines. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol 34, pp 12549\u201312556","DOI":"10.1609\/aaai.v34i07.6944"},{"key":"7498_CR10","doi-asserted-by":"crossref","unstructured":"Li B, Yan J, Wu W, Zhu Z, Hu X (2018) High performance visual tracking with siamese region proposal network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 8971\u20138980","DOI":"10.1109\/CVPR.2018.00935"},{"key":"7498_CR11","doi-asserted-by":"crossref","unstructured":"Yu Y, Xiong Y, Huang W, Scott MR (2020) Deformable siamese attention networks for visual object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6728\u20136737","DOI":"10.1109\/CVPR42600.2020.00676"},{"key":"7498_CR12","first-page":"1","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A (2017) Attention is all you need. Adv Neural Inf Process Syst 30:1","journal-title":"Adv Neural Inf Process Syst"},{"key":"7498_CR13","doi-asserted-by":"crossref","unstructured":"Chen X, Peng H, Wang D, Lu H, Hu H (2023) Seqtrack: sequence to sequence learning for visual object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14572\u201314581","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"7498_CR14","doi-asserted-by":"crossref","unstructured":"Zhang L, Gonzalez-Garcia A, Weijer JVD, Danelljan M, Khan FS (2019) Learning the model update for siamese trackers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4010\u20134019","DOI":"10.1109\/ICCV.2019.00411"},{"key":"7498_CR15","unstructured":"Bolya D, Fu C.-Y, Dai X, Zhang P, Feichtenhofer C, Hoffman J (2022) Token merging: your vit but faster. arXiv preprint arXiv:2210.09461"},{"key":"7498_CR16","doi-asserted-by":"crossref","unstructured":"Wei X, Bai Y, Zheng Y, Shi D, Gong Y (2023) Autoregressive visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9697\u20139706","DOI":"10.1109\/CVPR52729.2023.00935"},{"key":"7498_CR17","doi-asserted-by":"crossref","unstructured":"Muller M, Bibi A, Giancola S, Alsubaihi S, Ghanem B (2018) Trackingnet: a large-scale dataset and benchmark for object tracking in the wild. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 300\u2013317","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"7498_CR18","doi-asserted-by":"crossref","unstructured":"Wu Y, Lim J, Yang M-H (2013) Online object tracking: a benchmark. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2411\u20132418","DOI":"10.1109\/CVPR.2013.312"},{"key":"7498_CR19","doi-asserted-by":"crossref","unstructured":"Danelljan M, Bhat G, Khan FS, Felsberg M (2019) Atom: accurate tracking by overlap maximization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4660\u20134669","DOI":"10.1109\/CVPR.2019.00479"},{"key":"7498_CR20","doi-asserted-by":"crossref","unstructured":"Bhat G, Danelljan M, Gool LV, Timofte R (2019) Learning discriminative model prediction for tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 6182\u20136191","DOI":"10.1109\/ICCV.2019.00628"},{"key":"7498_CR21","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast r-cnn. In: Proceedings of the IEEE International Conference on Computer Vision, pp 1440\u20131448","DOI":"10.1109\/ICCV.2015.169"},{"key":"7498_CR22","doi-asserted-by":"crossref","unstructured":"Fan H, Ling H (2019) Siamese cascaded region proposal networks for real-time visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 7952\u20137961","DOI":"10.1109\/CVPR.2019.00814"},{"key":"7498_CR23","doi-asserted-by":"crossref","unstructured":"Zhang Z, Liu Y, Wang X, Li B, Hu W (2021) Learn to match: automatic matching network design for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 13339\u201313348","DOI":"10.1109\/ICCV48922.2021.01309"},{"key":"7498_CR24","doi-asserted-by":"crossref","unstructured":"Guo M, Zhang Z, Fan H, Jing L, Lyu Y, Li B, Hu W (2022) Learning target-aware representation for visual tracking via informative interactions. arXiv preprint arXiv:2201.02526","DOI":"10.24963\/ijcai.2022\/130"},{"key":"7498_CR25","doi-asserted-by":"crossref","unstructured":"Chen B, Li P, Bai L, Qiao L, Shen Q, Li B, Gan W, Wu W, Ouyang W (2022) Backbone is all your need: a simplified architecture for visual object tracking. In: European Conference on Computer Vision, pp 375\u2013392. Springer","DOI":"10.1007\/978-3-031-20047-2_22"},{"key":"7498_CR26","unstructured":"Alexey D (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv: 2010.11929"},{"key":"7498_CR27","first-page":"16743","volume":"35","author":"L Lin","year":"2022","unstructured":"Lin L, Fan H, Zhang Z, Xu Y, Ling H (2022) Swintrack: a simple and strong baseline for transformer tracking. Adv Neural Inf Process Syst 35:16743\u201316754","journal-title":"Adv Neural Inf Process Syst"},{"key":"7498_CR28","doi-asserted-by":"crossref","unstructured":"He K, Zhang C, Xie S, Li Z, Wang Z (2023) Target-aware tracking with long-term context attention. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol 37, pp 773\u2013780","DOI":"10.1609\/aaai.v37i1.25155"},{"key":"7498_CR29","doi-asserted-by":"crossref","unstructured":"Song Z, Luo R, Yu J, Chen Y-PP, Yang W (2023) Compact transformer tracker with correlative masked modeling. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol 37, pp 2321\u20132329","DOI":"10.1609\/aaai.v37i2.25327"},{"key":"7498_CR30","doi-asserted-by":"crossref","unstructured":"Haurum JB, Escalera S, Taylor GW, Moeslund TB (2023) Which tokens to use? Investigating token reduction in vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 773\u2013783","DOI":"10.1109\/ICCVW60793.2023.00085"},{"key":"7498_CR31","first-page":"13937","volume":"34","author":"Y Rao","year":"2021","unstructured":"Rao Y, Zhao W, Liu B, Lu J, Zhou J, Hsieh C-J (2021) Dynamicvit: efficient vision transformers with dynamic token sparsification. Adv Neural Inf Process Syst 34:13937\u201313949","journal-title":"Adv Neural Inf Process Syst"},{"key":"7498_CR32","unstructured":"Liang Y, Ge C, Tong Z, Song Y, Wang J, Xie P (2022) Not all patches are what you need: expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800"},{"key":"7498_CR33","doi-asserted-by":"crossref","unstructured":"Fayyaz M, Koohpayegani SA, Jafari FR, Sengupta S, Joze HRV, Sommerlade E, Pirsiavash H, Gall J (2022) Adaptive token sampling for efficient vision transformers. In: European Conference on Computer Vision, pp 396\u2013414. Springer","DOI":"10.1007\/978-3-031-20083-0_24"},{"key":"7498_CR34","doi-asserted-by":"crossref","unstructured":"Zong Z, Li K, Song G, Wang Y, Qiao Y, Leng B, Liu Y(2022) Self-slimmed vision transformer. In: European Conference on Computer Vision, pp 432\u2013448. Springer","DOI":"10.1007\/978-3-031-20083-0_26"},{"key":"7498_CR35","doi-asserted-by":"crossref","unstructured":"Marin D, Chang J-HR, Ranjan A, Prabhu A, Rastegari M, Tuzel O (2023) Token pooling in vision transformers for image classification. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 12\u201321","DOI":"10.1109\/WACV56688.2023.00010"},{"key":"7498_CR36","doi-asserted-by":"crossref","unstructured":"Fu Z, Liu Q, Fu Z, Wang Y (2021) Stmtrack: template-free visual tracking with space-time memory networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13774\u201313783","DOI":"10.1109\/CVPR46437.2021.01356"},{"key":"7498_CR37","doi-asserted-by":"crossref","unstructured":"Cao Z, Huang Z, Pan L, Zhang S, Liu Z, Fu C (2022) Tctrack: temporal contexts for aerial tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14798\u201314808","DOI":"10.1109\/CVPR52688.2022.01438"},{"key":"7498_CR38","unstructured":"Chen T, Saxena S, Li L, Fleet DJ, Hinton G (2021) Pix2seq: a language modeling framework for object detection. arXiv preprint arXiv:2109.10852"},{"key":"7498_CR39","unstructured":"Gevorgyan Z (2022) Siou loss: more powerful learning for bounding box regression. arXiv preprint arXiv:2205.12740"},{"key":"7498_CR40","doi-asserted-by":"crossref","unstructured":"He K, Chen X, Xie S, Li Y, Doll\u00e1r P, Girshick R (2022) Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16000\u201316009","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"7498_CR41","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp 740\u2013755. Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"7498_CR42","doi-asserted-by":"crossref","unstructured":"Fan H, Lin L, Yang F, Chu P, Deng G, Yu S, Bai H, Xu Y, Liao C, Ling H (2019) Lasot: a high-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5374\u20135383","DOI":"10.1109\/CVPR.2019.00552"},{"key":"7498_CR43","unstructured":"Loshchilov I, Hutter F (2017) Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"},{"key":"7498_CR44","unstructured":"Benchmark U (2016) A benchmark and simulator for uav tracking. In: European Conference on Computer Vision, 7"},{"key":"7498_CR45","unstructured":"Kristan M, Leonardis A, Matas J, Felsberg M, Pflugfelder R, K\u00e4m\u00e4r\u00e4inen J-K, Chang HJ, Danelljan M, Zajc L\u010c, Luke\u017ei\u010d A, et al (2022) The tenth visual object tracking vot2022 challenge results. In: European Conference on Computer Vision, pp 431\u2013460. Springer"},{"key":"7498_CR46","doi-asserted-by":"crossref","unstructured":"Wang X, Shu X, Zhang Z, Jiang B, Wang Y, Tian Y, Wu F (2021) Towards more flexible and accurate object tracking with natural language: algorithms and benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13763\u201313773","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"7498_CR47","doi-asserted-by":"crossref","unstructured":"Kiani Galoogahi H, Fagg A, Huang C, Ramanan D, Lucey S (2017) Need for speed: a benchmark for higher frame rate object tracking. In: Proceedings of the IEEE International Conference on Computer Vision, pp 1125\u20131134","DOI":"10.1109\/ICCV.2017.128"},{"key":"7498_CR48","doi-asserted-by":"crossref","unstructured":"Zhang Z, Peng H, Fu J, Li B, Hu W (2020) Ocean: object-aware anchor-free tracking. In: European Conference on Computer Vision, pp 771\u2013787. Springer","DOI":"10.1007\/978-3-030-58589-1_46"},{"key":"7498_CR49","doi-asserted-by":"crossref","unstructured":"Yan B, Zhang X, Wang D, Lu H, Yang X (2021) Alpha-refine: boosting tracking performance by precise bounding box estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp 5289\u20135298","DOI":"10.1109\/CVPR46437.2021.00525"},{"key":"7498_CR50","doi-asserted-by":"crossref","unstructured":"Xie F, Wang C, Wang G, Cao Y, Yang W, Zeng W (2022) Correlation-aware deep tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 8751\u20138760","DOI":"10.1109\/CVPR52688.2022.00855"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07498-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-07498-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07498-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,9]],"date-time":"2025-06-09T11:50:56Z","timestamp":1749469856000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-07498-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,9]]},"references-count":50,"journal-issue":{"issue":"8","published-online":{"date-parts":[[2025,6]]}},"alternative-id":["7498"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-07498-y","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,9]]},"assertion":[{"value":"23 May 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval and consent to participate"}}],"article-number":"993"}}