{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:11:01Z","timestamp":1774602661849,"version":"3.50.1"},"reference-count":77,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,15]],"date-time":"2026-02-15T00:00:00Z","timestamp":1771113600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,15]],"date-time":"2026-02-15T00:00:00Z","timestamp":1771113600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-026-02774-7","type":"journal-article","created":{"date-parts":[[2026,2,15]],"date-time":"2026-02-15T04:51:08Z","timestamp":1771131068000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["GLAD: Generative Language-Assisted Visual Tracking for Low-Semantic Templates"],"prefix":"10.1007","volume":"134","author":[{"given":"Xingyu","family":"Luo","sequence":"first","affiliation":[]},{"given":"Yidong","family":"Cai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9297-7729","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Gangshan","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Limin","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,15]]},"reference":[{"key":"2774_CR1","doi-asserted-by":"crossref","unstructured":"Bertinetto, L., Valmadre, J,. Golodetz, S., & et\u00a0al. (2016). Staple: Complementary learners for real-time tracking. In: CVPR. IEEE Computer Society, pp 1401\u20131409.","DOI":"10.1109\/CVPR.2016.156"},{"key":"2774_CR2","doi-asserted-by":"crossref","unstructured":"Bhat, G., Johnander, J., Danelljan, M., & et\u00a0al. (2018). Unveiling the power of deep tracking. In: ECCV (2), Lecture Notes in Computer Science, vol 11206. Springer, pp 493\u2013509.","DOI":"10.1007\/978-3-030-01216-8_30"},{"key":"2774_CR3","doi-asserted-by":"crossref","unstructured":"Bhat, G., Danelljan, M., Gool, L.V., & et\u00a0al. (2019). Learning discriminative model prediction for tracking. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 6182\u20136191.","DOI":"10.1109\/ICCV.2019.00628"},{"key":"2774_CR4","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems, 33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR5","doi-asserted-by":"crossref","unstructured":"Cai, Y., Liu, J., Tang, J., & et\u00a0al. (2023). Robust object modeling for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 9589\u20139600.","DOI":"10.1109\/ICCV51070.2023.00879"},{"key":"2774_CR6","doi-asserted-by":"crossref","unstructured":"Chen, B., Li, P., Bai, L., & et\u00a0al. (2022). Backbone is all your need: A simplified architecture for visual object tracking. In: ECCV (22), Lecture Notes in Computer Science, vol 13682. Springer, pp 375\u2013392.","DOI":"10.1007\/978-3-031-20047-2_22"},{"key":"2774_CR7","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., & et\u00a0al. (2023a). Diffusiondet: Diffusion model for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 19830\u201319843.","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"2774_CR8","doi-asserted-by":"crossref","unstructured":"Chen, X., Yan, B., Zhu, J., & et\u00a0al. (2021). Transformer tracking. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8126\u20138135.","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"2774_CR9","doi-asserted-by":"crossref","unstructured":"Chen, X., Peng, H., Wang, D., & et\u00a0al. (2023b). Seqtrack: Sequence to sequence learning for visual object tracking. In: CVPR. IEEE, pp 14572\u201314581.","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"2774_CR10","doi-asserted-by":"crossref","unstructured":"Chen, X., Kang, B., Geng, W., & et\u00a0al. (2025). Sutrack: Towards simple and unified single object tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 2239\u20132247.","DOI":"10.1609\/aaai.v39i2.32223"},{"issue":"3","key":"2774_CR11","doi-asserted-by":"publisher","first-page":"1195","DOI":"10.1007\/s11263-024-02221-5","volume":"133","author":"Z Cheng","year":"2025","unstructured":"Cheng, Z., Zhu, F., Zhang, X. Y., et al. (2025). Breaking the limits of reliable prediction via generated data. International Journal of Computer Vision, 133(3), 1195\u20131221.","journal-title":"International Journal of Computer Vision"},{"key":"2774_CR12","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jiang, C., Wang, L., & et\u00a0al. (2022). Mixformer: End-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13608\u201313618","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"2774_CR13","doi-asserted-by":"publisher","first-page":"58736","DOI":"10.52202\/075280-2561","volume":"36","author":"Y Cui","year":"2023","unstructured":"Cui, Y., Song, T., Wu, G., et al. (2023). Mixformerv2: Efficient fully transformer tracking. Advances in neural information processing systems, 36, 58736\u201358751.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR14","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Khan, F.S., & et\u00a0al. (2019). ATOM: accurate tracking by overlap maximization. In: CVPR. Computer Vision Foundation \/ IEEE, pp 4660\u20134669","DOI":"10.1109\/CVPR.2019.00479"},{"key":"2774_CR15","unstructured":"Devlin, J., Chang, M.W., Lee, K., & et\u00a0al. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"2774_CR16","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., & Nichol, A. (2021). Diffusion models beat gans on image synthesis. Advances in neural information processing systems, 34, 8780\u20138794.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR17","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., & et\u00a0al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2774_CR18","doi-asserted-by":"crossref","unstructured":"Fan, H., Lin, L., Yang, F., & et\u00a0al. (2019). Lasot: A high-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5374\u20135383.","DOI":"10.1109\/CVPR.2019.00552"},{"key":"2774_CR19","doi-asserted-by":"publisher","first-page":"439","DOI":"10.1007\/s11263-020-01387-y","volume":"129","author":"H Fan","year":"2021","unstructured":"Fan, H., Bai, H., Lin, L., et al. (2021). Lasot: A high-quality large-scale single object tracking benchmark. International Journal of Computer Vision, 129, 439\u2013461.","journal-title":"International Journal of Computer Vision"},{"key":"2774_CR20","doi-asserted-by":"crossref","unstructured":"Feng, Q., Ablavsky, V., Bai, Q., & et\u00a0al. (2020). Real-time visual object tracking with natural language description. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 700\u2013709","DOI":"10.1109\/WACV45572.2020.9093425"},{"key":"2774_CR21","doi-asserted-by":"crossref","unstructured":"Feng, Q., Ablavsky, V., Bai, Q., & et\u00a0al. (2021). Siamese natural language tracker: Tracking by natural language descriptions with siamese trackers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5851\u20135860","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"2774_CR22","doi-asserted-by":"crossref","unstructured":"Gao, S., Zhou, C., Ma, C., & et\u00a0al. (2022). Aiatrack: Attention in attention for transformer visual tracking. In: ECCV (22), Lecture Notes in Computer Science, vol 13682. Springer, pp 146\u2013164.","DOI":"10.1007\/978-3-031-20047-2_9"},{"key":"2774_CR23","doi-asserted-by":"crossref","unstructured":"Ge, S.R., Cao, J., & He, R. (2025). Improving object detection models via llm-based training data synthesis. International Journal of Computer Vision pp 1\u201316.","DOI":"10.1007\/s11263-025-02560-x"},{"key":"2774_CR24","unstructured":"Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. In: Proceedings of the thirteenth international conference on artificial intelligence and statistics, JMLR Workshop and Conference Proceedings, pp 249\u2013256."},{"key":"2774_CR25","first-page":"4446","volume":"35","author":"M Guo","year":"2022","unstructured":"Guo, M., Zhang, Z., Fan, H., et al. (2022). Divert more attention to vision-language tracking. Advances in Neural Information Processing Systems, 35, 4446\u20134460.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2774_CR26","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & et\u00a0al. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2774_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., & et\u00a0al. (2022). Masked autoencoders are scalable vision learners. In: CVPR. IEEE, pp 15979\u201315988","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2774_CR28","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems, 33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR29","doi-asserted-by":"crossref","unstructured":"Huang, B., Lian, D., Luo, W., & et\u00a0al. (2021). Look before you leap: Learning landmark features for one-stage visual grounding. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16888\u201316897","DOI":"10.1109\/CVPR46437.2021.01661"},{"issue":"5","key":"2774_CR30","doi-asserted-by":"publisher","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","volume":"43","author":"L Huang","year":"2019","unstructured":"Huang, L., Zhao, X., & Huang, K. (2019). Got-10k: A large high-diversity benchmark for generic object tracking in the wild. IEEE transactions on pattern analysis and machine intelligence, 43(5), 1562\u20131577.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2774_CR31","first-page":"26565","volume":"35","author":"T Karras","year":"2022","unstructured":"Karras, T., Aittala, M., Aila, T., et al. (2022). Elucidating the design space of diffusion-based generative models. Advances in Neural Information Processing Systems, 35, 26565\u201326577.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2774_CR32","unstructured":"Kingma, D.P., & Welling, M. (2013). Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114."},{"key":"2774_CR33","doi-asserted-by":"crossref","unstructured":"Li, X., Ren, Y., Jin, X., & et\u00a0al. (2025). Diffusion models for image restoration and enhancement: a comprehensive survey. International Journal of Computer Vision pp 1\u201331.","DOI":"10.1007\/s11263-025-02570-9"},{"key":"2774_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y., Yu, J., Cai, Z., & et\u00a0al. (2022). Cross-modal target retrieval for tracking by natural language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4931\u20134940.","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"2774_CR35","doi-asserted-by":"crossref","unstructured":"Li, Z., Tao, R., Gavves, E., & et\u00a0al. (2017). Tracking by natural language specification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6495\u20136503","DOI":"10.1109\/CVPR.2017.777"},{"key":"2774_CR36","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"},{"key":"2774_CR37","doi-asserted-by":"crossref","unstructured":"Lugmayr, A., Danelljan, M., Romero, A., & et\u00a0al. (2022). Repaint: Inpainting using denoising diffusion probabilistic models. In: CVPR. IEEE, pp 11451\u201311461","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"2774_CR38","unstructured":"Luo, S., Tan, Y., Patil, S., & et\u00a0al. (2023). Lcm-lora: A universal stable-diffusion acceleration module. arXiv preprint arXiv:2311.05556"},{"key":"2774_CR39","doi-asserted-by":"crossref","unstructured":"Ma, D., & Wu, X. (2023). Tracking by natural language specification with long short-term context decoupling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 14012\u201314021","DOI":"10.1109\/ICCV51070.2023.01288"},{"key":"2774_CR40","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., & et\u00a0al. (2016). Generation and comprehension of unambiguous object descriptions. In: CVPR. IEEE Computer Society, pp 11\u201320","DOI":"10.1109\/CVPR.2016.9"},{"key":"2774_CR41","doi-asserted-by":"crossref","unstructured":"Muller, M., Bibi, A., Giancola, S., & et\u00a0al. (2018). Trackingnet: A large-scale dataset and benchmark for object tracking in the wild. In: Proceedings of the European conference on computer vision (ECCV), pp 300\u2013317","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"2774_CR42","doi-asserted-by":"crossref","unstructured":"Pnvr, K., Singh, B., Ghosh, P., & et\u00a0al. (2023). Ld-znet: A latent diffusion approach for text-based image segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4157\u20134168","DOI":"10.1109\/ICCV51070.2023.00384"},{"issue":"7","key":"2774_CR43","doi-asserted-by":"publisher","first-page":"2367","DOI":"10.1007\/s11263-023-01977-6","volume":"132","author":"W Quan","year":"2024","unstructured":"Quan, W., Chen, J., Liu, Y., et al. (2024). Deep learning-based image and video inpainting: A survey. International Journal of Computer Vision, 132(7), 2367\u20132400.","journal-title":"International Journal of Computer Vision"},{"key":"2774_CR44","unstructured":"Radford, A., Kim, J.W., Hallacy, C., & et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: International conference on machine learning, PMLR, pp 8748\u20138763"},{"key":"2774_CR45","first-page":"1","volume-title":"SC20: International Conference for High Performance Computing","author":"S Rajbhandari","year":"2020","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., et al. (2020). Zero: Memory optimizations toward training trillion parameter models. SC20: International Conference for High Performance Computing (pp. 1\u201316). Storage and Analysis, IEEE: Networking."},{"key":"2774_CR46","doi-asserted-by":"crossref","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., & et\u00a0al. (2020). Deepspeed: System optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp 3505\u20133506","DOI":"10.1145\/3394486.3406703"},{"key":"2774_CR47","doi-asserted-by":"crossref","unstructured":"Rassin, R., Ravfogel, S., & Goldberg, Y. (2022). Dalle-2 is seeing double: Flaws in word-to-concept mapping in text2image models. arXiv preprint arXiv:2210.10606","DOI":"10.18653\/v1\/2022.blackboxnlp-1.28"},{"key":"2774_CR48","unstructured":"Rezende, D.J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In: International conference on machine learning, PMLR, pp 1278\u20131286"},{"key":"2774_CR49","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., & et\u00a0al. (2022). High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2774_CR50","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., & Brox, T. (2015). U-net: Convolutional networks for biomedical image segmentation. In: Medical image computing and computer-assisted intervention\u2013MICCAI 2015: 18th international conference, Munich, Germany, October 5-9, 2015, proceedings, part III 18, Springer, pp 234\u2013241","DOI":"10.1007\/978-3-319-24574-4_28"},{"issue":"3","key":"2774_CR51","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., et al. (2015). Imagenet large scale visual recognition challenge. International journal of computer vision, 115(3), 211\u2013252.","journal-title":"International journal of computer vision"},{"key":"2774_CR52","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, 35, 36479\u201336494.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR53","doi-asserted-by":"crossref","unstructured":"Shao, Y., He, S., Ye, Q., & et\u00a0al. (2024). Context-aware integration of language and visual references for natural language tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 19208\u201319217","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"2774_CR54","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., & et\u00a0al. (2015). Deep unsupervised learning using nonequilibrium thermodynamics. In: International conference on machine learning, PMLR, pp 2256\u20132265"},{"key":"2774_CR55","unstructured":"Song, Y., & Ermon, S. (2019). Generative modeling by estimating gradients of the data distribution. Advances in neural information processing systems 32"},{"key":"2774_CR56","first-page":"12438","volume":"33","author":"Y Song","year":"2020","unstructured":"Song, Y., & Ermon, S. (2020). Improved techniques for training score-based generative models. Advances in neural information processing systems, 33, 12438\u201312448.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR57","doi-asserted-by":"crossref","unstructured":"Song, Y., Ma, C., Wu, X., & et\u00a0al. (2018). VITAL: visual tracking via adversarial learning. In: CVPR. Computer Vision Foundation \/ IEEE Computer Society, pp 8990\u20138999","DOI":"10.1109\/CVPR.2018.00937"},{"key":"2774_CR58","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., & et\u00a0al. (2020). Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456"},{"key":"2774_CR59","first-page":"1415","volume":"34","author":"Y Song","year":"2021","unstructured":"Song, Y., Durkan, C., Murray, I., et al. (2021). Maximum likelihood training of score-based diffusion models. Advances in neural information processing systems, 34, 1415\u20131428.","journal-title":"Advances in neural information processing systems"},{"key":"2774_CR60","unstructured":"Song, Y., Dhariwal, P., Chen, M., & et\u00a0al. (2023). Consistency models. arXiv preprint arXiv:2303.01469"},{"key":"2774_CR61","unstructured":"Sun, Y., Yu, F., Chen, S., & et\u00a0al. (2024). Chattracker: Enhancing visual tracking performance via chatting with multimodal large language model. In: NeurIPS"},{"key":"2774_CR62","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., & et\u00a0al. (2017). Attention is all you need. Advances in neural information processing systems 30"},{"key":"2774_CR63","doi-asserted-by":"crossref","unstructured":"Wang, X., Shu, X., Zhang, Z., & et\u00a0al. (2021a). Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 13763\u201313773","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"2774_CR64","unstructured":"Wang, Z., Zhao, H., Li, Y., & et\u00a0al. (2021b). Do different tracking tasks require different appearance models? In: NeurIPS, pp 726\u2013738"},{"key":"2774_CR65","doi-asserted-by":"crossref","unstructured":"Wei, X., Bai, Y., Zheng, Y., & et\u00a0al. (2023). Autoregressive visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 9697\u20139706","DOI":"10.1109\/CVPR52729.2023.00935"},{"key":"2774_CR66","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Fu, J., & et\u00a0al. (2021). Learning spatio-temporal transformer for visual tracking. In: ICCV. IEEE, pp 10428\u201310437","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"2774_CR67","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., & et\u00a0al. (2019). A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4683\u20134693","DOI":"10.1109\/ICCV.2019.00478"},{"issue":"9","key":"2774_CR68","doi-asserted-by":"publisher","first-page":"3433","DOI":"10.1109\/TCSVT.2020.3038720","volume":"31","author":"Z Yang","year":"2020","unstructured":"Yang, Z., Kumar, T., Chen, T., et al. (2020). Grounding-tracking-integration. IEEE Transactions on Circuits and Systems for Video Technology, 31(9), 3433\u20133443.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2774_CR69","doi-asserted-by":"crossref","unstructured":"Ye, B., Chang, H., Ma, B., & et\u00a0al. (2022). Joint feature learning and relation modeling for tracking: A one-stream framework. In: European conference on computer vision, Springer, pp 341\u2013357","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"2774_CR70","doi-asserted-by":"crossref","unstructured":"Zhang, C., Sun, X., Yang, Y., & et\u00a0al. (2023). All in one: Exploring unified vision-language tracking with multi-modal alignment. In: Proceedings of the 31st ACM International Conference on Multimedia, pp 5552\u20135561","DOI":"10.1145\/3581783.3611803"},{"key":"2774_CR71","unstructured":"Zhang, G., Zhong, B., Liang, Q., & et\u00a0al. (2024). Diffusion mask-driven visual-language tracking. In: IJCAI. ijcai.org, pp 1652\u20131660"},{"key":"2774_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Peng, H., Fu, J., & et\u00a0al. (2020). Ocean: Object-aware anchor-free tracking. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16, Springer, pp 771\u2013787","DOI":"10.1007\/978-3-030-58589-1_46"},{"issue":"10","key":"2774_CR73","doi-asserted-by":"publisher","first-page":"2418","DOI":"10.1007\/s11263-020-01300-7","volume":"128","author":"B Zhao","year":"2020","unstructured":"Zhao, B., Yin, W., Meng, L., et al. (2020). Layout2image: Image generation from layout. International journal of computer vision, 128(10), 2418\u20132435.","journal-title":"International journal of computer vision"},{"key":"2774_CR74","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1016\/j.patrec.2023.02.023","volume":"168","author":"H Zhao","year":"2023","unstructured":"Zhao, H., Wang, X., Wang, D., et al. (2023). Transformer vision-language tracking via proxy token guided cross-modal fusion. Pattern Recognition Letters, 168, 10\u201316.","journal-title":"Pattern Recognition Letters"},{"issue":"4","key":"2774_CR75","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/TCSVT.2023.3301933","volume":"34","author":"Y Zheng","year":"2023","unstructured":"Zheng, Y., Zhong, B., Liang, Q., et al. (2023). Toward unified token learning for vision-language tracking. IEEE Transactions on Circuits and Systems for Video Technology, 34(4), 2125\u20132135.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2774_CR76","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Z., Mao, K., & et\u00a0al. (2023). Joint visual grounding and tracking with natural language specification. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 23151\u201323160","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"2774_CR77","unstructured":"Zhou, X., Wang, D., & Kr\u00e4henb\u00fchl, P. (2019). Objects as points. CoRR arxiv:abs\/1904.07850"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02774-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02774-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02774-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:34:22Z","timestamp":1774600462000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02774-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,15]]},"references-count":77,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2774"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02774-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,15]]},"assertion":[{"value":"22 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"There is no specific conflict of interest statement with this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}},{"value":"There is no specific compliance of ethical standard statement with this manuscript.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance of Ethical Standard"}},{"value":"There is no specific informed consent with this manuscript.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed Consent"}}],"article-number":"121"}}