{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,25]],"date-time":"2025-09-25T16:26:10Z","timestamp":1758817570478,"version":"3.40.3"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781094"},{"type":"electronic","value":"9783031781100"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78110-0_24","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:53:40Z","timestamp":1733090020000},"page":"366-381","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Adaptive Text Feature Updating for\u00a0Visual-Language Tracking"],"prefix":"10.1007","author":[{"given":"Xuexin","family":"Liu","sequence":"first","affiliation":[]},{"given":"Zhuojun","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Hao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"24_CR1","doi-asserted-by":"crossref","unstructured":"Li, Z., Tao, R., Gavves, E., Snoek, C.G.M., Smeulders, A.W.M.: Tracking by natural language specification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6495\u20136503 (2017)","DOI":"10.1109\/CVPR.2017.777"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Z., Mao, K., He, Z.: Joint visual grounding and tracking with natural language specification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23151\u201323160 (2023)","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"24_CR3","first-page":"65801","volume":"36","author":"X Zhao","year":"2023","unstructured":"Zhao, X., Zhang, D., Liyuan, H., Zhang, T., Bo, X.: Ode-based recurrent model-free reinforcement learning for pomdps. Adv. Neural. Inf. Process. Syst. 36, 65801\u201365817 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Wang, Z., Zhu, X., Zhang, T., Wang, B., Lei, Z.: 3d face reconstruction with the geometric guidance of facial part segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1672\u20131682 (2024)","DOI":"10.1109\/CVPR52733.2024.00165"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Towards more flexible and accurate object tracking with natural language: Algorithms and benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13763\u201313773 (2021)","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: All in one: exploring unified vision-language tracking with multi-modal alignment. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5552\u20135561 (2023)","DOI":"10.1145\/3581783.3611803"},{"key":"24_CR7","unstructured":"Ma, Y., Tang, Y., Yang, W., Zhang, T., Zhang, J., Kang, M.: Unifying visual and vision-language tracking via contrastive learning. arXiv preprint arXiv: 2401.11228 (2024)"},{"key":"24_CR8","doi-asserted-by":"crossref","unstructured":"Shao, Y., He, S., Ye, Q., Feng, Y., Luo, W., Chen, J.: Context-aware integration of language and visual references for natural language tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19208\u201319217 (2024)","DOI":"10.1109\/CVPR52733.2024.01817"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Fu, J., Wang, D., Lu, H.: Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10448\u201310457 (2021)","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"24_CR10","doi-asserted-by":"publisher","unstructured":"Ye, B., Chang, H., Ma, B., Shan, S., Chen, X.: Joint feature learning and relation modeling for tracking: a one-stream framework. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13682. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_20","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Peng, H., Wang, D., Lu, H., Hu, H.: Seqtrack: sequence to sequence learning for visual object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14572\u201314581 (2023)","DOI":"10.1109\/CVPR52729.2023.01400"},{"key":"24_CR12","doi-asserted-by":"crossref","unstructured":"Wei, X., Bai, Y., Zheng, Y., Shi, D., Gong, Y.: Autoregressive visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9697\u20139706 (2023)","DOI":"10.1109\/CVPR52729.2023.00935"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Zou, Z., Hao, J., Shu, L.: Online feature classification and clustering for transformer-based visual tracker. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp. 3514\u20133521. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956181"},{"issue":"9","key":"24_CR14","doi-asserted-by":"publisher","first-page":"3433","DOI":"10.1109\/TCSVT.2020.3038720","volume":"31","author":"Z Yang","year":"2020","unstructured":"Yang, Z., Kumar, T., Chen, T., Jingsong, S., Luo, J.: Grounding-tracking-integration. IEEE Trans. Circuits Syst. Video Technol. 31(9), 3433\u20133443 (2020)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"24_CR15","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint, arXiv: 2303.08774 (2023)"},{"key":"24_CR16","unstructured":"Anil, R., et\u00a0al.: Palm 2 technical report. arXiv preprint, arXiv: 2305.1040 (2023)"},{"key":"24_CR17","unstructured":"Bai, J., et\u00a0al.: Qwen technical report. arXiv preprint, arXiv: 2309.16609 (2023)"},{"key":"24_CR18","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. neural inform. process. syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. neural inform. process. syst."},{"key":"24_CR19","unstructured":"Zhang, R., et al.: Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv preprint, arXiv: 2303.16199 (2023)"},{"key":"24_CR20","unstructured":"Bai, J., et al.: Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint, arXiv: 2308.12966 (2023)"},{"key":"24_CR21","unstructured":"Wang, W., et\u00a0al.: Cogvlm: Visual expert for pretrained language models. arXiv preprint, arXiv: 2311.03079 (2023)"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Hong, W., et\u00a0al.: Cogagent: a visual language model for gui agents. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14281\u201314290 (2024)","DOI":"10.1109\/CVPR52733.2024.01354"},{"key":"24_CR23","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"24_CR24","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. Adv. Neural Inform. Process. Syst. 36 (2024)"},{"key":"24_CR25","unstructured":"Hu, E.J., et al.: Lora: Low-rank adaptation of large language models arXiv preprint arXiv: 2106.09685 (2021)"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Li, Z., Tao, R., Gavves, E., Snoek, C.G.M., Smeulders, A.W.M.: Tracking by natural language specification. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7350\u20137358 (2017)","DOI":"10.1109\/CVPR.2017.777"},{"key":"24_CR27","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Lasot: ahigh-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5374\u20135383 (2019)","DOI":"10.1109\/CVPR.2019.00552"},{"key":"24_CR28","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. arXiv preprint, arXiv: 1412.6980 (2014)"},{"key":"24_CR29","unstructured":"Dosovitskiy, A.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"24_CR30","doi-asserted-by":"crossref","unstructured":"Li, Y., Yu, J., Cai, Z., Pan, Y.: Cross-modal target retrieval for tracking by natural language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4931\u20134940 (2022)","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Feng, Q., Ablavsky, V., Bai, Q., Sclaroff, S.: Siamese natural language tracker: Tracking by natural language descriptions with siamese trackers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5851\u20135860 (2021)","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"24_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"771","DOI":"10.1007\/978-3-030-58589-1_46","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Zhang","year":"2020","unstructured":"Zhang, Z., Peng, H., Fu, J., Li, B., Hu, W.: Ocean: object-aware anchor-free tracking. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 771\u2013787. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_46"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Chen, X., Yan, B., Zhu, J., Wang, D., Yang, X., Lu, H.: Transformer tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8126\u20138135 (2021)","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jiang, C., Wang, L., Wu, G.: Mixformer: end-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13608\u201313618 (2022)","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"24_CR35","doi-asserted-by":"publisher","unstructured":"Chen, B., et al.: Backbone is all your need: a simplified architecture for visual object tracking. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022, pp. 375\u2013392, Springer Nature Switzerland, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_22","DOI":"10.1007\/978-3-031-20047-2_22"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78110-0_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:34:50Z","timestamp":1733096090000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78110-0_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781094","9783031781100"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78110-0_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}