{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T16:13:19Z","timestamp":1770394399890,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":52,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557578","type":"print"},{"value":"9789819557585","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5758-5_39","type":"book-chapter","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T04:59:33Z","timestamp":1770353973000},"page":"547-567","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["RGBT Tracking Based on\u00a0Multimodal Spatio-Temporal Feature Interaction and\u00a0Progressive Mamba Fusion"],"prefix":"10.1007","author":[{"given":"Zhiyuan","family":"Chang","sequence":"first","affiliation":[]},{"given":"Peng","family":"Yuan","sequence":"additional","affiliation":[]},{"given":"Zining","family":"Song","sequence":"additional","affiliation":[]},{"given":"He","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,7]]},"reference":[{"issue":"2","key":"39_CR1","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1007\/s41095-023-0345-5","volume":"10","author":"P Zhang","year":"2024","unstructured":"Zhang, P., Wang, D., Huchuan, L.: Multi-modal visual tracking: review and experimental comparison. Comput. Vis. Media 10(2), 193\u2013214 (2024)","journal-title":"Comput. Vis. Media"},{"key":"39_CR2","unstructured":"Tang, Z., Xu, T., Wu, X.J.: A survey for deep RGBT tracking. arXiv preprint arXiv:2201.09296 (2022)"},{"key":"39_CR3","doi-asserted-by":"crossref","unstructured":"Long Li, C., Lu, A., Hua Zheng, A., Tu, Z., Tang, J.: Multi-adapter RGBT tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops, pp. 2262\u20132270 (2019)","DOI":"10.1109\/ICCVW.2019.00279"},{"key":"39_CR4","doi-asserted-by":"publisher","first-page":"5613","DOI":"10.1109\/TIP.2021.3087341","volume":"30","author":"L Andong","year":"2021","unstructured":"Andong, L., Li, C., Yan, Y., Tang, J., Luo, B.: RGBT tracking via multi-adapter network with hierarchical divergence loss. IEEE Trans. Image Process. 30, 5613\u20135625 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR5","doi-asserted-by":"crossref","unstructured":"Hui, T., et al.: Bridging search region interaction with template for RGB-T tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13630\u201313639 (2023)","DOI":"10.1109\/CVPR52729.2023.01310"},{"key":"39_CR6","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Li, C., Luo, B., Tang, J., Wang, X.: Dense feature aggregation and pruning for RGBT tracking. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 465\u2013472 (2019)","DOI":"10.1145\/3343031.3350928"},{"key":"39_CR7","doi-asserted-by":"crossref","unstructured":"Gao, Y., Li, C., Zhu, Y., Tang, J., He, T., Wang, F.: Deep adaptive fusion network for high performance RGBT tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00017"},{"key":"39_CR8","doi-asserted-by":"crossref","unstructured":"Wei, X., Bai, Y., Zheng, Y., Shi, D., Gong, Y.: Autoregressive visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9697\u20139706 (2023)","DOI":"10.1109\/CVPR52729.2023.00935"},{"key":"39_CR9","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Zhong, B., Liang, Q., Mo, Z., Zhang, S., Li, X.: ODTrack: online dense temporal token learning for visual tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 7588\u20137596 (2024)","DOI":"10.1609\/aaai.v38i7.28591"},{"key":"39_CR10","unstructured":"Vaswani, A.: Attention is all you need. In: Advances in Neural Information Processing Systems (2017)"},{"key":"39_CR11","unstructured":"Lu, A., Qian, C., Li, C., Tang, J., Wang, L.: Duality-gated mutual condition network for RGBT tracking. IEEE Trans. Neural Netw. Learn. Syst. (2022)"},{"key":"39_CR12","doi-asserted-by":"crossref","unstructured":"Cao, B., Guo, J., Zhu, P., Qinghua, H.: Bi-directional adapter for multimodal tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 927\u2013935 (2024)","DOI":"10.1609\/aaai.v38i2.27852"},{"key":"39_CR13","unstructured":"Luo, Y., Guo, X., Feng, H., Ao, L.: RGB-T tracking via multi-modal mutual prompt learning. arXiv preprint arXiv:2308.16386 (2023)"},{"key":"39_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1007\/978-3-030-58542-6_14","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Li","year":"2020","unstructured":"Li, C., Liu, L., Lu, A., Ji, Q., Tang, J.: Challenge-aware RGBT tracking. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12367, pp. 222\u2013237. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58542-6_14"},{"key":"39_CR15","doi-asserted-by":"publisher","first-page":"2714","DOI":"10.1007\/s11263-021-01495-3","volume":"129","author":"P Zhang","year":"2021","unstructured":"Zhang, P., Wang, D., Huchuan, L., Yang, X.: Learning adaptive attribute-driven representation for real-time RGB-T tracking. Int. J. Comput. Vis. 129, 2714\u20132729 (2021)","journal-title":"Int. J. Comput. Vis."},{"issue":"2","key":"39_CR16","doi-asserted-by":"publisher","first-page":"579","DOI":"10.1109\/TCSVT.2021.3067997","volume":"32","author":"Y Zhu","year":"2021","unstructured":"Zhu, Y., Li, C., Tang, J., Luo, B., Wang, L.: RGBT tracking by trident fusion network. IEEE Trans. Circuits Syst. Video Technol. 32(2), 579\u2013592 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"39_CR17","unstructured":"Tang, Z., Xu, T., Wu, X.J.: Temporal aggregation for adaptive RGBT tracking. arXiv preprint arXiv:2201.08949 (2022)"},{"key":"39_CR18","first-page":"1","volume":"72","author":"F Zhang","year":"2023","unstructured":"Zhang, F., Peng, H., Lingli, Yu., Zhao, Y., Chen, B.: Dual-modality space-time memory network for RGBT tracking. IEEE Trans. Instrum. Meas. 72, 1\u201312 (2023)","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"39_CR19","doi-asserted-by":"crossref","unstructured":"Sun, D., Pan, Y., Lu, A., Li, C., Luo, B.: Transformer RGBT tracking with spatio-temporal multimodal tokens. IEEE Trans. Circ. Syst. Video Technol. (2024)","DOI":"10.1109\/TCSVT.2024.3425455"},{"key":"39_CR20","doi-asserted-by":"crossref","unstructured":"Wang, H., Liu, X., Li, Y., Sun, M., Yuan, D., Liu, J.: Temporal adaptive RGBT tracking with modality prompt. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 5436\u20135444 (2024)","DOI":"10.1609\/aaai.v38i6.28352"},{"key":"39_CR21","doi-asserted-by":"crossref","unstructured":"Shaker, A., Wasim, S.T., Khan, S., Gall, J., Khan, F.S.: GroupMAMBA: parameter-efficient and accurate group visual state space model. arXiv preprint arXiv:2407.13772 (2024)","DOI":"10.1109\/CVPR52734.2025.01389"},{"key":"39_CR22","doi-asserted-by":"crossref","unstructured":"Shi, Y., Dong, M., Xu, C.: Multi-scale VMAMBA: hierarchy in hierarchy visual state space model. arXiv preprint arXiv:2405.14174 (2024)","DOI":"10.52202\/079017-0808"},{"key":"39_CR23","unstructured":"Xiao, C., Li, M., Zhang, Z., Meng, D., Zhang, L.: Spatial-mamba: effective visual state space models via structure-aware state fusion. arXiv preprint arXiv:2410.15091 (2024)"},{"key":"39_CR24","unstructured":"Gu, A., Dao, T.: Mamba: linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)"},{"key":"39_CR25","unstructured":"Zhu, L., Liao, B., Zhang, Q., Wang, X., Liu, W. and Wang, X.: Vision mamba: efficient visual representation learning with bidirectional state space model. In: Proceedings of the 41st International Conference on Machine Learning, vol. 235, pp. 62429\u201362442 (2024)"},{"key":"39_CR26","unstructured":"Liu, Y., et al.: Visual state space model, VMAMBA (2024)"},{"key":"39_CR27","first-page":"127181","volume":"37","author":"D Han","year":"2025","unstructured":"Han, D., et al.: Demystify mamba in vision: a linear attention perspective. Adv. Neural. Inf. Process. Syst. 37, 127181\u2013127203 (2025)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"12","key":"39_CR28","doi-asserted-by":"publisher","first-page":"5743","DOI":"10.1109\/TIP.2016.2614135","volume":"25","author":"C Li","year":"2016","unstructured":"Li, C., Cheng, H., Shiyi, H., Liu, X., Tang, J., Lin, L.: Learning collaborative sparse representation for grayscale-thermal tracking. IEEE Trans. Image Process. 25(12), 5743\u20135756 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR29","doi-asserted-by":"crossref","unstructured":"Li, C., Zhao, N., Lu, Y., Zhu, C., Tang, J.: Weighted sparse representation regularized graph learning for RGB-T object tracking. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 1856\u20131864 (2017)","DOI":"10.1145\/3123266.3123289"},{"key":"39_CR30","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.106977","volume":"96","author":"C Li","year":"2019","unstructured":"Li, C., Liang, X., Yijuan, L., Zhao, N., Tang, J.: RGB-T object tracking: benchmark and baseline. Pattern Recogn. 96, 106977 (2019)","journal-title":"Pattern Recogn."},{"key":"39_CR31","doi-asserted-by":"publisher","first-page":"392","DOI":"10.1109\/TIP.2021.3130533","volume":"31","author":"C Li","year":"2021","unstructured":"Li, C., et al.: LASHER: a large-scale high-diversity benchmark for RGBT tracking. IEEE Trans. Image Process. 31, 392\u2013404 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR32","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: LASOT: a high-quality benchmark for large-scale single object tracking. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5374\u20135383 (2019)","DOI":"10.1109\/CVPR.2019.00552"},{"key":"39_CR33","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"39_CR34","doi-asserted-by":"publisher","unstructured":"Ye, B., Chang, H., Ma, B., Shan, S., Chen, X.: Joint feature learning and relation modeling for tracking: a one-stream framework. In: European Conference on Computer Vision, pp. 341\u2013357. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_20","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"39_CR35","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Yang, M., Li, C., Liu, L., Tang, J.: Attribute-based progressive fusion network for RGBT tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 2831\u20132838 (2022)","DOI":"10.1609\/aaai.v36i3.20187"},{"key":"39_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, T., Guo, H., Jiao, Q., Zhang, Q., Han, J.: Efficient RGB-T tracking via cross-modality distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5404\u20135413 (2023)","DOI":"10.1109\/CVPR52729.2023.00523"},{"key":"39_CR37","doi-asserted-by":"publisher","first-page":"1753","DOI":"10.1109\/TIP.2024.3371355","volume":"33","author":"L Liu","year":"2024","unstructured":"Liu, L., Li, C., Xiao, Y., Ruan, R., Fan, M.: Rgbt tracking via challenge-based appearance disentanglement and interaction. IEEE Trans. Image Process. 33, 1753\u20131767 (2024)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR38","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2020.102881","volume":"72","author":"M Feng","year":"2020","unstructured":"Feng, M., Song, K., Wang, Y., Liu, J., Yan, Y.: Learning discriminative update adaptive spatial-temporal regularized correlation filter for RGB-T tracking. J. Vis. Commun. Image Represent. 72, 102881 (2020)","journal-title":"J. Vis. Commun. Image Represent."},{"key":"39_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, L., Danelljan, M., Gonzalez-Garcia, A., Van De Weijer, J., Shahbaz Khan, F.: Multi-modal fusion for end-to-end RGB-T tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00278"},{"key":"39_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, P., Zhao, J., Wang, D., Lu, H., Ruan, X.: Visible-thermal UAV tracking: a large-scale benchmark and new baseline. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8886\u20138895 (2022)","DOI":"10.1109\/CVPR52688.2022.00868"},{"key":"39_CR41","doi-asserted-by":"crossref","unstructured":"Nam, H., Han, B.: Learning multi-domain convolutional neural networks for visual tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4293\u20134302 (2016)","DOI":"10.1109\/CVPR.2016.465"},{"key":"39_CR42","doi-asserted-by":"crossref","unstructured":"Zhu, J., Lai, S., Chen, X., Wang, D., Lu, H.: Visual prompt multi-modal tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9516\u20139526 (2023)","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"39_CR43","unstructured":"Wang, G., Luo, C., Sun, X., Xiong, Z., Zeng, W.: Real-time MDNET. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 83\u201398 (2018)"},{"key":"39_CR44","doi-asserted-by":"publisher","first-page":"3335","DOI":"10.1109\/TIP.2021.3060862","volume":"30","author":"P Zhang","year":"2021","unstructured":"Zhang, P., Zhao, J., Bo, C., Wang, D., Huchuan, L., Yang, X.: Jointly modeling motion and appearance cues for robust RGB-T tracking. IEEE Trans. Image Process. 30, 3335\u20133347 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR45","doi-asserted-by":"crossref","unstructured":"Danelljan, M., Bhat, G., Shahbaz Khan, F., Felsberg, M.: ECO: efficient convolution operators for tracking. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6638\u20136646 (2017)","DOI":"10.1109\/CVPR.2017.733"},{"issue":"2","key":"39_CR46","doi-asserted-by":"publisher","first-page":"393","DOI":"10.3390\/s20020393","volume":"20","author":"H Zhang","year":"2020","unstructured":"Zhang, H., Zhang, L., Zhuo, L., Zhang, J.: Object tracking in RGB-T videos using modal-aware attention network and competitive learning. Sensors 20(2), 393 (2020)","journal-title":"Sensors"},{"issue":"1","key":"39_CR47","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1109\/TIV.2020.2980735","volume":"6","author":"Y Zhu","year":"2020","unstructured":"Zhu, Y., Li, C., Tang, J., Luo, B.: Quality-aware feature aggregation network for robust RGBT tracking. IEEE Trans. Intell. Veh. 6(1), 121\u2013130 (2020)","journal-title":"IEEE Trans. Intell. Veh."},{"key":"39_CR48","doi-asserted-by":"crossref","unstructured":"Yang, J., Li, Z., Zheng, F., Leonardis, A., Song, J.: Prompting for multi-modal tracking. In Proceedings of the 30th ACM International Conference on Multimedia, pp. 3492\u20133500, 2022","DOI":"10.1145\/3503161.3547851"},{"key":"39_CR49","doi-asserted-by":"crossref","unstructured":"Tang, Z., Tianyang, X., Xiaojun, W., Zhu, X.-F., Kittler, J.: Generative-based fusion mechanism for multi-modal tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 5189\u20135197 (2024)","DOI":"10.1609\/aaai.v38i6.28325"},{"key":"39_CR50","doi-asserted-by":"crossref","unstructured":"Hong, L., et al.: OneTracker: unifying visual object tracking with foundation models and efficient tuning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19079\u201319091 (2024)","DOI":"10.1109\/CVPR52733.2024.01805"},{"key":"39_CR51","doi-asserted-by":"crossref","unstructured":"Wu, Z., et al.: Single-model and any-modality for video object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19156\u201319166 (2024)","DOI":"10.1109\/CVPR52733.2024.01812"},{"key":"39_CR52","unstructured":"Xiaojun Hou, X., et al. SDSTrack: self-distillation symmetric adapter learning for multi-modal visual object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26551\u201326561 (2024)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5758-5_39","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T04:59:42Z","timestamp":1770353982000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5758-5_39"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557578","9789819557585"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5758-5_39","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"7 February 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}