{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T05:03:55Z","timestamp":1768280635916,"version":"3.49.0"},"reference-count":94,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62576264"],"award-info":[{"award-number":["62576264"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Project supported by the National Science and Technology Major Project of the Ministry of Science and Technology of China","award":["2025ZD0551500"],"award-info":[{"award-number":["2025ZD0551500"]}]},{"name":"Project supported by the National Science and Technology Major Project of the Ministry of Science and Technology of China","award":["2025ZD0551502"],"award-info":[{"award-number":["2025ZD0551502"]}]},{"name":"Key Project of National Natural Science Foundation of China","award":["62431020"],"award-info":[{"award-number":["62431020"]}]},{"name":"Key Project of National Natural Science Foundation of China","award":["62231027"],"award-info":[{"award-number":["62231027"]}]},{"name":"Joint Fund Project of National Natural Science Foundation of China","award":["U22B2054"],"award-info":[{"award-number":["U22B2054"]}]},{"name":"Fund for Foreign Scholars in University Research and Teaching Programs","award":["B07048"],"award-info":[{"award-number":["B07048"]}]},{"name":"Postdoctoral Fellowship Program of China Postdoctoral Science Foundation","award":["GZC20232033"],"award-info":[{"award-number":["GZC20232033"]}]},{"name":"Program for Cheung Kong Scholars and Innovative Research Team in University","award":["IRT 15R53"],"award-info":[{"award-number":["IRT 15R53"]}]},{"name":"Key Scientific Technological Innovation Research Project by Ministry of Education and the National Key Laboratory of Human-Machine Hybrid Augmented Intelligence"},{"name":"Xi&#x2019;an Jiaotong University","award":["HMHAI-202404"],"award-info":[{"award-number":["HMHAI-202404"]}]},{"name":"Xi&#x2019;an Jiaotong University","award":["HMHAI-202405"],"award-info":[{"award-number":["HMHAI-202405"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tmm.2025.3632650","type":"journal-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:50:18Z","timestamp":1763146218000},"page":"915-928","source":"Crossref","is-referenced-by-count":0,"title":["Adaptive Multi-Modal Visual Tracking With Dynamic Semantic Prompts"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0375-9253","authenticated-orcid":false,"given":"Jiahao","family":"Wang","sequence":"first","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5669-9354","authenticated-orcid":false,"given":"Fang","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3354-9617","authenticated-orcid":false,"given":"Licheng","family":"Jiao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2002-3894","authenticated-orcid":false,"given":"Shuo","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6130-2518","authenticated-orcid":false,"given":"Lingling","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5472-1426","authenticated-orcid":false,"given":"Puhua","family":"Chen","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8780-5455","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8872-2195","authenticated-orcid":false,"given":"Wenping","family":"Ma","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]},{"given":"Xinyi","family":"Wang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of Ministry of Education, International Research Center for Intelligent Perception and Computation, Joint International Research Laboratory of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.777"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093425"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00540"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3235720"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02217"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.106977"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JSEN.2019.2925821"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2916895"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0345-5"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00868"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"ref13","first-page":"4446","article-title":"Divert more attention to vision-language tracking","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Guo","year":"2022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3038720"},{"key":"ref15","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Houlsby","year":"2019"},{"key":"ref16","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28347"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3238511"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289700"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321497"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3358549"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3377471"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3428316"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00935"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00441"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-018-6871-y"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-022-06979-y"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/JSEN.2020.3033034"},{"key":"ref33","article-title":"Bootstrapping referring multi-object tracking","author":"Zhang","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1111\/2041-210X.13166"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01711-8"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2017.2740952"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123289"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.infrared.2019.04.017"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3310295"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3371355"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3391802"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3393298"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20187"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547851"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00918"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02507"},{"key":"ref47","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref48","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Tsimpoukelli","year":"2021"},{"key":"ref49","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac","year":"2022"},{"key":"ref50","article-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023"},{"key":"ref51","first-page":"19","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref52","article-title":"mPLUG-Owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023"},{"key":"ref53","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu","year":"2024"},{"key":"ref54","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"},{"key":"ref55","first-page":"49250","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Dai","year":"2024"},{"key":"ref56","first-page":"61501","article-title":"VisionLLM: Large language model is also an open-ended decoder for vision-centric tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Wang","year":"2024"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-91813-1_4"},{"key":"ref58","article-title":"An image is worth 16  16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref59","article-title":"Towards efficient visual adaption via structural re-parameterization","author":"Luo","year":"2023"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_20"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2863604"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412984"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-68238-5_39"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01055"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25500"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01805"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00479"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00305"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25085-9_25"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3130533"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2023.3318601"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00552"},{"key":"ref73","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","volume-title":"Proc. 13th Int. Conf. Artif. Intell. Statist. JMLR Workshop Conf. Proc.","author":"Glorot","year":"2010"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TIV.2020.2980735"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58542-6_14"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3060862"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00709"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01812"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102531"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28325"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01309"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/130"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3206668"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01792"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00879"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3347644"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475349"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3301933"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611803"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01288"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73235-5_24"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/11342315\/11248958.pdf?arnumber=11248958","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:02:14Z","timestamp":1768255334000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11248958\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":94,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3632650","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}