{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T01:14:20Z","timestamp":1773018860326,"version":"3.50.1"},"reference-count":290,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U23A20387"],"award-info":[{"award-number":["U23A20387"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62322212"],"award-info":[{"award-number":["62322212"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62036012"],"award-info":[{"award-number":["62036012"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072455"],"award-info":[{"award-number":["62072455"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62536003"],"award-info":[{"award-number":["62536003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402252"],"award-info":[{"award-number":["62402252"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Key Project of PCL","award":["PCL2025A14"],"award-info":[{"award-number":["PCL2025A14"]}]},{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2021ZD0112200"],"award-info":[{"award-number":["2021ZD0112200"]}],"id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CAS Project for Young Scientists in Basic Research","award":["YSBR-116"],"award-info":[{"award-number":["YSBR-116"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1109\/tpami.2025.3630635","type":"journal-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T18:10:09Z","timestamp":1762539009000},"page":"2749-2771","source":"Crossref","is-referenced-by-count":3,"title":["Toward Visual Grounding: A Survey"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2592-5264","authenticated-orcid":false,"given":"Linhui","family":"Xiao","sequence":"first","affiliation":[{"name":"Pengcheng Laboratory (PCL), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5453-9755","authenticated-orcid":false,"given":"Xiaoshan","family":"Yang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8564-0346","authenticated-orcid":false,"given":"Xiangyuan","family":"Lan","sequence":"additional","affiliation":[{"name":"Pengcheng Laboratory (PCL), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2197-9038","authenticated-orcid":false,"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology (Shenzhen), Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/nature14539"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JRPROC.1961.287775"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-09438-9"},{"key":"ref4","volume-title":"Artificial Intelligence","author":"Winston","year":"1984"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3115\/1706269.1706296"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.3115\/1708322.1708334"},{"key":"ref15","first-page":"410","article-title":"A game-theoretic approach to generating spatial descriptions","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Golland","year":"2010"},{"key":"ref16","first-page":"95","article-title":"Natural reference to objects in a visual domain","volume-title":"Proc. 6th Int. Natural Lang. Gener. Conf.","author":"Mitchell","year":"2010"},{"key":"ref17","first-page":"1174","article-title":"Generating expressions that refer to visible objects","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics, Hum. Lang. Technol.","author":"Mitchell","year":"2013"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D13-1197"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref22","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1044"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042066"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijinfomgt.2019.01.021"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1163\/9789004368811_003"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3321501"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1162\/COLI_a_00088"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00177"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref34","first-page":"1","article-title":"Grounding multimodal large language models to the world","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Peng","year":"2024"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2911066"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3090426"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref45","first-page":"1","article-title":"DINO: DETR with improved denoising anchor boxes for end-to-end object detection","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Zhang","year":"2023"},{"key":"ref46","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Li","year":"2021"},{"key":"ref47","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"ref49","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2022"},{"key":"ref50","first-page":"30016","article-title":"Training compute-optimal large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hoffmann","year":"2022"},{"issue":"8","key":"ref51","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref52","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1036\/1097-8542.253500"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02506"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.558"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00226"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CRV55824.2022.00015"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00641"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00479"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127738"},{"key":"ref63","first-page":"79095","article-title":"Described object detection: Liberating object detection with flexible expressions","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Xie","year":"2023"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_30"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25085-9_1"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3328185"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-53311-2_36"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00476"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.3390\/electronics14142815"},{"key":"ref70","article-title":"GREC: Generalized referring expression comprehension","author":"He","year":"2023"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2015.303"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3296823"},{"key":"ref73","first-page":"139854","article-title":"OneRef: Unified one-tower expression grounding and segmentation with mask referring modeling","volume-title":"Adv. Proc. Neural Inf. Process. Syst.","author":"Xiao","year":"2024"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00425"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3058684"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00669"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00263"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20123"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.375"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28363"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"ref83","first-page":"19652","article-title":"Referring Transformer: A one-step approach to multi-task visual grounding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li","year":"2021"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01607"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00782"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00431"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01010"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.644"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00376"},{"key":"ref93","article-title":"Refdrone: A challenging benchmark for referring expression comprehension in drone scenes","author":"Sun","year":"2025"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2222"},{"key":"ref95","first-page":"21667","article-title":"Referring to any person","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Jiang","year":"2025"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW67362.2025.00056"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-acl.512"},{"key":"ref99","first-page":"17675","article-title":"MC-bench: A benchmark for multi-context visual grounding in the era of MLLMs","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Xu","year":"2025"},{"key":"ref100","first-page":"22119","article-title":"When visual grounding meets gigapixel-level large-scale scenes: Benchmark and approach","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Ma","year":"2024"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1162"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1082"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00474"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.470"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_42"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.213"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.95"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref111","first-page":"1","article-title":"Pix2seq: A language modeling framework for object detection","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Chen","year":"2022"},{"key":"ref112","first-page":"1","article-title":"Unified-IO: A unified model for vision, language, and multi-modal tasks","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Lu","year":"2022"},{"key":"ref113","first-page":"1","article-title":"Git: A generative image-to-text transformer for vision and language","volume":"1","author":"Wang","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref114","first-page":"61501","article-title":"VisionLLM: Large language model is also an open-ended decoder for vision-centric tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Wang","year":"2023"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1016\/0010-0285(72)90002-3"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.127599"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref119","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"28","author":"Ren","year":"2015"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.l007\/978-3-319-46448-0_2"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3331778"},{"key":"ref122","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Simonyan","year":"2015"},{"key":"ref123","first-page":"1","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","volume-title":"Proc. NIPS Workshop Deep Learn.","volume":"2014","author":"Chung","year":"2014"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2006.479"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/155"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20870-7_28"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6833"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1804.02767"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2016.2582924"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.520"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.333"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00447"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00206"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00477"},{"key":"ref140","article-title":"Real-time referring expression comprehension by single-stage grounding network","author":"Chen","year":"2018"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00255"},{"key":"ref143","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"Chung","year":"2014"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"ref146","first-page":"2204","article-title":"Recurrent models of visual attention","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"27","author":"Mnih","year":"2014"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.26.041002.131047"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1053"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00808"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"key":"ref153","first-page":"1","article-title":"An image is worth 16 x 16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2020"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681071"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25331"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25261"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681256"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"issue":"3","key":"ref161","first-page":"6","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","volume":"2","author":"Chiang","year":"2023"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.360"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3311646"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02796"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00626-4"},{"key":"ref167","first-page":"1","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Hu","year":"2021"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01667"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.258"},{"key":"ref173","first-page":"36067","article-title":"Glipv2: Unifying localization and vision-language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Zhang","year":"2022"},{"key":"ref174","article-title":"DINO-X: A unified vision model for open-world object detection and understanding","author":"Ren","year":"2024"},{"key":"ref175","first-page":"21429","article-title":"Hierarchical open-vocabulary universal image segmentation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Wang","year":"2023"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"ref177","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Lu","year":"2019"},{"key":"ref178","first-page":"1","article-title":"Vl-BERT: Pre-training of generic visual-linguistic representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Su","year":"2020"},{"key":"ref179","article-title":"One-peace: Exploring one general representation model toward unlimited modalities","author":"Wang","year":"2023"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1561\/0600000105"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1410-8"},{"key":"ref183","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref184","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref185","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"issue":"240","key":"ref186","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref187","article-title":"Stanford alpaca: An instruction-following llama model","author":"Taori","year":"2023"},{"key":"ref188","article-title":"Instruction tuning with GPT-4","author":"Peng","year":"2023"},{"key":"ref189","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu","year":"2023"},{"key":"ref190","first-page":"72096","article-title":"Language is not all you need: Aligning perception with language models","volume-title":"Proc. Neural Inf. Process. Syst.","volume":"36","author":"Huang","year":"2023"},{"key":"ref191","article-title":"Ferret-v2: An improved baseline for referring and grounding with large language models","author":"Zhang","year":"2024"},{"key":"ref192","first-page":"1","article-title":"DINOv2: Learning robust visual features without supervision","volume":"1","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res. J."},{"key":"ref193","first-page":"16664","article-title":"Adaptformer: Adapting vision transformers for scalable visual recognition","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Chen","year":"2022"},{"key":"ref194","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00370"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.19"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02301"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01335"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_24"},{"key":"ref202","article-title":"MiniGPT-v2: Large language model as a unified interface for vision-language multi-task learning","author":"Chen","year":"2023"},{"key":"ref203","article-title":"Qwen-Vl: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888428"},{"key":"ref205","first-page":"8612","article-title":"Visual cot: Advancing multi-modal language models with a comprehensive dataset and benchmark for chain-of-thought reasoning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Shao","year":"2024"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73030-6_3"},{"key":"ref207","article-title":"BuboGPT: Enabling visual grounding in multi-modal llms","author":"Zhao","year":"2023"},{"key":"ref208","first-page":"1","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","volume-title":"Proc. 12th Int. Conf. Learn. Representations","author":"Zhu","year":"2024"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01309"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3860"},{"key":"ref211","first-page":"60116","article-title":"NeXT-Chat: An LMM for chat, detection and segmentation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zhang","year":"2024"},{"key":"ref212","article-title":"Texthawk: Exploring efficient fine-grained perception of multimodal large language models","author":"Yu","year":"2024"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1145\/3672398"},{"key":"ref214","first-page":"1","article-title":"VisRL: Intention-driven visual perception via reinforced reasoning","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","author":"Chen","year":"2025"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1145\/3660638"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3867"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.3233\/faia240541"},{"key":"ref221","first-page":"32942","article-title":"Coarse-to-fine vision-language pre-training with fusion in the backbone","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Dou","year":"2022"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01277"},{"key":"ref223","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00597"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_44"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.159"},{"key":"ref230","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Zhang","year":"2020"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01387"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3186410"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3265816"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3311917"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161294"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3433547"},{"key":"ref237","first-page":"25994","article-title":"Multi-grained vision language pre-training: Aligning texts with visual concepts","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Zeng","year":"2022"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00265"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681058"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1145\/3656045"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611757"},{"key":"ref246","article-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445738"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00182"},{"key":"ref249","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095558"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16852"},{"key":"ref253","article-title":"Actress: Active retraining for semi-supervised visual grounding","author":"Kang","year":"2024"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.10.079"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/112"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128904"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01352"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10688227"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3297312"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128621"},{"key":"ref261","article-title":"Adapting clip for phrase localization without further training","author":"Li","year":"2022"},{"key":"ref262","first-page":"1","article-title":"Language models can do zero-shot visual referring expression comprehension","volume-title":"Proc. ICLR Tiny Paper","author":"Sui","year":"2023"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2024.01.004"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01362"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28278"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3462100"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72986-7_12"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10649948"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3339628"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3361862"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00367"},{"key":"ref273","article-title":"Grill: Grounded vision-language pre-training via aligning text and image regions","author":"Jin","year":"2023"},{"key":"ref274","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_35"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72995-9_8"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128227"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01851"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_38"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.727"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.241"},{"key":"ref281","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2020.103968"},{"key":"ref282","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01074"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.1214"},{"key":"ref285","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32867"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-024-07566-y"},{"key":"ref287","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3615586"},{"key":"ref288","article-title":"Laion-400 m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021","journal-title":"NeurIPS Workshop Datacentric AI"},{"key":"ref289","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Schuhmann","year":"2022"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11372200\/11235566.pdf?arnumber=11235566","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T21:05:41Z","timestamp":1770671141000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11235566\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":290,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3630635","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3]]}}}