{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T18:23:50Z","timestamp":1773512630064,"version":"3.50.1"},"reference-count":65,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tip.2025.3644175","type":"journal-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:04:36Z","timestamp":1768255476000},"page":"388-401","source":"Crossref","is-referenced-by-count":1,"title":["RAR: Retrieving and Ranking Augmented MLLMs for Visual Recognition"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4489-0005","authenticated-orcid":false,"given":"Ziyu","family":"Liu","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4264-4281","authenticated-orcid":false,"given":"Zeyi","family":"Sun","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1110-5062","authenticated-orcid":false,"given":"Yuhang","family":"Zang","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory, Shanghai, China"}]},{"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Nanyang, China"}]},{"given":"Pan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory, Shanghai, China"}]},{"given":"Xiaoyi","family":"Dong","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6391-4921","authenticated-orcid":false,"given":"Yuanjun","family":"Xiong","sequence":"additional","affiliation":[{"name":"MThreads Inc, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8865-7896","authenticated-orcid":false,"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shatin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6877-5353","authenticated-orcid":false,"given":"Jiaqi","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref2","article-title":"EVA-CLIP: Improved training techniques for CLIP at scale","author":"Sun","year":"2023","journal-title":"arXiv:2303.15389"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01058"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"ref5","volume-title":"GPT-4V(ision) System Card","year":"2023"},{"key":"ref6","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref7","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. NeurIPS","author":"Liu"},{"key":"ref8","first-page":"49250","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai"},{"key":"ref9","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"arXiv:2306.14824"},{"key":"ref10","article-title":"MPLUG-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023","journal-title":"arXiv:2304.14178"},{"key":"ref11","article-title":"OpenFlamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023","journal-title":"arXiv:2308.01390"},{"key":"ref12","article-title":"InternLM-XComposer: A vision-language large model for advanced text-image comprehension and composition","author":"Zhang","year":"2023","journal-title":"arXiv:2309.15112"},{"key":"ref13","article-title":"Qwen-VL: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023","journal-title":"arXiv:2308.12966"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3860"},{"key":"ref15","article-title":"ShareGPT4 V: Improving large multi-modal models with better captions","author":"Chen","year":"2023","journal-title":"arXiv:2311.12793"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3617827"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3158317"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01817"},{"key":"ref19","first-page":"39755","article-title":"Retrieval-augmented multimodal language modeling","volume-title":"Proc. ICML","author":"Yasunaga"},{"key":"ref20","article-title":"RankRAG: Unifying context ranking with retrieval-augmented generation in LLMs","author":"Yu","year":"2024","journal-title":"arXiv:2407.02485"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.194"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"ref24","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","volume-title":"Proc. ICLR","author":"Gu"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_7"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref27","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. ICML","author":"Jia"},{"key":"ref28","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. ICML","author":"Li"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"ref32","article-title":"Alpha-CLIP: A CLIP model focusing on wherever you want","author":"Sun","year":"2023","journal-title":"arXiv:2312.03818"},{"key":"ref33","first-page":"24993","article-title":"Fine-grained visual prompting","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01101"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00088"},{"key":"ref39","article-title":"Recognize any regions","author":"Yang","year":"2023","journal-title":"arXiv:2311.01373"},{"key":"ref40","article-title":"Democratizing fine-grained visual recognition with large language models","volume-title":"Proc. ICLR","author":"Liu"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889473"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"ref44","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv:2310.03744"},{"key":"ref45","article-title":"GPT4 Vis: What can GPT-4 do for zero-shot visual recognition?","author":"Wu","year":"2023","journal-title":"arXiv:2311.15732"},{"key":"ref46","article-title":"The caltech-UCSD birds-200\u20132011","author":"Wah","year":"2011"},{"key":"ref47","article-title":"Novel dataset for fine-grained image categorization: Stanford dogs","volume-title":"Proc. CVPR Workshop","author":"Khosla"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.277"},{"key":"ref56","article-title":"Fine-grained visual classification of aircraft","author":"Maji","year":"2013","journal-title":"arXiv:1306.5151"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219748"},{"key":"ref58","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. ICML","author":"Li"},{"key":"ref59","first-page":"30662","article-title":"Vocabulary-free image classification","volume-title":"Proc. NeurIPS","author":"Conti"},{"key":"ref60","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv:2106.09685"},{"key":"ref61","article-title":"InternLM-XComposer2: Mastering free-form text-image composition and comprehension in vision-language large model","author":"Dong","year":"2024","journal-title":"arXiv:2401.16420"},{"key":"ref62","article-title":"Openclip","author":"Ilharco","year":"2021"},{"key":"ref63","first-page":"770","article-title":"Deep residual learning for image recognition","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"He"},{"key":"ref64","article-title":"Qwen2.5-VL technical report","volume-title":"arXiv:2502.13923","author":"Bai","year":"2025"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/tbdata.2025.3618474"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/83\/11355710\/11346906.pdf?arnumber=11346906","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T05:58:15Z","timestamp":1770703095000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11346906\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":65,"URL":"https:\/\/doi.org\/10.1109\/tip.2025.3644175","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}