{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T20:21:19Z","timestamp":1778617279796,"version":"3.51.4"},"reference-count":88,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202163"],"award-info":[{"award-number":["62202163"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072166"],"award-info":[{"award-number":["62072166"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372150"],"award-info":[{"award-number":["62372150"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472161"],"award-info":[{"award-number":["62472161"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","award":["2022JJ40190"],"award-info":[{"award-number":["2022JJ40190"]}],"id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","award":["2022JJ30231"],"award-info":[{"award-number":["2022JJ30231"]}],"id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","award":["2023JJ30169"],"award-info":[{"award-number":["2023JJ30169"]}],"id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tmm.2026.3660119","type":"journal-article","created":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T20:45:03Z","timestamp":1770065103000},"page":"3444-3457","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Modal Refined Prompting for Advancing Knowledge-Based Visual Question Answering"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4569-1429","authenticated-orcid":false,"given":"Lei","family":"Zhu","sequence":"first","affiliation":[{"name":"College of Information and Intelligence, Hunan Agricultural University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5566-3900","authenticated-orcid":false,"given":"Mengxi","family":"Ying","sequence":"additional","affiliation":[{"name":"School of Computer, Hunan University of Technology, Zhuzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2721-6867","authenticated-orcid":false,"given":"Chengyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0371-9921","authenticated-orcid":false,"given":"Deyin","family":"Liu","sequence":"additional","affiliation":[{"name":"Anhui University, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6119-058X","authenticated-orcid":false,"given":"Lin Yuanbo","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Engineering, University of Warwick, England, U.K."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9981-2970","authenticated-orcid":false,"given":"Shichao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Guangxi Normal University, Guilin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2924-946X","authenticated-orcid":false,"given":"Xuelong","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence (TeleAI) of China Telecom, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3169065"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00768"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2180"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3521646"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3222965"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3235495"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3380259"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3521709"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09832-7"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3292597"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2754246"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-024-02166-8"},{"key":"ref16","first-page":"10560","article-title":"Revive: Regional visual representation matters in knowledge-based visual question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Lin","year":"2022"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.517"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00501"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20174"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/153"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01389"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"ref25","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref26","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","volume":"139","author":"Radford","year":"2021"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01046"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27888"},{"key":"ref32","article-title":"PromptCap: Prompt-guided task-aware image captioning","author":"Hu","year":"2022"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.70"},{"key":"ref35","first-page":"1571","article-title":"Bilinear attention networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst. 31","author":"Kim","year":"2018"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2817340"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_25"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3104937"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3120194"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3173131"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3216770"},{"key":"ref44","first-page":"6616","article-title":"Large-scale adversarial training for vision-and-language representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Gan","year":"2020"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00553"},{"key":"ref48","article-title":"SimVLM: Simple visual language model pretraining with weak supervision","author":"Wang","year":"2021"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.187"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"ref52","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref53","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2022"},{"key":"ref54","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289729"},{"key":"ref57","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"32","author":"Lu","year":"2019"},{"key":"ref58","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"Li","year":"2019"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3272224"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01954-z"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3294991"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3105284"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3180563"},{"key":"ref65","article-title":"FortisAVQA and MAVEN: A benchmark dataset and debiasing framework for robust multimodal reasoning","author":"Ma","year":"2025"},{"key":"ref66","first-page":"9507","article-title":"Look, listen, and answer: Overcoming biases for audio-visual question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"38","author":"Ma","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3366154"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3355638"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1023\/B:BTTJ.0000047600.45421.6d"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3224577"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i16.33918"},{"key":"ref74","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.772"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.20"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-023-02028-9"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448108"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3384270"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.289"},{"key":"ref84","article-title":"Gemma 2: Improving open language models at a practical size","author":"Team","year":"2024"},{"key":"ref85","article-title":"Qwen2.5-Coder technical report","author":"Hui","year":"2024"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.21105\/joss.07211"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1171"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1612"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/11342315\/11370296.pdf?arnumber=11370296","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T19:49:50Z","timestamp":1778615390000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11370296\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":88,"URL":"https:\/\/doi.org\/10.1109\/tmm.2026.3660119","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}