{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,15]],"date-time":"2026-03-15T22:22:59Z","timestamp":1773613379158,"version":"3.50.1"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Jiangsu Science and Technology Major Program","award":["BG2024041"],"award-info":[{"award-number":["BG2024041"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376069"],"award-info":[{"award-number":["62376069"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515012027"],"award-info":[{"award-number":["2024A1515012027"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Science and Technology Program","award":["ZDSYS20230626091203008"],"award-info":[{"award-number":["ZDSYS20230626091203008"]}]},{"name":"Shenzhen Science and Technology Program","award":["KQTD20240729102207002"],"award-info":[{"award-number":["KQTD20240729102207002"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tmm.2025.3599070","type":"journal-article","created":{"date-parts":[[2025,8,14]],"date-time":"2025-08-14T18:48:19Z","timestamp":1755197299000},"page":"7522-7532","source":"Crossref","is-referenced-by-count":5,"title":["RA-BLIP: Multimodal Adaptive Retrieval-Augmented Bootstrapping Language-Image Pre-Training"],"prefix":"10.1109","volume":"27","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9009-3615","authenticated-orcid":false,"given":"Muhe","family":"Ding","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}]},{"given":"Yang","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Computer Science, University of Sydney, Sydney, NSW, Australia"}]},{"given":"Pengda","family":"Qin","sequence":"additional","affiliation":[{"name":"Security Department, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}]},{"given":"Yuhong","family":"Li","sequence":"additional","affiliation":[{"name":"Security Department, Alibaba Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Harbin Institute of Technology, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1612"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01600"},{"key":"ref5","first-page":"1","article-title":"MultimodalQA: Complex question answering over text, tables and images","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Talmor","year":"2021"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.290"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3289729"},{"issue":"70","key":"ref8","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung","year":"2024","journal-title":"J. Mach. Learn. Res."},{"key":"ref9","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref10","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. 2023 Int. Conf. Mach. Learn.","author":"Li","year":"2023"},{"key":"ref11","article-title":"GPT-4 Technical report","year":"2023"},{"key":"ref12","first-page":"240:1","article-title":"PaLM: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref13","first-page":"3929","article-title":"REALM: Retrieval augmented language model pre-training","volume-title":"Proc. 2020 Int. Conf. Mach. Learn.","author":"Guu","year":"2020"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02238"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613848"},{"key":"ref17","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Lewis","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.74"},{"key":"ref19","first-page":"2206","article-title":"Improving language models by retrieving from trillions of tokens","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"162","author":"Borgeaud","year":"2022"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611964"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.292"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3026892"},{"key":"ref23","article-title":"Universal multi-modality retrieval with one unified embedding space","author":"Liu","year":"2022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.389"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.772"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3591864"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2972830"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3280734"},{"key":"ref30","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dai","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3543873.3584627"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539151"},{"key":"ref35","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref37","first-page":"1","article-title":"SimVLM: Simple visual language model pretraining with weak supervision","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wang","year":"2022"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3233258"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00553"},{"key":"ref42","first-page":"200","article-title":"Multimodal few-shot learning with frozen language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tsimpoukelli","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3291588"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"ref45","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Alayrac","year":"2022"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.292"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.69"},{"key":"ref48","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref49","first-page":"3887","article-title":"Accelerating large-scale inference with anisotropic vector quantization","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Guo","year":"2020"},{"key":"ref50","first-page":"27263","article-title":"BARTScore: Evaluating generated text as text generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Yuan","year":"2021"},{"key":"ref51","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref52","first-page":"140:1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref53","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2022"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1525\/9780520940420-020"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1161"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-3029"},{"key":"ref57","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"ref59","first-page":"1143","article-title":"IM2Text: Describing images using 1 million captioned photographs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ordonez","year":"2011"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401110"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6294"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/10844992\/11125516.pdf?arnumber=11125516","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T17:25:49Z","timestamp":1761153949000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11125516\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":62,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3599070","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}