{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T02:10:12Z","timestamp":1776132612427,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611830","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"547-556","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":27,"title":["RAMM: Retrieval-augmented Biomedical Visual Question Answering with Multi-modal Pre-training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7179-2437","authenticated-orcid":false,"given":"Zheng","family":"Yuan","sequence":"first","affiliation":[{"name":"Alibaba DAMO Academy, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1268-7239","authenticated-orcid":false,"given":"Qiao","family":"Jin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6676-3057","authenticated-orcid":false,"given":"Chuanqi","family":"Tan","sequence":"additional","affiliation":[{"name":"Alibaba DAMO Academy, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9425-7752","authenticated-orcid":false,"given":"Zhengyun","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2597-1973","authenticated-orcid":false,"given":"Hongyi","family":"Yuan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3709-5053","authenticated-orcid":false,"given":"Fei","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba DAMO Academy, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8084-0904","authenticated-orcid":false,"given":"Songfang","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba DAMO Academy, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1371"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of CLEF (Conference and Labs of the Evaluation Forum) 2019 Working Notes.","author":"Abacha Asma Ben","year":"2019","unstructured":"Asma Ben Abacha, Sadid A Hasan, Vivek V Datla, Dina Demner-Fushman, and Henning M\u00fcller. 2019. VQA-Med: Overview of the Medical Visual Question Answering Task at ImageCLEF 2019. In Proceedings of CLEF (Conference and Labs of the Evaluation Forum) 2019 Working Notes. 9-12 September 2019."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"D. Nguyen Binh Do Thanh-Toan X. Nguyen Binh Do Tuong Tjiputra Erman and D. Tran Quang. 2019. Overcoming Data Limitation in Medical Visual Question Answering. In MICCAI.","DOI":"10.1007\/978-3-030-32251-9_57"},{"key":"e_1_3_2_1_4_1","volume-title":"Multi-Modal Masked Autoencoders for Medical Vision-and-Language Pre-Training. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer.","author":"Chen Zhihong","year":"2022","unstructured":"Zhihong Chen, Yuhao Du, Jinpeng Hu, Yang Liu, Guanbin Li, Xiang Wan, and Tsung-Hui Chang. 2022a. Multi-Modal Masked Autoencoders for Medical Vision-and-Language Pre-Training. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547948"},{"key":"e_1_3_2_1_6_1","volume-title":"Lin (Eds.)","volume":"33","author":"Cubuk Ekin Dogus","year":"2020","unstructured":"Ekin Dogus Cubuk, Barret Zoph, Jon Shlens, and Quoc Le. 2020. RandAugment: Practical Automated Data Augmentation with a Reduced Search Space. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 18613--18624."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-demo.3"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_1_9_1","unstructured":"Tuong Do Binh X. Nguyen Erman Tjiputra Minh Tran Quang D. Tran and Anh Nguyen. 2021. Multiple Meta-model Quantifying for Medical Visual Question Answering. In MICCAI."},{"key":"e_1_3_2_1_10_1","volume-title":"An Empirical Study of Training End-to-End Vision-and-Language Transformers. In Conference on Computer Vision and Pattern Recognition (CVPR). https:\/\/arxiv.org\/abs\/2111","author":"Dou Zi-Yi","year":"2022","unstructured":"Zi-Yi Dou, Yichong Xu, Zhe Gan, Jianfeng Wang, Shuohang Wang, Lijuan Wang, Chenguang Zhu, Pengchuan Zhang, Lu Yuan, Nanyun Peng, Zicheng Liu, and Michael Zeng. 2022. An Empirical Study of Training End-to-End Vision-and-Language Transformers. In Conference on Computer Vision and Pattern Recognition (CVPR). https:\/\/arxiv.org\/abs\/2111.02387"},{"key":"e_1_3_2_1_11_1","volume-title":"Article arXiv:2112.13906 (Dec.","author":"Eslami Sedigheh","year":"2021","unstructured":"Sedigheh Eslami, Gerard de Melo, and Christoph Meinel. 2021. Does CLIP Benefit Visual Question Answering in the Medical Domain as Much as it Does in the General Domain? arXiv e-prints, Article arXiv:2112.13906 (Dec. 2021). arxiv: 2112.13906 [cs.CV]"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463584"},{"key":"e_1_3_2_1_13_1","volume-title":"CLEF 2021 - Conference and Labs of the Evaluation Forum, September 21-24, 2021, Bucharest, Romania (CEUR Workshop Proceedings).","author":"Gong Haifan","year":"2021","unstructured":"Haifan Gong, Ricong Huang, Guanqi Chen, and Guanbin Li. 2021b. SYSU-HCP at VQA-Med 2021: A Data-centric Model with Efficient Training Methodology for Medical Visual Question Answering. In CLEF 2021 - Conference and Labs of the Evaluation Forum, September 21-24, 2021, Bucharest, Romania (CEUR Workshop Proceedings)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458754"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 3929--3938","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In International Conference on Machine Learning. PMLR, 3929--3938."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-85251-1_23"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3490238"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-019-0322-0"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"e_1_3_2_1_21_1","unstructured":"Jin-Hwa Kim Jaehyun Jun and Byoung-Tak Zhang. 2018. Bilinear Attention Networks. In Advances in Neural Information Processing Systems 31. 1571--1581."},{"key":"e_1_3_2_1_22_1","volume-title":"Asma Ben Abacha, and Dina Demner-Fushman","author":"Lau Jason J","year":"2018","unstructured":"Jason J Lau, Soumya Gayen, Asma Ben Abacha, and Dina Demner-Fushman. 2018. A dataset of clinically generated visual questions and answers about radiology images. Scientific data, Vol. 5, 1 (2018), 1--10."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btz682"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1612"},{"key":"e_1_3_2_1_25_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Chenliang Li Haiyang Xu Junfeng Tian Wei Wang Ming Yan Bin Bi Jiabo Ye Hehong Chen Guohai Xu Zheng Cao et al. 2022. mPLUG: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections. arXiv preprint arXiv:2205.12005 (2022).","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"e_1_3_2_1_27_1","volume-title":"Shafiq Joty, Caiming Xiong, and Steven Hoi.","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath R. Selvaraju, Akhilesh Deepak Gotmare, Shafiq Joty, Caiming Xiong, and Steven Hoi. 2021. Align before Fuse: Vision and Language Representation Learning with Momentum Distillation. In NeurIPS."},{"key":"e_1_3_2_1_28_1","unstructured":"Xiaobo Liang Lijun Wu Juntao Li Yue Wang Qi Meng Tao Qin Wei Chen Min Zhang and Tie-Yan Liu. 2021. R-Drop: Regularized Dropout for Neural Networks. In NeurIPS."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","author":"Liu Bo","unstructured":"Bo Liu, Li-Ming Zhan, and Xiao-Ming Wu. 2021b. Contrastive Pre-training and Representation Distillation for Medical Visual Question Answering Based on Radiology Images. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 210--220."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6959--6969","author":"Long Alexander","unstructured":"Alexander Long, Wei Yin, Thalaiyasingam Ajanthan, Vu Nguyen, Pulak Purkait, Ravi Garg, Alan Blair, Chunhua Shen, and Anton van den Hengel. 2022. Retrieval augmented classification for long-tail visual recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 6959--6969."},{"key":"e_1_3_2_1_33_1","volume-title":"Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In 7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6-9, 2019."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2022.3207502"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-32251-9_57"},{"key":"e_1_3_2_1_36_1","volume-title":"Intravascular Imaging and Computer Assisted Stenting and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis","author":"Pelka Obioma","unstructured":"Obioma Pelka, Sven Koitka, Johannes R\u00fcckert, Felix Nensa, and Christoph M Friedrich. 2018. Radiology objects in COntext (ROCO): a multimodal image dataset. In Intravascular Imaging and Computer Assisted Stenting and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis. Springer, 180--189."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3549555.3549585"},{"key":"e_1_3_2_1_40_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_41_1","volume-title":"Saksham Singhal, Subhojit Som, et al.","author":"Wang Wenhui","year":"2022","unstructured":"Wenhui Wang, Hangbo Bao, Li Dong, Johan Bjorck, Zhiliang Peng, Qiang Liu, Kriti Aggarwal, Owais Khan Mohammed, Saksham Singhal, Subhojit Som, et al. 2022. Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_43_1","volume-title":"Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering. IEEE International Conference on Computer Vision (ICCV) (2017)","author":"Yu Zhou","year":"2017","unstructured":"Zhou Yu, Jun Yu, Jianping Fan, and Dacheng Tao. 2017. Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering. IEEE International Conference on Computer Vision (ICCV) (2017), 1839--1848."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.bionlp-1.9"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.bionlp-1.20"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413761"},{"key":"e_1_3_2_1_47_1","volume-title":"PMC-Patients: A Large-scale Dataset of Patient Notes and Relations Extracted from Case Reports in PubMed Central. arXiv preprint arXiv:2202.13876","author":"Zhao Zhengyun","year":"2022","unstructured":"Zhengyun Zhao, Qiao Jin, and Sheng Yu. 2022. PMC-Patients: A Large-scale Dataset of Patient Notes and Relations Extracted from Case Reports in PubMed Central. arXiv preprint arXiv:2202.13876 (2022)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611830","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611830","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:11Z","timestamp":1755820571000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611830"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":47,"alternative-id":["10.1145\/3581783.3611830","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611830","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}