{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T03:53:18Z","timestamp":1778903598057,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["12126610"],"award-info":[{"award-number":["12126610"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Key Field Research and Development Plan","award":["2018B010109006"],"award-info":[{"award-number":["2018B010109006"]}]},{"name":"Guangzhou S&T Research Plan","award":["202007030010;202002020047"],"award-info":[{"award-number":["202007030010;202002020047"]}]},{"name":"Guangdong Key Field Research and Development Plan","award":["2019B020228001"],"award-info":[{"award-number":["2019B020228001"]}]},{"name":"National Key Research and Development Program of China","award":["2020YFB0204803"],"award-info":[{"award-number":["2020YFB0204803"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613848","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5399-5409","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Retrieval-based Knowledge Augmented Vision Language Pre-training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6840-8198","authenticated-orcid":false,"given":"Jiahua","family":"Rao","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8283-6498","authenticated-orcid":false,"given":"Zifei","family":"Shan","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2696-0014","authenticated-orcid":false,"given":"Longpo","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7001-4607","authenticated-orcid":false,"given":"Yao","family":"Zhou","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6782-2813","authenticated-orcid":false,"given":"Yuedong","family":"Yang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","volume-title":"Translating embeddings for modeling multi-relational data. Advances in neural information processing systems","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. Advances in neural information processing systems, Vol. 26 (2013)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Soravit Changpinyo Piyush Sharma Nan Ding and Radu Soricut. 2021. Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"e_1_3_2_1_4_1","volume-title":"workshop.","author":"Chen Kezhen","year":"2021","unstructured":"Kezhen Chen, Qiuyuan Huang, Yonatan Bisk, Daniel McDuff, and Jianfeng Gao. 2021. Kb-vlp: Knowledge based vision and language pretraining. In ICML, workshop."},{"key":"e_1_3_2_1_5_1","volume-title":"MuRAG: Multimodal Retrieval-Augmented Generator for Open Question Answering over Images and Text. arXiv preprint arXiv:2210.02928","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W Cohen. 2022a. MuRAG: Multimodal Retrieval-Augmented Generator for Open Question Answering over Images and Text. arXiv preprint arXiv:2210.02928 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794","author":"Chen Xi","year":"2022","unstructured":"Xi Chen, Xiao Wang, Soravit Changpinyo, AJ Piergiovanni, Piotr Padlewski, Daniel Salz, Sebastian Goodman, Adam Grycner, Basil Mustafa, Lucas Beyer, et al. 2022c. Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. Uniter: Universal image-text representation learning. In ECCV."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547948"},{"key":"e_1_3_2_1_9_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_10_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. PMLR, 3929--3938","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Mingwei Chang. 2020. Retrieval augmented language model pre-training. In International Conference on Machine Learning. PMLR, 3929--3938."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_13_1","volume-title":"REVEAL: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory. CVPR","author":"Hu Ziniu","year":"2023","unstructured":"Ziniu Hu, Ahmet Iscen, Chen Sun, Zirui Wang, Kai-Wei Chang, Yizhou Sun, Cordelia Schmid, David A. Ross, and Alireza Fathi. 2023. REVEAL: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory. CVPR (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR, 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904--4916."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_16_1","volume-title":"Simple embedding for link prediction in knowledge graphs. Advances in neural information processing systems","author":"Kazemi Seyed Mehran","year":"2018","unstructured":"Seyed Mehran Kazemi and David Poole. 2018. Simple embedding for link prediction in knowledge graphs. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"e_1_3_2_1_18_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"LAVIS: A Library for Language-Vision Intelligence. arXiv preprint arXiv:2209.09019","author":"Li Dongxu","year":"2022","unstructured":"Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, and Steven CH Hoi. 2022a. LAVIS: A Library for Language-Vision Intelligence. arXiv preprint arXiv:2209.09019 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022b. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_21_1","volume-title":"Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems, Vol. 34 (2021), 9694--9705."},{"key":"e_1_3_2_1_22_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i5.16554"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/544"},{"key":"e_1_3_2_1_27_1","volume-title":"A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge. arXiv preprint arXiv:2206.01718","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge. arXiv preprint arXiv:2206.01718 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"e_1_3_2_1_29_1","volume-title":"Visual Named Entity Linking: A New Dataset and A Baseline. arXiv preprint arXiv:2211.04872","author":"Sun Wenxiang","year":"2022","unstructured":"Wenxiang Sun, Yixing Fan, Jiafeng Guo, Ruqing Zhang, and Xueqi Cheng. 2022. Visual Named Entity Linking: A New Dataset and A Baseline. arXiv preprint arXiv:2211.04872 (2022)."},{"key":"e_1_3_2_1_30_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HkgEQnRqYQ","author":"Sun Zhiqing","year":"2019","unstructured":"Zhiqing Sun, Zhi-Hong Deng, Jian-Yun Nie, and Jian Tang. 2019. RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HkgEQnRqYQ"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022b. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00360"},{"key":"e_1_3_2_1_34_1","volume-title":"WikiDiverse: A Multimodal Entity Linking Dataset with Diversified Contextual Topics and Entity Types. arXiv preprint arXiv:2204.06347","author":"Wang Xuwu","year":"2022","unstructured":"Xuwu Wang, Junfeng Tian, Min Gui, Zhixu Li, Rui Wang, Ming Yan, Lihan Chen, and Yanghua Xiao. 2022a. WikiDiverse: A Multimodal Entity Linking Dataset with Diversified Contextual Topics and Entity Types. arXiv preprint arXiv:2204.06347 (2022)."},{"key":"e_1_3_2_1_35_1","volume-title":"Zihang Dai, Yulia Tsvetkov, and Yuan Cao.","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang, Jiahui Yu, Adams Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021b. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)."},{"key":"e_1_3_2_1_36_1","volume-title":"Visual Entailment: A Novel Task for Fine-grained Image Understanding. arXiv preprint arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual Entailment: A Novel Task for Fine-grained Image Understanding. arXiv preprint arXiv:1901.06706 (2019)."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Learning Representations.","author":"Yang Bishan","year":"2015","unstructured":"Bishan Yang, Wen-tau Yih, Xiaodong He, Jianfeng Gao, and Li Deng. 2015. Embedding entities and relations for learning and inference in knowledge bases. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_38_1","unstructured":"Michihiro Yasunaga Antoine Bosselut Hongyu Ren Xikun Zhang Christopher D. Manning Percy Liang and Jure Leskovec. 2022. Deep Bidirectional Language-Knowledge Graph Pretraining. In Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_39_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1139"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-0152-y"},{"key":"e_1_3_2_1_42_1","volume-title":"Yuedong Yang, and Zhangming Niu.","author":"Zheng Shuangjia","year":"2021","unstructured":"Shuangjia Zheng, Jiahua Rao, Ying Song, Jixian Zhang, Xianglu Xiao, Evandro Fei Fang, Yuedong Yang, and Zhangming Niu. 2021. PharmKG: a dedicated knowledge graph benchmark for bomedical data mining. Briefings in bioinformatics, Vol. 22, 4 (2021), bbaa344."},{"key":"e_1_3_2_1_43_1","volume-title":"Cross-modal Graph Contrastive Learning with Cellular Images. bioRxiv","author":"Zheng Shuangjia","year":"2022","unstructured":"Shuangjia Zheng, Jiahua Rao, Jixian Zhang, Ethan Cohen, Chengtao Li, and Yuedong Yang. 2022. Cross-modal Graph Contrastive Learning with Cellular Images. bioRxiv (2022), 2022--06."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613848","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613848","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:21Z","timestamp":1755820581000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613848"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":43,"alternative-id":["10.1145\/3581783.3613848","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613848","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}