{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T16:46:38Z","timestamp":1772901998974,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,12]],"date-time":"2023-06-12T00:00:00Z","timestamp":1686528000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"ANR","award":["19-CE23-0028"],"award-info":[{"award-number":["19-CE23-0028"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,12]]},"DOI":"10.1145\/3591106.3592227","type":"proceedings-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T22:33:38Z","timestamp":1686263618000},"page":"29-38","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Explicit Knowledge Integration for Knowledge-Aware Visual Question Answering about Named Entities"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6021-7776","authenticated-orcid":false,"given":"Omar","family":"Adjali","sequence":"first","affiliation":[{"name":"CEA LIST, France"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5116-9417","authenticated-orcid":false,"given":"Paul","family":"Grimal","sequence":"additional","affiliation":[{"name":"CEA LIST, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0755-2361","authenticated-orcid":false,"given":"Olivier","family":"Ferret","sequence":"additional","affiliation":[{"name":"CEA LIST, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7531-2522","authenticated-orcid":false,"given":"Sahar","family":"Ghannay","sequence":"additional","affiliation":[{"name":"CNRS, LISN, Universit\u00e9 Paris-Saclay, France"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0520-8436","authenticated-orcid":false,"given":"Herv\u00e9","family":"Le Borgne","sequence":"additional","affiliation":[{"name":"CEA LIST, France"}]}],"member":"320","published-online":{"date-parts":[[2023,6,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International Conference on Language Resources and Evaluation (LREC). European Language Resources Association","author":"Adjali Omar","year":"2020","unstructured":"Omar Adjali, Romaric Besan\u00e7on, Olivier Ferret, Herv\u00e9 Le Borgne, and Brigitte Grau. 2020. Building a Multimodal Entity Linking Dataset From Tweets. In International Conference on Language Resources and Evaluation (LREC). European Language Resources Association, Marseille, France."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-45439-5_31"},{"key":"e_1_3_2_1_3_1","volume-title":"VQA: Visual Question Answering. In IEEE International Conference on Computer Vision, ICCV December 7-13","author":"Antol Stanislaw","year":"2015","unstructured":"Stanislaw Antol, Aishwarya Agrawal, Jiasen Lu, Margaret Mitchell, Dhruv Batra, C.\u00a0Lawrence Zitnick, and Devi Parikh. 2015. VQA: Visual Question Answering. In IEEE International Conference on Computer Vision, ICCV December 7-13. IEEE Computer Society, Santiago, Chile, 2425\u20132433."},{"key":"e_1_3_2_1_4_1","volume-title":"Commonsense Knowledge Reasoning and Generation with Pre-trained Language Models: A Survey. arXiv preprint arXiv:2201.12438 abs\/2201.12438","author":"Bhargava Prajjwal","year":"2022","unstructured":"Prajjwal Bhargava and Vincent Ng. 2022. Commonsense Knowledge Reasoning and Generation with Pre-trained Language Models: A Survey. arXiv preprint arXiv:2201.12438 abs\/2201.12438 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_7_1","volume-title":"Injecting entity types into entity-guided text generation. arXiv:2009.13401","author":"Dong Xiangyu","year":"2020","unstructured":"Xiangyu Dong, Wenhao Yu, Chenguang Zhu, and Meng Jiang. 2020. Injecting entity types into entity-guided text generation. arXiv:2009.13401 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.387"},{"key":"e_1_3_2_1_9_1","volume-title":"Fast Graph Representation Learning with PyTorch Geometric. In ICLR Workshop on Representation Learning on Graphs and Manifolds. MIT Press","author":"Fey Matthias","year":"2019","unstructured":"Matthias Fey and Jan\u00a0E. Lenssen. 2019. Fast Graph Representation Learning with PyTorch Geometric. In ICLR Workshop on Representation Learning on Graphs and Manifolds. MIT Press, New Orleans, LA, USA."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc\u00a0V. Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 4904\u20134916. http:\/\/proceedings.mlr.press\/v139\/jia21b.html"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.567"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July(Proceedings of Machine Learning Research, Vol.\u00a0139)","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, Virtual Event, 5583\u20135594. http:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"key":"e_1_3_2_1_17_1","volume-title":"Semi-Supervised Classification with Graph Convolutional Networks. In 5th International Conference on Learning Representations ICLR April 24-26, Conference Track Proceedings. OpenReview.net","author":"N.","unstructured":"Thomas\u00a0N. Kipf and Max Welling. 2017. Semi-Supervised Classification with Graph Convolutional Networks. In 5th International Conference on Learning Representations ICLR April 24-26, Conference Track Proceedings. OpenReview.net, Toulon, France. https:\/\/openreview.net\/forum?id=SJU4ayYgl"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531753"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.438"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.522"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1598"},{"key":"e_1_3_2_1_22_1","first-page":"8","article-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems (December 8-14), Hanna\u00a0M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d\u2019Alch\u00e9-Buc, Emily\u00a0B. Fox, and Roman Garnett (Eds.). Neural Information Processing Systems Foundation, Inc., Vancouver, BC, Canada, 13\u201323.","journal-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.113"},{"key":"e_1_3_2_1_24_1","volume-title":"Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014","author":"Malinowski Mateusz","year":"2014","unstructured":"Mateusz Malinowski and Mario Fritz. 2014. A Multi-World Approach to Question Answering about Real-World Scenes based on Uncertain Input. In Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13, Zoubin Ghahramani, Max Welling, Corinna Cortes, Neil\u00a0D. Lawrence, and Kilian\u00a0Q. Weinberger (Eds.). MIT Press, Montreal, Quebec, Canada, 1682\u20131690. https:\/\/proceedings.neurips.cc\/paper\/2014\/hash\/d516b13671a4179d9b7b458a6ebdeb92-Abstract.html"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1159"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1078"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 10th international conference on Cross-language evaluation forum: multimedia experiments(Lecture Notes in Computer Science), Carol Peters, Barbara Caputo, Julio Gonzalo, Gareth\u00a0J","author":"Myoupo D\u00e9bora","unstructured":"D\u00e9bora Myoupo, Adrian Popescu, Herv\u00e9 Le\u00a0Borgne, and Pierre-Alain Mo\u00ebllic. 2010. Multimodal image retrieval over a large database. In Proceedings of the 10th international conference on Cross-language evaluation forum: multimedia experiments(Lecture Notes in Computer Science), Carol Peters, Barbara Caputo, Julio Gonzalo, Gareth\u00a0J.F. Jones, and Jayashree Kalpathy-Cramer (Eds.). Springer Berlin \/ Heidelberg, Berlin, Heidelberg, 177\u2013184."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems","author":"Narasimhan Medhini","year":"2018","unstructured":"Medhini Narasimhan, Svetlana Lazebnik, and Alexander\u00a0G. Schwing. 2018. Out of the Box: Reasoning with Graph Convolution Nets for Factual Visual Question Answering. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3-8, Samy Bengio, Hanna\u00a0M. Wallach, Hugo Larochelle, Kristen Grauman, Nicol\u00f2 Cesa-Bianchi, and Roman Garnett (Eds.). MIT Press, Montr\u00e9al, Canada, 2659\u20132670."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_28"},{"key":"e_1_3_2_1_31_1","volume-title":"NIPS 2017 Workshop on Autodiff. MIT Press","author":"Paszke Adam","year":"2017","unstructured":"Adam Paszke, Sam Gross, Soumith Chintala, Gregory Chanan, Edward Yang, Zachary DeVito, Zeming Lin, Alban Desmaison, Luca Antiga, and Adam Lerer. 2017. Automatic differentiation in pytorch. In NIPS 2017 Workshop on Autodiff. MIT Press, Long Beach, CA, USA."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML (18-24 July) (Proceedings of Machine Learning Research, Vol.\u00a0139)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML (18-24 July) (Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, Virtual Event, 8748\u20138763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"e_1_3_2_1_37_1","volume-title":"A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge. arXiv preprint arXiv:2206.01718 abs\/2206.01718","author":"Schwenk Dustin","year":"2022","unstructured":"Dustin Schwenk, Apoorv Khandelwal, Christopher Clark, Kenneth Marino, and Roozbeh Mottaghi. 2022. A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge. arXiv preprint arXiv:2206.01718 abs\/2206.01718 (2022). https:\/\/arxiv.org\/abs\/2206.01718"},{"key":"e_1_3_2_1_38_1","volume-title":"KVQA: Knowledge-Aware Visual Question Answering. In The Thirty-Third AAAI Conference on Artificial Intelligence, AAAI","author":"Shah Sanket","year":"2019","unstructured":"Sanket Shah, Anand Mishra, Naganand Yadati, and Partha\u00a0Pratim Talukdar. 2019. KVQA: Knowledge-Aware Visual Question Answering. In The Thirty-Third AAAI Conference on Artificial Intelligence, AAAI 2019, The Thirty-First Innovative Applications of Artificial Intelligence Conference, IAAI 2019, The Ninth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI (January 27 - February 1). AAAI Press, Honolulu, Hawaii, USA, 8876\u20138884."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1455"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6428"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). Association for Computational Linguistics Hong Kong China 5100\u20135111. https:\/\/doi.org\/10.18653\/v1\/D19-1514","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_43_1","volume-title":"Aggregating Image and Text Quantized Correlated Components. In IEEE Conference on Computer Vision and Pattern Recognition,(CVPR)","author":"Quynh\u00a0Nhi Tran Thi","year":"2016","unstructured":"Thi Quynh\u00a0Nhi Tran, Herv\u00e9 Le\u00a0Borgne, and Michel Crucianu. 2016. Aggregating Image and Text Quantized Correlated Components. In IEEE Conference on Computer Vision and Pattern Recognition,(CVPR). Las Vegas, USA."},{"key":"e_1_3_2_1_44_1","volume-title":"Composition-based multi-relational graph convolutional networks. arXiv preprint arXiv:1911.03082","author":"Vashishth Shikhar","year":"2019","unstructured":"Shikhar Vashishth, Soumya Sanyal, Vikram Nitin, and Partha Talukdar. 2019. Composition-based multi-relational graph convolutional networks. arXiv preprint arXiv:1911.03082 (2019)."},{"key":"e_1_3_2_1_45_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna\u00a0M. Wallach, Rob Fergus, S.\u00a0V.\u00a0N. Vishwanathan, and Roman Garnett (Eds.). Long Beach, CA, USA, 5998\u20136008."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_47_1","volume-title":"Fvqa: Fact-based visual question answering","author":"Wang Peng","year":"2017","unstructured":"Peng Wang, Qi Wu, Chunhua Shen, Anthony Dick, and Anton Van Den\u00a0Hengel. 2017. Fvqa: Fact-based visual question answering. IEEE transactions on pattern analysis and machine intelligence 40, 10 (2017), 2413\u20132427."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/179"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00360"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.500"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.523"},{"key":"e_1_3_2_1_53_1","volume-title":"A survey of knowledge enhanced pre-trained models. arXiv:2110.00269","author":"Yang Jian","year":"2021","unstructured":"Jian Yang, Gang Xiao, Yulong Shen, Wei Jiang, Xinyu Hu, Ying Zhang, and Jinghui Peng. 2021. A survey of knowledge enhanced pre-trained models. arXiv:2110.00269 (2021)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1237"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-1090"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.306"},{"key":"e_1_3_2_1_58_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 abs\/2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 abs\/2205.01917 (2022)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"}],"event":{"name":"ICMR '23: International Conference on Multimedia Retrieval","location":"Thessaloniki Greece","acronym":"ICMR '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2023 ACM International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592227","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3591106.3592227","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:22Z","timestamp":1750182682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3591106.3592227"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,12]]},"references-count":59,"alternative-id":["10.1145\/3591106.3592227","10.1145\/3591106"],"URL":"https:\/\/doi.org\/10.1145\/3591106.3592227","relation":{},"subject":[],"published":{"date-parts":[[2023,6,12]]},"assertion":[{"value":"2023-06-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}