{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T23:28:18Z","timestamp":1774308498799,"version":"3.50.1"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887130","type":"print"},{"value":"9783031887147","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88714-7_38","type":"book-chapter","created":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T11:37:59Z","timestamp":1743853079000},"page":"391-400","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Entity-Aware Cross-Modal Pretraining for\u00a0Knowledge-Based Visual Question Answering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6021-7776","authenticated-orcid":false,"given":"Omar","family":"Adjali","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0755-2361","authenticated-orcid":false,"given":"Olivier","family":"Ferret","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7531-2522","authenticated-orcid":false,"given":"Sahar","family":"Ghannay","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0520-8436","authenticated-orcid":false,"given":"Herv\u00e9","family":"Le Borgne","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,4]]},"reference":[{"key":"38_CR1","doi-asserted-by":"publisher","unstructured":"Adjali, O., Ferret, O., Ghannay, S., Le Borgne, H.: Multi-level information retrieval augmented generation for knowledge-based visual question answering. In: Al-Onaizan, Y., Bansal, M., Chen, Y.N. (eds.) 2024 Conference on Empirical Methods in Natural Language Processing (EMNLP 2024), pp. 16499\u201316513. Association for Computational Linguistics, Miami, Florida, USA (2024). https:\/\/doi.org\/10.18653\/v1\/2024.emnlp-main.922","DOI":"10.18653\/v1\/2024.emnlp-main.922"},{"key":"38_CR2","doi-asserted-by":"crossref","unstructured":"Adjali, O., Grimal, P., Ferret, O., Ghannay, S., Le\u00a0Borgne, H.: Explicit knowledge integration for knowledge-aware visual question answering about named entities. In: Proceedings of the 2023 ACM International Conference on Multimedia Retrieval, pp. 29\u201338 (2023)","DOI":"10.1145\/3591106.3592227"},{"key":"38_CR3","doi-asserted-by":"publisher","unstructured":"Bulian, J., Buck, C., Gajewski, W., B\u00f6rschinger, B., Schuster, T.: Tomayto, Tomahto. Beyond token-level answer equivalence for question answering evaluation. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 291\u2013305. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.20","DOI":"10.18653\/v1\/2022.emnlp-main.20"},{"key":"38_CR4","doi-asserted-by":"crossref","unstructured":"Chen, S., Wang, J., Jiang, F., Lin, C.Y.: Improving entity linking by modeling latent entity type information. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a034, pp. 7529\u20137537 (2020)","DOI":"10.1609\/aaai.v34i05.6251"},{"key":"38_CR5","doi-asserted-by":"publisher","unstructured":"Chen, Y., et al.: Can pre-trained vision and language models answer visual information-seeking questions? In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 14948\u201314968. Association for Computational Linguistics, Singapore (2023). https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.925","DOI":"10.18653\/v1\/2023.emnlp-main.925"},{"key":"38_CR6","doi-asserted-by":"publisher","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR June 16-20, pp. 4690\u20134699. Computer Vision Foundation \/ IEEE, Long Beach, CA, USA (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00482","DOI":"10.1109\/CVPR.2019.00482"},{"key":"38_CR7","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of Deep Bidirectional Transformers for Language Understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"38_CR8","doi-asserted-by":"publisher","unstructured":"Dong, X., Yu, W., Zhu, C., Jiang, M.: Injecting entity types into entity-guided text generation. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 734\u2013741. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic (2021). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.56","DOI":"10.18653\/v1\/2021.emnlp-main.56"},{"key":"38_CR9","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR, June 27-30, pp. 770\u2013778. IEEE Computer Society, Las Vegas, NV, USA (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"38_CR10","doi-asserted-by":"crossref","unstructured":"Hu, Y., Hua, H., Yang, Z., Shi, W., Smith, N.A., Luo, J.: PromptCap: Prompt-Guided Task-Aware Image Captioning. arXiv preprint arXiv:2211.09699 (2022)","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"38_CR11","unstructured":"Hu, Z., et al.: AVIS: autonomous visual information seeking with large language model agent. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"38_CR12","doi-asserted-by":"publisher","unstructured":"Karpukhin, V., et al.: Dense passage retrieval for open-domain question answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6769\u20136781. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.550","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"38_CR13","unstructured":"Lerner, P.: R\u00e9pondre aux questions visuelles \u00e0 propos d\u2019entit\u00e9s nomm\u00e9es. Ph.D. thesis, Universit\u00e9 Paris-Saclay (2023). https:\/\/theses.hal.science\/tel-04352321"},{"key":"38_CR14","doi-asserted-by":"crossref","unstructured":"Lerner, P., Ferret, O., Guinaudeau, C.: Multimodal inverse cloze task for knowledge-based visual question answering. In: European Conference on Information Retrieval, pp. 569\u2013587. Springer (2023)","DOI":"10.1007\/978-3-031-28244-7_36"},{"key":"38_CR15","doi-asserted-by":"crossref","unstructured":"Lerner, P., Ferret, O., Guinaudeau, C.: Cross-modal retrieval for knowledge-based visual question answering. In: 46th European Conference on Information Retrieval (ECIR 2024): Advances in Information Retrieval, pp. 421\u2013438. Springer Nature Switzerland, Glasgow, Scotland (2024). https:\/\/doi.org\/10.1007\/978-3-031-56027-9_26","DOI":"10.1007\/978-3-031-56027-9_26"},{"key":"38_CR16","doi-asserted-by":"crossref","unstructured":"Lerner, P., et al.: ViQuAE, a dataset for knowledge-based visual question answering about named entities. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 3108\u20133120. Association for Computing Machinery, Madrid, Spain (2022)","DOI":"10.1145\/3477495.3531753"},{"key":"38_CR17","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: Learning customized visual models with retrieval-augmented knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15148\u201315158 (2023)","DOI":"10.1109\/CVPR52729.2023.01454"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Luo, Z., Xi, Y., Zhang, R., Li, G., Zhao, Z., Ma, J.: Conditioned masked language and image modeling for image-text dense retrieval. In: Findings of the Association for Computational Linguistics: EMNLP 2022, pp. 130\u2013140 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.10"},{"key":"38_CR19","doi-asserted-by":"crossref","unstructured":"Ma, H., et al.: EI-CLIP: entity-aware interventional contrastive learning for E-commerce cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18051\u201318061 (2022)","DOI":"10.1109\/CVPR52688.2022.01752"},{"key":"38_CR20","doi-asserted-by":"publisher","unstructured":"Mensink, T., et al.: Encyclopedic VQA: visual questions about detailed properties of fine-grained categories . In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 3090\u20133101. IEEE Computer Society, Los Alamitos, CA, USA (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.00289","DOI":"10.1109\/ICCV51070.2023.00289"},{"key":"38_CR21","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR, Virtual Event (2021). http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"38_CR22","doi-asserted-by":"crossref","unstructured":"Saito, K., et al.: Prefix conditioning unifies language and label supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2861\u20132870 (2023)","DOI":"10.1109\/CVPR52729.2023.00280"},{"key":"38_CR23","doi-asserted-by":"crossref","unstructured":"Wang, Z., Ng, P., Ma, X., Nallapati, R., Xiang, B.: Multi-passage BERT: a globally normalized BERT model for open-domain question answering. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5878\u20135882 (2019)","DOI":"10.18653\/v1\/D19-1599"},{"key":"38_CR24","doi-asserted-by":"crossref","unstructured":"Yang, Y., et\u00a0al.: Attentive mask CLIP. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2771\u20132781 (2023)","DOI":"10.1109\/ICCV51070.2023.00260"},{"key":"38_CR25","unstructured":"Yao, L., et al.: FILIP: fine-grained interactive language-image pre-training. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=cpDhcsEDC2"},{"issue":"9","key":"38_CR26","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88714-7_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T11:38:15Z","timestamp":1743853095000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88714-7_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887130","9783031887147"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88714-7_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"4 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}