{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,30]],"date-time":"2026-01-30T05:13:41Z","timestamp":1769750021431,"version":"3.49.0"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031282430","type":"print"},{"value":"9783031282447","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-28244-7_36","type":"book-chapter","created":{"date-parts":[[2023,3,16]],"date-time":"2023-03-16T17:03:18Z","timestamp":1678986198000},"page":"569-587","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Multimodal Inverse Cloze Task for\u00a0Knowledge-Based Visual Question Answering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0882-8684","authenticated-orcid":false,"given":"Paul","family":"Lerner","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0755-2361","authenticated-orcid":false,"given":"Olivier","family":"Ferret","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7249-8715","authenticated-orcid":false,"given":"Camille","family":"Guinaudeau","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,3,17]]},"reference":[{"key":"36_CR1","doi-asserted-by":"publisher","unstructured":"Antol, S., et al.: VQA: visual question answering. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp. 2425\u20132433. IEEE, Santiago, December 2015. https:\/\/doi.org\/10.1109\/ICCV.2015.279, http:\/\/ieeexplore.ieee.org\/document\/7410636\/","DOI":"10.1109\/ICCV.2015.279"},{"key":"36_CR2","doi-asserted-by":"crossref","unstructured":"Aytar, Y., Castrejon, L., Vondrick, C., Pirsiavash, H., Torralba, A.: Cross-modal scene networks. IEEE Trans. Pattern Anal. Mach. Intell. 40(10), 2303\u20132314 (2017). Publisher: IEEE","DOI":"10.1109\/TPAMI.2017.2753232"},{"key":"36_CR3","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv:1607.06450 [cs, stat], July 2016. http:\/\/arxiv.org\/abs\/1607.06450, arXiv: 1607.06450"},{"key":"36_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1007\/978-3-030-99739-7_30","volume-title":"Advances in Information Retrieval","author":"E Bassani","year":"2022","unstructured":"Bassani, E.: ranx: A blazing-fast Python library for\u00a0ranking evaluation and\u00a0comparison. In: Hagen, M., et al. (eds.) ECIR 2022. LNCS, vol. 13186, pp. 259\u2013264. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-99739-7_30"},{"key":"36_CR5","doi-asserted-by":"publisher","unstructured":"Bugliarello, E., Cotterell, R., Okazaki, N., Elliott, D.: Multimodal pretraining unmasked: a meta-analysis and a unified framework of vision-and-language BERTs. Trans. Assoc. Comput. Linguist. 9, 978\u2013994 (2021). https:\/\/doi.org\/10.1162\/tacl_a_00408","DOI":"10.1162\/tacl_a_00408"},{"key":"36_CR6","doi-asserted-by":"publisher","unstructured":"Chen, T., Xu, B., Zhang, C., Guestrin, C.: Training deep nets with sublinear memory cost, April 2016 https:\/\/doi.org\/10.48550\/arXiv.1604.06174, http:\/\/arxiv.org\/abs\/1604.06174, number: arXiv:1604.06174 [cs]","DOI":"10.48550\/arXiv.1604.06174"},{"key":"36_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"36_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), June 2019. https:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Deng_ArcFace_Additive_Angular_Margin_Loss_for_Deep_Face_Recognition_CVPR_2019_paper.html","DOI":"10.1109\/CVPR.2019.00482"},{"key":"36_CR9","doi-asserted-by":"publisher","unstructured":"Depeursinge, A., M\u00fcller, H.: Fusion techniques for combining textual and visual information retrieval. In: M\u00fcller, H., Clough, P., Deselaers, T., Caputo, B. (eds.) ImageCLEF: Experimental Evaluation in Visual Information Retrieval. The Information Retrieval Series, pp. 95\u2013114, Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15181-1_6","DOI":"10.1007\/978-3-642-15181-1_6"},{"key":"36_CR10","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, June 2019. https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"36_CR11","doi-asserted-by":"publisher","unstructured":"Fan, Y., et al.: Pre-training methods in information retrieval. Found. Trends\u00ae Inf. Retrieval 16(3), 178\u2013317 (2022). https:\/\/doi.org\/10.1561\/1500000100","DOI":"10.1561\/1500000100"},{"key":"36_CR12","unstructured":"Fisher, R.A.: The Design of Experiments, 2nd edn. Oliver & Boyd, Edinburgh & London (1937). https:\/\/www.cabdirect.org\/cabdirect\/abstract\/19371601600"},{"key":"36_CR13","unstructured":"Fun, H., Gandhi, S., Ravi, S.: Efficient retrieval optimized multi-task learning. arXiv:2104.10129 [cs], April 2021. http:\/\/arxiv.org\/abs\/2104.10129, arXiv: 2104.10129"},{"key":"36_CR14","doi-asserted-by":"crossref","unstructured":"Gao, L., Callan, J.: Condenser: a pre-training architecture for dense retrieval. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 981\u2013993. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, November 2021. https:\/\/aclanthology.org\/2021.emnlp-main.75","DOI":"10.18653\/v1\/2021.emnlp-main.75"},{"key":"36_CR15","doi-asserted-by":"publisher","unstructured":"Gao, L., Callan, J.: Unsupervised corpus aware language model pre-training for dense passage retrieval. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2843\u20132853. Association for Computational Linguistics, Dublin, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.203, https:\/\/aclanthology.org\/2022.acl-long.203","DOI":"10.18653\/v1\/2022.acl-long.203"},{"key":"36_CR16","doi-asserted-by":"publisher","unstructured":"Garcia-Olano, D., Onoe, Y., Ghosh, J.: Improving and diagnosing knowledge-based visual question answering via entity enhanced knowledge injection. In: Companion Proceedings of the Web Conference 2022. WWW 2022, pp. 705\u2013715. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3487553.3524648","DOI":"10.1145\/3487553.3524648"},{"key":"36_CR17","doi-asserted-by":"crossref","unstructured":"Gard\u00e8res, F., Ziaeefard, M.: ConceptBert: concept-aware representation for visual question answering. In: Findings of the Association for Computational Linguistics: EMNLP 2020, p. 10 (2020). https:\/\/aclanthology.org\/2020.findings-emnlp.44\/","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"36_CR18","doi-asserted-by":"publisher","unstructured":"Geigle, G., Pfeiffer, J., Reimers, N., Vuli\u0107, I., Gurevych, I.: Retrieve fast, rerank smart: cooperative and joint approaches for improved cross-modal retrieval. Trans. Assoc. Comput. Linguist. 10, 503\u2013521 (2022). https:\/\/doi.org\/10.1162\/tacl_a_00473","DOI":"10.1162\/tacl_a_00473"},{"key":"36_CR19","doi-asserted-by":"publisher","unstructured":"Guo, Y., Nie, L., Wong, Y., Liu, Y., Cheng, Z., Kankanhalli, M.: A unified end-to-end retriever-reader framework for knowledge-based VQA. In: Proceedings of the 30th ACM International Conference on Multimedia. MM 2022, pp. 2061\u20132069. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3503161.3547870","DOI":"10.1145\/3503161.3547870"},{"key":"36_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016). https:\/\/openaccess.thecvf.com\/content_cvpr_2016\/papers\/He_Deep_Residual_Learning_CVPR_2016_paper.pdf","DOI":"10.1109\/CVPR.2016.90"},{"key":"36_CR21","doi-asserted-by":"publisher","unstructured":"Heo, Y.J., Kim, E.S., Choi, W.S., Zhang, B.T.: Hypergraph transformer: weakly-supervised multi-hop reasoning for knowledge-based visual question answering. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 373\u2013390. Association for Computational Linguistics, Dublin, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.29, https:\/\/aclanthology.org\/2022.acl-long.29","DOI":"10.18653\/v1\/2022.acl-long.29"},{"key":"36_CR22","doi-asserted-by":"publisher","unstructured":"Hessel, J., Lee, L.: Does my multimodal model learn cross-modal interactions? It\u2019s harder to tell than you might think! In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 861\u2013877. Association for Computational Linguistics, Online, November 2020. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.62, https:\/\/aclanthology.org\/2020.emnlp-main.62","DOI":"10.18653\/v1\/2020.emnlp-main.62"},{"key":"36_CR23","doi-asserted-by":"publisher","unstructured":"Hofst\u00e4tter, S., Lin, S.C., Yang, J.H., Lin, J., Hanbury, A.: Efficiently teaching an effective dense retriever with balanced topic aware sampling. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval. SIGIR 2021, pp. 113\u2013122. Association for Computing Machinery, New York (2021). https:\/\/doi.org\/10.1145\/3404835.3462891","DOI":"10.1145\/3404835.3462891"},{"key":"36_CR24","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: Proceedings of the 36th International Conference on Machine Learning, pp. 2790\u20132799. PMLR, May 2019. https:\/\/proceedings.mlr.press\/v97\/houlsby19a.html, ISSN 2640-3498"},{"issue":"3","key":"36_CR25","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., J\u00e9gou, H.: Billion-scale similarity search with GPUs. IEEE Trans. Big Data 7(3), 535\u2013547 (2019). https:\/\/doi.org\/10.1109\/TBDATA.2019.2921572","journal-title":"IEEE Trans. Big Data"},{"key":"36_CR26","unstructured":"Kalantidis, Y., Sariyildiz, M.B., Pion, N., Weinzaepfel, P., Larlus, D.: Hard negative mixing for contrastive learning. In: Advances in Neural Information Processing Systems, vol. 33, pp. 21798\u201321809. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/f7cade80b7cc92b991cf4d2806d6bd78-Abstract.html"},{"key":"36_CR27","doi-asserted-by":"crossref","unstructured":"Karpukhin, V., et al.: Dense passage retrieval for open-domain question answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6769\u20136781. Association for Computational Linguistics, Online, November 2020. https:\/\/www.aclweb.org\/anthology\/2020.emnlp-main.550","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"36_CR28","doi-asserted-by":"publisher","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Comput. Surv. (2021). https:\/\/doi.org\/10.1145\/3505244, just Accepted","DOI":"10.1145\/3505244"},{"key":"36_CR29","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (Poster) (2015). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"36_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"147","DOI":"10.1007\/978-3-540-79860-6_12","volume-title":"Adaptive Multimedia Retrieval: Retrieval, User, and Semantics","author":"J Kludas","year":"2008","unstructured":"Kludas, J., Bruno, E., Marchand-Maillet, S.: Information fusion in multimedia information retrieval. In: Boujemaa, N., Detyniecki, M., N\u00fcrnberger, A. (eds.) AMR 2007. LNCS, vol. 4918, pp. 147\u2013159. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-79860-6_12"},{"key":"36_CR31","doi-asserted-by":"publisher","unstructured":"Lee, K., Chang, M.W., Toutanova, K.: Latent retrieval for weakly supervised open domain question answering. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 6086\u20136096. Association for Computational Linguistics, Florence, Italy, July 2019. https:\/\/doi.org\/10.18653\/v1\/P19-1612, https:\/\/aclanthology.org\/P19-1612","DOI":"10.18653\/v1\/P19-1612"},{"key":"36_CR32","doi-asserted-by":"publisher","unstructured":"Lerner, P., et al.: ViQuAE, a dataset for knowledge-based visual question answering about named entities. In: Proceedings of The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval. SIGIR 2022. Association for Computing Machinery, New York (2022). https:\/\/doi.org\/10.1145\/3477495.3531753, https:\/\/hal.archives-ouvertes.fr\/hal-03650618","DOI":"10.1145\/3477495.3531753"},{"key":"36_CR33","unstructured":"Lhoest, Q., et al.: Datasets: a community library for natural language processing. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 175\u2013184. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, November 2021. https:\/\/aclanthology.org\/2021.emnlp-demo.21"},{"key":"36_CR34","doi-asserted-by":"publisher","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, no. 07, pp. 11336\u201311344, April 2020. https:\/\/doi.org\/10.1609\/aaai.v34i07.6795, https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/6795, number: 07","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"36_CR35","doi-asserted-by":"publisher","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: Visualbert: A simple and performant baseline for vision and language (2019). https:\/\/doi.org\/10.48550\/ARXIV.1908.03557, https:\/\/arxiv.org\/abs\/1908.03557","DOI":"10.48550\/ARXIV.1908.03557"},{"issue":"4","key":"36_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.2200\/S01123ED1V01Y202108HLT053","volume":"14","author":"J Lin","year":"2021","unstructured":"Lin, J., Nogueira, R., Yates, A.: Pretrained transformers for text ranking: BERT and beyond. Synth. Lect. Hum. Lang. Technol. 14(4), 1\u2013325 (2021). https:\/\/doi.org\/10.2200\/S01123ED1V01Y202108HLT053","journal-title":"Synth. Lect. Hum. Lang. Technol."},{"key":"36_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"36_CR38","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32, pp. 13\u201323 (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/c74d97b01eae257e44aa9d5bade97baf-Abstract.html"},{"key":"36_CR39","doi-asserted-by":"publisher","unstructured":"Luo, M., Zeng, Y., Banerjee, P., Baral, C.: Weakly-supervised visual-retriever-reader for knowledge-based question answering. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 6417\u20136431. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, November 2021. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.517, https:\/\/aclanthology.org\/2021.emnlp-main.517","DOI":"10.18653\/v1\/2021.emnlp-main.517"},{"key":"36_CR40","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019). https:\/\/ieeexplore.ieee.org\/document\/8953725\/","DOI":"10.1109\/CVPR.2019.00331"},{"key":"36_CR41","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, vol. 26 (2013). https:\/\/papers.neurips.cc\/paper\/2013\/hash\/9aa42b31882ec039965f3c4923ce901b-Abstract.html"},{"key":"36_CR42","unstructured":"Paszke, A., et al.: PyTorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32 (2019). https:\/\/papers.nips.cc\/paper\/2019\/hash\/bdbca288fee7f92f2bfa9f7012727740-Abstract.html"},{"key":"36_CR43","doi-asserted-by":"publisher","unstructured":"Petroni, F., et al.: KILT: a benchmark for knowledge intensive language tasks. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 2523\u20132544. Association for Computational Linguistics, Online, June 2021. https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.200, https:\/\/aclanthology.org\/2021.naacl-main.200","DOI":"10.18653\/v1\/2021.naacl-main.200"},{"key":"36_CR44","doi-asserted-by":"publisher","unstructured":"Qu, C., Zamani, H., Yang, L., Croft, W.B., Learned-Miller, E.: Passage retrieval for outside-knowledge visual question answering. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval. SIGIR 2021, pp. 1753\u20131757. Association for Computing Machinery, New York (2021). https:\/\/doi.org\/10.1145\/3404835.3462987","DOI":"10.1145\/3404835.3462987"},{"key":"36_CR45","doi-asserted-by":"publisher","unstructured":"Qu, Y., et al.: RocketQA: an optimized training approach to dense passage retrieval for open-domain question answering. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5835\u20135847. Association for Computational Linguistics, Online, June 2021. https:\/\/doi.org\/10.18653\/v1\/2021.naacl-main.466, https:\/\/aclanthology.org\/2021.naacl-main.466","DOI":"10.18653\/v1\/2021.naacl-main.466"},{"key":"36_CR46","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"36_CR47","doi-asserted-by":"crossref","unstructured":"Ram, O., Shachaf, G., Levy, O., Berant, J., Globerson, A.: Learning to retrieve passages without supervision. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 2687\u20132700. Association for Computational Linguistics, Seattle, July 2022. https:\/\/aclanthology.org\/2022.naacl-main.193","DOI":"10.18653\/v1\/2022.naacl-main.193"},{"key":"36_CR48","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28, pp. 91\u201399 (2015). https:\/\/proceedings.neurips.cc\/paper\/2015\/hash\/14bfa6bb14875e45bba028a21ed38046-Abstract.html"},{"key":"36_CR49","unstructured":"Robertson, S.E., Walker, S., Jones, S., Hancock-Beaulieu, M.M., Gatford, M.: Okapi at TREC-3. In: Harman, D.K. (ed.) Third Text REtrieval Conference (TREC-3). NIST Special Publication, vol. 500\u2013225, pp. 109\u2013126. National Institute of Standards and Technology (NIST) (1995)"},{"key":"36_CR50","doi-asserted-by":"crossref","unstructured":"Shah, S., Mishra, A., Yadati, N., Talukdar, P.P.: KVQA: knowledge-aware visual question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 8876\u20138884 (2019). https:\/\/144.208.67.177\/ojs\/index.php\/AAAI\/article\/view\/4915","DOI":"10.1609\/aaai.v33i01.33018876"},{"key":"36_CR51","doi-asserted-by":"publisher","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565. Association for Computational Linguistics, Melbourne, Australia, July 2018. https:\/\/doi.org\/10.18653\/v1\/P18-1238, https:\/\/aclanthology.org\/P18-1238","DOI":"10.18653\/v1\/P18-1238"},{"key":"36_CR52","doi-asserted-by":"publisher","unstructured":"Smucker, M.D., Allan, J., Carterette, B.: A comparison of statistical significance tests for information retrieval evaluation. In: Proceedings of the Sixteenth ACM Conference On Conference on Information and Knowledge Management. CIKM 2007, pp. 623\u2013632. Association for Computing Machinery, New York, November 2007. https:\/\/doi.org\/10.1145\/1321440.1321528","DOI":"10.1145\/1321440.1321528"},{"key":"36_CR53","doi-asserted-by":"publisher","unstructured":"Srinivasan, K., Raman, K., Chen, J., Bendersky, M., Najork, M.: Wit: Wikipedia-based image text dataset for multimodal multilingual machine learning. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval. SIGIR 2021, pp. 2443\u20132449. Association for Computing Machinery, New York (2021). https:\/\/doi.org\/10.1145\/3404835.3463257","DOI":"10.1145\/3404835.3463257"},{"key":"36_CR54","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15(1), 1929\u20131958 (2014). Publisher: JMLR.org"},{"key":"36_CR55","unstructured":"Su, W., et al.: Vl-BERT: pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations (2020). https:\/\/openreview.net\/forum?id=SygXPaEYvH"},{"key":"36_CR56","doi-asserted-by":"publisher","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5100\u20135111. Association for Computational Linguistics, Hong Kong, November 2019. https:\/\/doi.org\/10.18653\/v1\/D19-1514, https:\/\/www.aclweb.org\/anthology\/D19-1514","DOI":"10.18653\/v1\/D19-1514"},{"key":"36_CR57","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"36_CR58","doi-asserted-by":"publisher","unstructured":"Wang, Z., Li, L., Li, Q., Zeng, D.: Multimodal data enhanced representation learning for knowledge graphs. In: 2019 International Joint Conference on Neural Networks (IJCNN), pp. 1\u20138, July 2019). https:\/\/doi.org\/10.1109\/IJCNN.2019.8852079, Issn 2161-4407","DOI":"10.1109\/IJCNN.2019.8852079"},{"key":"36_CR59","doi-asserted-by":"publisher","unstructured":"Weston, J., Chopra, S., Bordes, A.: Memory networks (2014). https:\/\/doi.org\/10.48550\/ARXIV.1410.3916, https:\/\/arxiv.org\/abs\/1410.3916","DOI":"10.48550\/ARXIV.1410.3916"},{"key":"36_CR60","unstructured":"Wolf, T., et al.: HuggingFace\u2019s transformers: state-of-the-art natural language processing. arXiv:1910.03771 [cs], July 2020. http:\/\/arxiv.org\/abs\/1910.03771"},{"key":"36_CR61","doi-asserted-by":"publisher","unstructured":"Zhang, H., et al.: Modeling temporal-modal entity graph for procedural multimodal machine comprehension. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1179\u20131189. Association for Computational Linguistics, Dublin, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.84, https:\/\/aclanthology.org\/2022.acl-long.84","DOI":"10.18653\/v1\/2022.acl-long.84"},{"key":"36_CR62","doi-asserted-by":"crossref","unstructured":"Zhu, Y., et al.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), December 2015. https:\/\/www.cv-foundation.org\/openaccess\/content_iccv_2015\/html\/Zhu_Aligning_Books_and_ICCV_2015_paper.html","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-28244-7_36","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T13:42:28Z","timestamp":1709646148000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-28244-7_36"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031282430","9783031282447"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-28244-7_36","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"17 March 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dublin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ireland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 April 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 April 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"45","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2023.org\/index.html?v=1.0","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"489","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"77","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"83","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"16% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}