{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T03:48:46Z","timestamp":1770090526913,"version":"3.49.0"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach. Intell. Res."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s11633-025-1591-z","type":"journal-article","created":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T14:11:34Z","timestamp":1770041494000},"page":"115-132","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Bootstrapping Large Language Models with Outsideknowledge for Knowledge-based Visual Question Answering"],"prefix":"10.1007","volume":"23","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2470-8399","authenticated-orcid":false,"given":"Yanze","family":"Min","sequence":"first","affiliation":[]},{"given":"Yawei","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Yin","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,2]]},"reference":[{"key":"1591_CR1","doi-asserted-by":"publisher","first-page":"2425","DOI":"10.1109\/ICCV.2015.279","volume-title":"Proceedings of IEEE International Conference on Computer Vision","author":"S Antol","year":"2015","unstructured":"S. Antol, A. Agrawal, J. Lu, M. Mitchell, D. Batra, C. L. Zitnick, D. Parikh. VQA: Visual question answering. In Proceedings of IEEE International Conference on Computer Vision, Santiago, Chile, pp. 2425\u20132433, 2015. DOI: https:\/\/doi.org\/10.1109\/ICCV.2015.279."},{"key":"1591_CR2","doi-asserted-by":"publisher","first-page":"3190","DOI":"10.1109\/CVPR.2019.00331","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K Marino","year":"2019","unstructured":"K. Marino, M. Rastegari, A. Farhadi, R. Mottaghi. OKVQA: A visual question answering benchmark requiring external knowledge. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 3190\u20133199, 2019. DOI: https:\/\/doi.org\/10.1109\/CVPR.2019.00331."},{"issue":"10","key":"1591_CR3","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2017","unstructured":"P. Wang, Q. Wu, C. Shen, A. Dick, A. Van Den Hengel. FVQA: Fact-based visual question answering. IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 40, no. 10, pp. 2413\u20132427, 2017. DOI: https:\/\/doi.org\/10.1109\/TPAMI.2017.2754246.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1591_CR4","doi-asserted-by":"publisher","first-page":"14106","DOI":"10.1109\/CVPR46437.2021.01389","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K Marino","year":"2021","unstructured":"K. Marino, X. Chen, D. Parikh, A. Gupta, M. Rohrbach. KRISP: Integrating implicit and symbolic knowledge for open-domain knowledge-based VQA. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Nashville, USA, pp. 14106\u201314116, 2021. DOI: https:\/\/doi.org\/10.1109\/CVPR46437.2021.01389."},{"key":"1591_CR5","volume-title":"GPT-4 technical report","author":"J Achiam","year":"2024","unstructured":"J. Achiam, S. Adler, S. Agarwal, L. Ahmad, I. Akkaya, F. L. Aleman, D. Almeida, J. Altenschmidt, S. Altman, S. Anadkat, R. Avila, I. Babuschkin, S. Balaji, V. Balcom, P. Baltescu, H. Bao, M. Bavarian, J. Belgum, I. Bello, J. Berdine, G. Bernadett-Shapiro, C. Berner, L. Bogdonoff, O. Boiko, M. Boyd, A. L. Brakman, G. Brockman, T. Brooks, M. Brundage, K. Button, T. Cai, R. Campbell, A. Cann, B. Carey, C. Carlson, R. Carmichael, B. Chan, C. Chang, F. Chantzis, D. Chen, S. Chen, R. Chen, J. Chen, M. Chen, B. Chess, C. Cho, C. Chu, H. W. Chung, D. Cummings, J. Currier, Y. Dai, C. Decareaux, T. Degry, N. Deutsch, D. Deville, A. Dhar, D. Dohan, S. Dowling, S. Dunning, A. Ecoffet, A. Eleti, T. Eloundou, D. Farhi, L. Fedus, N. Felix, S. P. Fishman, J. Forte, I. Fulford, L. Gao, E. Georges, C. Gibson, V. Goel, T. Gogineni, G. Goh, R. Gontijo-Lopes, J. Gordon, M. Grafstein, S. Gray, R. Greene, J. Gross, S. S. Gu, Y. Guo, C. Hallacy, J. Han, J. Harris, Y. He, M. Heaton, J. Heidecke, C. Hesse, A. Hickey, W. Hickey, P. Hoeschele, B. Houghton, K. Hsu, S. Hu, X. Hu, J. Huizinga, S. Jain, S. Jain, J. Jang, A. Jiang, R. Jiang, H. Jin, D. Jin, S. Jomoto, B. Jonn, H. Jun, T. Kaftan, \u0141. Kaiser, A. Kamali, I. Kanitscheider, N. S. Keskar, T. Khan, L. Kilpatrick, J. W. Kim, C. Kim, Y. Kim, J. H. Kirchner, J. Kiros, M. Knight, D. Kokotajlo, \u0141 Kondraciuk, A. Kondrich, A. Konstantinidis, K. Kosic, G. Krueger, V. Kuo, M. Lampe, I. Lan, T. Lee, J. Leike, J. Leung, D. Levy, C. M. Li, R. Lim, M. Lin, S. Lin, M. Litwin, T. Lopez, R. Lowe, P. Lue, A. Makanju, K. Malfacini, S. Manning, T. Markov, Y. Markovski, B. Martin, K. Mayer, A. Mayne, B. McGrew, S. M. McKinney, C. McLeavey, P. McMillan, J. McNeil, D. Medina, A. Mehta, J. Menick, L. Metz, A. Mishchenko, P. Mishkin, V. Monaco, E. Morikawa, D. Mossing, T. Mu, M. Murati, O. Murk, D. M\u00e9ly, A. Nair, R. Nakano, R. Nayak, A. Neelakantan, R. Ngo, H. Noh, L. Ouyang, C. O\u2019Keefe, J. Pachocki, A. Paino, J. Palermo, A. Pantuliano, G. Parascandolo, J. Parish, E. Parparita, A. Passos, M. Pavlov, A. Peng, A. Perelman, F. de Avila Belbute Peres, M. Petrov, H. P. de Oliveira Pinto, M. Pokorny, M. Pokrass, V. H. Pong, T. Powell, A. Power, B. Power, E. Proehl, R. Puri, A. Radford, J. Rae, A. Ramesh, C. Raymond, F. Real, K. Rimbach, C. Ross, B. Rotsted, H. Roussez, N. Ryder, M. Saltarelli, T. Sanders, S. Santurkar, G. Sastry, H. Schmidt, D. Schnurr, J. Schulman, D. Selsam, K. Sheppard, T. Sherbakov, J. Shieh, S. Shoker, P. Shyam, S. Sidor, E. Sigler, M. Simens, J. Sitkin, K. Slama, I. Sohl, B. Sokolowsky, Y. Song, N. Staudacher, F. P. Such, N. Summers, I. Sutskever, J. Tang, N. Tezak, M. B. Thompson, P. Tillet, A. Tootoonchian, E. Tseng, P. Tuggle, N. Turley, J. Tworek, J. F. C. Uribe, A. Vallone, A. Vijayvergiya, C. Voss, C. Wainwright, J. J. Wang, A. Wang, B. Wang, J. Ward, J. Wei, C. J. Weinmann, A. Welihinda, P. Welinder, J. Weng, L. Weng, M. Wiethoff, D. Willner, C. Winter, S. Wolrich, H. Wong, L. Workman, S. Wu, J. Wu, M. Wu, K. Xiao, T. Xu, S. Yoo, K. Yu, Q. Yuan, W. Zaremba, R. Zellers, C. Zhang, M. Zhang, S. Zhao, T. Zheng, J. Zhuang, W. Zhuk, B. Zoph, GPT-4 technical report, [Onine], Available: https:\/\/arxiv.org\/abs\/2303.08774, 2024."},{"key":"1591_CR6","volume-title":"MiniGPT-v2: Large language model as a unified interface for vision-language multi-task learning","author":"J Chen","year":"2023","unstructured":"J. Chen, D. Zhu, X. Shen, X. Li, Z. Liu, P. Zhang, R. Krishnamoorthi, V. Chandra, Y. Xiong, M. Elhoseiny. MiniGPT-v2: Large language model as a unified interface for vision-language multi-task learning, [Online], Available: https:\/\/arxiv.org\/abs\/2310.09478, 2023."},{"key":"1591_CR7","volume-title":"Llama 2: Open foundation and fine-tuned chat models","author":"H Touvron","year":"2023","unstructured":"H. Touvron, L. Martin, K. Stone, P. Albert, A. Almahairi, Y. Babaei, N. Bashlykov, S. Batra, P. Bhargava, S. Bhosale, D. Bikel, L. Blecher, C. C. Ferrer, M. Chen, G. Cucurull, D. Esiobu, J. Fernandes, J. Fu, W. Fu, B. Fuller, C. Gao, V. Goswami, N. Goyal, A. Hartshorn, S. Hosseini, R. Hou, H. Inan, M. Kardas, V. Kerkez, M. Khabsa, I. Kloumann, A. Korenev, P. S. Koura, M. A. Lachaux, T. Lavril, J. Lee, D. Liskovich, Y. Lu, Y. Mao, X. Martinet, T. Mihaylov, P. Mishra, I. Molybog, Y. Nie, A. Poulton, J. Reizenstein, R. Rungta, K. Saladi, A. Schelten, R. Silva, E. M. Smith, R. Subramanian, X. E. Tan, B. Tang, R. Taylor, A. Williams, J. X. Kuan, P. Xu, Z. Yan, I. Zarov, Y. Zhang, A. Fan, M. Kambadur, S. Narang, A. Rodriguez, R. Stojnic, S. Edunov, T. Scialom. Llama 2: Open foundation and fine-tuned chat models, [Online], Available: https:\/\/arxiv.org\/abs\/2307.09288, 2023."},{"key":"1591_CR8","first-page":"1877","volume-title":"Proceedings of the 34th Conference on Neural Information Processing Systems","author":"T B Brown","year":"2020","unstructured":"T. B. Brown, B. Mann, N. Ryder, M. Subbiah, J. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell, S. Agarwal, A. Herbert-Voss, G. Krueger, T. Henighan, R. Child, A. Ramesh, D. M. Ziegler, J. Wu, C. Winter, C. Hesse, M. Chen, E. Sigler, M. Litwin, S. Gray, B. Chess, J. Clark, C. Berner, S. McCandlish, A. Radford, I. Sutskever, D. Amodei. Language models are few-shot learners. In Proceedings of the 34th Conference on Neural Information Processing Systems, pp. 1877\u20131901, 2020."},{"key":"1591_CR9","doi-asserted-by":"publisher","first-page":"14974","DOI":"10.1109\/CVPR52729.2023.01438","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Shao","year":"2023","unstructured":"Z. Shao, Z. Yu, M. Wang, J. Yu. Prompting large language models with answer heuristics for knowledge-based visual question answering. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Vancouver, Canada, pp. 14974\u201314983, 2023. DOI: https:\/\/doi.org\/10.1109\/CVPR52729.2023.01438."},{"key":"1591_CR10","volume-title":"PromptCap: Prompt-guided task-aware image captioning","author":"Y Hu","year":"2023","unstructured":"Y. Hu, H. Hua, Z. Yang, W. Shi, N. A. Smith, J. Luo. PromptCap: Prompt-guided task-aware image captioning, [Online], Available: https:\/\/arxiv.org\/abs\/2211.09699, 2023."},{"key":"1591_CR11","doi-asserted-by":"publisher","first-page":"3081","DOI":"10.1609\/aaai.v36i3.20215","volume-title":"Proceedings of the 36th AAAI Conference on Artificial Intelligence","author":"Z Yang","year":"2022","unstructured":"Z. Yang, Z. Gan, J. Wang, X. Hu, Y. Lu, Z. Liu, L. Wang. An empirical study of GPT-3 for few-shot knowledgebased VQA. In Proceedings of the 36th AAAI Conference on Artificial Intelligence, pp. 3081\u20133089, 2022. DOI: https:\/\/doi.org\/10.1609\/aaai.v36i3.20215."},{"key":"1591_CR12","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Y Lin","year":"2022","unstructured":"Y. Lin, Y. Xie, D. Chen, Y. Xu, C. Zhu, L. Yuan. REVIVE: Regional visual representation matters in knowledge-based visual question answering. In Proceedings of the 36th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 767, 2022."},{"key":"1591_CR13","doi-asserted-by":"publisher","unstructured":"L. Huang, W. Yu, W. Ma, W. Zhong, Z. Feng, H. Wang, Q. Chen, W. Peng, X. Feng, B. Qin, T. Liu. A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, vol. 43, no. 2, Article number 42, 2025. DOI: https:\/\/doi.org\/10.1145\/3703155.","DOI":"10.1145\/3703155"},{"key":"1591_CR14","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3667112","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"W Lin","year":"2024","unstructured":"W. Lin, J. Chen, J. Mei, A. Coca, B. Byrne. Fine-grained late-interaction multi-modal retrieval for retrieval augmented visual question answering. In Proceedings of the 37th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 990, 2024. DOI: https:\/\/doi.org\/10.5555\/3666122.3667112."},{"key":"1591_CR15","volume-title":"Visual question answering: From early developments to recent advances\u2013a survey","author":"N D Huynh","year":"2025","unstructured":"N. D. Huynh, M. R. Bouadjenek, S. Aryal, I. Razzak, H. Hacid. Visual question answering: From early developments to recent advances\u2013a survey, [Online], Available: https:\/\/arxiv.org\/abs\/2501.03939, 2025."},{"key":"1591_CR16","doi-asserted-by":"publisher","first-page":"3567","DOI":"10.1609\/aaai.v30i1.10442","volume-title":"Proceedings of the 30th AAAI Conference on Artificial Intelligence","author":"L Ma","year":"2016","unstructured":"L. Ma, Z. Lu, H. Li. Learning to answer questions from image using convolutional neural network. In Proceedings of the 30th AAAI Conference on Artificial Intelligence, Phoenix, USA, pp. 3567\u20133573, 2016. DOI: https:\/\/doi.org\/10.1609\/aaai.v30i1.10442."},{"key":"1591_CR17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.9","volume-title":"Proceedings of IEEE International Conference on Computer Vision","author":"M Malinowski","year":"2015","unstructured":"M. Malinowski, M. Rohrbach, M. Fritz. Ask your neurons: A neural-based approach to answering questions about images. In Proceedings of IEEE International Conference on Computer Vision, Santiago, Chile, 2015. DOI: https:\/\/doi.org\/10.1109\/ICCV.2015.9."},{"key":"1591_CR18","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1109\/CVPR.2016.10","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"Z Yang","year":"2016","unstructured":"Z. Yang, X. He, J. Gao, L. Deng, A. Smola. Stacked attention networks for image question answering. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, USA, pp. 21\u201329, 2016. DOI: https:\/\/doi.org\/10.1109\/CVPR.2016.10."},{"key":"1591_CR19","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-319-46478-7_28","volume-title":"Proceedings of the 14th European Conference on Computer Vision","author":"H Xu","year":"2016","unstructured":"H. Xu, K. Saenko. Ask, attend and answer: Exploring question-guided spatial attention for visual question answering. In Proceedings of the 14th European Conference on Computer Vision, Amsterdam, The Netherlands, pp. 451\u2013466, 2016. DOI: https:\/\/doi.org\/10.1007\/978-3-319-46478-7_28."},{"key":"1591_CR20","doi-asserted-by":"publisher","first-page":"289","DOI":"10.5555\/3157096.3157129","volume-title":"Proceedings of the 30th International Conference on Neural Information Processing Systems","author":"J Lu","year":"2016","unstructured":"J. Lu, J. Yang, D. Batra, D. Parikh. Hierarchical questionimage co-attention for visual question answering. In Proceedings of the 30th International Conference on Neural Information Processing Systems, Barcelona, Spain, pp. 289\u2013297, 2016. DOI: https:\/\/doi.org\/10.5555\/3157096.3157129."},{"key":"1591_CR21","doi-asserted-by":"publisher","first-page":"6274","DOI":"10.1109\/CVPR.2019.00644","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Yu","year":"2019","unstructured":"Z. Yu, J. Yu, Y. Cui, D. Tao, Q. Tian. Deep modular coattention networks for visual question answering. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 6274\u20136283, 2019. DOI: https:\/\/doi.org\/10.1109\/CVPR.2019.00644."},{"key":"1591_CR22","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1109\/CVPR.2016.12","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"J Andreas","year":"2016","unstructured":"J. Andreas, M. Rohrbach, T. Darrell, D. Klein. Neural module networks. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, USA, pp. 39\u201348, 2016. DOI: https:\/\/doi.org\/10.1109\/CVPR.2016.12."},{"key":"1591_CR23","doi-asserted-by":"publisher","first-page":"804","DOI":"10.1109\/ICCV.2017.93","volume-title":"Proceedings of IEEE International Conference on Computer Vision","author":"R Hu","year":"2017","unstructured":"R. Hu, J. Andreas, M. Rohrbach, T. Darrell, K. Saenko. Learning to reason: End-to-end module networks for visual question answering. In Proceedings of IEEE International Conference on Computer Vision, Venice, Italy, pp. 804\u2013813, 2017. DOI: https:\/\/doi.org\/10.1109\/ICCV.2017.93."},{"key":"1591_CR24","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1007\/978-3-030-01234-2_4","volume-title":"Proceedings of the 15th European Conference on Computer Vision","author":"R Hu","year":"2018","unstructured":"R. Hu, J. Andreas, T. Darrell, K. Saenko. Explainable neural computation via stack neural module networks. In Proceedings of the 15th European Conference on Computer Vision, Munich, Germany, pp. 55\u201371, 2018. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01234-2_4."},{"key":"1591_CR25","doi-asserted-by":"publisher","first-page":"3233","DOI":"10.1109\/CVPR.2017.344","volume-title":"Proceedings of IEEE Conference on Computer Vision and Pattern Recognition","author":"D Teney","year":"2017","unstructured":"D. Teney, L. Liu, A. van den Hengel. Graph-structured representations for visual question answering. In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 3233\u20133241, 2017. DOI: https:\/\/doi.org\/10.1109\/CVPR.2017.344."},{"key":"1591_CR26","doi-asserted-by":"publisher","first-page":"2659","DOI":"10.5555\/3327144.3327190","volume-title":"Proceedings of the 32nd International Conference on Neural Information Processing Systems","author":"M Narasimhan","year":"2018","unstructured":"M. Narasimhan, S. Lazebnik, A. G. Schwing. Out of the box: Reasoning with graph convolution nets for factual visual question answering. In Proceedings of the 32nd International Conference on Neural Information Processing Systems, Montreal, Canada, pp. 2659\u20132670, 2018. DOI: https:\/\/doi.org\/10.5555\/3327144.3327190."},{"issue":"2","key":"1591_CR27","doi-asserted-by":"publisher","first-page":"1023","DOI":"10.1109\/TNNLS.2021.3104937","volume":"34","author":"D Guo","year":"2023","unstructured":"D. Guo, C. Xu, D. Tao. Bilinear graph networks for visual question answering. IEEE Transactions on Neural Networks and Learning Systems, vol. 34, no. 2, pp. 1023\u20131034, 2023. DOI: https:\/\/doi.org\/10.1109\/TNNLS.2021.3104937.","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"1591_CR28","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Proceedings of the 16th European Conference on Computer Vision","author":"X Li","year":"2020","unstructured":"X. Li, X. Yin, C. Li, P. Zhang, X. Hu, L. Zhang, L. Wang, H. Hu, L. Dong, F. Wei, Y. Choi, J. Gao. OSCAR: Objectsemantics aligned pre-training for vision-language tasks. In Proceedings of the 16th European Conference on Computer Vision, Glasgow, UK, pp. 121\u2013137, 2020. DOI: https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8."},{"key":"1591_CR29","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"A Radford","year":"2021","unstructured":"A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, I. Sutskever. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning, pp. 8748\u20138763, 2021."},{"key":"1591_CR30","volume-title":"Florence: A new foundation model for computer vision","author":"L Yuan","year":"2021","unstructured":"L. Yuan, D. Chen, Y. L. Chen, N. Codella, X. Dai, J. Gao, H. Hu, X. Huang, B. Li, C. Li, C. Liu, M. Liu, Z. Liu, Y. Lu, Y. Shi, L. Wang, J. Wang, B. Xiao, Z. Xiao, J. Yang, M. Zeng, L. Zhou, P. Zhang. Florence: A new foundation model for computer vision, [Online], Available: https:\/\/arxiv.org\/abs\/2111.11432, 2021."},{"key":"1591_CR31","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"Z Wang","year":"2022","unstructured":"Z. Wang, J. Yu, A. W. Yu, Z. Dai, Y. Tsvetkov, Y. Cao. SimVLM: Simple visual language model pretraining with weak supervision. In Proceedings of the 10th International Conference on Learning Representations, 2022."},{"key":"1591_CR32","doi-asserted-by":"publisher","first-page":"5575","DOI":"10.1109\/CVPR46437.2021.00553","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"P Zhang","year":"2021","unstructured":"P. Zhang, X. Li, X. Hu, J. Yang, L. Zhang, L. Wang, Y. Choi, J. Gao. VinVL: Revisiting visual representations in vision-language models. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, Nashville, USA, pp. 5575\u20135584, 2021. DOI: https:\/\/doi.org\/10.1109\/CVPR46437.2021.00553."},{"key":"1591_CR33","first-page":"2592","volume-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing","author":"W Li","year":"2021","unstructured":"W. Li, C. Gao, G. Niu, X. Xiao, H. Liu, J. Liu, H. Wu, H. Wang. UNIMO: Towards unified-modal understanding and generation via cross-modal contrastive learning. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, pp. 2592\u20132607, 2021."},{"key":"1591_CR34","doi-asserted-by":"publisher","first-page":"1290","DOI":"10.24963\/ijcai.2017\/179","volume-title":"Proceedings of the 26th International Joint Conference on Artificial Intelligence","author":"P Wang","year":"2017","unstructured":"P. Wang, Q. Wu, C. Shen, A. Dick, A. van den Hengel. Explicit knowledge-based reasoning for visual question answering. In Proceedings of the 26th International Joint Conference on Artificial Intelligence, Melbourne, Australia, pp. 1290\u20131296, 2017. DOI: https:\/\/doi.org\/10.24963\/ijcai.2017\/179."},{"key":"1591_CR35","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-030-01237-3_28","volume-title":"Proceedings of the 15th European Conference on Computer Vision","author":"M Narasimhan","year":"2018","unstructured":"M. Narasimhan, A. G. Schwing. Straight to the facts: Learning knowledge base retrieval for factual visual question answering. In Proceedings of the 15th European Conference on Computer Vision, Munich, Germany, pp. 451\u2013468, 2018. DOI: https:\/\/doi.org\/10.1007\/978-3-030-01237-3_28."},{"key":"1591_CR36","doi-asserted-by":"publisher","first-page":"8876","DOI":"10.1609\/aaai.v33i01.33018876","volume-title":"Proceedings of the 33rd AAAI Conference on Artificial Intelligence","author":"S Shah","year":"2019","unstructured":"S. Shah, A. Mishra, N. Yadati, P. P. Talukdar. KVQA: Knowledge-aware visual question answering. In Proceedings of the 33rd AAAI Conference on Artificial Intelligence, Honolulu, USA, pp. 8876\u20138884, 2019. DOI: https:\/\/doi.org\/10.1609\/aaai.v33i01.33018876."},{"key":"1591_CR37","doi-asserted-by":"publisher","first-page":"2712","DOI":"10.1609\/aaai.v36i3.20174","volume-title":"Proceedings of the 36th AAAI Conference on Artificial Intelligence","author":"J Wu","year":"2022","unstructured":"J. Wu, J. Lu, A. Sabharwal, R. Mottaghi. Multi-modal answer validation for knowledge-based VQA. In Proceedings of the 36th AAAI Conference on Artificial Intelligence, pp. 2712\u20132721, 2022. DOI: https:\/\/doi.org\/10.1609\/aaai.v36i3.20174."},{"key":"1591_CR38","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3668475","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"X Xing","year":"2023","unstructured":"X. Xing, M. Liang, Y. Wu. TOA: Task-oriented active VQA. In Proceedings of the 37th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 2353, 2023. DOI: https:\/\/doi.org\/10.5555\/3666122.3668475."},{"key":"1591_CR39","doi-asserted-by":"publisher","first-page":"956","DOI":"10.18653\/v1\/2022.naacl-main.70","volume-title":"Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"L Gui","year":"2022","unstructured":"L. Gui, B. Wang, Q. Huang, A. Hauptmann, Y. Bisk, J. Gao. KAT: A knowledge augmented transformer for vision-and-language. In Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Seattle, USA, pp. 956\u2013968, 2022. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.70."},{"key":"1591_CR40","doi-asserted-by":"publisher","first-page":"6417","DOI":"10.18653\/v1\/2021.emnlp-main.517","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"M Luo","year":"2021","unstructured":"M. Luo, Y. Zeng, P. Banerjee, C. Baral. Weakly-supervised visual-retriever-reader for knowledge-based question answering. In Proceedings of Conference on Empirical Methods in Natural Language Processing, Punta Cana, Dominican Republic, pp. 6417\u20136431, 2021. DOI: https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.517."},{"key":"1591_CR41","doi-asserted-by":"publisher","first-page":"5057","DOI":"10.1109\/CVPR52688.2022.00501","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"F Gao","year":"2022","unstructured":"F. Gao, Q. Ping, G. Thattai, A. Reganti, Y. N. Wu, P. Natarajan. Transform-retrieve-generate: Natural language-centric outside-knowledge visual question answering. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 5057\u20135067, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.00501."},{"key":"1591_CR42","doi-asserted-by":"publisher","first-page":"5079","DOI":"10.1109\/CVPR52688.2022.00503","volume-title":"Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Ding","year":"2022","unstructured":"Y. Ding, J. Yu, B. Liu, Y. Hu, M. Cui, Q. Wu. MuKEA: Multimodal knowledge extraction and accumulation for knowledge-based visual question answering. In Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, New Orleans, USA, pp. 5079\u20135089, 2022. DOI: https:\/\/doi.org\/10.1109\/CVPR52688.2022.00503."},{"key":"1591_CR43","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"W Dai","year":"2023","unstructured":"W. Dai, J. Li, D. Li, A. M. H. Tiong, J. Zhao, W. Wang, B. Li, P. Fung, S. Hoi. InstructBLIP: Towards general-purpose vision-language models with instruction tuning. In Proceedings of the 37th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 2142, 2023."},{"key":"1591_CR44","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3667638","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"H Liu","year":"2023","unstructured":"H. Liu, C. Li, Q. Wu, Y. J. Lee. Visual instruction tuning. In Proceedings of the 37th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 1516, 2023. DOI: https:\/\/doi.org\/10.5555\/3666122.3667638."},{"key":"1591_CR45","first-page":"8469","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"D Driess","year":"2023","unstructured":"D. Driess, F. Xia, M. S. M. Sajjadi, C. Lynch, A. Chowdhery, B. Ichter, A. Wahid, J. Tompson, Q. Vuong, T. Yu, W. Huang, Y. Chebotar, P. Sermanet, D. Duckworth, S. Levine, V. Vanhoucke, K. Hausman, M. Toussaint, K. Greff, A. Zeng, I. Mordatch, P. Florence. PaLM-E: An embodied multimodal language model. In Proceedings of the 40th International Conference on Machine Learning, Honolulu, USA, pp. 8469\u20138488, 2023."},{"key":"1591_CR46","doi-asserted-by":"publisher","DOI":"10.5555\/3600270.3601993","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"J B Alayrac","year":"2022","unstructured":"J. B. Alayrac, J. Donahue, P. Luc, A. Miech, I. Barr, Y. Hasson, K. Lenc, A. Mensch, K. Millican, M. Reynolds, A. Miech, I. Barr, Y. Hasson, K. Lenc, A. Mensch, K. Millicah, M. Reynolds, R. Ring, E. Rutherford, S. Cabi, T. Han, Z. Gong, S. Samangooei, M. Monteiro, J. Menick, S. Borgeaud, A. Brock, A. Nematzadeh, S. Sharifzadeh, M. Binkowski, R. Barreira, O. Vinyals, A. Zisserman, K. Simonyan. Flamingo: A visual language model for few-shot learning. In Proceedings of the 36th International Conference on Neural Information Processing Systems, New Orleans, USA, Article number 1723, 2022. DOI: https:\/\/doi.org\/10.5555\/3600270.3601993."},{"key":"1591_CR47","doi-asserted-by":"publisher","first-page":"39755","DOI":"10.48550\/arXiv.2211.12561","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"M Yasunaga","year":"2022","unstructured":"M. Yasunaga, A. Aghajanyan, W. Shi, R. James, J. Leskovec, P. Liang, M. Lewis, L. Zettlemoyer, W. T. Yih. Retrieval-augmented multimodal language modeling. In Proceedings of the 40th International Conference on Machine Learning, Honolulu, USA, pp. 39755\u201339769, 2022. DOI: https:\/\/doi.org\/10.48550\/arXiv.2211.12561."},{"key":"1591_CR48","doi-asserted-by":"publisher","first-page":"274","DOI":"10.1007\/978-3-031-72946-1_16","volume-title":"Proceedings of the 18th European Conference on Computer Vision","author":"H Wang","year":"2024","unstructured":"H. Wang, W. Ge. Q&A prompts: Discovering rich visual clues through mining question-answer prompts for VQA requiring diverse world knowledge. In Proceedings of the 18th European Conference on Computer Vision, Milan, Italy, pp. 274\u2013292, 2024. DOI: https:\/\/doi.org\/10.1007\/978-3-031-72946-1_16."},{"key":"1591_CR49","volume-title":"GeReA: Question-aware prompt captions for knowledge-based visual question answering","author":"Z Ma","year":"2024","unstructured":"Z. Ma, S. Li, B. Sun, J. Cai, Z. Long, F. Ma. GeReA: Question-aware prompt captions for knowledge-based visual question answering, [Online], Available: https:\/\/arxiv.org\/abs\/2402.02503, 2024."},{"key":"1591_CR50","doi-asserted-by":"publisher","first-page":"6769","DOI":"10.18653\/v1\/2020.emnlp-main.550","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"V Karpukhin","year":"2020","unstructured":"V. Karpukhin, B. O\u011fuz, S. Min, P. Lewis, L. Wu, S. Edunov, D. Chen, W. T. Yih. Dense passage retrieval for opendomain question answering. In Proceedings of Conference on Empirical Methods in Natural Language Processing, pp. 6769\u20136781, 2020. DOI: https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-main.550."},{"key":"1591_CR51","doi-asserted-by":"publisher","first-page":"8061","DOI":"10.18653\/v1\/2022.emnlp-main.551","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"J Wu","year":"2022","unstructured":"J. Wu, R. Mooney. Entity-focused dense passage retrieval for outside-knowledge visual question answering. In Proceedings of Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, UAE, pp. 8061\u20138072, 2022. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.551."},{"key":"1591_CR52","doi-asserted-by":"publisher","first-page":"11238","DOI":"10.18653\/v1\/2022.emnlp-main.772","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"W Lin","year":"2022","unstructured":"W. Lin, B. Byrne. Retrieval augmented visual question answering with outside knowledge. In Proceedings of Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, UAE, pp. 11238\u201311254, 2022. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.772."},{"key":"1591_CR53","doi-asserted-by":"publisher","first-page":"3715","DOI":"10.18653\/v1\/2022.naacl-main.272","volume-title":"Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"K Santhanam","year":"2022","unstructured":"K. Santhanam, O. Khattab, J. Saad-Falcon, C. Potts, M. Zaharia. ColBERTv2: Effective and efficient retrieval via lightweight late interaction. In Proceedings of Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Seattle, USA, pp. 3715\u20133734, 2022. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.272."},{"key":"1591_CR54","doi-asserted-by":"publisher","first-page":"5399","DOI":"10.1145\/3581783.3613848","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia","author":"J Rao","year":"2023","unstructured":"J. Rao, Z. Shan, L. Liu, Y. Zhou, Y. Yang. Retrieval-based knowledge augmented vision language pre-training. In Proceedings of the 31st ACM International Conference on Multimedia, Ottawa, Canada, pp. 5399\u20135409, 2023. DOI: https:\/\/doi.org\/10.1145\/3581783.3613848."},{"key":"1591_CR55","volume-title":"Retrieval-augmented generation for large language models: A survey","author":"Y Gao","year":"2024","unstructured":"Y. Gao, Y. Xiong, X. Gao, K. Jia, J. Pan, Y. Bi, Y. Dai, J. Sun, H. Wang, H. Wang. Retrieval-augmented generation for large language models: A survey, [Online], Available: https:\/\/arxiv.org\/abs\/2312.10997, 2024."},{"key":"1591_CR56","doi-asserted-by":"publisher","first-page":"5558","DOI":"10.18653\/v1\/2022.emnlp-main.375","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"W Chen","year":"2022","unstructured":"W. Chen, H. Hu, X. Chen, P. Verga, W. W. Cohen. MuRAG: Multimodal retrieval-augmented generator for open question answering over images and text. In Proceedings of Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, UAE, pp. 5558\u20135570, 2022. DOI: https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.375."},{"key":"1591_CR57","doi-asserted-by":"publisher","DOI":"10.5555\/3737916.3739976","volume-title":"Proceedings of the 38th International Conference on Neural Information Processing Systems","author":"C Li","year":"2025","unstructured":"C. Li, Z. Li, C. Jing, S. Liu, W. Shao, Y. Wu, P. Luo, Y. Qiao, K. Zhang. SearchLVLMs: A plug-and-play framework for augmenting large vision-language models by searching up-to-date internet knowledge. In Proceedings of the 38th International Conference on Neural Information Processing Systems, Vancouver, Canada, Article number 2060, 2025. DOI: https:\/\/doi.org\/10.5555\/3737916.3739976."},{"key":"1591_CR58","volume-title":"Retrieval meets reasoning: Even high-school textbook knowledge benefits multimodal reasoning","author":"C Tan","year":"2024","unstructured":"C. Tan, J. Wei, L. Sun, Z. Gao, S. Li, B. Yu, R. Guo, S. Z. Li. Retrieval meets reasoning: Even high-school textbook knowledge benefits multimodal reasoning, [Online], Available: https:\/\/arxiv.org\/abs\/2405.20834, 2024."},{"key":"1591_CR59","volume-title":"MLLM is a strong reranker: Advancing multimodal retrieval-augmented generation via knowledge-enhanced reranking and noise-injected training","author":"Z Chen","year":"2024","unstructured":"Z. Chen, C. Xu, Y. Qi, J. Guo. MLLM is a strong reranker: Advancing multimodal retrieval-augmented generation via knowledge-enhanced reranking and noise-injected training, [Online], Available: https:\/\/arxiv.org\/abs\/2407.21439, 2024."},{"key":"1591_CR60","doi-asserted-by":"publisher","first-page":"39","DOI":"10.1145\/3397271.3401075","volume-title":"Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval","author":"O Khattab","year":"2020","unstructured":"O. Khattab, M. Zaharia. ColBERT: Efficient and effective passage search via contextualized late interaction over BERT. In Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 39\u201348, 2020. DOI: https:\/\/doi.org\/10.1145\/3397271.3401075."},{"key":"1591_CR61","doi-asserted-by":"publisher","first-page":"4444","DOI":"10.1609\/aaai.v31i1.11164","volume-title":"Proceedings of the 31st AAAI Conference on Artificial Intelligence","author":"R Speer","year":"2017","unstructured":"R. Speer, J. Chin, C. Havasi. ConceptNet 5.5: An open multilingual graph of general knowledge. In Proceedings of the 31st AAAI Conference on Artificial Intelligence, San Francisco, USA, pp. 4444\u20134451, 2017. DOI: https:\/\/doi.org\/10.1609\/aaai.v31i1.11164."},{"key":"1591_CR62","doi-asserted-by":"publisher","first-page":"115","DOI":"10.18653\/v1\/P17-4020","volume-title":"Proceedings of ACL, System Demonstrations","author":"N Tandon","year":"2017","unstructured":"N. Tandon, G. de Melo, G. Weikum. WebChild 2.0: Finegrained commonsense knowledge distillation. In Proceedings of ACL, System Demonstrations, Association for Computational Linguistics, Vancouver, Canada, pp. 115\u2013120, 2017. DOI: https:\/\/doi.org\/10.18653\/v1\/P17-4020."},{"key":"1591_CR63","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1007\/978-3-540-76298-0_52","volume-title":"Proceedings of the 6th International Semantic Web Conference, the 2nd Asian Semantic Web Conference","author":"S Auer","year":"2007","unstructured":"S. Auer, C. Bizer, G. Kobilarov, J. Lehmann, R. Cyganiak, Z. Ives. DBpedia: A nucleus for a web of open data. In Proceedings of the 6th International Semantic Web Conference, the 2nd Asian Semantic Web Conference, Busan, Republic of Korea, pp. 722\u2013735, 2007. DOI: https:\/\/doi.org\/10.1007\/978-3-540-76298-0_52."},{"key":"1591_CR64","doi-asserted-by":"publisher","unstructured":"C. Raffel, N. Shazeer, A. Roberts, K. Lee, S. Narang, M. Matena, Y. Zhou, W. Li, P. J. Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research, vol. 21, no. 1, Article number 140, 2020. DOI: https:\/\/doi.org\/10.5555\/3455716.3455856.","DOI":"10.5555\/3455716.3455856"},{"key":"1591_CR65","first-page":"19730","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"J Li","year":"2023","unstructured":"J. Li, D. Li, S. Savarese, S. Hoi. BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In Proceedings of the 40th International Conference on Machine Learning, Honolulu, USA, pp. 19730\u201319742, 2023."},{"key":"1591_CR66","volume-title":"Proceedings of the 11th International Conference on Learning Representations","author":"X Chen","year":"2023","unstructured":"X. Chen, X. Wang, S. Changpinyo, A. J. Piergiovanni, P. Padlewski, D. Salz, S. Goodman, A. Grycner, B. Mustafa, L. Beyer, A. Kolesnikov, J. Puigcerver, N. Ding, K. Rong, H. Akbari, G. Mishra, L. Xue, A. V. Thapliyal, J. Bradbury, W. Kuo. PaLI: A jointly-scaled multilingual language-image model. In Proceedings of the 11th International Conference on Learning Representations, Kigali, Rwanda, 2023."},{"key":"1591_CR67","doi-asserted-by":"publisher","first-page":"489","DOI":"10.18653\/v1\/2020.findings-emnlp.44","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"F Gard\u00e8res","year":"2020","unstructured":"F. Gard\u00e8res, M. Ziaeefard, B. Abeloos, F. Lecue. ConceptBert: Concept-aware representation for visual question answering. In Proceedings of Findings of the Association for Computational Linguistics, pp. 489\u2013498, 2020. DOI: https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.44."},{"key":"1591_CR68","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"E J Hu","year":"2022","unstructured":"E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, W. Chen. LoRA: Low-rank adaptation of large language models. In Proceedings of the 10th International Conference on Learning Representations, 2022."},{"key":"1591_CR69","volume-title":"Scaling laws for neural language models","author":"J Kaplan","year":"2020","unstructured":"J. Kaplan, S. McCandlish, T. Henighan, T. B. Brown, B. Chess, R. Child, S. Gray, A. Radford, J. Wu, D. Amodei. Scaling laws for neural language models, [Online], Available: https:\/\/arxiv.org\/abs\/2001.08361, 2020."}],"container-title":["Machine Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-025-1591-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11633-025-1591-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-025-1591-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T15:04:07Z","timestamp":1770044647000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11633-025-1591-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":69,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["1591"],"URL":"https:\/\/doi.org\/10.1007\/s11633-025-1591-z","relation":{},"ISSN":["2731-538X","2731-5398"],"issn-type":[{"value":"2731-538X","type":"print"},{"value":"2731-5398","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"28 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Jun Zhu and Bo Zhang are editorial board members of\n                      Machine Intelligence Research\n                      and were not involved in the editorial review, or the decision to publish this article. All authors declare that there are no other competing interests. The authors declared that they have no conflicts of interest to this work.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations of conflict of interest"}}]}}