{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T22:13:05Z","timestamp":1778191985090,"version":"3.51.4"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032251589","type":"print"},{"value":"9783032251596","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-25159-6_19","type":"book-chapter","created":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T14:51:26Z","timestamp":1778165486000},"page":"360-377","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Context-Aware Visual Multi-turn Conversation Generation from\u00a0Wikipedia and\u00a0Wikidata"],"prefix":"10.1007","author":[{"given":"Basel","family":"Shbita","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pengyuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Anna Lisa","family":"Gentile","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,5,8]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"19_CR2","unstructured":"Consortium, W.W.W., et\u00a0al.: SPARQL 1.1 overview. Tech. rep., World Wide Web Consortium (2013)"},{"key":"19_CR3","unstructured":"Consortium, W.W.W., et\u00a0al.: RDF 1.1 primer (2014)"},{"key":"19_CR4","unstructured":"Dai, W., et al.: NVLM: Open frontier-class multimodal LLMs. arXiv preprint arXiv:2409.11402 (2024)"},{"key":"19_CR5","unstructured":"Deitke, M., et\u00a0al.: Molmo and pixmo: Open weights and open data for state-of-the-art multimodal models. arXiv e-prints pp. arXiv\u20132409 (2024)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Duan, H., et\u00a0al.: Vlmevalkit: An open-source toolkit for evaluating large multi-modality models. In: Proceedings of the 32nd ACM International Conference On Multimedia, pp. 11198\u201311201 (2024)","DOI":"10.1145\/3664647.3685520"},{"key":"19_CR7","unstructured":"Fu, C., et\u00a0al.: MME-survey: a comprehensive survey on evaluation of multimodal LLMs. arXiv preprint arXiv:2411.15296 (2024)"},{"key":"19_CR8","unstructured":"Granite\u00a0Team, I.: Granite 3.0 language models. URL: https:\/\/github.com\/ibm-granite\/granite-3.0-language-models (2024)"},{"key":"19_CR9","unstructured":"Grattafiori, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"issue":"4","key":"19_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3447772","volume":"54","author":"A Hogan","year":"2021","unstructured":"Hogan, A., et al.: Knowledge graphs. ACM Comput. Surv. (Csur) 54(4), 1\u201337 (2021)","journal-title":"ACM Comput. Surv. (Csur)"},{"issue":"2","key":"19_CR11","first-page":"3","volume":"1","author":"EJ Hu","year":"2022","unstructured":"Hu, E.J., et al.: Lora: Low-rank adaptation of large language models. ICLR 1(2), 3 (2022)","journal-title":"ICLR"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Hu, H., et al.: Open-domain visual entity recognition: Towards recognizing millions of wikipedia entities. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12065\u201312075 (2023)","DOI":"10.1109\/ICCV51070.2023.01108"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Kembhavi, A., Salvato, M., Kolve, E., Seo, M., Hajishirzi, H., Farhadi, A.: A diagram is worth a dozen images. In: European Conference on Computer Vision, pp. 235\u2013251. Springer (2016)","DOI":"10.1007\/978-3-319-46493-0_15"},{"key":"19_CR14","unstructured":"Li, B., et\u00a0al.: Llava-onevision: Easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Li, Z., Wu, X., Du, H., Liu, F., Nghiem, H., Shi, G.: A survey of state of the art large vision language models: Alignment, benchmark, evaluations and challenges. arXiv preprint arXiv:2501.02189 (2025)","DOI":"10.1109\/CVPRW67362.2025.00147"},{"key":"19_CR16","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR17","unstructured":"Marafioti, A., et\u00a0al.: Smolvlm: Redefining small and efficient multimodal models. arXiv preprint arXiv:2504.05299 (2025)"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: OK-VQA: A visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Masry, A., Long, D.X., Tan, J.Q., Joty, S., Hoque, E.: Chartqa: A benchmark for question answering about charts with visual and logical reasoning. arXiv preprint arXiv:2203.10244 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: Docvqa: A dataset for VQA on document images. In: Proceedings of the IEEE\/CVF Winter Conference On Applications Of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Mensink, T., et al.: Encyclopedic VQA: Visual questions about detailed properties of fine-grained categories. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3113\u20133124 (2023)","DOI":"10.1109\/ICCV51070.2023.00289"},{"key":"19_CR22","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning, pp. 8748\u20138763. PmLR (2021)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Rahmanzadehgervi, P., Bolton, L., Taesiri, M.R., Nguyen, A.T.: Vision language models are blind. In: Proceedings of the Asian Conference on Computer Vision, pp. 18\u201334 (2024)","DOI":"10.1007\/978-981-96-0917-8_17"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Schwenk, D., Khandelwal, A., Clark, C., Marino, K., Mottaghi, R.: A-okvqa: A benchmark for visual question answering using world knowledge. In: European Conference on Computer Vision, pp. 146\u2013162. Springer (2022)","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Shah, S., Mishra, A., Yadati, N., Talukdar, P.P.: KVQA: knowledge-aware visual question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 8876\u20138884 (2019)","DOI":"10.1609\/aaai.v33i01.33018876"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Srinivasan, K., Raman, K., Chen, J., Bendersky, M., Najork, M.: Wit: Wikipedia-based image text dataset for multimodal multilingual machine learning. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2443\u20132449 (2021)","DOI":"10.1145\/3404835.3463257"},{"key":"19_CR27","unstructured":"Team, G., et\u00a0al.: Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)"},{"key":"19_CR28","unstructured":"Team, G.V., et\u00a0al.: Granite vision: a lightweight, open-source multimodal model for enterprise intelligence. arXiv preprint arXiv:2502.09927 (2025)"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Tong, P., et\u00a0al.: Cambrian-1: A fully open, vision-centric exploration of multimodal LLMs. Adv. Neural Inform. Process. Syst. 37, 87310\u201387356 (2024)","DOI":"10.52202\/079017-2771"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? exploring the visual shortcomings of multimodal LLMs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 9568\u20139578 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"issue":"10","key":"19_CR31","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1145\/2629489","volume":"57","author":"D Vrande\u010di\u0107","year":"2014","unstructured":"Vrande\u010di\u0107, D., Kr\u00f6tzsch, M.: Wikidata: a free collaborative knowledgebase. Commun. ACM 57(10), 78\u201385 (2014)","journal-title":"Commun. ACM"},{"key":"19_CR32","unstructured":"Wang, P., et\u00a0al.: Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"19_CR33","doi-asserted-by":"crossref","unstructured":"Yan, Y., Xie, W.: Echosight: Advancing visual-language models with wiki knowledge. arXiv preprint arXiv:2407.12735 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.83"},{"key":"19_CR34","doi-asserted-by":"crossref","unstructured":"Yue, X., et\u00a0al.: Mmmu: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9556\u20139567 (2024)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"19_CR35","unstructured":"Zhang, J., Liu, O., Yu, T., Hu, J., Neiswanger, W.: Euclid: supercharging multimodal LLMs with synthetic high-fidelity visual descriptions. In: Will Synthetic Data Finally Solve the Data Access Problem? (2025). https:\/\/openreview.net\/forum?id=6DV6DCk8GS"},{"key":"19_CR36","unstructured":"Zhang, K., et al.: LMMS-EVAL: Reality check on the evaluation of large multimodal models (2024). https:\/\/arxiv.org\/abs\/2407.12772"}],"container-title":["Lecture Notes in Computer Science","The Semantic Web"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-25159-6_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T22:04:16Z","timestamp":1778191456000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-25159-6_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032251589","9783032251596"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-25159-6_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"8 May 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ESWC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Semantic Web Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dubrovnik","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Croatia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 May 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 May 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"esws2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2026.eswc-conferences.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}