{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T06:57:36Z","timestamp":1781161056932,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":21,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819200672","type":"print"},{"value":"9789819200689","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-92-0068-9_33","type":"book-chapter","created":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T06:11:39Z","timestamp":1781158299000},"page":"492-506","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AutoViVQA: A Large-Scale, Automatically Constructed Dataset for\u00a0Vietnamese Visual Question Answering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-3457-5745","authenticated-orcid":false,"given":"Anh Tuong","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1816-9224","authenticated-orcid":false,"given":"Ba Duc","family":"Phan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4569-3932","authenticated-orcid":false,"given":"Trung Quoc","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8187-4605","authenticated-orcid":false,"given":"Dac Thinh","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8658-6602","authenticated-orcid":false,"given":"Duy Lan","family":"Dang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6765-1276","authenticated-orcid":false,"given":"Quoc Thinh","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9900-7047","authenticated-orcid":false,"given":"Tung","family":"Le","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,6,1]]},"reference":[{"key":"33_CR1","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Proceedings of the 36th International Conference on Neural Information Processing Systems. NIPS \u201922, Curran Associates Inc., Red Hook, NY (2022)"},{"key":"33_CR2","doi-asserted-by":"publisher","unstructured":"Doan, K.T., et al.: Vintern-1b: an efficient multimodal large language model for vietnamese (2024). https:\/\/doi.org\/10.48550\/arXiv.2408.12480","DOI":"10.48550\/arXiv.2408.12480"},{"key":"33_CR3","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"33_CR4","doi-asserted-by":"publisher","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in VQA matter: elevating the role of image understanding in visual question answering. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6325\u20136334 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.670","DOI":"10.1109\/CVPR.2017.670"},{"key":"33_CR5","doi-asserted-by":"publisher","unstructured":"Guo, X., Chen, Y.: Generative AI for synthetic data generation: methods, challenges and the future (2024). https:\/\/doi.org\/10.48550\/arXiv.2403.04190","DOI":"10.48550\/arXiv.2403.04190"},{"key":"33_CR6","doi-asserted-by":"publisher","unstructured":"Honovich, O., Scialom, T., Levy, O., Schick, T.: Unnatural instructions: tuning language models with (almost) no human labor (2022). https:\/\/doi.org\/10.48550\/arXiv.2212.09689","DOI":"10.48550\/arXiv.2212.09689"},{"key":"33_CR7","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. ICML\u201923, JMLR.org (2023)"},{"key":"33_CR8","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: European Conference on Computer Vision (ECCV), pp. 740\u2013755 (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"33_CR9","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS), pp. 34892\u201334916 (2023)","DOI":"10.52202\/075280-1516"},{"key":"33_CR10","doi-asserted-by":"publisher","unstructured":"Meng, Y., Michalski, M., Huang, J., Zhang, Y., Abdelzaher, T., Han, J.: Tuning language models as training data generators for augmentation-enhanced few-shot learning (2023).https:\/\/doi.org\/10.48550\/arXiv.2211.03044","DOI":"10.48550\/arXiv.2211.03044"},{"key":"33_CR11","doi-asserted-by":"publisher","unstructured":"Nguyen, D.Q., Nguyen, A.T.: Phobert: pre-trained language models for vietnamese. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 1037\u20131042 (2020). https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.92","DOI":"10.18653\/v1\/2020.findings-emnlp.92"},{"key":"33_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101868","volume":"100","author":"NH Nguyen","year":"2023","unstructured":"Nguyen, N.H., Vo, D.T.D., Nguyen, K.V., Nguyen, N.L.T.: Openvivqa: task, dataset, and multimodal fusion models for visual question answering in vietnamese. Inf. Fusion 100, 101868 (2023). https:\/\/doi.org\/10.1016\/j.inffus.2023.101868","journal-title":"Inf. Fusion"},{"key":"33_CR13","doi-asserted-by":"publisher","unstructured":"Nguyen, Q.V., et al.: Vitextvqa: a large-scale visual question answering dataset for evaluating vietnamese text comprehension in images (2025). https:\/\/doi.org\/10.48550\/arXiv.2404.10652","DOI":"10.48550\/arXiv.2404.10652"},{"key":"33_CR14","doi-asserted-by":"publisher","unstructured":"Pham, H.Q., et al.: Viocrvqa: novel benchmark dataset and vision reader for visual question answering by understanding vietnamese text in images (2024). https:\/\/doi.org\/10.48550\/arXiv.2404.18397","DOI":"10.48550\/arXiv.2404.18397"},{"key":"33_CR15","unstructured":"Tran, K.Q., Nguyen, A.T., Le, A.T.H., Nguyen, K.V.: Vivqa: vietnamese visual question answering. In: Proceedings of the 35th Pacific Asia Conference on Language, Information and Computation, pp. 683\u2013691 (2021). https:\/\/aclanthology.org\/2021.paclic-1.72\/"},{"key":"33_CR16","doi-asserted-by":"publisher","unstructured":"Tran, K.V., Nguyen, K.V., Nguyen, N.L.T.: Bartphobeit: pre-trained sequence-to-sequence and image transformers models for vietnamese visual question answering (2023). https:\/\/doi.org\/10.48550\/arXiv.2307.15335","DOI":"10.48550\/arXiv.2307.15335"},{"key":"33_CR17","doi-asserted-by":"publisher","unstructured":"Tran, K.V., Phan, H.P., Nguyen, K.V., Nguyen, N.L.T.: Viclevr: a visual reasoning dataset and hybrid multimodal fusion model for visual question answering in vietnamese (2023). https:\/\/doi.org\/10.48550\/arXiv.2310.18046","DOI":"10.48550\/arXiv.2310.18046"},{"key":"33_CR18","unstructured":"Tran, O.N., Bui, H.V., Ha, H.H., Phan, P.V.: Vista (2024). https:\/\/huggingface.co\/datasets\/Vi-VLM\/Vista"},{"key":"33_CR19","doi-asserted-by":"publisher","unstructured":"Yang, Y., et al.: Generative data augmentation for commonsense reasoning. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 1008\u20131025 (2020). https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.90","DOI":"10.18653\/v1\/2020.findings-emnlp.90"},{"key":"33_CR20","doi-asserted-by":"publisher","unstructured":"Zhang, Z., et al.: Auto-instruct: automatic instruction generation and ranking for black-box language models (2023). https:\/\/doi.org\/10.48550\/arXiv.2310.13127","DOI":"10.48550\/arXiv.2310.13127"},{"key":"33_CR21","unstructured":"Zhou, Y., et al.: Large language models are human-level prompt engineers (2023). https:\/\/arxiv.org\/abs\/2211.01910"}],"container-title":["Communications in Computer and Information Science","Recent Challenges in Intelligent information and Database Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-92-0068-9_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T06:11:45Z","timestamp":1781158305000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-92-0068-9_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819200672","9789819200689"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-981-92-0068-9_33","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"1 June 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACIIDS","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Intelligent Information and Database Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kaohsiung","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2026","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13 April 2026","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 April 2026","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"aciids2026","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/aciids.pwr.edu.pl\/2026\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}