{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:07:20Z","timestamp":1768255640842,"version":"3.49.0"},"reference-count":68,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T00:00:00Z","timestamp":1739318400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T00:00:00Z","timestamp":1739318400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]},{"name":"Vietnam National University HoChiMinh City","award":["DS2024-26-01"],"award-info":[{"award-number":["DS2024-26-01"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s00530-025-01696-7","type":"journal-article","created":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T15:00:45Z","timestamp":1739372445000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["ViOCRVQA: novel benchmark dataset and VisionReader for visual question answering by understanding Vietnamese text in images"],"prefix":"10.1007","volume":"31","author":[{"given":"Huy Quang","family":"Pham","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thang Kien-Bao","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quan","family":"Van Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dan Quang","family":"Tran","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nghia Hieu","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kiet","family":"Van Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ngan Luu-Thuy","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,12]]},"reference":[{"key":"1696_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"1696_CR2","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in vqa matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"1696_CR3","doi-asserted-by":"crossref","unstructured":"Singh, A., Natarajan, V., Shah, M., Jiang, Y., Chen, X., Batra, D., Parikh, D., Rohrbach, M.: Towards vqa models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"1696_CR4","doi-asserted-by":"crossref","unstructured":"Biten, A.F., Tito, R., Mafla, A., Gomez, L., Rusinol, M., Valveny, E., Jawahar, C., Karatzas, D.: Scene text visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4291\u20134301 (2019)","DOI":"10.1109\/ICCV.2019.00439"},{"issue":"8","key":"1696_CR5","doi-asserted-by":"publisher","first-page":"10803","DOI":"10.1007\/s13369-023-07687-y","volume":"48","author":"SM Kamel","year":"2023","unstructured":"Kamel, S.M., Hassan, S.I., Elrefaei, L.: Vaqa: visual Arabic question answering. Arab. J. Sci. Eng. 48(8), 10803\u201310823 (2023)","journal-title":"Arab. J. Sci. Eng."},{"key":"1696_CR6","doi-asserted-by":"crossref","unstructured":"Kim, M., Song, S., Lee, Y., Jang, H., Lim, K.: Bok-vqa: bilingual outside knowledge-based visual question answering via graph representation pretraining (2024). arXiv preprint arXiv:2401.06443","DOI":"10.1609\/aaai.v38i16.29798"},{"key":"1696_CR7","unstructured":"Shimizu, N., Rong, N., Miyazaki, T.: Visual question answering dataset for bilingual image understanding: a study of cross-lingual transfer using attention maps. In: Bender, E.M., Derczynski, L., Isabelle, P. (eds.) Proceedings of the 27th International Conference on Computational Linguistics, pp. 1918\u20131928. Association for Computational Linguistics, Santa Fe, New Mexico, USA (2018)"},{"key":"1696_CR8","unstructured":"Tran, K.Q., Nguyen, A.T., Le, A.T.-H., Van\u00a0Nguyen, K.: Vivqa: Vietnamese visual question answering. In: Proceedings of the 35th Pacific Asia Conference on Language, Information and Computation, pp. 683\u2013691 (2021)"},{"key":"1696_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101868","volume":"100","author":"NH Nguyen","year":"2023","unstructured":"Nguyen, N.H., Vo, D.T., Van Nguyen, K., Nguyen, N.L.-T.: Openvivqa: task, dataset, and multimodal fusion models for visual question answering in Vietnamese. Inf. Fusion 100, 101868 (2023)","journal-title":"Inf. Fusion"},{"key":"1696_CR10","doi-asserted-by":"crossref","unstructured":"Nguyen, N.L.-T., Nguyen, N.H., Vo, D.T., Tran, K.Q., Van\u00a0Nguyen, K.: Evjvqa challenge: multilingual visual question answering. J. Comput. Sci. Cybern., 237\u2013258 (2023)","DOI":"10.15625\/1813-9663\/18157"},{"key":"1696_CR11","doi-asserted-by":"crossref","unstructured":"Tran, K.V., Phan, H.P., Van\u00a0Nguyen, K., Nguyen, N.L.T.: Viclevr: a visual reasoning dataset and hybrid multimodal fusion model for visual question answering in Vietnamese (2023). arXiv preprint arXiv:2310.18046","DOI":"10.2139\/ssrn.4611463"},{"key":"1696_CR12","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1696_CR13","unstructured":"Kazemi, V., Elqursh, A.: Show, ask, attend, and answer: a strong baseline for visual question answering (2017). arXiv preprint arXiv:1704.03162"},{"key":"1696_CR14","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"1696_CR15","unstructured":"Kim, J.-H., Lee, S.-W., Kwak, D., Heo, M.-O., Kim, J., Ha, J.-W., Zhang, B.-T.: Multimodal residual learning for visual qa. AAdv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"1696_CR16","doi-asserted-by":"crossref","unstructured":"Teney, D., Anderson, P., He, X., Van Den\u00a0Hengel, A.: Tips and tricks for visual question answering: Learnings from the 2017 challenge. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4223\u20134232 (2018)","DOI":"10.1109\/CVPR.2018.00444"},{"key":"1696_CR17","doi-asserted-by":"crossref","unstructured":"Mathew, M., Karatzas, D., Jawahar, C.: Docvqa: A dataset for vqa on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2200\u20132209 (2021)","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"1696_CR18","doi-asserted-by":"crossref","unstructured":"Kantharaj, S., Do, X.L., Leong, R.T., Tan, J.Q., Hoque, E., Joty, S.: Opencqa: Open-ended question answering with charts. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 11817\u201311837 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.811"},{"key":"1696_CR19","doi-asserted-by":"crossref","unstructured":"Tanaka, R., Nishida, K., Yoshida, S.: Visualmrc: machine reading comprehension on document images. In: Proceedings of the AAAI Conference on Artificial Intelligence 35, pp. 13878\u201313888 (2021)","DOI":"10.1609\/aaai.v35i15.17635"},{"key":"1696_CR20","doi-asserted-by":"crossref","unstructured":"Mathew, M., Bagal, V., Tito, R., Karatzas, D., Valveny, E., Jawahar, C.: Infographicvqa. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1697\u20131706 (2022)","DOI":"10.1109\/WACV51458.2022.00264"},{"key":"1696_CR21","doi-asserted-by":"crossref","unstructured":"Mishra, A., Shekhar, S., Singh, A.K., Chakraborty, A.: Ocr-vqa: Visual question answering by reading text in images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 947\u2013952 (2019). IEEE","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"1696_CR22","doi-asserted-by":"crossref","unstructured":"Gurari, D., Li, Q., Stangl, A.J., Guo, A., Lin, C., Grauman, K., Luo, J., Bigham, J.P.: Vizwiz grand challenge: answering visual questions from blind people. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3608\u20133617 (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"key":"1696_CR23","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Ok-vqa: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"1696_CR24","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"1696_CR25","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"1696_CR26","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., Van Der\u00a0Maaten, L., Fei-Fei, L., Lawrence\u00a0Zitnick, C., Girshick, R.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"1696_CR27","doi-asserted-by":"crossref","unstructured":"Hasegawa, R., Thawonmas, R., Tanabe, J., Yu, L.: Minecraft video aesthetics quality assessment model. In: Proceedings of the 13th International Conference on Advances in Information Technology, pp. 1\u20135 (2023)","DOI":"10.1145\/3628454.3631667"},{"key":"1696_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: 3rd International Conference on Learning Representations (ICLR 2015) (2015). Computational and Biological Learning Society"},{"issue":"1","key":"1696_CR29","doi-asserted-by":"publisher","first-page":"667","DOI":"10.1109\/TVCG.2017.2744158","volume":"24","author":"H Strobelt","year":"2017","unstructured":"Strobelt, H., Gehrmann, S., Pfister, H., Rush, A.M.: Lstmvis: a tool for visual analysis of hidden state dynamics in recurrent neural networks. IEEE Trans. Vis. Comput. Gr. 24(1), 667\u2013676 (2017)","journal-title":"IEEE Trans. Vis. Comput. Gr."},{"key":"1696_CR30","doi-asserted-by":"crossref","unstructured":"Chen, S.Y.-C., Yoo, S., Fang, Y.-L.L.: Quantum long short-term memory. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8622\u20138626 (2022). IEEE","DOI":"10.1109\/ICASSP43922.2022.9747369"},{"key":"1696_CR31","unstructured":"Chen, K., Wang, J., Chen, L.-C., Gao, H., Xu, W., Nevatia, R.: Abc-cnn: an attention based convolutional neural network for visual question answering (2015). arXiv preprint arXiv:1511.05960"},{"key":"1696_CR32","doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., Li, H.: Learning to answer questions from image using convolutional neural network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 30 (2016)","DOI":"10.1609\/aaai.v30i1.10442"},{"key":"1696_CR33","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7w: Grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"},{"key":"1696_CR34","doi-asserted-by":"crossref","unstructured":"Shih, K.J., Singh, S., Hoiem, D.: Where to look: Focus regions for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4613\u20134621 (2016)","DOI":"10.1109\/CVPR.2016.499"},{"issue":"6","key":"1696_CR35","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2017","unstructured":"Wu, Q., Shen, C., Wang, P., Dick, A., Van Den Hengel, A.: Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1367\u20131381 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1696_CR36","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"1696_CR37","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"1696_CR38","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.-J., Chang, K.-W.: Visualbert: a simple and performant baseline for vision and language (2019). arXiv preprint arXiv:1908.03557"},{"key":"1696_CR39","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5100\u20135111 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"1696_CR40","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., Dai, J.: Vl-bert: pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations (2019)"},{"key":"1696_CR41","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: universal image-text representation learning. In: European Conference on Computer Vision, pp. 104\u2013120 (2020). Springer","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"1696_CR42","doi-asserted-by":"crossref","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: Computer Vision\u2014ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16, pp. 121\u2013137 (2020). Springer","DOI":"10.1007\/978-3-030-58577-8_8"},{"issue":"1","key":"1696_CR43","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"1696_CR44","doi-asserted-by":"crossref","unstructured":"Biten, A.F., Litman, R., Xie, Y., Appalaraju, S., Manmatha, R.: Latr: layout-aware transformer for scene-text vqa. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16548\u201316558 (2022)","DOI":"10.1109\/CVPR52688.2022.01605"},{"key":"1696_CR45","doi-asserted-by":"crossref","unstructured":"Kil, J., Changpinyo, S., Chen, X., Hu, H., Goodman, S., Chao, W.-L., Soricut, R.: Prestu: pre-training for scene-text understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15270\u201315280 (2023)","DOI":"10.1109\/ICCV51070.2023.01401"},{"key":"1696_CR46","unstructured":"Cho, J., Lei, J., Tan, H., Bansal, M.: Unifying vision-and-language tasks via text generation. In: International Conference on Machine Learning, pp. 1931\u20131942 (2021). PMLR"},{"key":"1696_CR47","doi-asserted-by":"crossref","unstructured":"Fang, C., Li, J., Li, L., Ma, C., Hu, D.: Separate and locate: rethink the text in text-based visual question answering. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 4378\u20134388 (2023)","DOI":"10.1145\/3581783.3611753"},{"key":"1696_CR48","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742 (2023). PMLR"},{"key":"1696_CR49","first-page":"23716","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1696_CR50","doi-asserted-by":"crossref","unstructured":"Singh, A., Hu, R., Goswami, V., Couairon, G., Galuba, W., Rohrbach, M., Kiela, D.: Flava: a foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"1696_CR51","doi-asserted-by":"crossref","unstructured":"Li, C., Xu, H., Tian, J., Wang, W., Yan, M., Bi, B., Ye, J., Chen, H., Xu, G., Cao, Z., : mplug: effective and efficient vision-language learning by cross-modal skip-connections. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 7241\u20137259 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"issue":"3","key":"1696_CR52","doi-asserted-by":"publisher","first-page":"1335","DOI":"10.5194\/gmd-13-1335-2020","volume":"13","author":"B Maronga","year":"2020","unstructured":"Maronga, B., Banzhaf, S., Burmeister, C., Esch, T., Forkel, R., Fr\u00f6hlich, D., Fuka, V., Gehrke, K.F., Geleti\u010d, J., Giersch, S.: Overview of the palm model system 6.0. Geosci. Model Dev. 13(3), 1335\u20131372 (2020)","journal-title":"Geosci. Model Dev."},{"key":"1696_CR53","unstructured":"Anil, R., Dai, A.M., Firat, O., Johnson, M., Lepikhin, D., Passos, A., Shakeri, S., Taropa, E., Bailey, P., Chen, Z., et al.: Palm 2 technical report (2023). arXiv preprint arXiv:2305.10403"},{"key":"1696_CR54","doi-asserted-by":"crossref","unstructured":"Phan, L., Tran, H., Nguyen, H., Trinh, T.H.: Vit5: Pretrained text-to-text transformer for vietnamese language generation. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop, pp. 136\u2013142 (2022)","DOI":"10.18653\/v1\/2022.naacl-srw.18"},{"key":"1696_CR55","doi-asserted-by":"crossref","unstructured":"Tran, N.L., Le, D.M., Nguyen, D.Q.: BARTpho: pre-trained sequence-to-sequence models for Vietnamese. In: Proceedings of the 23rd Annual Conference of the International Speech Communication Association (2022)","DOI":"10.21437\/Interspeech.2022-10177"},{"key":"1696_CR56","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., Zettlemoyer, L.: Bart: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (2020). Association for Computational Linguistics","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"1696_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., Yang, J., Zhang, L., Wang, L., Choi, Y., Gao, J.: Vinvl: revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u20135588 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"1696_CR58","doi-asserted-by":"crossref","unstructured":"Huang, M., Liu, Y., Peng, Z., Liu, C., Lin, D., Zhu, S., Yuan, N., Ding, K., Jin, L.: Swintextspotter: scene text spotting via better synergy between text detection and text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4593\u20134603 (2022)","DOI":"10.1109\/CVPR52688.2022.00455"},{"key":"1696_CR59","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"1696_CR60","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"1696_CR61","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"1696_CR62","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: Visualgpt: data-efficient adaptation of pretrained language models for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18030\u201318040 (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"1696_CR63","unstructured":"Wang, W., Chen, Z., Chen, X., Wu, J., Zhu, X., Zeng, G., Luo, P., Lu, T., Zhou, J., Qiao, Y., et al.: Visionllm: large language model is also an open-ended decoder for vision-centric tasks. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"1696_CR64","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1696_CR65","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et al.: Llama: open and efficient foundation language models (2023). arXiv preprint arXiv:2302.13971"},{"key":"1696_CR66","doi-asserted-by":"crossref","unstructured":"Liu, X., Liang, D., Yan, S., Chen, D., Qiao, Y., Yan, J.: Fots: fast oriented text spotting with a unified network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5676\u20135685 (2018)","DOI":"10.1109\/CVPR.2018.00595"},{"key":"1696_CR67","doi-asserted-by":"crossref","unstructured":"Lyu, P., Liao, M., Yao, C., Wu, W., Bai, X.: Mask textspotter: an end-to-end trainable neural network for spotting text with arbitrary shapes. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 67\u201383 (2018)","DOI":"10.1007\/978-3-030-01264-9_5"},{"issue":"2","key":"1696_CR68","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2022.103207","volume":"60","author":"Z Xu","year":"2023","unstructured":"Xu, Z., Gu, J., Liu, M., Zhou, G., Fu, H., Qiu, C.: A question-guided multi-hop reasoning graph network for visual question answering. Inf. Process. Manag. 60(2), 103207 (2023)","journal-title":"Inf. Process. Manag."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01696-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01696-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01696-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T19:34:03Z","timestamp":1745264043000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01696-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,12]]},"references-count":68,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["1696"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01696-7","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,12]]},"assertion":[{"value":"28 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"106"}}