{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T23:07:34Z","timestamp":1764198454233,"version":"3.46.0"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819549627"},{"type":"electronic","value":"9789819549634"}],"license":[{"start":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T00:00:00Z","timestamp":1763769600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T00:00:00Z","timestamp":1763769600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-4963-4_30","type":"book-chapter","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T17:07:09Z","timestamp":1763744829000},"page":"363-374","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["RPR-MCAoAN: A Transformer-Based Co-attention Network with\u00a0Relative Positional Representations for\u00a0Visual Question Answering"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3339-8041","authenticated-orcid":false,"given":"Pham Hoai","family":"Nhan","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3512-4977","authenticated-orcid":false,"given":"Thai Gia","family":"Bao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4329-7428","authenticated-orcid":false,"given":"Nguyen Minh","family":"Hai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,22]]},"reference":[{"issue":"15","key":"30_CR1","doi-asserted-by":"publisher","first-page":"46397","DOI":"10.1007\/s11042-023-17275-9","volume":"83","author":"K Dittakan","year":"2024","unstructured":"Dittakan, K., Prompitak, K., Thungklang, P., Wongwattanakit, C.: Image caption generation using transformer learning methods: a case study on instagram image. Multimed. Tools Appl. 83(15), 46397\u201346417 (2024)","journal-title":"Multimed. Tools Appl."},{"key":"30_CR2","doi-asserted-by":"crossref","unstructured":"Cao, M., Li, S., Li, J., Nie, L., Zhang, M.: Image-text retrieval: a survey on recent research and development. arXiv preprint arXiv:2203.14713 (2022)","DOI":"10.24963\/ijcai.2022\/759"},{"issue":"10","key":"30_CR3","first-page":"1","volume":"57","author":"BS Kim","year":"2025","unstructured":"Kim, B.S., Kim, J., Lee, D., Jang, B.: Visual question answering: a survey of methods, datasets, evaluation, and challenges. ACM Comput. Surv. 57(10), 1\u201335 (2025)","journal-title":"ACM Comput. Surv."},{"key":"30_CR4","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1016\/j.neunet.2021.02.001","volume":"139","author":"J-J Kim","year":"2021","unstructured":"Kim, J.-J., Lee, D.-G., Wu, J., Jung, H.-G., Lee, S.-W.: Visual question answering based on local-scene-aware referring expression generation. Neural Netw. 139, 158\u2013167 (2021)","journal-title":"Neural Netw."},{"key":"30_CR5","unstructured":"Sikarwar, A., Kreiman, G.: On the efficacy of co-attention transformer layers in visual question answering. arXiv preprint arXiv:2201.03965 (2022)"},{"key":"30_CR6","doi-asserted-by":"crossref","unstructured":"Rahman, T., Chou, S.-H., Sigal, L., Carenini, G.: An improved attention for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1653\u20131662 (2021)","DOI":"10.1109\/CVPRW53098.2021.00181"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Dou, Z.-Y., et al.: An empirical study of training end-to-end vision-and-language transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18166\u201318176 (2022)","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"30_CR8","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, pp. 5583\u20135594. PMLR (2021)"},{"key":"30_CR9","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"4","key":"30_CR10","doi-asserted-by":"publisher","first-page":"114","DOI":"10.3390\/robotics12040114","volume":"12","author":"F Cao","year":"2023","unstructured":"Cao, F., Luo, S., Nunez, F., Wen, Z., Poon, J., Han, S.C.: Scenegate: scene-graph based co-attention networks for text visual question answering. Robotics 12(4), 114 (2023)","journal-title":"Robotics"},{"key":"30_CR11","unstructured":"Islam, M.A., Jia, S., Bruce, N.D.: How much position information do convolutional neural networks encode? arXiv preprint arXiv:2001.08248 (2020)"},{"key":"30_CR12","doi-asserted-by":"publisher","first-page":"6997","DOI":"10.1109\/TMM.2022.3216770","volume":"25","author":"A Mao","year":"2022","unstructured":"Mao, A., Yang, Z., Lin, K., Xuan, J., Liu, Y.-J.: Positional attention guided transformer-like architecture for visual question answering. IEEE Trans. Multimed. 25, 6997\u20137009 (2022)","journal-title":"IEEE Trans. Multimed."},{"issue":"8","key":"30_CR13","doi-asserted-by":"publisher","first-page":"7852","DOI":"10.1109\/TCYB.2021.3049537","volume":"52","author":"X Liu","year":"2021","unstructured":"Liu, X., Ji, Z., Pang, Y., Han, J., Li, X.: DGIG-Net: dynamic graph-in-graph networks for few-shot human-object interaction. IEEE Trans. Cybern. 52(8), 7852\u20137864 (2021)","journal-title":"IEEE Trans. Cybern."},{"key":"30_CR14","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. arXiv preprint arXiv:1803.02155 (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"30_CR15","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"30_CR16","unstructured":"Kim, J.-H., Jun, J., Zhang, B.-T.: Bilinear attention networks. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6281\u20136290 (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"30_CR18","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"30_CR19","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.-J., Chang, K.-W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"30_CR20","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VilBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"Wu, K., Peng, H., Chen, M., Fu, J., Chao, H.: Rethinking and improving relative position encoding for vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10033\u201310041 (2021)","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"Wang, Y., Sun, X., Fengzong, L., Kang, Z., Xu, C.X.: An anchor-based relative position embedding method for cross-modal tasks. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 5401\u20135413 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.362"},{"issue":"9","key":"30_CR24","doi-asserted-by":"publisher","first-page":"11921","DOI":"10.1007\/s11063-023-11403-0","volume":"55","author":"L Cai","year":"2023","unstructured":"Cai, L., Xu, N., Tian, H., Chen, K., Fan, H.: Multimodal Bi-direction guided attention networks for visual question answering. Neural Process. Lett. 55(9), 11921\u201311943 (2023)","journal-title":"Neural Process. Lett."},{"key":"30_CR25","doi-asserted-by":"crossref","unstructured":"Li, P., Si, Q., Fu, P., Lin, Z., Wang, Y.: Object attribute matters in visual question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, no. 17, pp. 18545\u201318553 (2024)","DOI":"10.1609\/aaai.v38i17.29816"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.-Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"30_CR27","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"30_CR28","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision \u2013 ECCV 2014. ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Teney, D., Anderson, P., He, X., Van Den Hengel, A.: Tips and tricks for visual question answering: learnings from the 2017 challenge. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4223\u20134232 (2018)","DOI":"10.1109\/CVPR.2018.00444"},{"key":"30_CR30","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"issue":"1","key":"30_CR31","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543, 2014","DOI":"10.3115\/v1\/D14-1162"}],"container-title":["Lecture Notes in Computer Science","Multi-disciplinary Trends in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-4963-4_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,26]],"date-time":"2025-11-26T23:03:00Z","timestamp":1764198180000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-4963-4_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,22]]},"ISBN":["9789819549627","9789819549634"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-4963-4_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,11,22]]},"assertion":[{"value":"22 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MIWAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multi-disciplinary Trends in Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ho Chi Minh City","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 December 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miwai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/miwai25.miwai.org","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}