{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T19:56:42Z","timestamp":1769630202507,"version":"3.49.0"},"reference-count":50,"publisher":"Springer Science and Business Media LLC","issue":"24","license":[{"start":{"date-parts":[[2024,5,10]],"date-time":"2024-05-10T00:00:00Z","timestamp":1715299200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,5,10]],"date-time":"2024-05-10T00:00:00Z","timestamp":1715299200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s00521-024-09818-4","type":"journal-article","created":{"date-parts":[[2024,5,10]],"date-time":"2024-05-10T05:01:51Z","timestamp":1715317311000},"page":"14691-14708","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Multimodal attention-driven visual question answering for Malayalam"],"prefix":"10.1007","volume":"36","author":[{"given":"Abhishek Gopinath","family":"Kovath","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9821-6146","authenticated-orcid":false,"given":"Anand","family":"Nayyar","sequence":"additional","affiliation":[]},{"given":"O. K.","family":"Sikha","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,5,10]]},"reference":[{"key":"9818_CR1","unstructured":"Abacha AB, Hasan SA , Datla VV, Liu J, Demner-Fushman D, M\u00fcller H (2019) VAQ-Med: overview of the medical visual question answering task at ImageCLEF 2019. CLEF (Working Notes) 2(6)"},{"key":"9818_CR2","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering","DOI":"10.1109\/CVPR.2018.00636"},{"key":"9818_CR3","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick CL, Vqa DP (2015) Visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"9818_CR4","doi-asserted-by":"publisher","first-page":"10","DOI":"10.4324\/9781315002217","volume-title":"Malayalam","author":"R Asher","year":"2013","unstructured":"Asher R (2013) Malayalam. Routledge, London, p 10"},{"key":"9818_CR5","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"Piotr Bojanowski","year":"2017","unstructured":"Bojanowski Piotr, Grave Edouard, Joulin Armand, Mikolov Tomas (2017) Enriching word vectors with subword information. Trans Assoc Comput Linguist 5:135\u2013146","journal-title":"Trans Assoc Comput Linguist"},{"key":"9818_CR6","doi-asserted-by":"crossref","unstructured":"Cascante-Bonilla P, Wu H, Wang L, Feris RS, Ordonez V (2022) Simvqa: Exploring simulated environments for visual question answering. arXiv:2203.17219 [cs], 03","DOI":"10.1109\/CVPR52688.2022.00500"},{"key":"9818_CR7","doi-asserted-by":"crossref","unstructured":"Dancette C, Cad\u00e8ne R, Teney D, Cord M (2021) Beyond question-based biases: assessing multimodal shortcut learning in visual question answering","DOI":"10.1109\/ICCV48922.2021.00160"},{"key":"9818_CR8","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K(2018) Bert: pre-training of deep bidirectional transformers for language understanding, p 10"},{"key":"9818_CR9","doi-asserted-by":"crossref","unstructured":"Dey R, Salem FM (2017) Gate-variants of gated recurrent unit (GRU) neural networks. In: 2017 IEEE 60th international midwest symposium on circuits and systems (MWSCAS). IEEE, pp 1597\u20131600","DOI":"10.1109\/MWSCAS.2017.8053243"},{"key":"9818_CR10","doi-asserted-by":"crossref","unstructured":"Fukui A, Park DH, Yang D, Rohrbach A, Darrell T, Rohrbach M (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv:1606.01847 [cs], 09","DOI":"10.18653\/v1\/D16-1044"},{"issue":"1","key":"9818_CR11","doi-asserted-by":"publisher","first-page":"94","DOI":"10.33130\/AJCT.2022v08i01.014","volume":"8","author":"Anusha Garlapati","year":"2022","unstructured":"Garlapati Anusha, Malisetty Neeraj, Narayanan Gayathri (2022) Image captioning from wikipedia for multi-language using deep learning models. Asian J Converg Technol (AJCT) 8(1):94\u2013101 (ISSN-2350-1146)","journal-title":"Asian J Converg Technol (AJCT)"},{"issue":"12","key":"9818_CR12","doi-asserted-by":"publisher","first-page":"3618","DOI":"10.1073\/pnas.1422953112","volume":"112","author":"Donald Geman","year":"2015","unstructured":"Geman Donald, Geman Stuart, Hallonquist Neil, Younes Laurent (2015) Visual turing test for computer vision systems. Proc Natl Acad Sci 112(12):3618\u20133623","journal-title":"Proc Natl Acad Sci"},{"key":"9818_CR13","doi-asserted-by":"crossref","unstructured":"Girshick R (2015) Fast R-CNN. In: Proceedings of the IEEE international conference on computer vision, pp. 1440\u20131448","DOI":"10.1109\/ICCV.2015.169"},{"key":"9818_CR14","doi-asserted-by":"crossref","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D (2017) Making the v in vqa matter: elevating the role of image understanding in visual question answering","DOI":"10.1109\/CVPR.2017.670"},{"key":"9818_CR15","unstructured":"Grave E, Bojanowski P, Gupta P, Joulin A, Mikolov T (2018) Learning word vectors for 157 languages. arXiv:1802.06893 [cs], 03"},{"key":"9818_CR16","doi-asserted-by":"crossref","unstructured":"Gu J, Zhao H, Lin Z, Li S, Cai J, Ling M (2019) Scene graph generation with external knowledge and image reconstruction","DOI":"10.1109\/CVPR.2019.00207"},{"key":"9818_CR17","unstructured":"Gupta D, Lenka P, Ekbal A, Bhattacharyya P (2020) A unified framework for multilingual and code-mixed visual question answering. In: Proceedings of the 1st conference of the Asia-Pacific chapter of the association for computational linguistics and the 10th international joint conference on natural language processing, pp. 900\u2013913 (2020)"},{"key":"9818_CR18","unstructured":"Gupta D, Lenka P, Ekbal A, Bhattacharyya P (2020) A unified framework for multilingual and code-mixed visual question answering, p 12"},{"key":"9818_CR19","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"9818_CR20","unstructured":"Kazemi V, Elqursh A (2017) Show, ask, attend, and answer: a strong baseline for visual question answering. arXiv:1704.03162 [cs], 04"},{"key":"9818_CR21","unstructured":"Lu J, Yang J, Batra D, Parikh D (2017) Dense-captioning events in videos"},{"key":"9818_CR22","unstructured":"Lu J, Yang J, Batra D, Parikh D (2016) Hierarchical question-image co-attention for visual question answering"},{"key":"9818_CR23","doi-asserted-by":"crossref","unstructured":"Malinowski M, Doersch C, Santoro A, Battaglia P (2018) Learning visual question answering by bootstrapping hard attention","DOI":"10.1007\/978-3-030-01231-1_1"},{"key":"9818_CR24","unstructured":"Malinowski M, Fritz M (2014) A multi-world approach to question answering about real-world scenes based on uncertain input"},{"key":"9818_CR25","doi-asserted-by":"crossref","unstructured":"Malinowski M, Rohrbach M, Fritz M (2015) Ask your neurons: a neural-based approach to answering questions about images","DOI":"10.1109\/ICCV.2015.9"},{"key":"9818_CR26","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1007\/s11263-017-1038-2","volume":"125","author":"M Malinowski","year":"2017","unstructured":"Malinowski M, Rohrbach M, Fritz M (2017) Ask your neurons: a deep learning approach to visual question answering. Int J Comput Vis 125:110\u2013135","journal-title":"Int J Comput Vis"},{"key":"9818_CR27","doi-asserted-by":"crossref","unstructured":"Mithun NC, Panda R, Papalexakis EE, Roy-Chowdhury AK (2018) Webly supervised joint embedding for cross-modal image-text retrieval. In: Proceedings of the 26th ACM international conference on Multimedia, pp 1856\u20131864","DOI":"10.1145\/3240508.3240712"},{"key":"9818_CR28","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. Adv Neural Inf Process Syst 28"},{"key":"9818_CR29","unstructured":"Sanjay SP, Ezhilarasan N, Kumar MA, Soman KP (2015) Amrita-cen@ fire2015: automated story illustration using word embedding. In: FIRE workshops, pp 67\u201370"},{"key":"9818_CR30","doi-asserted-by":"crossref","unstructured":"Shi Y, Furlanello T, Zha S, Anandkumar A (2018) Question type guided attention in visual question answering","DOI":"10.1007\/978-3-030-01225-0_10"},{"key":"9818_CR31","doi-asserted-by":"crossref","unstructured":"Shrestha R, Kafle K, Kanan C (2019) Answer them all! toward universal visual question answering models","DOI":"10.1109\/CVPR.2019.01072"},{"issue":"10","key":"9818_CR32","doi-asserted-by":"publisher","first-page":"15937","DOI":"10.1007\/s11042-020-10315-8","volume":"80","author":"OK Sikha","year":"2021","unstructured":"Sikha OK, Soman KP (2021) Dynamic mode decomposition based salient edge\/region features for content based image retrieval. Multim. Tools Appl. 80(10):15937\u201315958","journal-title":"Multim. Tools Appl."},{"key":"9818_CR33","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556"},{"key":"9818_CR34","unstructured":"Snover M, Dorr B, Schwartz R, Micciulla L, Makhoul J (2006) A study of translation edit rate with targeted human annotation"},{"key":"9818_CR35","doi-asserted-by":"crossref","unstructured":"Srihari K, Sikha OK (2022) Partially supervised image captioning model for urban road views. In: Intelligent data communication technologies and internet of things. Springer, pp 59\u201373","DOI":"10.1007\/978-981-16-7610-9_5"},{"key":"9818_CR36","unstructured":"Tan M, Le Q (2019) Efficientnet: rethinking model scaling for convolutional neural networks. In: International conference on machine learning. PMLR, pp 6105\u20136114"},{"key":"9818_CR37","doi-asserted-by":"crossref","unstructured":"Teney D, Anderson P, He X, van\u00a0den Hengel A (2018) Tips and tricks for visual question answering: learnings from the 2017 challenge","DOI":"10.1109\/CVPR.2018.00444"},{"key":"9818_CR38","doi-asserted-by":"publisher","first-page":"2921","DOI":"10.1007\/s11042-018-6097-z","volume":"78","author":"AS Toor","year":"2018","unstructured":"Toor AS, Wechsler H, Nappi M (2018) Question action relevance and editing for visual question answering. Multimed Tools Appl 78:2921\u20132935","journal-title":"Multimed Tools Appl"},{"key":"9818_CR39","doi-asserted-by":"crossref","unstructured":"Wang Z, Ji S (2018) Learning convolutional text representations for visual question answering. In: Proceedings of the 2018 SIAM international conference on data mining, pp 594\u2013602","DOI":"10.1137\/1.9781611975321.67"},{"key":"9818_CR40","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2018","unstructured":"Wu Q, Shen C, Wang P, Dick A, van den Hengel A (2018) Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach Intell 40:1367\u20131381","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"9818_CR41","unstructured":"Wu Y, Schuster M, Chen Z, Le QV, Norouzi M, Macherey W, Krikun M, Cao Y, Gao Q, Macherey K, Klingner J, Shah A, Johnson M, Liu X, Kaiser L, Gouws S, Kato Y, Kudo T, Kazawa H, Stevens K, Kurian G, Patil N, Wang W, Young C, Smith J, Riesa J, Rudnick A, Vinyals O, Corrado G, Hughes M, Dean J (2016) Bridging the gap between human and machine translation. Google\u2019s neural machine translation system"},{"key":"9818_CR42","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention"},{"key":"9818_CR43","doi-asserted-by":"crossref","unstructured":"Yang Z, He X, Gao J, Deng J, Smola A (2016) Stacked attention networks for image question answering","DOI":"10.1109\/CVPR.2016.10"},{"key":"9818_CR44","doi-asserted-by":"crossref","unstructured":"Yu D, Fu J, Mei T, Rui Y (2017) Multi-level attention networks for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4709\u20134717","DOI":"10.1109\/CVPR.2017.446"},{"key":"9818_CR45","doi-asserted-by":"crossref","unstructured":"Yu Z, Yu J, Cui Y, Tao D, Tian Q (2019) Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6281\u20136290","DOI":"10.1109\/CVPR.2019.00644"},{"key":"9818_CR46","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu Z, Yu J, Xiang C, Fan J, Tao D (2018) Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans Neural Netw Learn Syst 29:5947\u20135959","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"9818_CR47","doi-asserted-by":"crossref","unstructured":"Zagoruyko S, Komodakis N (2016) Wide residual networks. arXiv:1605.07146","DOI":"10.5244\/C.30.87"},{"key":"9818_CR48","doi-asserted-by":"crossref","unstructured":"Zhang Q, Lei Z, Zhang Z, Li SZ (2020) Context-aware attention network for image-text retrieval. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3536\u20133545","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"9818_CR49","unstructured":"Zhou B, Tian Y, Sukhbaatar S, Szlam A, Fergus R (2015) Simple baseline for visual question answering. arXiv:1512.02167"},{"key":"9818_CR50","doi-asserted-by":"crossref","unstructured":"Zhu Y, Groth O, Bernstein M, Fei-Fei L (2016) Visual7w: grounded question answering in images. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4995\u20135004","DOI":"10.1109\/CVPR.2016.540"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-09818-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-024-09818-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-09818-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T10:04:11Z","timestamp":1724148251000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-024-09818-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,10]]},"references-count":50,"journal-issue":{"issue":"24","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["9818"],"URL":"https:\/\/doi.org\/10.1007\/s00521-024-09818-4","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,5,10]]},"assertion":[{"value":"23 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 April 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 May 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest to report regarding the present study.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}