{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T02:21:54Z","timestamp":1764210114407},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T00:00:00Z","timestamp":1694563200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T00:00:00Z","timestamp":1694563200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"\u56fd\u5bb6\u81ea\u7136\u79d1\u5b66\u57fa\u91d1","award":["62277008","62277008","62277008","62277008","62277008"],"award-info":[{"award-number":["62277008","62277008","62277008","62277008","62277008"]}]},{"name":"\u91cd\u5e86\u90ae\u7535\u5927\u5b66\u6559\u80b2\u4fe1\u606f\u5316\u9879\u76ee","award":["xxhyf2022-08","xxhyf2022-08","xxhyf2022-08","xxhyf2022-08","xxhyf2022-08"],"award-info":[{"award-number":["xxhyf2022-08","xxhyf2022-08","xxhyf2022-08","xxhyf2022-08","xxhyf2022-08"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11063-023-11403-0","type":"journal-article","created":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T17:02:01Z","timestamp":1694624521000},"page":"11921-11943","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Multimodal Bi-direction Guided Attention Networks for Visual Question Answering"],"prefix":"10.1007","volume":"55","author":[{"given":"Linqin","family":"Cai","sequence":"first","affiliation":[]},{"given":"Nuoying","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Kejia","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Haodu","family":"Fan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,9,13]]},"reference":[{"key":"11403_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2023.03.010","author":"Q Wang","year":"2023","unstructured":"Wang Q, Deng H, Wu X, Yang Z, Liu Y, Wang Y, Hao G (2023) LCM-Captioner: A lightweight text-based image captioning method with collaborative mechanism between vision and text. Neural Netw. https:\/\/doi.org\/10.1016\/j.neunet.2023.03.010","journal-title":"Neural Netw"},{"key":"11403_CR2","doi-asserted-by":"publisher","first-page":"102238","DOI":"10.1016\/j.displa.2022.102238","volume":"73","author":"W Jiang","year":"2022","unstructured":"Jiang W, Li Q, Zhan K, Fang Y, Shen F (2022) Hybrid attention network for image captioning. Displays 73:102238. https:\/\/doi.org\/10.1016\/j.displa.2022.102238","journal-title":"Displays"},{"key":"11403_CR3","doi-asserted-by":"publisher","first-page":"109548","DOI":"10.1016\/j.patcog.2023.109548","volume":"140","author":"Y Tian","year":"2023","unstructured":"Tian Y, Ding A, Wang D, Luo X, Wan B, Wang Y (2023) Bi-Attention enhanced representation learning for image-text matching. Pattern Recognition 140:109548","journal-title":"Pattern Recognition"},{"key":"11403_CR4","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1016\/j.neucom.2022.01.042","volume":"481","author":"Wu Dongqing","year":"2022","unstructured":"Dongqing Wu, Li H, Tang Y, Guo L, Liu H (2022) Global-guided asymmetric attention network for image-text matching. Neurocomputing 481:77\u201390. https:\/\/doi.org\/10.1016\/j.neucom.2022.01.042","journal-title":"Neurocomputing"},{"issue":"10","key":"11403_CR5","doi-asserted-by":"publisher","first-page":"4362","DOI":"10.1109\/TNNLS.2020.3017530","volume":"32","author":"L Zhang","year":"2020","unstructured":"Zhang L, Liu S, Liu D, Zeng P, Li X, Song J, Gao L (2020) Rich visual knowledge-based augmentation network for visual question answering. IEEE Trans Neural Netw Learn Syst 32(10):4362\u20134373","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"11403_CR6","doi-asserted-by":"publisher","first-page":"5936","DOI":"10.1109\/TIP.2022.3205212","volume":"31","author":"P Zeng","year":"2022","unstructured":"Zeng P, Zhang H, Gao L, Song J, Shen H (2022) Video question answering with prior knowledge and object-sensitive learning[J]. IEEE Trans Image Process 31:5936\u20135948","journal-title":"IEEE Trans Image Process"},{"key":"11403_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109339","volume":"138","author":"H Zhang","year":"2023","unstructured":"Zhang H, Zeng P, Yuxuan Hu, Qian J, Song J, Gao L (2023) Learning visual question answering on controlled semantic noisy labels. Pattern Recogn 138:109339","journal-title":"Pattern Recogn"},{"issue":"1","key":"11403_CR8","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1109\/TPAMI.2020.3004830","volume":"44","author":"L Peng","year":"2022","unstructured":"Peng L, Yang Y, Wang Z, Huang Zi, Shen HT (2022) MRA-Net: improving VQA Via multi-modal relation attention network. IEEE Trans Pattern Anal Mach Intell 44(1):318\u2013329. https:\/\/doi.org\/10.1109\/TPAMI.2020.3004830","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11403_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.108980","volume":"132","author":"C Chen","year":"2022","unstructured":"Chen C, Han D, Chang C-C (2022) CAAN: Context-Aware attention network for visual question answering. Pattern Recogn 132:108980","journal-title":"Pattern Recogn"},{"key":"11403_CR10","doi-asserted-by":"publisher","unstructured":"Yu D, Gao X, Xiong H (2018) Structured semantic representation for visual question answering. In: 2018 25th IEEE International Conference on Image Processing (ICIP), 2286\u20132290. https:\/\/doi.org\/10.1109\/icip.2018.8451516","DOI":"10.1109\/icip.2018.8451516"},{"key":"11403_CR11","doi-asserted-by":"publisher","unstructured":"Wu J, Ge F, Shu P, Ma L, Hao Y(2022) Question-Driven Multiple Attention(DQMA) Model for Visual Question Answer. International Conference on Artificial Intelligence and Computer Information Technology (AICIT), 1\u20134. https:\/\/doi.org\/10.1109\/AICIT55386.2022.9930294","DOI":"10.1109\/AICIT55386.2022.9930294"},{"key":"11403_CR12","doi-asserted-by":"publisher","unstructured":"Guan W, Wu Z, Ping W (2022) Question-oriented cross-modal co-attention networks for visual question answering. 2nd International Conference on Consumer Electronics and Computer Engineering (ICCECE), 2022, 401\u2013407. https:\/\/doi.org\/10.1109\/ICCECE54139.2022.9712726","DOI":"10.1109\/ICCECE54139.2022.9712726"},{"key":"11403_CR13","doi-asserted-by":"publisher","unstructured":"Wang F, An G (2022) Visual Question Answering based on multimodal triplet knowledge accumulation. In: 2022 16th IEEE International Conference on Signal Processing (ICSP), 81\u201384. https:\/\/doi.org\/10.1109\/ICSP56322.2022.9965282","DOI":"10.1109\/ICSP56322.2022.9965282"},{"key":"11403_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107650","author":"L Liu","year":"2022","unstructured":"Liu L, Wang M, He X, Qing L, Chen H (2022) Fact-based visual question answering via dual-process system. Knowledge-Based Syst. https:\/\/doi.org\/10.1016\/j.knosys.2021.107650","journal-title":"Knowledge-Based Syst"},{"key":"11403_CR15","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/j.neucom.2021.02.092","volume":"445","author":"Z Yang","year":"2021","unstructured":"Yang Z, Garcia N, Chu C, Otani M, Nakashima Y, Takemura H (2021) A comparative study of language transformers for video question answering. Neurocomputing 445:121\u2013133. https:\/\/doi.org\/10.1016\/j.neucom.2021.02.092","journal-title":"Neurocomputing"},{"key":"11403_CR16","doi-asserted-by":"publisher","unstructured":"Peng L, An G, Ruan Q (2022) Transformer-based Sparse Encoder and Answer Decoder for Visual Question Answering. In: 2022 16th IEEE International Conference on Signal Processing (ICSP), 120\u2013123. https:\/\/doi.org\/10.1109\/ICSP56322.2022.9965298","DOI":"10.1109\/ICSP56322.2022.9965298"},{"key":"11403_CR17","doi-asserted-by":"publisher","unstructured":"Yu Z, Yu J, Cui Y, Tao D, Tian Q (2019) Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 6281\u20136290. https:\/\/doi.org\/10.1109\/cvpr.2019.00644","DOI":"10.1109\/cvpr.2019.00644"},{"key":"11403_CR18","doi-asserted-by":"crossref","unstructured":"Bin Y, Yang Y, Zhou J, Huang Z, Shen HT(2017) Adaptively attending to visual attributes and linguistic knowledge for captioning. In: Proceedings of the 25th ACM international conference on multimedia, 1345\u20131353","DOI":"10.1145\/3123266.3123391"},{"key":"11403_CR19","doi-asserted-by":"publisher","unstructured":"Tasse FP, Kosinka J, Dodgson N (2015) Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, 2425\u20132433. https:\/\/doi.org\/10.1109\/ICCV.2015.279","DOI":"10.1109\/ICCV.2015.279"},{"key":"11403_CR20","doi-asserted-by":"publisher","unstructured":"Fukui A, Park DH, Yang D, Rohrbach A, Darrell T (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, 457\u2013468. https:\/\/doi.org\/10.18653\/v1\/D16-1044","DOI":"10.18653\/v1\/D16-1044"},{"key":"11403_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/ICME52920.2022.9859591","volume":"2022","author":"Y Qian","year":"2022","unstructured":"Qian Y, Yuncong Hu, Wang R, Feng F, Wang X (2022) Question-driven graph fusion network for visual question answering. IEEE Int Conf Multimed Expo (ICME) 2022:1\u20136. https:\/\/doi.org\/10.1109\/ICME52920.2022.9859591","journal-title":"IEEE Int Conf Multimed Expo (ICME)"},{"issue":"12","key":"11403_CR22","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/tnnls.2018.2817340","volume":"29","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Xiang C, Fan J, Tao D (2018) Beyond bilinear: Generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans Neural Netw Learn Syst 29(12):5947\u20135959. https:\/\/doi.org\/10.1109\/tnnls.2018.2817340","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"11403_CR23","doi-asserted-by":"publisher","unstructured":"Ben-Younes H, Cadene R, Cord M, Thome N (2017) Mutan: Multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE international conference on computer vision, 2631\u20132639. https:\/\/doi.org\/10.1109\/iccv.2017.285","DOI":"10.1109\/iccv.2017.285"},{"key":"11403_CR24","doi-asserted-by":"publisher","unstructured":"Yu Z, Yu J, Fan J, Tao D (2017) Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE international conference on computer vision, doi: https:\/\/doi.org\/10.1109\/iccv.2017.202","DOI":"10.1109\/iccv.2017.202"},{"key":"11403_CR25","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1016\/j.inffus.2021.02.006","volume":"72","author":"W Zhang","year":"2021","unstructured":"Zhang W, Jing Yu, Zhao W, Ran C (2021) DMRFNet: deep multimodal reasoning and fusion for visual question answering and explanation generation. Inform Fusion 72:70\u201379. https:\/\/doi.org\/10.1016\/j.inffus.2021.02.006","journal-title":"Inform Fusion"},{"key":"11403_CR26","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1016\/j.neucom.2020.10.071","volume":"423","author":"M Lao","year":"2021","unstructured":"Lao M, Guo Y, Nan P, Chen W, Liu Y, Lew MS (2021) Multi-stage hybrid embedding fusion network for visual question answering. Neurocomputing 423:541\u2013550. https:\/\/doi.org\/10.1016\/j.neucom.2020.10.071","journal-title":"Neurocomputing"},{"key":"11403_CR27","doi-asserted-by":"publisher","first-page":"106639","DOI":"10.1016\/j.knosys.2020.106639","volume":"212","author":"W Zhang","year":"2021","unstructured":"Zhang W, Jing Y, Wang Y, Wang W (2021) Multimodal deep fusion for image question answering. Knowledge-Based Syst 212:106639. https:\/\/doi.org\/10.1016\/j.knosys.2020.106639","journal-title":"Knowledge-Based Syst"},{"issue":"158","key":"11403_CR28","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1016\/j.neunet.2021.02.001","volume":"139","author":"J-J Kim","year":"2021","unstructured":"Kim J-J, Lee D-G, Jialin W, Jung H-G, Lee S-W (2021) Visual question answering based on local-scene-aware referring expression generation. Neural Netw 139(158):167. https:\/\/doi.org\/10.1016\/j.neunet.2021.02.001","journal-title":"Neural Netw"},{"key":"11403_CR29","doi-asserted-by":"publisher","first-page":"104165","DOI":"10.1016\/j.imavis.2021.104165","volume":"110","author":"H Sharma","year":"2021","unstructured":"Sharma H, Jalal AS (2021) Visual question answering model based on graph neural network and contextual attention. Image and Vis Comput 110:104165. https:\/\/doi.org\/10.1016\/j.imavis.2021.104165","journal-title":"Image and Vis Comput"},{"key":"11403_CR30","doi-asserted-by":"publisher","unstructured":"Peng, L., Yang, Y., Wang, Z., Wu, X. and Huang, Z (2019) Cra-net: Composed relation attention network for visual question answering. In: Proceedings of the 27th ACM International Conference on Multimedia, 1202\u20131210. https:\/\/doi.org\/10.1145\/3343031.3350925","DOI":"10.1145\/3343031.3350925"},{"key":"11403_CR31","doi-asserted-by":"publisher","unstructured":"Yang Z, He X, Gao J, Deng L, Smola A (2016) Stacked attention networks for image question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 21\u201329. https:\/\/doi.org\/10.1109\/cvpr.2016.10","DOI":"10.1109\/cvpr.2016.10"},{"key":"11403_CR32","doi-asserted-by":"publisher","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 6077\u20136086. https:\/\/doi.org\/10.1109\/cvpr.2018.00636","DOI":"10.1109\/cvpr.2018.00636"},{"key":"11403_CR33","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1606.00061","author":"J Lu","year":"2016","unstructured":"Lu J, Yang J, Batra D, Parikh D (2016) Hierarchical question image co-attention for visual question answering. Adv Neural Inform Process Syst. https:\/\/doi.org\/10.48550\/arXiv.1606.00061","journal-title":"Adv Neural Inform Process Syst"},{"key":"11403_CR34","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1805.07932","author":"J-H Kim","year":"2018","unstructured":"Kim J-H, Jun J, Zhang B-T (2018) Bilinear attention networks. Adv Neural Inform Process Syst. https:\/\/doi.org\/10.48550\/arXiv.1805.07932","journal-title":"Adv Neural Inform Process Syst"},{"key":"11403_CR35","doi-asserted-by":"publisher","unstructured":"Nguyen DK, Okatani T (2018) Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 6087\u20136096. https:\/\/doi.org\/10.1109\/CVPR.2018.00637","DOI":"10.1109\/CVPR.2018.00637"},{"key":"11403_CR36","doi-asserted-by":"publisher","unstructured":"Li RY, Kaabar MK, Wu Z (2022) A Lightweight Visual Question Answering Model based on Semantic Similarity. In Proceedings of the 2021 4th International Conference on Machine Learning and Machine Intelligence (MLMI '21). 71\u201376. https:\/\/doi.org\/10.1145\/3490725.3490736","DOI":"10.1145\/3490725.3490736"},{"issue":"23","key":"11403_CR37","doi-asserted-by":"publisher","first-page":"6758","DOI":"10.3390\/s20236758","volume":"20","author":"Z Guo","year":"2020","unstructured":"Guo Z, Han D (2020) Multi-modal explicit sparse attention networks for visual question answering. Sensors 20(23):6758. https:\/\/doi.org\/10.3390\/s20236758","journal-title":"Sensors"},{"key":"11403_CR38","doi-asserted-by":"publisher","first-page":"3518","DOI":"10.1109\/tmm.2020.3026892","volume":"23","author":"F Liu","year":"2020","unstructured":"Liu F, Liu J, Fang Z, Hong R, Hanqing Lu (2020) Visual question answering with dense inter-and intra-modality interactions. IEEE Trans Multimed 23:3518\u20133529. https:\/\/doi.org\/10.1109\/tmm.2020.3026892","journal-title":"IEEE Trans Multimed"},{"key":"11403_CR39","doi-asserted-by":"publisher","first-page":"107956","DOI":"10.1016\/j.patcog.2021.107956","volume":"117","author":"Y Liu","year":"2021","unstructured":"Liu Y, Zhang X, Zhang Q, Li C, Huang F, Tang X, Li Z (2021) Dual self-attention with co-attention networks for visual question answering. Pattern Recognition 117:107956. https:\/\/doi.org\/10.1016\/j.patcog.2021.107956","journal-title":"Pattern Recognition"},{"issue":"1","key":"11403_CR40","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA, Bernstein MS, Fei-Fei Li (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vision 123(1):32\u201373. https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int J Comput Vision"},{"key":"11403_CR41","doi-asserted-by":"publisher","unstructured":"Pennington J, Socher R, Manning C (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), 1532\u20131543. https:\/\/doi.org\/10.3115\/v1\/d14-1162","DOI":"10.3115\/v1\/d14-1162"},{"key":"11403_CR42","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1706.03762","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. Adv Neural Inform Process Syst. https:\/\/doi.org\/10.48550\/arXiv.1706.03762","journal-title":"Adv Neural Inform Process Syst"},{"issue":"12","key":"11403_CR43","doi-asserted-by":"publisher","first-page":"3196","DOI":"10.1109\/tmm.2020.2972830","volume":"22","author":"Yu Jing","year":"2020","unstructured":"Jing Yu, Zhang W, Yuhang Lu, Qin Z, Yue Hu, Tan J, Qi Wu (2020) Reasoning on the relation: enhancing visual representation for visual question answering and cross-modal retrieval. IEEE Trans Multimedia 22(12):3196\u20133209. https:\/\/doi.org\/10.1109\/tmm.2020.2972830","journal-title":"IEEE Trans Multimedia"},{"key":"11403_CR44","doi-asserted-by":"publisher","first-page":"1435","DOI":"10.1007\/s11063-021-10689-2","volume":"54","author":"Y Miao","year":"2022","unstructured":"Miao Y, Cheng W, He S, Jiang H (2022) Research on visual question answering based on GAT relational reasoning. Neural Process Lett 54:1435\u20131448. https:\/\/doi.org\/10.1007\/s11063-021-10689-2","journal-title":"Neural Process Lett"},{"key":"11403_CR45","doi-asserted-by":"publisher","unstructured":"Han Y, Guo Y, Yin J, Liu M, Hu Y, Nie L (2021) Focal and Composed Vision-semantic Modeling for Visual Question Answering. Proceedings of the 29th ACM International Conference on Multimedia, 4528\u20134536. https:\/\/doi.org\/10.1145\/3474085.3475609","DOI":"10.1145\/3474085.3475609"},{"key":"11403_CR46","doi-asserted-by":"publisher","DOI":"10.1145\/3498340","author":"Y Liu","year":"2022","unstructured":"Liu Y, Guo Y, Yin J, Song X, Liu W, Nie L, Zhang M (2022) Answer questions with right image regions: a visual attention regularization approach. ACM Trans Multimedia Comput Commun Appl. https:\/\/doi.org\/10.1145\/3498340","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"issue":"1","key":"11403_CR47","doi-asserted-by":"publisher","first-page":"116319","DOI":"10.1016\/j.image.2021.116319","volume":"96","author":"W Yirui","year":"2021","unstructured":"Yirui W, Ma Y, Wan S (2021) Multi-scale relation reasoning for multi-modal visual question answering. Signal Process Image Commun 96(1):116319. https:\/\/doi.org\/10.1016\/j.image.2021.116319","journal-title":"Signal Process Image Commun"},{"key":"11403_CR48","doi-asserted-by":"publisher","unstructured":"Gao L, Zeng P, Song J, Liu X, Shen HT (2018) From pixels to objects: Cubic visual attention for visual question answering. Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence Main track. 906\u2013912. https:\/\/doi.org\/10.24963\/ijcai.2018\/126","DOI":"10.24963\/ijcai.2018\/126"},{"key":"11403_CR49","doi-asserted-by":"publisher","unstructured":"Chen K, Wang J, Chen LC, Gao H, Xu W, Nevatia R (2015) Abc-cnn: An attention based convolutional neural network for visual question answering. arXiv preprint arXiv:1511.05960. https:\/\/doi.org\/10.48550\/arXiv.1511.05960","DOI":"10.48550\/arXiv.1511.05960"},{"key":"11403_CR50","doi-asserted-by":"publisher","unstructured":"Noh H, Seo PH, Han B (2016) Image question answering using convolutional neural network with dynamic parameter prediction. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 30\u201338. https:\/\/doi.org\/10.1109\/cvpr.2016.11","DOI":"10.1109\/cvpr.2016.11"},{"key":"11403_CR51","doi-asserted-by":"publisher","unstructured":"PLu P, Li H, Zhang W, Wang J, Wang X (2018) Co-attending free-form regions and detections with multi-modal multiplicative feature embedding for visual question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, 32. https:\/\/doi.org\/10.1609\/aaai.v32i1.12240","DOI":"10.1609\/aaai.v32i1.12240"},{"key":"11403_CR52","doi-asserted-by":"publisher","DOI":"10.1145\/3489142","author":"Fu Qun Li","year":"2022","unstructured":"Qun Li Fu, Xiao BB, Sheng B, Hong R (2022) Inner knowledge-based Img2Doc scheme for visual question answering. ACM Trans Multimedia Comput Commun Appl. https:\/\/doi.org\/10.1145\/3489142","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"11403_CR53","doi-asserted-by":"publisher","unstructured":"Voita E, Talbot D, Moiseev F, Sennrich R, Titov I (2019) Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 5797\u20135808. https:\/\/doi.org\/10.18653\/v1\/P19-1580","DOI":"10.18653\/v1\/P19-1580"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11403-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-023-11403-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11403-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,27]],"date-time":"2023-12-27T09:18:54Z","timestamp":1703668734000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-023-11403-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,13]]},"references-count":53,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["11403"],"URL":"https:\/\/doi.org\/10.1007\/s11063-023-11403-0","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,9,13]]},"assertion":[{"value":"20 August 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 September 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}