{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T15:54:11Z","timestamp":1777478051084,"version":"3.51.4"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2022,1,25]],"date-time":"2022-01-25T00:00:00Z","timestamp":1643068800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,25]],"date-time":"2022-01-25T00:00:00Z","timestamp":1643068800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"national natural science foundation of china","doi-asserted-by":"publisher","award":["61906044"],"award-info":[{"award-number":["61906044"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010031","name":"postdoctoral research foundation of china","doi-asserted-by":"publisher","award":["2020M681984"],"award-info":[{"award-number":["2020M681984"]}],"id":[{"id":"10.13039\/501100010031","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100009558","name":"university natural science research project of anhui province","doi-asserted-by":"publisher","award":["KJ2020ZD48"],"award-info":[{"award-number":["KJ2020ZD48"]}],"id":[{"id":"10.13039\/501100009558","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100009558","name":"university natural science research project of anhui province","doi-asserted-by":"publisher","award":["KJ2019A0532"],"award-info":[{"award-number":["KJ2019A0532"]}],"id":[{"id":"10.13039\/501100009558","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100009558","name":"university natural science research project of anhui province","doi-asserted-by":"publisher","award":["KJ2019A0536"],"award-info":[{"award-number":["KJ2019A0536"]}],"id":[{"id":"10.13039\/501100009558","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1007\/s00521-022-06923-0","type":"journal-article","created":{"date-parts":[[2022,1,25]],"date-time":"2022-01-25T11:03:30Z","timestamp":1643108610000},"page":"9015-9023","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Overcoming language priors in VQA via adding visual module"],"prefix":"10.1007","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7440-0109","authenticated-orcid":false,"given":"Jia","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuesong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuefeng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gang","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,25]]},"reference":[{"key":"6923_CR1","doi-asserted-by":"crossref","unstructured":"Fukui A, Park DH, Yang D, Rohrbach A, Darrell T, Rohrbach M (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 conference on empirical methods in natural language processing, pp 457\u2013468","DOI":"10.18653\/v1\/D16-1044"},{"key":"6923_CR2","doi-asserted-by":"crossref","unstructured":"Yu Z, Yu J, Fan J, Tao D (2017) Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 1821\u20131830","DOI":"10.1109\/ICCV.2017.202"},{"issue":"12","key":"6923_CR3","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu Z, Yu J, Xiang C, Fan J, Tao D (2018) Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans Neural Netw Learn Syst 29(12):5947\u20135959","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"6923_CR4","doi-asserted-by":"crossref","unstructured":"Ben-Younes H, Cadene R, Cord M, Thome N (2017) Mutan: multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2612\u20132620","DOI":"10.1109\/ICCV.2017.285"},{"key":"6923_CR5","doi-asserted-by":"crossref","unstructured":"Ben-Younes H, Cadene R, Thome N, Cord M (2019) Block: bilinear superdiagonal fusion for visual question answering and visual relationship detection. In: Proceedings of the AAAI conference on artificial intelligence, vol 33, pp 8102\u20138109","DOI":"10.1609\/aaai.v33i01.33018102"},{"key":"6923_CR6","doi-asserted-by":"crossref","unstructured":"Yang Z, He X, Gao J, Deng L, Smola A (2016) Stacked attention networks for image question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 21\u201329","DOI":"10.1109\/CVPR.2016.10"},{"key":"6923_CR7","unstructured":"Lu J, Yang J, Batra D, Parikh D (2016) Hierarchical question-image co-attention for visual question answering. In: Proceedings of the 30th international conference on neural information processing systems, pp 289\u2013297"},{"key":"6923_CR8","doi-asserted-by":"crossref","unstructured":"Gao P, Jiang Z, You H, Lu P, Hoi SC, Wang X, Li H (2019) Dynamic fusion with intra-and inter-modality attention flow for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6639\u20136648","DOI":"10.1109\/CVPR.2019.00680"},{"key":"6923_CR9","doi-asserted-by":"publisher","first-page":"35662","DOI":"10.1109\/ACCESS.2020.2975093","volume":"8","author":"C Chen","year":"2020","unstructured":"Chen C, Han D, Wang J (2020) Multimodal encoder\u2013decoder attention networks for visual question answering. IEEE Access 8:35662\u201335671","journal-title":"IEEE Access"},{"key":"6923_CR10","unstructured":"Narasimhan M, Lazebnik S, Schwing AG (2018) Out of the box: reasoning with graph convolution nets for factual visual question answering. In: Proceedings of the 32nd international conference on neural information processing systems, pp 2659\u20132670"},{"key":"6923_CR11","doi-asserted-by":"publisher","unstructured":"Zhu Z, Yu J, Wang Y, Sun Y, Hu Y, Wu Q (2020) Mucko: multi-layer cross-modal knowledge reasoning for fact-based visual question answering. In: Proceedings of the twenty-ninth international joint conference on artificial intelligence main track, pp 1097\u20131103. https:\/\/doi.org\/10.24963\/ijcai.2020\/153","DOI":"10.24963\/ijcai.2020\/153"},{"issue":"10","key":"6923_CR12","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2017","unstructured":"Wang P, Wu Q, Shen C, Dick A, Van Den Hengel A (2017) Fvqa: fact-based visual question answering. IEEE Trans Pattern Anal Mach Intell 40(10):2413\u20132427","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6923_CR13","doi-asserted-by":"crossref","unstructured":"Cadene R, Ben-Younes H, Cord M, Thome N (2019) Murel: multimodal relational reasoning for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1989\u20131998","DOI":"10.1109\/CVPR.2019.00209"},{"key":"6923_CR14","doi-asserted-by":"crossref","unstructured":"Li L, Gan Z, Cheng Y, Liu J (2019) Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10313\u201310322","DOI":"10.1109\/ICCV.2019.01041"},{"key":"6923_CR15","doi-asserted-by":"crossref","unstructured":"Huang Q, Wei J, Cai Y, Zheng C, Chen J, Leung H F, Li Q (2020) Aligned dual channel graph convolutional network for visual question answering. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 7166\u20137176","DOI":"10.18653\/v1\/2020.acl-main.642"},{"key":"6923_CR16","doi-asserted-by":"crossref","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D (2017) Making the v in vqa matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6904\u20136913","DOI":"10.1109\/CVPR.2017.670"},{"key":"6923_CR17","doi-asserted-by":"crossref","unstructured":"Agrawal A, Batra D, Parikh D (2016) Analyzing the behavior of visual question answering models. In: Proceedings of the 2016 conference on empirical methods in natural language processing, pp 1955\u20131960","DOI":"10.18653\/v1\/D16-1203"},{"key":"6923_CR18","doi-asserted-by":"crossref","unstructured":"Jabri A, Joulin A, Van Der Maaten L (2016) Revisiting visual question answering baselines. In: Proceedings of the European conference on computer vision. Springer, Cham, pp 727\u2013739","DOI":"10.1007\/978-3-319-46484-8_44"},{"key":"6923_CR19","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.cviu.2017.06.005","volume":"163","author":"K Kafle","year":"2017","unstructured":"Kafle K, Kanan C (2017) Visual question answering: datasets, algorithms, and future challenges. Comput Vision Image Underst 163:3\u201320","journal-title":"Comput Vision Image Underst"},{"key":"6923_CR20","doi-asserted-by":"crossref","unstructured":"Zhang P, Goyal Y, Summers-Stay D, Batra D, Parikh D (2016) Yin and yang: balancing and answering binary visual questions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5014\u20135022","DOI":"10.1109\/CVPR.2016.542"},{"key":"6923_CR21","unstructured":"Zhou B, Tian Y, Sukhbaatar S, Szlam A, Fergus R (2015) Simple baseline for visual question answering. arXiv preprint arXiv:1512.02167"},{"key":"6923_CR22","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Lee S, Shen Y, Jin H, Ghosh S, Heck L, Batra D, Parikh D (2019) Taking a hint: leveraging explanations to make vision and language models more grounded. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2591\u20132600","DOI":"10.1109\/ICCV.2019.00268"},{"key":"6923_CR23","unstructured":"Wu J, Mooney RJ (2019) Self-critical reasoning for robust visual question answering. arXiv preprint arXiv:1905.09998"},{"key":"6923_CR24","doi-asserted-by":"crossref","unstructured":"Shrestha R, Kafle K, Kanan C (2020) A negative case analysis of visual grounding methods for VQA. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 8172\u20138181","DOI":"10.18653\/v1\/2020.acl-main.727"},{"key":"6923_CR25","doi-asserted-by":"crossref","unstructured":"Clark C, Yatskar M, Zettlemoyer L (2019) Don\u2019t take the easy way out: ensemble based methods for avoiding known dataset biases. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language Processing (EMNLP-IJCNLP), pp 4060\u20134073","DOI":"10.18653\/v1\/D19-1418"},{"key":"6923_CR26","doi-asserted-by":"crossref","unstructured":"Niu Y, Tang K, Zhang H, Lu Z, Hua X S, Wen J R (2020) Counterfactual vqa: a cause-effect look at language bias. arXiv preprint arXiv:2006.04315","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"6923_CR27","doi-asserted-by":"crossref","unstructured":"Agrawal A, Batra D, Parikh D, Kembhavi A (2018) Don\u2019t just assume; look and answer: overcoming priors for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4971\u20134980","DOI":"10.1109\/CVPR.2018.00522"},{"key":"6923_CR28","doi-asserted-by":"crossref","unstructured":"Liang Z, Jiang W, Hu H, Zhu J (2020) Learning to contrast the counterfactual samples for robust visual question answering. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), pp 3285\u20133292","DOI":"10.18653\/v1\/2020.emnlp-main.265"},{"key":"6923_CR29","unstructured":"Gat I, Schwartz I, Schwing A, Hazan T (2020) Removing bias in multi-modal classifiers: regularization by maximizing functional entropies. arXiv preprint arXiv:2010.10802"},{"key":"6923_CR30","unstructured":"Guo Y, Nie L, Cheng Z, Tian Q (2020) Loss-rescaling VQA: revisiting Language Prior Problem from a Class-imbalance View. arXiv:2010.16010"},{"key":"6923_CR31","doi-asserted-by":"crossref","unstructured":"Guo Y, Cheng Z, Nie L, Liu Y, Wang Y, Kankanhalli M (2019) Quantifying and alleviating the language prior problem in visual question answering. In: Proceedings of the 42nd international ACM SIGIR conference on research and development in information retrieval, pp 75\u201384","DOI":"10.1145\/3331184.3331186"},{"key":"6923_CR32","doi-asserted-by":"publisher","unstructured":"Zhu X, Mao Z, Liu C, Zhang P, Wang B, Zhang Y (2020) Overcoming language priors with self-supervised learning for visual question answering. In: Proceedings of the twenty-ninth international joint conference on artificial intelligence main track, pp 1083\u20131089. https:\/\/doi.org\/10.24963\/ijcai.2020\/151","DOI":"10.24963\/ijcai.2020\/151"},{"key":"6923_CR33","doi-asserted-by":"crossref","unstructured":"Chen L, Yan X, Xiao J, Zhang H, Pu S, Zhuang Y (2020) Counterfactual samples synthesizing for robust visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10800\u201310809","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"6923_CR34","doi-asserted-by":"crossref","unstructured":"Grand G, Belinkov Y (2019) Adversarial regularization for visual question answering: strengths, shortcomings, and side effects. In: Proceedings of the second workshop on shortcomings in vision and language, pp 1\u201313","DOI":"10.18653\/v1\/W19-1801"},{"key":"6923_CR35","unstructured":"Ramakrishnan S, Agrawal A, Lee S (2018) Overcoming language priors in visual question answering with adversarial regularization. In: Proceedings of the 32nd international conference on neural information processing systems, pp 1548\u20131558"},{"key":"6923_CR36","unstructured":"Cadene R, Dancette C, Ben-Younes H, Cord M, Parikh D (2019) RUBi: reducing unimodal biases for visual question answering. In: Proceedings of the neural information processing systems, vol 32. Curran Associates Inc, pp 841\u2013852"},{"key":"6923_CR37","doi-asserted-by":"crossref","unstructured":"Gokhale T, Banerjee P, Baral C, Yang Y (2020) MUTANT: a training paradigm for out-of-distribution generalization in visual question answering. In: Proceedings of the 2020 conference on empirical methods in natural language processing (EMNLP), pp 878\u2013892","DOI":"10.18653\/v1\/2020.emnlp-main.63"},{"key":"6923_CR38","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick C L, Parikh D (2015) Vqa: visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp. 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"6923_CR39","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"6923_CR40","doi-asserted-by":"crossref","unstructured":"Cho K, van Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1724\u20131734","DOI":"10.3115\/v1\/D14-1179"},{"key":"6923_CR41","doi-asserted-by":"crossref","unstructured":"Teney D, Abbasnedjad E, Hengel A V D (2020) Learning what makes a difference from counterfactual examples and gradient supervision. arXiv preprint arXiv:2004.09034","DOI":"10.1007\/978-3-030-58607-2_34"},{"key":"6923_CR42","doi-asserted-by":"publisher","unstructured":"Yang C, Feng S, Li D, Shen H, Wang G, Jiang B (2020) Learning content and context with language bias for visual question answering. In: Proceedings of the IEEE international conference on multimedia and expo (ICME), pp 1\u20136. https:\/\/doi.org\/10.1109\/ICME51207.2021.9428098","DOI":"10.1109\/ICME51207.2021.9428098"},{"key":"6923_CR43","unstructured":"Kervadec C, Antipov G, Baccouche M, Wolf C (2020) Estimating semantic structure for the VQA answer space. arXiv preprint arXiv:2006.05726"},{"key":"6923_CR44","doi-asserted-by":"crossref","unstructured":"Wang M, Qi G, Wang H F, Zheng Q S (2019) Richpedia: a comprehensive multi-modal knowledge graph. In: Joint International Semantic Technology Conference(JIST), pp 130\u2013145","DOI":"10.1007\/978-3-030-41407-8_9"},{"issue":"2","key":"6923_CR45","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis T, Ahuja C, Morency LP (2018) Multimodal machine learning: a survey and taxonomy. IEEE Trans Pattern Anal Mach Intell 41(2):423\u2013443","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"6923_CR46","doi-asserted-by":"publisher","first-page":"100159","DOI":"10.1016\/j.bdr.2020.100159","volume":"22","author":"M Wang","year":"2020","unstructured":"Wang M, Wang H, Qi G, Zheng Q (2020) Richpedia: a large-scale, comprehensive multi-modal knowledge graph. Big Data Res 22:100159","journal-title":"Big Data Res"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-022-06923-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-022-06923-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-022-06923-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,14]],"date-time":"2022-05-14T05:38:36Z","timestamp":1652506716000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-022-06923-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,1,25]]},"references-count":46,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2022,6]]}},"alternative-id":["6923"],"URL":"https:\/\/doi.org\/10.1007\/s00521-022-06923-0","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,1,25]]},"assertion":[{"value":"24 June 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 January 2022","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 January 2022","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}