{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:24:05Z","timestamp":1772907845415,"version":"3.50.1"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2022,11,10]],"date-time":"2022-11-10T00:00:00Z","timestamp":1668038400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,11,10]],"date-time":"2022-11-10T00:00:00Z","timestamp":1668038400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"national natural science foundation of china","doi-asserted-by":"publisher","award":["81860318"],"award-info":[{"award-number":["81860318"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"national natural science foundation of china","doi-asserted-by":"publisher","award":["81560296"],"award-info":[{"award-number":["81560296"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2023,5]]},"DOI":"10.1007\/s11042-022-14167-2","type":"journal-article","created":{"date-parts":[[2022,11,10]],"date-time":"2022-11-10T15:04:31Z","timestamp":1668092671000},"page":"16343-16358","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Overcoming language priors with self-contrastive learning for visual question answering"],"prefix":"10.1007","volume":"82","author":[{"given":"Hong","family":"Yan","sequence":"first","affiliation":[]},{"given":"Lijun","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xupeng","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Qingsong","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,10]]},"reference":[{"key":"14167_CR1","doi-asserted-by":"publisher","unstructured":"Agrawal A, Batra D, Parikh D (2016) Analyzing the behavior of visual question answering models. In: Proceedings of the 2016 conference on empirical methods in natural language processing. Association for Computational Linguistics, Austin, pp 1955\u20131960, DOI https:\/\/doi.org\/10.18653\/v1\/D16-1203","DOI":"10.18653\/v1\/D16-1203"},{"key":"14167_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal A, Batra D, Parikh D, Kembhavi A (2018) Don\u2019t just assume; look and answer: Overcoming priors for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4971\u20134980","DOI":"10.1109\/CVPR.2018.00522"},{"key":"14167_CR3","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"14167_CR4","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick CL, Parikh D (2015) Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"14167_CR5","doi-asserted-by":"crossref","unstructured":"Cadene R, Ben-Younes H, Cord M, Thome N (2019a) Murel: Multimodal relational reasoning for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1989\u20131998","DOI":"10.1109\/CVPR.2019.00209"},{"key":"14167_CR6","unstructured":"Cadene R, Dancette C, Cord M, Parikh D et al (2019b) Rubi: Reducing unimodal biases for visual question answering. Advances in Neural Information Processing Systems 32"},{"key":"14167_CR7","doi-asserted-by":"crossref","unstructured":"Chen L, Yan X, Xiao J, Zhang H, Pu S, Zhuang Y (2020) Counterfactual samples synthesizing for robust visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10800\u201310809","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"14167_CR8","doi-asserted-by":"publisher","unstructured":"Cho K, van Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP). Association for Computational Linguistics, Doha, pp 1724\u20131734, DOI https:\/\/doi.org\/10.3115\/v1\/D14-1179, (to appear in print)","DOI":"10.3115\/v1\/D14-1179"},{"key":"14167_CR9","doi-asserted-by":"publisher","unstructured":"Clark C, Yatskar M, Zettlemoyer L (2019) Don\u2019t take the easy way out: Ensemble based methods for avoiding known dataset biases. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP). Association for Computational Linguistics, China, pp 4069\u20134082, DOI https:\/\/doi.org\/10.18653\/v1\/D19-1418, (to appear in print)","DOI":"10.18653\/v1\/D19-1418"},{"key":"14167_CR10","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1016\/j.cviu.2017.10.001","volume":"163","author":"A Das","year":"2017","unstructured":"Das A, Agrawal H, Zitnick L, Parikh D, Batra D (2017) Human attention in visual question answering: Do humans and deep networks look at the same regions? Comput Vis Image Underst 163:90\u2013100","journal-title":"Comput Vis Image Underst"},{"key":"14167_CR11","doi-asserted-by":"publisher","unstructured":"Fukui A, Park D H, Yang D, Rohrbach A, Darrell T, Rohrbach M (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 conference on empirical methods in natural language processing. Texas, Association for Computational Linguistics, pp 457\u2013468, DOI https:\/\/doi.org\/10.18653\/v1\/D16-1044, (to appear in print)","DOI":"10.18653\/v1\/D16-1044"},{"key":"14167_CR12","first-page":"3197","volume":"33","author":"I Gat","year":"2020","unstructured":"Gat I, Schwartz I, Schwing A, Hazan T (2020) Removing bias in multi-modal classifiers: Regularization by maximizing functional entropies. Adv Neural Inf Process Syst 33:3197\u20133208","journal-title":"Adv Neural Inf Process Syst"},{"key":"14167_CR13","doi-asserted-by":"crossref","unstructured":"Goyal Y, Khot T, Summers-Stay D, Batra D, Parikh D (2017) Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6904\u20136913","DOI":"10.1109\/CVPR.2017.670"},{"key":"14167_CR14","doi-asserted-by":"publisher","unstructured":"Grand G, Belinkov Y (2019) Adversarial regularization for visual question answering: Strengths, shortcomings, and side effects. In: Proceedings of the second workshop on shortcomings in vision and language. Minnesota, Association for Computational Linguistics, pp 1\u201313, DOI https:\/\/doi.org\/10.18653\/v1\/W19-1801https:\/\/doi.org\/10.18653\/v1\/W19-1801, (to appear in print)","DOI":"10.18653\/v1\/W19-1801 10.18653\/v1\/W19-1801"},{"key":"14167_CR15","doi-asserted-by":"publisher","first-page":"6730","DOI":"10.1109\/TIP.2021.3097180","volume":"30","author":"W Guo","year":"2021","unstructured":"Guo W, Zhang Y, Yang J, Yuan X (2021a) Re-attention for visual question answering. IEEE Trans Image Process 30:6730\u20136743","journal-title":"IEEE Trans Image Process"},{"key":"14167_CR16","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1109\/TIP.2021.3128322","volume":"31","author":"Y Guo","year":"2021","unstructured":"Guo Y, Nie L, Cheng Z, Tian Q, Zhang M (2021b) Loss re-scaling vqa: Revisiting the language prior problem from a class-imbalance view. IEEE Trans Image Process 31:227\u2013238","journal-title":"IEEE Trans Image Process"},{"key":"14167_CR17","doi-asserted-by":"crossref","unstructured":"Jing C, Wu Y, Zhang X, Jia Y, Wu Q (2020) Overcoming language priors in vqa via decomposed linguistic representations. In: Proceedings of the AAAI conference on artificial intelligence, vol 34, pp 11181\u201311188","DOI":"10.1609\/aaai.v34i07.6776"},{"key":"14167_CR18","doi-asserted-by":"crossref","unstructured":"Kafle K, Kanan C (2017) An analysis of visual question answering algorithms. In: Proceedings of the IEEE international conference on computer vision, pp 1965\u20131973","DOI":"10.1109\/ICCV.2017.217"},{"key":"14167_CR19","unstructured":"Kingma DP, Ba J (2015) Adam: A method for stochastic optimization. In: Bengio Y, LeCun Y (eds) 3rd international conference on learning representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings. arXiv:1412.6980"},{"key":"14167_CR20","doi-asserted-by":"crossref","unstructured":"KV G, Mittal A (2020) Reducing language biases in visual question answering with visually-grounded question encoder. In: European Conference on computer vision. Springer, pp 18\u201334","DOI":"10.1007\/978-3-030-58601-0_2"},{"key":"14167_CR21","doi-asserted-by":"crossref","unstructured":"Li L, Gan Z, Cheng Y, Liu J (2019) Relation-aware graph attention network for visual question answering. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10313\u201310322","DOI":"10.1109\/ICCV.2019.01041"},{"key":"14167_CR22","doi-asserted-by":"crossref","unstructured":"Liu C, Mao J, Sha F, Yuille A (2017) Attention correctness in neural image captioning. In: Proceedings of the AAAI conference on artificial intelligence, vol 31","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"14167_CR23","doi-asserted-by":"crossref","unstructured":"Malinowski M, Doersch C, Santoro A, Battaglia P (2018) Learning visual question answering by bootstrapping hard attention. In: Proceedings of the European conference on computer vision (ECCV), pp 3\u201320","DOI":"10.1007\/978-3-030-01231-1_1"},{"key":"14167_CR24","doi-asserted-by":"crossref","unstructured":"Niu Y, Tang K, Zhang H, Lu Z, Hua XS, Wen JR (2021) Counterfactual vqa: A cause-effect look at language bias. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 12700\u201312710","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"14167_CR25","doi-asserted-by":"crossref","unstructured":"Park DH, Hendricks LA, Akata Z, Rohrbach A, Schiele B, Darrell T, Rohrbach M (2018) Multimodal explanations: Justifying decisions and pointing to the evidence. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8779\u20138788","DOI":"10.1109\/CVPR.2018.00915"},{"issue":"3","key":"14167_CR26","doi-asserted-by":"publisher","first-page":"3843","DOI":"10.1007\/s11042-018-6389-3","volume":"78","author":"L Peng","year":"2019","unstructured":"Peng L, Yang Y, Bin Y, Xie N, Shen F, Ji Y, Xu X (2019) Word-to-region attention network for visual question answering. Multimed Tools Appl 78(3):3843\u20133858","journal-title":"Multimed Tools Appl"},{"key":"14167_CR27","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"14167_CR28","unstructured":"Ramakrishnan S, Agrawal A, Lee S (2018) Overcoming language priors in visual question answering with adversarial regularization. Adv Neural Inf Process Syst 31"},{"key":"14167_CR29","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in Neural Information Processing Systems 28"},{"key":"14167_CR30","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Lee S, Shen Y, Jin H, Ghosh S, Heck L, Batra D, Parikh D (2019) Taking a hint: Leveraging explanations to make vision and language models more grounded. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2591\u20132600","DOI":"10.1109\/ICCV.2019.00268"},{"key":"14167_CR31","doi-asserted-by":"crossref","unstructured":"Shrestha R, Kafle K, Kanan C (2019) Answer them all! toward universal visual question answering models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10472\u201310481","DOI":"10.1109\/CVPR.2019.01072"},{"key":"14167_CR32","doi-asserted-by":"publisher","unstructured":"Shrestha R, Kafle K, Kanan C (2020) A negative case analysis of visual grounding methods for VQA. In: Proceedings of the 58th annual meeting of the association for computational linguistics, Online. Association for Computational Linguistics, pp 8172\u20138181, DOI https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.727, (to appear in print)","DOI":"10.18653\/v1\/2020.acl-main.727"},{"key":"14167_CR33","doi-asserted-by":"crossref","unstructured":"Teney D, Abbasnedjad E, Hengel AVD (2020a) Learning what makes a difference from counterfactual examples and gradient supervision. In: European conference on computer vision. Springer, pp 580\u2013599","DOI":"10.1007\/978-3-030-58607-2_34"},{"key":"14167_CR34","first-page":"407","volume":"33","author":"D Teney","year":"2020","unstructured":"Teney D, Abbasnejad E, Kafle K, Shrestha R, Kanan C, Van Den Hengel A (2020b) On the value of out-of-distribution testing: an example of goodhart\u2019s law. Adv Neural Inf Process Syst 33:407\u2013 417","journal-title":"Adv Neural Inf Process Syst"},{"key":"14167_CR35","doi-asserted-by":"crossref","unstructured":"Teney D, Abbasnejad E, van den Hengel A (2021) Unshuffling data for improved generalization in visual question answering. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 1417\u20131427","DOI":"10.1109\/ICCV48922.2021.00145"},{"issue":"3","key":"14167_CR36","doi-asserted-by":"publisher","first-page":"2921","DOI":"10.1007\/s11042-018-6097-z","volume":"78","author":"AS Toor","year":"2019","unstructured":"Toor AS, Wechsler H, Nappi M (2019) Question action relevance and editing for visual question answering. Multimed Tools Appl 78(3):2921\u20132935","journal-title":"Multimed Tools Appl"},{"key":"14167_CR37","unstructured":"Wu J, Mooney R (2019) Self-critical reasoning for robust visual question answering. Adv Neural Inf Process Syst 32"},{"key":"14167_CR38","doi-asserted-by":"crossref","unstructured":"Yang Z, He X, Gao J, Deng L, Smola A (2016) Stacked attention networks for image question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 21\u201329","DOI":"10.1109\/CVPR.2016.10"},{"key":"14167_CR39","doi-asserted-by":"publisher","unstructured":"Zhu X, Mao Z, Liu C, Zhang P, Wang B, Zhang Y (2020) Overcoming language priors with self-supervised learning for visual question answering. In: Bessiere C (ed) Proceedings of the twenty-ninth international joint conference on artificial intelligence, IJCAI 2020, ijcai.org, pp 1083\u20131089, DOI https:\/\/doi.org\/10.24963\/ijcai.2020\/151, (to appear in print)","DOI":"10.24963\/ijcai.2020\/151"},{"issue":"11","key":"14167_CR40","doi-asserted-by":"publisher","first-page":"16247","DOI":"10.1007\/s11042-020-08790-0","volume":"80","author":"X Zhu","year":"2021","unstructured":"Zhu X, Mao Z, Chen Z, Li Y, Wang Z, Wang B (2021) Object-difference drived graph convolutional networks for visual question answering. Multimed Tools Appl 80(11):16247\u201316265","journal-title":"Multimed Tools Appl"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-14167-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-022-14167-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-14167-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,15]],"date-time":"2023-04-15T09:31:37Z","timestamp":1681551097000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-022-14167-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11,10]]},"references-count":40,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2023,5]]}},"alternative-id":["14167"],"URL":"https:\/\/doi.org\/10.1007\/s11042-022-14167-2","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,11,10]]},"assertion":[{"value":"6 July 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 November 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}