{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,25]],"date-time":"2026-05-25T18:03:22Z","timestamp":1779732202304,"version":"3.53.1"},"reference-count":58,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.neunet.2026.109154","type":"journal-article","created":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T15:50:42Z","timestamp":1779292242000},"page":"109154","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Focus on the essentials: Learning to attend to the most critical information for visual question answering"],"prefix":"10.1016","volume":"203","author":[{"given":"Kun","family":"Zeng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5313-6134","authenticated-orcid":false,"given":"Zhixin","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.109154_bib0001","series-title":"Proceedings of the 2016 Conference on empirical methods in natural language processing","first-page":"1955","article-title":"Analyzing the behavior of visual question answering models","author":"Agrawal","year":"2016"},{"key":"10.1016\/j.neunet.2026.109154_bib0002","series-title":"Proceedings of the IEEE Conference on computer vision and pattern recognition","first-page":"4971","article-title":"Don\u2019t just assume; look and answer: Overcoming priors for visual question answering","author":"Agrawal","year":"2018"},{"key":"10.1016\/j.neunet.2026.109154_bib0003","series-title":"Proceedings of the IEEE Conference on computer vision and pattern recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.neunet.2026.109154_bib0004","series-title":"Proceedings of the IEEE International conference on computer vision","first-page":"2425","article-title":"VQA: Visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.neunet.2026.109154_bib0005","series-title":"Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition","first-page":"11671","article-title":"RMLVQA: A margin loss approach for visual question answering with language biases","author":"Basu","year":"2023"},{"key":"10.1016\/j.neunet.2026.109154_bib0006","series-title":"Advances in neural information processing systems","first-page":"841","article-title":"RUBi: Reducing unimodal biases for visual question answering","author":"Cadene","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0007","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111129","article-title":"Enhancing robust VQA via contrastive and self-supervised learning","volume":"159","author":"Cao","year":"2025","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.neunet.2026.109154_bib0008","series-title":"Proceedings of the European conference on computer vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0009","series-title":"Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition","first-page":"10800","article-title":"Counterfactual samples synthesizing for robust visual question answering","author":"Chen","year":"2020"},{"issue":"11","key":"10.1016\/j.neunet.2026.109154_bib0010","first-page":"13218","article-title":"Counterfactual samples synthesizing and training for robust visual question answering","volume":"45","author":"Chen","year":"2023","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.109154_bib0011","series-title":"Proceedings of the European Conference on computer vision","first-page":"95","article-title":"Rethinking data augmentation for robust visual question answering","author":"Chen","year":"2022"},{"key":"10.1016\/j.neunet.2026.109154_bib0012","series-title":"Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition","first-page":"11681","article-title":"Generative bias for robust visual question answering","author":"Cho","year":"2023"},{"key":"10.1016\/j.neunet.2026.109154_bib0013","series-title":"Proceedings of the 2014 Conference on empirical methods in natural language processing","first-page":"1724","article-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation","author":"Cho","year":"2014"},{"key":"10.1016\/j.neunet.2026.109154_bib0014","series-title":"Proceedings of the 2019 Conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing","first-page":"4069","article-title":"Don\u2019T take the easy way out: Ensemble based methods for avoiding known dataset biases","author":"Clark","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0015","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"1554","article-title":"Beyond question-based biases: Assessing multimodal shortcut learning in visual question answering","author":"Dancette","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition","first-page":"4690","article-title":"ArcFace: Additive angular margin loss for deep face recognition","author":"Deng","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0017","series-title":"Proceedings of the 2019 Conference of the North American chapter of the association for computational linguistics: human language technologies","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0018","unstructured":"Dubey, A., Jauhri, A., Pandey, A., Kadian, A., Al-Dahle, A., Letman, A., Mathur, A., Schelten, A., Yang, A., Fan, A. et al. (2024). The Llama 3 herd of models. arXiv e-prints, arXiv: 2407.21783."},{"key":"10.1016\/j.neunet.2026.109154_bib0019","series-title":"Advances in neural information processing systems","first-page":"3197","article-title":"Removing bias in multi-modal classifiers: Regularization by maximizing functional entropies","author":"Gat","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0020","series-title":"Proceedings of the IEEE Conference on computer vision and pattern recognition","first-page":"6904","article-title":"Making the v in VQA matter: Elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"10.1016\/j.neunet.2026.109154_bib0021","series-title":"Proceedings of the 33rd ACM International conference on information and knowledge management","first-page":"3767","article-title":"Beyond language bias: Overcoming multimodal shortcut and distribution biases for robust visual question answering","author":"Gu","year":"2024"},{"key":"10.1016\/j.neunet.2026.109154_bib0022","series-title":"Proceedings of the 2006\u202fIEEE Computer society conference on computer vision and pattern recognition","first-page":"1735","article-title":"Dimensionality reduction by learning an invariant mapping","author":"Hadsell","year":"2006"},{"key":"10.1016\/j.neunet.2026.109154_bib0023","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"1584","article-title":"Greedy gradient ensemble for robust visual question answering","author":"Han","year":"2021"},{"issue":"8","key":"10.1016\/j.neunet.2026.109154_bib0024","doi-asserted-by":"crossref","first-page":"9789","DOI":"10.1109\/TPAMI.2023.3240337","article-title":"General greedy de-bias learning","volume":"45","author":"Han","year":"2023","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"10","key":"10.1016\/j.neunet.2026.109154_bib0025","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3673902","article-title":"Hccl: Hierarchical counterfactual contrastive learning for robust visual question answering","volume":"20","author":"Hao","year":"2024","journal-title":"ACM Transactions on Multimedia Computing Communications and Applications"},{"key":"10.1016\/j.neunet.2026.109154_bib0026","doi-asserted-by":"crossref","first-page":"40","DOI":"10.1016\/j.neunet.2022.02.007","article-title":"Possibilistic classification by support vector networks","volume":"149","author":"Hao","year":"2022","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109154_bib0027","unstructured":"Honnibal, M., & Montani, I. (2017). spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing. To appear, 7 (1), 411\u2013420."},{"key":"10.1016\/j.neunet.2026.109154_bib0028","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106710","article-title":"Exploring refined dual visual features cross-combination for image captioning","volume":"180","author":"Hu","year":"2024","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109154_bib0029","series-title":"Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition","first-page":"6700","article-title":"GQA: A new dataset for real-world visual reasoning and compositional question answering","author":"Hudson","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0030","series-title":"Proceedings of the IEEE International conference on computer vision","first-page":"1965","article-title":"An analysis of visual question answering algorithms","author":"Kafle","year":"2017"},{"key":"10.1016\/j.neunet.2026.109154_bib0031","series-title":"Advances in neural information processing systems","first-page":"21798","article-title":"Hard negative mixing for contrastive learning","author":"Kalantidis","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0032","series-title":"2021\u202fIEEE\/CVF Conference on computer vision and pattern recognition","first-page":"2775","article-title":"Roses are red, violets are blue... but should VQA expect them to?","author":"Kervadec","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0033","series-title":"Proceedings of the 2021 Conference on empirical methods in natural language processing","first-page":"6346","article-title":"Discovering the unknown knowns: Turning implicit knowledge in the dataset into explicit training examples for visual question answering","author":"Kil","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0034","series-title":"Proceedings of the IEEE\/CVF Winter conference on applications of computer vision","first-page":"3001","article-title":"Efficient counterfactual debiasing for visual question answering","author":"Kolling","year":"2022"},{"key":"10.1016\/j.neunet.2026.109154_bib0035","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.neunet.2026.109154_bib0036","series-title":"Proceedings of the 2020 Conference on empirical methods in natural language processing","first-page":"3285","article-title":"Learning to contrast the counterfactual samples for robust visual question answering","author":"Liang","year":"2020"},{"issue":"4","key":"10.1016\/j.neunet.2026.109154_bib0037","first-page":"1","article-title":"Answer questions with right image regions: A visual attention regularization approach","volume":"18","author":"Liu","year":"2022","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications"},{"key":"10.1016\/j.neunet.2026.109154_bib0038","series-title":"Proceedings of the china national conference on Chinese computational linguistics","first-page":"471","article-title":"A robustly optimized BERT pre-training approach with post-training","author":"Liu","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0039","series-title":"Advances in neural information processing systems","first-page":"16292","article-title":"Introspective distillation for robust question answering","author":"Niu","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0040","doi-asserted-by":"crossref","first-page":"3405","DOI":"10.1109\/TMM.2021.3097502","article-title":"Suppressing biased samples for robust VQA","volume":"24","author":"Ouyang","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109154_bib0041","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106560","article-title":"Robust visual question answering via polarity enhancement and contrast","volume":"179","author":"Peng","year":"2024","journal-title":"Neural Networks"},{"key":"10.1016\/j.neunet.2026.109154_bib0042","series-title":"Proceedings of the 2014 Conference on empirical methods in natural language processing","first-page":"1532","article-title":"Glove: Global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.neunet.2026.109154_bib0043","series-title":"Proceedings of the IEEE International conference on multimedia and expo","first-page":"1","article-title":"RankVQA: Answer re-ranking for visual question answering","author":"Qiao","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0044","series-title":"Advances in neural information processing systems","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren","year":"2015"},{"key":"10.1016\/j.neunet.2026.109154_bib0045","doi-asserted-by":"crossref","unstructured":"R\u00f6sch, P. J., Oswald, N., Geierhos, M., & Libovick\u1ef3, J. (2024). Enhancing conceptual understanding in multimodal contrastive learning through hard negative samples. In Proceedings of the 3rd Workshop on Advances in Language and Vision Research (pp. 102\u2013115).","DOI":"10.18653\/v1\/2024.alvr-1.9"},{"key":"10.1016\/j.neunet.2026.109154_bib0046","series-title":"Proceedings of the IEEE\/CVF International conference on computer vision","first-page":"2591","article-title":"Taking a hint: Leveraging explanations to make vision and language models more grounded","author":"Selvaraju","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0047","series-title":"Proceedings of the 59th Annual meeting of the association for computational linguistics and the 11th International joint conference on natural language processing","first-page":"4101","article-title":"Check it again: Progressive visual question answering via visual entailment","author":"Si","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0048","series-title":"Proceedings of the conference on empirical methods in natural language processing","first-page":"6650","article-title":"Towards robust visual question answering: Making the most of biased samples via contrastive learning","author":"Si","year":"2022"},{"key":"10.1016\/j.neunet.2026.109154_bib0049","series-title":"Proceedings of the 2019 Conference on empirical methods in natural language processing","first-page":"5100","article-title":"LXMERT: Learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0050","doi-asserted-by":"crossref","first-page":"8609","DOI":"10.1109\/TMM.2024.3380259","article-title":"Cross modality bias in visual question answering: A causal view with possible worlds VQA","volume":"26","author":"Vosoughi","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.neunet.2026.109154_bib0051","series-title":"Advances in neural information processing systems","first-page":"5776","article-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","author":"Wang","year":"2020"},{"key":"10.1016\/j.neunet.2026.109154_bib0052","series-title":"Advances in neural information processing systems","first-page":"3784","article-title":"Debiased visual question answering from feature and sample perspectives","author":"Wen","year":"2021"},{"key":"10.1016\/j.neunet.2026.109154_bib0053","series-title":"Advances in neural information processing systems","first-page":"1","article-title":"Self-critical reasoning for robust visual question answering","author":"Wu","year":"2019"},{"key":"10.1016\/j.neunet.2026.109154_bib0054","unstructured":"Yang, A., Li, A., Yang, B., Zhang, B., Hui, B., Zheng, B., Yu, B., Gao, C., Huang, C., Lv, C. et al. (2025). Qwen3 technical report. arXiv: 2505.09388."},{"key":"10.1016\/j.neunet.2026.109154_bib0055","series-title":"Proceedings of the IEEE Conference on computer vision and pattern recognition","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"10.1016\/j.neunet.2026.109154_bib0056","series-title":"Proceedings of the IEEE Conference on computer vision and pattern recognition","first-page":"5014","article-title":"Yin and Yang: Balancing and answering binary visual questions","author":"Zhang","year":"2016"},{"key":"10.1016\/j.neunet.2026.109154_bib0057","doi-asserted-by":"crossref","first-page":"85980","DOI":"10.1109\/ACCESS.2023.3304415","article-title":"Overcoming language priors via shuffling language bias for robust visual question answering","volume":"11","author":"Zhao","year":"2023","journal-title":"IEEE Access"},{"key":"10.1016\/j.neunet.2026.109154_bib0058","series-title":"Proceedings of the 33rd ACM International conference on information and knowledge management","first-page":"3602","article-title":"MV-BART: Multi-view BART for multi-modal sarcasm detection","author":"Zhuang","year":"2024"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006155?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026006155?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,25]],"date-time":"2026-05-25T17:12:52Z","timestamp":1779729172000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026006155"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":58,"alternative-id":["S0893608026006155"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109154","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Focus on the essentials: Learning to attend to the most critical information for visual question answering","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.109154","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"109154"}}