{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T20:52:14Z","timestamp":1743108734870,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":31,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819785018"},{"type":"electronic","value":"9789819785025"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8502-5_34","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"481-495","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["BIVL-Net: Bidirectional Vision-Language Guidance for\u00a0Visual Question Answering"],"prefix":"10.1007","author":[{"given":"Cong","family":"Han","sequence":"first","affiliation":[]},{"given":"Feifei","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"34_CR1","doi-asserted-by":"crossref","unstructured":"Abbasnejad, E., Teney, D., Parvaneh, A., Shi, J., Hengel, A.v.d.: Counterfactual vision and language learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10044\u201310054 (2020)","DOI":"10.1109\/CVPR42600.2020.01006"},{"key":"34_CR2","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., Parikh, D., Kembhavi, A.: Don\u2019t just assume; look and answer: overcoming priors for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4971\u20134980 (2018)","DOI":"10.1109\/CVPR.2018.00522"},{"key":"34_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"34_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: visual question answering. In: Proceedings of the IEEE International on Conference Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"Basu, A., Addepalli, S., Babu, R.V.: Rmlvqa: a margin loss approach for visual question answering with language biases. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11671\u201311680 (2023)","DOI":"10.1109\/CVPR52729.2023.01123"},{"key":"34_CR6","doi-asserted-by":"crossref","unstructured":"Ben-Younes, H., Cadene, R., Thome, N., Cord, M.: Block: bilinear superdiagonal fusion for visual question answering and visual relationship detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 8102\u20138109 (2019)","DOI":"10.1609\/aaai.v33i01.33018102"},{"key":"34_CR7","doi-asserted-by":"crossref","unstructured":"Cadene, R., Ben-Younes, H., Cord, M., Thome, N.: Murel: multimodal relational reasoning for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1989\u20131998 (2019)","DOI":"10.1109\/CVPR.2019.00209"},{"key":"34_CR8","unstructured":"Cadene, R., Dancette, C., Cord, M., Parikh, D., et\u00a0al.: Rubi: reducing unimodal biases for visual question answering. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"34_CR9","doi-asserted-by":"crossref","unstructured":"Chen, L., Yan, X., Xiao, J., Zhang, H., Pu, S., Zhuang, Y.: Counterfactual samples synthesizing for robust visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10800\u201310809 (2020)","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"34_CR10","doi-asserted-by":"crossref","unstructured":"Cho, J.W., Kim, D.J., Ryu, H., Kweon, I.S.: Generative bias for robust visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11681\u201311690 (2023)","DOI":"10.1109\/CVPR52729.2023.01124"},{"key":"34_CR11","doi-asserted-by":"crossref","unstructured":"Chua, T.S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: Nus-wide: a real-world web image database from national university of singapore. In: Proceedings of the ACM International Conference on Image and Video Retrieval, pp.\u00a01\u20139 (2009)","DOI":"10.1145\/1646396.1646452"},{"key":"34_CR12","doi-asserted-by":"crossref","unstructured":"Clark, C., Yatskar, M., Zettlemoyer, L.: Don\u2019t take the easy way out: ensemble based methods for avoiding known dataset biases (2019). arXiv:1909.03683","DOI":"10.18653\/v1\/D19-1418"},{"key":"34_CR13","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding (2016). arXiv:1606.01847","DOI":"10.18653\/v1\/D16-1044"},{"key":"34_CR14","doi-asserted-by":"crossref","unstructured":"Gokhale, T., Banerjee, P., Baral, C., Yang, Y.: Mutant: a training paradigm for out-of-distribution generalization in visual question answering (2020). arXiv:2009.08566","DOI":"10.18653\/v1\/2020.emnlp-main.63"},{"key":"34_CR15","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in vqa matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"34_CR16","doi-asserted-by":"crossref","unstructured":"Guo, Y., Nie, L., Cheng, Z., Ji, F., Zhang, J., Del\u00a0Bimbo, A.: Adavqa: overcoming language priors with adapted margin cosine loss (2021). arXiv:2105.01993","DOI":"10.24963\/ijcai.2021\/98"},{"key":"34_CR17","doi-asserted-by":"crossref","unstructured":"Han, X., Wang, S., Su, C., Huang, Q., Tian, Q.: Greedy gradient ensemble for robust visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1584\u20131593 (2021)","DOI":"10.1109\/ICCV48922.2021.00161"},{"key":"34_CR18","doi-asserted-by":"crossref","unstructured":"Jing, C., Wu, Y., Zhang, X., Jia, Y., Wu, Q.: Overcoming language priors in vqa via decomposed linguistic representations. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11181\u201311188 (2020)","DOI":"10.1609\/aaai.v34i07.6776"},{"key":"34_CR19","doi-asserted-by":"crossref","unstructured":"Kv, G., Mittal, A.: Reducing language biases in visual question answering with visually-grounded question encoder. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XIII 16, pp. 18\u201334. Springer (2020)","DOI":"10.1007\/978-3-030-58601-0_2"},{"key":"34_CR20","doi-asserted-by":"crossref","unstructured":"Liang, Z., Jiang, W., Hu, H., Zhu, J.: Learning to contrast the counterfactual samples for robust visual question answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 3285\u20133292 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.265"},{"key":"34_CR21","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"34_CR22","doi-asserted-by":"crossref","unstructured":"Niu, Y., Tang, K., Zhang, H., Lu, Z., Hua, X.S., Wen, J.R.: Counterfactual vqa: a cause-effect look at language bias. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12700\u201312710 (2021)","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"34_CR23","first-page":"16292","volume":"34","author":"Y Niu","year":"2021","unstructured":"Niu, Y., Zhang, H.: Introspective distillation for robust question answering. Adv. Neural. Inf. Process. Syst. 34, 16292\u201316304 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"34_CR24","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"34_CR25","unstructured":"Ramakrishnan, S., Agrawal, A., Lee, S.: Overcoming language priors in visual question answering with adversarial regularization. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"issue":"11","key":"34_CR26","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"34_CR27","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Lee, S., Shen, Y., Jin, H., Ghosh, S., Heck, L., Batra, D., Parikh, D.: Taking a hint: Leveraging explanations to make vision and language models more grounded. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2591\u20132600 (2019)","DOI":"10.1109\/ICCV.2019.00268"},{"key":"34_CR28","doi-asserted-by":"crossref","unstructured":"Teney, D., Anderson, P., He, X., Van Den\u00a0Hengel, A.: Tips and tricks for visual question answering: learnings from the 2017 challenge. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4223\u20134232 (2018)","DOI":"10.1109\/CVPR.2018.00444"},{"key":"34_CR29","first-page":"3784","volume":"34","author":"Z Wen","year":"2021","unstructured":"Wen, Z., Xu, G., Tan, M., Wu, Q., Wu, Q.: Debiased visual question answering from feature and sample perspectives. Adv. Neural. Inf. Process. Syst. 34, 3784\u20133796 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"34_CR30","unstructured":"Wu, J., Mooney, R.: Self-critical reasoning for robust visual question answering. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"34_CR31","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8502-5_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:22:16Z","timestamp":1730384536000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8502-5_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819785018","9789819785025"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8502-5_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}