{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:28:29Z","timestamp":1771698509370,"version":"3.50.1"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"9-10","license":[{"start":{"date-parts":[[2022,6,16]],"date-time":"2022-06-16T00:00:00Z","timestamp":1655337600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,6,16]],"date-time":"2022-06-16T00:00:00Z","timestamp":1655337600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1911401"],"award-info":[{"award-number":["U1911401"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Project of Science and Technology Innovation 2030 supported by the Ministry of Science and Technology of China","award":["ZDI135-96"],"award-info":[{"award-number":["ZDI135-96"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2022,9]]},"DOI":"10.1007\/s00371-022-02524-z","type":"journal-article","created":{"date-parts":[[2022,6,16]],"date-time":"2022-06-16T15:52:12Z","timestamp":1655394732000},"page":"3097-3108","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":22,"title":["SPCA-Net: a based on spatial position relationship co-attention network for visual question answering"],"prefix":"10.1007","volume":"38","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8615-139X","authenticated-orcid":false,"given":"Feng","family":"Yan","sequence":"first","affiliation":[]},{"given":"Wushouer","family":"Silamu","sequence":"additional","affiliation":[]},{"given":"Yanbin","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yachuang","family":"Chai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,6,16]]},"reference":[{"issue":"4","key":"2524_CR1","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/TPAMI.2016.2599174","volume":"39","author":"J Donahue","year":"2017","unstructured":"Donahue, J., Anne Hendricks, L., Gua-Darrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. IEEE Trans. Pattern Anal. Mach. Intell. 39(4), 677\u2013691 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2524_CR2","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A.C., Salakhutdinov, R., Zemel, R.S., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: Proceedings of ICML, Lille, FR, pp. 2048\u20132057 (2015)"},{"key":"2524_CR3","doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J.-W., Kim, J.: Dual attention networks for multimodal reasoning and matching. In: Proceedings of CVPR, Honolulu, HI, UT, pp. 2156\u20132164 (2017)","DOI":"10.1109\/CVPR.2017.232"},{"key":"2524_CR4","doi-asserted-by":"crossref","unstructured":"He, K.M., Zhang, X.Y., Ren, S.Q., Sun, J.: Deep residual learning for image recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2524_CR5","unstructured":"Sun, S.Y., Pang, J.M., Shi, J.P., Yi, S., Ouyang, W.L.: Fishnet: a versatile backbone for image, region, and pixel-level prediction. In: Advances in Neural Information Processing Systems, pp. 760\u2013770 (2018)"},{"key":"2524_CR6","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"2524_CR7","doi-asserted-by":"crossref","unstructured":"Gao, Y., Beijbom, O., Zhang, N., Darrell, T.: Compact bilinear pooling. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 317\u2013326 (2016)","DOI":"10.1109\/CVPR.2016.41"},{"key":"2524_CR8","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"2524_CR9","doi-asserted-by":"crossref","unstructured":"Nguyen, D.-K., Okatani, T.: Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. In: Proceedings of CVPR, Salt Lake, UT, pp. 6087\u20136096 (2018)","DOI":"10.1109\/CVPR.2018.00637"},{"key":"2524_CR10","unstructured":"Kim, J.-H., Jun, J., Zhang, B.-T.: Bilinear attention networks. In: Advances in Neural Information Processing Systems (NIPS) (2018)"},{"key":"2524_CR11","doi-asserted-by":"crossref","unstructured":"Gao, P., Li, H., You, H., Jiang, Z., Lu, P., Hoi, S.C.H., Wang, X.: Dynamic fusion with intra- and inter-modality attention flow for visual question answering. In: Proceedings of CVPR, Long Beach, CA, UT, pp. 6639\u20136648 (2019)","DOI":"10.1109\/CVPR.2019.00680"},{"key":"2524_CR12","unstructured":"Gao, P., You, H., Zhang, Z., Wang, X., Li, H.: Multi-modality latent interaction network for visual question answering. Available: https:\/\/arxiv.org\/abs\/1908.04289 (2019)"},{"key":"2524_CR13","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Cui, Y., Tao, D., Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6281\u20136290 (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"2524_CR14","unstructured":"Yu, Z., Cui, Y., Yu, J., Tao, D., Tian, Q.: Multimodal unified attention networks for vision-and-language interactions. Available: https:\/\/arxiv.org\/abs\/1908.04107 (2019)"},{"key":"2524_CR15","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 6000\u20136010 (2017)"},{"key":"2524_CR16","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39, 1137\u20131149, 2017-01\u201301 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2524_CR17","doi-asserted-by":"crossref","unstructured":"Xu, H., Saenko, K.: Ask, attend and answer: exploring question-guided spatial attention for visual question answering. In: Computer Vision\u2014ECCV, vol. 2016, pp. 451\u2013466. Springer International Publishing (2016)","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"2524_CR18","doi-asserted-by":"crossref","unstructured":"Sun, Q., Fu, Y.: Stacked self-attention networks for visual question answering. In: Proceedings of the 2019 on International Conference on Multimedia Retrieval. ACM (2019)","DOI":"10.1145\/3323873.3325044"},{"key":"2524_CR19","unstructured":"Chowdhury, M.I.H., Nguyen, K., Sridharan, S., Fookes, C.: Hierarchical relational attention for video question answering. In: 2018 25th IEEE International Conference on Image Processing (ICIP). IEEE (2018)"},{"key":"2524_CR20","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv e-prints arXiv:1810.04805 (2018)"},{"key":"2524_CR21","doi-asserted-by":"crossref","unstructured":"Yu, D., Fu, J., Mei, T., Rui, Y.: Multi-level attention networks for visual question answering. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2017)","DOI":"10.1109\/CVPR.2017.446"},{"key":"2524_CR22","unstructured":"Kim, J., On, K., Kim, J., Ha, J., Zhang, B.: Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1609.02907, 2016-10-14 (2016)"},{"key":"2524_CR23","doi-asserted-by":"crossref","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 13041\u201313049, 2020-01-01 (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"2524_CR24","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: LXMERT: learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). Association for Computational Linguistics (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"2524_CR25","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: NeurIPS (2019)"},{"key":"2524_CR26","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C., Chang, K.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"2524_CR27","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: VQA: visual question answering. In: International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"2524_CR28","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, June, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2524_CR29","doi-asserted-by":"crossref","unstructured":"Wu, Q., Wang, P., Shen, C., Dick, A., Van\u00a0Den\u00a0Hengel, A.: Ask me anything: free-form visual question answering based on knowledge from external sources. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2016)","DOI":"10.1109\/CVPR.2016.500"},{"key":"2524_CR30","doi-asserted-by":"crossref","unstructured":"Wang, P., Wu, Q., Shen, C., Dick, A., van\u00a0den\u00a0Hengel, A.: Explicit knowledge-based reasoning for visual question answering. In: Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence. International Joint Conferences on Artificial Intelligence Organization (2017)","DOI":"10.24963\/ijcai.2017\/179"},{"key":"2524_CR31","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2018","unstructured":"Wang, P., Wu, Q., Shen, C., Dick, A., van den Hengel, A.: FVQA: fact-based visual question answering. IEEE Trans. Pattern Anal. Mach. Intell. 40, 2413\u20132427, 2018-01\u201301 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2524_CR32","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2018","unstructured":"Wu, Q., Shen, C., Wang, P., Dick, A., van den Hengel, A.: Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans. Pattern Anal. Mach. Intell. 40, 1367\u20131381, 2018-01\u201301 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2524_CR33","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L., Shamma, D.A., Bernstein, M.S., Fei-Fei, L.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373, 2017-01\u201301 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"2524_CR34","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"2524_CR35","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: 2017 IEEE International Conference on Computer Vision (ICCV). IEEE (2017)","DOI":"10.1109\/ICCV.2017.202"},{"issue":"12","key":"2524_CR36","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu, Z., Yu, J., Xiang, C., Fan, J., Tao, D.: Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans. Neural Netw. Learn. Syst. 29(12), 5947\u20135959 (2018)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"2524_CR37","doi-asserted-by":"crossref","unstructured":"Cadene, R., Ben-Younes, H., Cord, M., Thome, N.: Murel: multimodal relational reasoning for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1989\u20131998 (2019)","DOI":"10.1109\/CVPR.2019.00209"},{"key":"2524_CR38","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1109\/TPAMI.2020.3004830","volume":"44","author":"L Peng","year":"2020","unstructured":"Peng, L., Yang, Y., Wang, Z., Huang, Z., Shen, H.T.: MRA-NET: improving VQA via multi-modal relation attention network. IEEE Trans. Pattern Anal. Mach. Intell. 44, 318\u2013329 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"3","key":"2524_CR39","doi-asserted-by":"publisher","first-page":"1045","DOI":"10.3390\/s22031045","volume":"22","author":"F Yan","year":"2022","unstructured":"Yan, F., Silamu, W., Li, Y.: Deep modular bilinear attention network for visual question answering. Sensors 22(3), 1045 (2022). https:\/\/doi.org\/10.3390\/s22031045","journal-title":"Sensors"},{"key":"2524_CR40","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, A., Darrell, T., Saenko, K.: Language-conditioned graph networks for relational reasoning. In: Proceedings of the 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Long Beach, CA, USA, June, pp. 10294\u201310303 (2019)","DOI":"10.1109\/ICCV.2019.01039"},{"key":"2524_CR41","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wang, K., Yu, M., et\u00a0al.: Interpretable visual reasoning via induced symbolic space. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, Seattle, WA, USA, June, pp. 1878\u20131887 (2020)","DOI":"10.1109\/ICCV48922.2021.00189"},{"key":"2524_CR42","first-page":"1","volume":"2022","author":"P Zhang","year":"2022","unstructured":"Zhang, P., Lan, H., Khan, M.A.: Multiple context learning networks for visual question answering. Sci. Program. 2022, 1\u201311 (2022)","journal-title":"Sci. Program."},{"key":"2524_CR43","doi-asserted-by":"publisher","first-page":"106639","DOI":"10.1016\/j.knosys.2020.106639","volume":"212","author":"W Zhang","year":"2021","unstructured":"Zhang, W., Yu, J., Wang, Y., et al.: Multimodal deep fusion for image question answering. Knowl. Based Syst. 212, 106639 (2021)","journal-title":"Knowl. Based Syst."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02524-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-022-02524-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02524-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,2,8]],"date-time":"2023-02-08T16:17:16Z","timestamp":1675873036000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-022-02524-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,16]]},"references-count":43,"journal-issue":{"issue":"9-10","published-print":{"date-parts":[[2022,9]]}},"alternative-id":["2524"],"URL":"https:\/\/doi.org\/10.1007\/s00371-022-02524-z","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,6,16]]},"assertion":[{"value":"10 May 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 June 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}