{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:43:35Z","timestamp":1740123815696,"version":"3.37.3"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2021,11,25]],"date-time":"2021-11-25T00:00:00Z","timestamp":1637798400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,11,25]],"date-time":"2021-11-25T00:00:00Z","timestamp":1637798400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61672246"],"award-info":[{"award-number":["61672246"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100013139","name":"Humanities and Social Science Fund of Ministry of Education of China","doi-asserted-by":"crossref","award":["21YJC870002"],"award-info":[{"award-number":["21YJC870002"]}],"id":[{"id":"10.13039\/501100013139","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61272068"],"award-info":[{"award-number":["61272068"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["61672254"],"award-info":[{"award-number":["61672254"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Program for Hust Academic Frontier Youth Team"},{"DOI":"10.13039\/501100003819","name":"Natural Science Foundation of Hubei Province","doi-asserted-by":"publisher","award":["2020CFB492"],"award-info":[{"award-number":["2020CFB492"]}],"id":[{"id":"10.13039\/501100003819","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102159"],"award-info":[{"award-number":["62102159"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["World Wide Web"],"published-print":{"date-parts":[[2022,7]]},"DOI":"10.1007\/s11280-021-00976-2","type":"journal-article","created":{"date-parts":[[2021,11,25]],"date-time":"2021-11-25T08:06:39Z","timestamp":1637827599000},"page":"1607-1623","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-level, multi-modal interactions for visual question answering over text in images"],"prefix":"10.1007","volume":"25","author":[{"given":"Jincai","family":"Chen","sequence":"first","affiliation":[]},{"given":"Sheng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jiangfeng","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Fuhao","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Yuan-Fang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Ping","family":"Lu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,11,25]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Anderson, P, He, X, Buehler, C, Teney, D, Johnson, M, Gould, S, Zhang, L: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18-22, 2018. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Anderson_Bottom-Up_and_Top-Down_CVPR_2018_paper.html, pp 6077\u20136086. IEEE Computer Society (2018)","key":"976_CR1","DOI":"10.1109\/CVPR.2018.00636"},{"doi-asserted-by":"crossref","unstructured":"Antol, S, Agrawal, A, Lu, J, Mitchell, M, Batra, D, Lawrence Zitnick, C, Parikh, D: Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433 (2015)","key":"976_CR2","DOI":"10.1109\/ICCV.2015.279"},{"doi-asserted-by":"crossref","unstructured":"Antol, S, Agrawal, A, Lu, J, Mitchell, M, Batra, D, Zitnick, CL, Parikh, D: VQA: visual question answering. In: 2015 IEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7-13, 2015, pp 2425\u20132433. IEEE Computer Society (2015)","key":"976_CR3","DOI":"10.1109\/ICCV.2015.279"},{"doi-asserted-by":"crossref","unstructured":"Ben-younes, H, Cadene, R, Cord, M, Thome, N: Mutan: Multimodal tucker fusion for visual question answering. In: Proc. IEEE Int. Conf. Computer Vision (ICCV), pp 2631\u20132639 (2017)","key":"976_CR4","DOI":"10.1109\/ICCV.2017.285"},{"doi-asserted-by":"crossref","unstructured":"Biten, AF, Tito, R, Mafla, A, G\u00f3mez, L, Rusi\u00f1ol, M, Mathew, M, Jawahar, CV, Valveny, E, Karatzas, D: ICDAR 2019 competition on scene text visual question answering. CoRR abs\/1907.00490, 1907.00490(2019)","key":"976_CR5","DOI":"10.1109\/ICCV.2019.00439"},{"key":"976_CR6","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P, Grave, E, Joulin, A, Mikolov, T: Enriching word vectors with subword information. TACL 5, 135\u2013146 (2017). https:\/\/transacl.org\/ojs\/index.php\/tacl\/article\/view\/999","journal-title":"TACL"},{"doi-asserted-by":"crossref","unstructured":"Chen, Z, Lu, H, Tian, S, Qiu, J, Kamiya, T, Serikawa, S, Xu, L: Construction of a hierarchical feature enhancement network and its application in fault recognition. IEEE Transactions on Industrial Informatics (2020)","key":"976_CR7","DOI":"10.1109\/TII.2020.3021688"},{"unstructured":"Devlin, J, Chang, M-W, Lee, K, Toutanova, K: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J, Doran, C, Solorio, T (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and Short Papers). https:\/\/www.aclweb.org\/anthology\/N19-1423\/, pp 4171\u20134186. Association for Computational Linguistics (2019)","key":"976_CR8"},{"doi-asserted-by":"crossref","unstructured":"Gao, P, Jiang, Z, You, H, Lu, P, Hoi, SCH, Wang, X, Li, H: Dynamic fusion with intra- and inter-modality attention flow for visual question answering. In: IEEE conference on computer vision and pattern recognition, CVPR 2019, long beach, ca, usa, june 16-20, 2019. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Gao_Dynamic_Fusion_With_Intra-_and_Inter-Modality_Attention_Flow_for_Visual_CVPR_2019_paper.html, pp 6639\u20136648. Computer Vision Foundation \/ IEEE (2019)","key":"976_CR9","DOI":"10.1109\/CVPR.2019.00680"},{"doi-asserted-by":"crossref","unstructured":"Goyal, Y, Khot, T, Summers-Stay, D, Batra, D, Parikh, D: Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6904\u20136913 (2017)","key":"976_CR10","DOI":"10.1109\/CVPR.2017.670"},{"doi-asserted-by":"crossref","unstructured":"Gu, J, Lu, Z, Li, H, Li, VOK: Incorporating copying mechanism in sequence-to-sequence learning. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, ACL 2016, August 7-12, 2016, Berlin, Germany, Volume 1: Long Papers. https:\/\/www.aclweb.org\/anthology\/P16-1154\/. The Association for Computer Linguistics (2016)","key":"976_CR11","DOI":"10.18653\/v1\/P16-1154"},{"doi-asserted-by":"crossref","unstructured":"Jiang, T, Zeng, J, Zhou, K, Huang, P, Yang, T: Lifelong disk failure prediction via gan-based anomaly detection. In: 2019 IEEE 37th International Conference on Computer Design (ICCD), pp 199\u2013207, IEEE (2019)","key":"976_CR12","DOI":"10.1109\/ICCD46524.2019.00033"},{"unstructured":"Jiang, Y, Natarajan, V, Chen, X, Rohrbach, M, Batra, D, Parikh, D: Pythia v0.1: the winning entry to the VQA challenge 2018. CoRR abs\/1807.09956, 1807.09956 (2018)","key":"976_CR13"},{"doi-asserted-by":"crossref","unstructured":"Johnson, J, Hariharan, B, van der Maaten, L, Fei-Fei, L, Lawrence Zitnick, C, Girshick, R: Clevr: A diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2901\u20132910 (2017)","key":"976_CR14","DOI":"10.1109\/CVPR.2017.215"},{"unstructured":"Kahou, SE, Michalski, V, Atkinson, A, K\u00e1d\u00e1r, A, Trischler, A, Bengio, Y: Figureqa: An annotated figure dataset for visual reasoning. arXiv:1710.07300 (2017)","key":"976_CR15"},{"unstructured":"Kim, J-H, Jun, J, Zhang, B-T: Bilinear attention networks. In: Bengio, S, Wallach, HM, Larochelle, H, Grauman, K, Cesa-Bianchi, N, Garnett, R (eds.) Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, 3-8 December 2018, Montr\u00e9al, Canada. http:\/\/papers.nips.cc\/paper\/7429-bilinear-attention-networks, pp 1571\u20131581 (2018)","key":"976_CR16"},{"issue":"1","key":"976_CR17","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R, Zhu, Y, Groth, O, Johnson, J, Hata, K, Kravitz, J, Chen, S, Kalantidis, Y, Li, L-J, Shamma, DA, Bernstein, MS, Fei-Fei, L: Visual genome: Connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"unstructured":"Kuznetsova, A, Rom, H, Alldrin, N, Uijlings, JRR, Krasin, I, Pont-Tuset, J, Kamali, S, Popov, S, Malloci, M, Duerig, T, Ferrari, V: The open images dataset V4: unified image classification, object detection, and visual relationship detection at scale. CoRR abs\/1811.00982, 1811.00982 (2018)","key":"976_CR18"},{"unstructured":"Lin, Y, Zhao, H, Li, Y, Wang, D: Dcd zju, textvqa challenge 2019 winner. https:\/\/visualqa.org\/workshop.html (2019)","key":"976_CR19"},{"issue":"4","key":"976_CR20","doi-asserted-by":"publisher","first-page":"2315","DOI":"10.1109\/JIOT.2017.2737479","volume":"5","author":"H Lu","year":"2017","unstructured":"Lu, H, Li, Y, Mu, S, Wang, D, Kim, H, Serikawa, S: Motor anomaly detection for unmanned aerial vehicles using reinforcement learning. IEEE internet of things journal 5(4), 2315\u20132322 (2017)","journal-title":"IEEE internet of things journal"},{"doi-asserted-by":"crossref","unstructured":"Lu, H, Zhang, M, Xu, X, Li, Y, Shen, HT: Deep fuzzy hashing network for efficient image retrieval. IEEE Trans. Fuzzy Syst. (2020)","key":"976_CR21","DOI":"10.1109\/TFUZZ.2020.2984991"},{"doi-asserted-by":"crossref","unstructured":"Lu, H, Zhang, Y, Li, Y, Jiang, C, Abbas, H: User-oriented virtual mobile network resource management for vehicle communications. IEEE Trans. Intell. Transp. Syst. (2020)","key":"976_CR22","DOI":"10.1109\/TITS.2020.2991766"},{"key":"976_CR23","doi-asserted-by":"publisher","first-page":"304","DOI":"10.1016\/j.future.2018.10.041","volume":"93","author":"X Ma","year":"2019","unstructured":"Ma, X, Zeng, J, Peng, L, Fortino, G, Zhang, Y: Modeling multi-aspects within one opinionated sentence simultaneously for aspect-level sentiment analysis. Futur. Gener. Comput. Syst. 93, 304\u2013311 (2019)","journal-title":"Futur. Gener. Comput. Syst."},{"unstructured":"Malinowski, M, Fritz, M: A multi-world approach to question answering about real-world scenes based on uncertain input. In: Advances in neural information processing systems, pp 1682\u20131690 (2014)","key":"976_CR24"},{"unstructured":"Malinowski, M, Fritz, M: Towards a visual turing challenge. CoRR abs\/1410.8027, 1410.8027 (2014)","key":"976_CR25"},{"doi-asserted-by":"crossref","unstructured":"Nguyen, D-K, Okatani, T: Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. In: 2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18-22, 2018. http:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Nguyen_Improved_Fusion_of_CVPR_2018_paper.html, pp 6087\u20136096. IEEE Computer Society (2018)","key":"976_CR26","DOI":"10.1109\/CVPR.2018.00637"},{"unstructured":"Paszke, A, Gross, S, Chintala, S, Chanan, G, Yang, E, DeVito, Z, Lin, Z, Desmaison, A, Antiga, L, Lerer, A: Automatic differentiation in pytorch (2017)","key":"976_CR27"},{"doi-asserted-by":"crossref","unstructured":"Pennington, J, Socher, R, Manning, CD: Glove: Global vectors for word representation. In: Moschitti, A, Pang, B, Daelemans, W (eds.) Proceedings of the 2014 conference on empirical methods in natural language processing, EMNLP 2014, october 25-29, 2014, doha, qatar, A meeting of sigdat, a special interest group of the ACL. https:\/\/www.aclweb.org\/anthology\/D14-1162\/, pp 1532\u20131543. ACL (2014)","key":"976_CR28","DOI":"10.3115\/v1\/D14-1162"},{"unstructured":"Ren, M, Kiros, R, Zemel, R: Exploring models and data for image question answering. In: Advances in neural information processing systems, pp 2953\u20132961 (2015)","key":"976_CR29"},{"unstructured":"Ren, S, He, K, Girshick, RB, Sun, J: Faster R-CNN: towards real-time object detection with region proposal networks. In: Cortes, C, Lawrence, ND, Lee, DD, Sugiyama, M, Garnett, R (eds.) Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada. http:\/\/papers.nips.cc\/paper\/5638-faster-r-cnn-towards-real-time-object-detection-with-region-proposal-networks, pp 91\u201399 (2015)","key":"976_CR30"},{"doi-asserted-by":"crossref","unstructured":"Singh, A, Natarajan, V, Shah, M, Jiang, Y, Chen, X, Batra, D, Parikh, D, Rohrbach, M: Towards VQA models that can read. In: IEEE conference on computer vision and pattern recognition, CVPR 2019, long beach, ca, usa, june 16-20, 2019. http:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Singh_Towards_VQA_Models_That_Can_Read_CVPR_2019_paper.html, pp 8317\u20138326. Computer Vision Foundation \/ IEEE (2019)","key":"976_CR31","DOI":"10.1109\/CVPR.2019.00851"},{"unstructured":"submission, A: Msft_vti. https:\/\/evalai.cloudcv.org\/web\/challenges\/challenge-page\/224\/","key":"976_CR32"},{"doi-asserted-by":"crossref","unstructured":"Suhr, A, Lewis, M, Yeh, J, Artzi, Y: A corpus of natural language for visual reasoning. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp 217\u2013223 (2017)","key":"976_CR33","DOI":"10.18653\/v1\/P17-2034"},{"doi-asserted-by":"crossref","unstructured":"Tan, H, Bansal, M: LXMERT: learning cross-modality encoder representations from transformers. In: Inui, K, Jiang, J, Ng, V, Wan, X (eds.) Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3-7, 2019, pp 5099\u20135110. Association for Computational Linguistics (2019)","key":"976_CR34","DOI":"10.18653\/v1\/D19-1514"},{"unstructured":"Vaswani, A, Shazeer, N, Parmar, N, Uszkoreit, J, Jones, L, Gomez, AN, Kaiser, L, Polosukhin, I: Attention is all you need. In: Guyon, I, von Luxburg, U, Bengio, S, Wallach, HM, Fergus, R, Vishwanathan, SVN, Garnett, R (eds.) Advances in neural information processing systems 30: Annual conference on neural information processing systems 2017, 4-9 december 2017, long beach, ca, USA. http:\/\/papers.nips.cc\/paper\/7181-attention-is-all-you-need, pp 5998\u20136008 (2017)","key":"976_CR35"},{"key":"976_CR36","doi-asserted-by":"publisher","first-page":"102369","DOI":"10.1016\/j.apor.2020.102369","volume":"104","author":"P Wang","year":"2020","unstructured":"Wang, P, Wang, D, Zhang, X, Li, X, Peng, T, Lu, H, Tian, X: Numerical and experimental study on the maneuverability of an active propeller control based wave glider. Applied Ocean Research 104, 102369 (2020)","journal-title":"Applied Ocean Research"},{"issue":"10","key":"976_CR37","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2017","unstructured":"Wang, P, Wu, Q, Shen, C, Dick, A, Van Den Hengel, A: Fvqa: Fact-based visual question answering. IEEE transactions on pattern analysis and machine intelligence 40(10), 2413\u20132427 (2017)","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"doi-asserted-by":"crossref","unstructured":"Xu, K, Wang, Z, Shi, J, Li, H, Zhang, QC: A2-net: Molecular structure estimation from cryo-em density volumes. In: The Thirty-Third AAAI Conference on Artificial Intelligence, AAAI 2019, The Thirty-First Innovative Applications of Artificial Intelligence Conference, IAAI 2019, The Ninth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2019, Honolulu, Hawaii, USA, January 27 - February 1, 2019, pp 1230\u20131237. AAAI Press (2019)","key":"976_CR38","DOI":"10.1609\/aaai.v33i01.33011230"},{"doi-asserted-by":"crossref","unstructured":"Yang, Z, He, X, Gao, J, Deng, L, Smola, A: Stacked attention networks for image question answering. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","key":"976_CR39","DOI":"10.1109\/CVPR.2016.10"},{"key":"976_CR40","doi-asserted-by":"publisher","first-page":"295","DOI":"10.1016\/j.neucom.2019.07.085","volume":"366","author":"J Zeng","year":"2019","unstructured":"Zeng, J, Ma, X, Zhou, K: Photo-realistic face age progression\/regression using a single generative adversarial network. Neurocomputing 366, 295\u2013304 (2019)","journal-title":"Neurocomputing"},{"key":"976_CR41","doi-asserted-by":"publisher","first-page":"362","DOI":"10.1016\/j.future.2018.03.047","volume":"86","author":"K Zhou","year":"2018","unstructured":"Zhou, K, Zeng, J, Liu, Y, Zou, F: Deep sentiment hashing for text retrieval in social ciot. Futur. Gener. Comput. Syst. 86, 362\u2013371 (2018)","journal-title":"Futur. Gener. Comput. Syst."}],"container-title":["World Wide Web"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11280-021-00976-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11280-021-00976-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11280-021-00976-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,26]],"date-time":"2022-07-26T08:15:49Z","timestamp":1658823349000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11280-021-00976-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,25]]},"references-count":41,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2022,7]]}},"alternative-id":["976"],"URL":"https:\/\/doi.org\/10.1007\/s11280-021-00976-2","relation":{},"ISSN":["1386-145X","1573-1413"],"issn-type":[{"type":"print","value":"1386-145X"},{"type":"electronic","value":"1573-1413"}],"subject":[],"published":{"date-parts":[[2021,11,25]]},"assertion":[{"value":"14 January 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 November 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 November 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}