{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T17:33:08Z","timestamp":1771954388901,"version":"3.50.1"},"reference-count":74,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,6,24]],"date-time":"2021-06-24T00:00:00Z","timestamp":1624492800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,6,24]],"date-time":"2021-06-24T00:00:00Z","timestamp":1624492800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["IJDAR"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1007\/s10032-021-00378-0","type":"journal-article","created":{"date-parts":[[2021,6,24]],"date-time":"2021-06-24T21:02:26Z","timestamp":1624568546000},"page":"251-268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["EAML: ensemble self-attention-based mutual learning network for document image classification"],"prefix":"10.1007","volume":"24","author":[{"given":"Souhail","family":"Bakkali","sequence":"first","affiliation":[]},{"given":"Zuheng","family":"Ming","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0123-439X","authenticated-orcid":false,"given":"Micka\u00ebl","family":"Coustaty","sequence":"additional","affiliation":[]},{"given":"Mar\u00e7al","family":"Rusi\u00f1ol","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,6,24]]},"reference":[{"key":"378_CR1","doi-asserted-by":"crossref","unstructured":"Afzal, M., Capobianco, S., Malik, M., Marinai, S., Breuel, T., Dengel, A., Liwicki, M.: Deepdocclassifier: Document classification with deep convolutional neural network. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1111\u20131115 (2015)","DOI":"10.1109\/ICDAR.2015.7333933"},{"key":"378_CR2","doi-asserted-by":"crossref","unstructured":"Afzal, M., K\u00f6lsch, A., Ahmed, S., Liwicki, M.: Cutting the error by half: Investigation of very deep CNN and advanced training strategies for document image classification. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol. 1, pp. 883\u2013888 (2017)","DOI":"10.1109\/ICDAR.2017.149"},{"key":"378_CR3","doi-asserted-by":"crossref","unstructured":"Afzal, M., Pastor-Pellicer, J., Shafait, F., Breuel, T., Dengel, A., Liwicki, M.: Document image binarization using LSTM: a sequence learning approach. In: HIP \u201915 (2015)","DOI":"10.1145\/2809544.2809561"},{"key":"378_CR4","doi-asserted-by":"publisher","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00636","DOI":"10.1109\/CVPR.2018.00636"},{"key":"378_CR5","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/PL00010904","volume":"4","author":"E Appiani","year":"2001","unstructured":"Appiani, E., Cesarini, F., Colla, A., Diligenti, M., Gori, M., Marinai, S., Soda, G.: Automatic document classification and indexing in high-volume applications. Int. J. Doc. Anal. Recogn. 4, 69\u201383 (2001)","journal-title":"Int. J. Doc. Anal. Recogn."},{"key":"378_CR6","doi-asserted-by":"crossref","unstructured":"Asim, M., Khan, M.U.G., Malik, M., Razzaque, K., Dengel, A., Ahmed, S.: Two stream deep network for document image classification. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1410\u20131416 (2019)","DOI":"10.1109\/ICDAR.2019.00227"},{"key":"378_CR7","doi-asserted-by":"crossref","unstructured":"Audebert, N., Herold, C., Slimani, K., Vidal, C.: Multimodal deep networks for text and image-based document classification. In: Joint European Conference on Machine Learning and Knowledge Discovery in Databases, pp. 427\u2013443. Springer (2019)","DOI":"10.1007\/978-3-030-43823-4_35"},{"key":"378_CR8","doi-asserted-by":"crossref","unstructured":"Augereau, O., Journet, N., Vialard, A., Domenger, J.P.: Improving classification of an industrial document image database by combining visual and textual features. In: 2014 11th IAPR International Workshop on Document Analysis Systems, pp. 314\u2013318 (2014)","DOI":"10.1109\/DAS.2014.44"},{"key":"378_CR9","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. CoRR. arXiv:1409.0473 (2015)"},{"key":"378_CR10","doi-asserted-by":"publisher","unstructured":"Bakkali, S., Ming, Z., Coustaty, M., Rusi\u00f1ol, M.: Cross-modal deep networks for document image classification. In: 2020 IEEE International Conference on Image Processing (ICIP), pp. 2556\u20132560 (2020). https:\/\/doi.org\/10.1109\/ICIP40778.2020.9191268","DOI":"10.1109\/ICIP40778.2020.9191268"},{"key":"378_CR11","doi-asserted-by":"publisher","unstructured":"Bakkali, S., Ming, Z., Coustaty, M., Rusi\u00f1ol, M.: Visual and textual deep feature fusion for document image classification. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), pp. 2394\u20132403 (2020). https:\/\/doi.org\/10.1109\/CVPRW50498.2020.00289","DOI":"10.1109\/CVPRW50498.2020.00289"},{"key":"378_CR12","doi-asserted-by":"crossref","unstructured":"Byun, Y., Lee, Y.: Form classification using DP matching. In: SAC \u201900 (2000)","DOI":"10.1145\/335603.335611"},{"key":"378_CR13","unstructured":"Chen, K., Wang, J., Chen, L.C., Gao, H., Xu, W., Nevatia, R.: Abc-CNN: an attention based convolutional neural network for visual question answering. arXiv:1511.05960 (2016)"},{"key":"378_CR14","first-page":"1","volume":"10","author":"N Chen","year":"2006","unstructured":"Chen, N., Blostein, D.: A survey of document image classification: problem statement, classifier architecture and performance evaluation. Int. J. Doc. Anal. Recogn. (IJDAR) 10, 1\u201316 (2006)","journal-title":"Int. J. Doc. Anal. Recogn. (IJDAR)"},{"key":"378_CR15","unstructured":"Csurka, G., Larlus, D., Gordo, A., Almaz\u00e1n, J.: What is the right way to represent document images? arXiv:1603.01076 (2016)"},{"key":"378_CR16","doi-asserted-by":"publisher","unstructured":"Das, A., Roy, S., Bhattacharya, U., Parui, S.K.: Document image classification with intra-domain transfer learning and stacked generalization of deep convolutional neural networks. In: 2018 24th International Conference on Pattern Recognition (ICPR), pp. 3180\u20133185 (2018). https:\/\/doi.org\/10.1109\/ICPR.2018.8545630","DOI":"10.1109\/ICPR.2018.8545630"},{"key":"378_CR17","unstructured":"Dauphinee, T., Patel, N., Rashidi, M.M.: Modular multimodal architecture for document classification. arXiv:1912.04376 (2019)"},{"key":"378_CR18","doi-asserted-by":"crossref","unstructured":"Dengel, A., Dubiel, F.: Clustering and classification of document structure-a machine learning approach. In: Proceedings of 3rd International Conference on Document Analysis and Recognition, vol. 2, pp. 587\u2013591 (1995)","DOI":"10.1109\/ICDAR.1995.601965"},{"key":"378_CR19","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (2019)"},{"key":"378_CR20","unstructured":"DeVries, T., Taylor, G.W.: Improved regularization of convolutional neural networks with cutout. arXiv:1708.04552 (2017)"},{"issue":"12138","key":"378_CR21","first-page":"387","volume":"2020","author":"J Ferrando","year":"2020","unstructured":"Ferrando, J., Dom\u00ednguez, J.L., Torres, J., Garc\u00eda, R., Garc\u00eda, D., Garrido, D., Cortada, J., Valero, M.: Improving accuracy and speeding up document image classification through parallel systems. Comput. Sci.\u2014ICCS 2020(12138), 387\u2013400 (2020)","journal-title":"Comput. Sci.\u2014ICCS"},{"key":"378_CR22","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"378_CR23","doi-asserted-by":"crossref","unstructured":"Gallo, I., Calefati, A., Nawaz, S., Janjua, M.K.: Image and encoded text fusion for multi-modal classification. In: 2018 Digital Image Computing: Techniques and Applications (DICTA), pp. 1\u20137 (2018)","DOI":"10.1109\/DICTA.2018.8615789"},{"key":"378_CR24","doi-asserted-by":"crossref","unstructured":"Hao, L., Gao, L., Yi, X., Tang, Z.: A table detection method for pdf documents based on convolutional neural networks. In: 2016 12th IAPR Workshop on Document Analysis Systems (DAS), pp. 287\u2013292 (2016)","DOI":"10.1109\/DAS.2016.23"},{"key":"378_CR25","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Ufkes, A., Derpanis, K.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 991\u2013995 (2015)","DOI":"10.1109\/ICDAR.2015.7333910"},{"key":"378_CR26","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"378_CR27","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"378_CR28","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"378_CR29","doi-asserted-by":"crossref","unstructured":"Kang, L., Kumar, J., Ye, P., Li, Y., Doermann, D.: Convolutional neural networks for document image classification. In: 2014 22nd International Conference on Pattern Recognition, pp. 3168\u20133172 (2014)","DOI":"10.1109\/ICPR.2014.546"},{"key":"378_CR30","doi-asserted-by":"publisher","unstructured":"Kang, L., Kumar, J., Ye, P., Li, Y., Doermann, D.: Convolutional neural networks for document image classification. In: 2014 22nd International Conference on Pattern Recognition, pp. 3168\u20133172 (2014). https:\/\/doi.org\/10.1109\/ICPR.2014.546","DOI":"10.1109\/ICPR.2014.546"},{"key":"378_CR31","unstructured":"Kim, J.H., Jun, J., Zhang, B.T.: Bilinear attention networks. In: Bengio, S., Wallach, H., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a031, pp. 1564\u20131574. Curran Associates, Inc., Red Hook (2018). https:\/\/proceedings.neurips.cc\/paper\/2018\/file\/96ea64f3a1aa2fd00c72faacf0cb8ac9-Paper.pdf"},{"key":"378_CR32","doi-asserted-by":"crossref","unstructured":"K\u00f6lsch, A., Afzal, M., Ebbecke, M., Liwicki, M.: Real-time document image classification using deep cnn and extreme learning machines. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol. 1, pp. 1318\u20131323 (2017)","DOI":"10.1109\/ICDAR.2017.217"},{"key":"378_CR33","doi-asserted-by":"crossref","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: CACM (2017)","DOI":"10.1145\/3065386"},{"key":"378_CR34","doi-asserted-by":"publisher","first-page":"119","DOI":"10.1016\/j.patrec.2013.10.030","volume":"43","author":"J Kumar","year":"2014","unstructured":"Kumar, J., Ye, P., Doermann, D.: Structural similarity for document image classification and retrieval. Pattern Recogn. Lett. 43, 119\u2013126 (2014)","journal-title":"Pattern Recogn. Lett."},{"key":"378_CR35","doi-asserted-by":"crossref","unstructured":"Lai, S., Xu, L., Liu, K., Zhao, J.: Recurrent convolutional neural networks for text classification. In: AAAI (2015)","DOI":"10.1609\/aaai.v29i1.9513"},{"issue":"11","key":"378_CR36","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998). https:\/\/doi.org\/10.1109\/5.726791","journal-title":"Proc. IEEE"},{"key":"378_CR37","doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"378_CR38","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4653\u20134661 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"378_CR39","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. arXiv:1606.00061 (2016)"},{"key":"378_CR40","unstructured":"Mikolov, T., Chen, K., Corrado, G.S., Dean, J.: Efficient estimation of word representations in vector space. CoRR. arXiv:1301.3781 (2013)"},{"key":"378_CR41","unstructured":"Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., Joulin, A.: Advances in pre-training distributed word representations. arXiv:1712.09405 (2018)"},{"key":"378_CR42","doi-asserted-by":"crossref","unstructured":"Nguyen, D.K., Okatani, T.: Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6087\u20136096 (2018)","DOI":"10.1109\/CVPR.2018.00637"},{"key":"378_CR43","doi-asserted-by":"crossref","unstructured":"Noce, L., Gallo, I., Zamberletti, A., Calefati, A.: Embedded textual content for document image classification with convolutional neural networks. In: DocEng \u201916 (2016)","DOI":"10.1145\/2960811.2960814"},{"key":"378_CR44","doi-asserted-by":"crossref","unstructured":"Pastor-Pellicer, J., Afzal, M., Liwicki, M., Bleda, M.J.: Complete system for text line extraction using convolutional neural networks and watershed transform. In: 2016 12th IAPR Workshop on Document Analysis Systems (DAS), pp. 30\u201335 (2016)","DOI":"10.1109\/DAS.2016.58"},{"key":"378_CR45","doi-asserted-by":"crossref","unstructured":"Pastor-Pellicer, J., Boquera, S.E., Zamora-Mart\u00ednez, F., Afzal, M.Z., Bleda, M.J.C.: Insights on the use of convolutional neural networks for document image binarization. In: IWANN (2015)","DOI":"10.1007\/978-3-319-19222-2_10"},{"key":"378_CR46","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: Global vectors for word representation. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"378_CR47","doi-asserted-by":"crossref","unstructured":"Peters, M.E., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., Zettlemoyer, L.: Deep contextualized word representations. arXiv:1802.05365 (2018)","DOI":"10.18653\/v1\/N18-1202"},{"key":"378_CR48","unstructured":"Qian, J., Wang, W., Wang, D.: A novel approach for online handwriting recognition of tibetan characters. International Multi-Conference of Engineers and Computer Scientists 2010, 2010-03-17 to 2010-03-19 (2010)"},{"key":"378_CR49","unstructured":"Ramachandran, P., Parmar, N., Vaswani, A., Bello, I., Levskaya, A., Shlens, J.: Stand-alone self-attention in vision models. NeurIPS (2019)"},{"key":"378_CR50","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M.S., Berg, A., Fei-Fei, L.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115, 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"378_CR51","doi-asserted-by":"crossref","unstructured":"Seuret, M., Alberti, M., Liwicki, M., Ingold, R.: PCA-initialized deep neural networks applied to document image analysis. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol. 1, pp. 877\u2013882 (2017)","DOI":"10.1109\/ICDAR.2017.148"},{"key":"378_CR52","unstructured":"Sierra, S., Gonz\u00e1lez, F.A.: Combining textual and visual representations for multimodal author profiling: notebook for pan at CLEF 2018. In: CLEF (2018)"},{"key":"378_CR53","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. CoRR. arXiv:1409.1556 (2015)"},{"key":"378_CR54","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Ioffe, S., Vanhoucke, V., Alemi, A.A.: Inception-v4, inception-resnet and the impact of residual connections on learning. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"378_CR55","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"378_CR56","doi-asserted-by":"publisher","unstructured":"Tensmeyer, C., Martinez, T.: Analysis of convolutional neural networks for document image classification. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 388\u2013393 (2017). https:\/\/doi.org\/10.1109\/ICDAR.2017.71","DOI":"10.1109\/ICDAR.2017.71"},{"key":"378_CR57","doi-asserted-by":"crossref","unstructured":"Ul-Hasan, A., Afzal, M., Shafait, F., Liwicki, M., Breuel, T.: A sequence learning approach for multiple script identification. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1046\u20131050 (2015)","DOI":"10.1109\/ICDAR.2015.7333921"},{"key":"378_CR58","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. arXiv:1706.03762 (2017)"},{"key":"378_CR59","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R.B., Gupta, A., He, K.: Non-local neural networks. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"378_CR60","doi-asserted-by":"publisher","unstructured":"Wang, Y., Yang, H., Qian, X., Ma, L., Lu, J., Li, B., Fan, X.: Position focused attention network for image-text matching. In: Proceedings of the 18th International Joint Conference on Artificial Intelligence, IJCAI-19, pp. 3792\u20133798. International Joint Conferences on Artificial Intelligence Organization (2019). https:\/\/doi.org\/10.24963\/ijcai.2019\/526","DOI":"10.24963\/ijcai.2019\/526"},{"key":"378_CR61","doi-asserted-by":"crossref","unstructured":"Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: pre-training of text and layout for document image understanding. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (2020)","DOI":"10.1145\/3394486.3403172"},{"key":"378_CR62","doi-asserted-by":"crossref","unstructured":"Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Flor\u00eancio, D., Zhang, C., Che, W., Zhang, M., Zhou, L.: Layoutlmv2: multi-modal pre-training for visually-rich document understanding. arXiv:2012.14740 (2020)","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"378_CR63","doi-asserted-by":"publisher","first-page":"107329","DOI":"10.1016\/j.sigpro.2019.107329","volume":"167","author":"S Yan","year":"2020","unstructured":"Yan, S., Xie, Y., Wu, F., Smith, J., Lu, W., Zhang, B.: Image captioning via hierarchical attention mechanism and policy gradient optimization. Signal Process. 167, 107329 (2020)","journal-title":"Signal Process."},{"key":"378_CR64","doi-asserted-by":"publisher","unstructured":"Yang, F., Peng, X., Ghosh, G., Shilon, R., Ma, H., Moore, E., Predovic, G.: Exploring deep multimodal fusion of text and photo for hate speech classification. In: Proceedings of the 3rd Workshop on Abusive Language Online, pp. 11\u201318. Association for Computational Linguistics, Florence, Italy (2019). https:\/\/doi.org\/10.18653\/v1\/W19-3502. https:\/\/www.aclweb.org\/anthology\/W19-3502","DOI":"10.18653\/v1\/W19-3502"},{"key":"378_CR65","doi-asserted-by":"crossref","unstructured":"Yang, X., Yumer, E., Asente, P., Kraley, M., Kifer, D., Giles, C.L.: Learning to extract semantic structure from documents using multimodal fully convolutional neural networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4342\u20134351 (2017)","DOI":"10.1109\/CVPR.2017.462"},{"key":"378_CR66","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., Le, Q.V.: Xlnet: generalized autoregressive pretraining for language understanding. In: NeurIPS (2019)"},{"key":"378_CR67","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"378_CR68","unstructured":"Yu, Z., Cui, Y., Yu, J., Tao, D., Tian, Q.: Multimodal unified attention networks for vision-and-language interactions. arXiv:1908.04107 (2019)"},{"key":"378_CR69","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.202"},{"issue":"12","key":"378_CR70","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu, Z., Yu, J., Xiang, C., Fan, J., Tao, D.: Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans. Neural Netw. Learn. Syst. 29(12), 5947\u20135959 (2018). https:\/\/doi.org\/10.1109\/TNNLS.2018.2817340","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"378_CR71","doi-asserted-by":"crossref","unstructured":"Zahavy, T., Magnani, A., Krishnan, A., Mannor, S.: Is a picture worth a thousand words? a deep multi-modal fusion architecture for product classification in e-commerce. AAAI (2018)","DOI":"10.1609\/aaai.v32i1.11419"},{"key":"378_CR72","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xiang, T., Hospedales, T.M., Lu, H.: Deep mutual learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4320\u20134328 (2018)","DOI":"10.1109\/CVPR.2018.00454"},{"key":"378_CR73","doi-asserted-by":"crossref","unstructured":"Zhao, H., Jia, J., Koltun, V.: Exploring self-attention for image recognition. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10073\u201310082 (2020)","DOI":"10.1109\/CVPR42600.2020.01009"},{"key":"378_CR74","unstructured":"Zhou, B., Tian, Y., Sukhbaatar, S., Szlam, A., Fergus, R.: Simple baseline for visual question answering. arXiv:1512.02167 (2015)"}],"container-title":["International Journal on Document Analysis and Recognition (IJDAR)"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-021-00378-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10032-021-00378-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10032-021-00378-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,2]],"date-time":"2024-09-02T16:06:59Z","timestamp":1725293219000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10032-021-00378-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,24]]},"references-count":74,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2021,9]]}},"alternative-id":["378"],"URL":"https:\/\/doi.org\/10.1007\/s10032-021-00378-0","relation":{},"ISSN":["1433-2833","1433-2825"],"issn-type":[{"value":"1433-2833","type":"print"},{"value":"1433-2825","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,6,24]]},"assertion":[{"value":"18 November 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 April 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 May 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 June 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}