{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T15:55:48Z","timestamp":1775145348693,"version":"3.50.1"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T00:00:00Z","timestamp":1689984000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T00:00:00Z","timestamp":1689984000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61602161,61772180"],"award-info":[{"award-number":["No. 61602161,61772180"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hubei Province Science and Technology Support Project","award":["Grant No: 2020BAB012"],"award-info":[{"award-number":["Grant No: 2020BAB012"]}]},{"name":"The Fundamental Research Funds for the Research Fund of Hubei University of Technology","award":["HBUT: 2021046"],"award-info":[{"award-number":["HBUT: 2021046"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s00530-023-01133-7","type":"journal-article","created":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T10:01:56Z","timestamp":1690020116000},"page":"3599-3608","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Hierarchical multiples self-attention mechanism for multi-modal analysis"],"prefix":"10.1007","volume":"29","author":[{"given":"Wu","family":"Jun","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6514-8055","authenticated-orcid":false,"given":"Zhu","family":"Tianliang","sequence":"additional","affiliation":[]},{"given":"Zhu","family":"Jiahui","sequence":"additional","affiliation":[]},{"given":"Li","family":"Tianyi","sequence":"additional","affiliation":[]},{"given":"Wang","family":"Chunzhi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,22]]},"reference":[{"key":"1133_CR1","doi-asserted-by":"publisher","unstructured":"Chen, M., Wang, S., Liang, P.P., Baltru\u0161aitis, T., Zadeh, A., Morency, L.-P.: Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction. ICMI \u201917, pp. 163\u2013171. Association for Computing Machinery, New York, NY, USA (2017). https:\/\/doi.org\/10.1145\/3136755.3136801","DOI":"10.1145\/3136755.3136801"},{"key":"1133_CR2","unstructured":"Tay, Y., Dehghani, M., Rao, J., Fedus, W., Abnar, S., Chung, H.W., Narang, S., Yogatama, D., Vaswani, A., Metzler, D.: Scale efficiently: Insights from pre-training and fine-tuning transformers. CoRR abs\/2109.10686 (2021)"},{"key":"1133_CR3","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., Wang, R., Mahajan, D.: Predet: Large-scale weakly supervised pre-training for detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2865\u20132875 (2021)","DOI":"10.1109\/ICCV48922.2021.00286"},{"key":"1133_CR4","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-020-00672-7","author":"A Kumar","year":"2022","unstructured":"Kumar, A., Sachdeva, N.: Multi-input integrative learning using deep neural networks and transfer learning for cyberbullying detection in real-time code-mix data. Multimed. Syst. (2022). https:\/\/doi.org\/10.1007\/s00530-020-00672-7","journal-title":"Multimed. Syst."},{"key":"1133_CR5","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-00993-9","author":"X Li","year":"2022","unstructured":"Li, X., Ma, S., Shan, L.: Multi-window transformer parallel fusion feature pyramid network for pedestrian orientation detection. Multimed. Syst. (2022). https:\/\/doi.org\/10.1007\/s00530-022-00993-9","journal-title":"Multimed. Syst."},{"issue":"6","key":"1133_CR6","doi-asserted-by":"publisher","first-page":"2133","DOI":"10.1007\/s00530-020-00731-z","volume":"28","author":"NEH Ben Chaabene","year":"2022","unstructured":"Ben Chaabene, N.E.H., Bouzeghoub, A., Guetari, R., Ghezala, H.H.B.: Deep learning methods for anomalies detection in social networks using multidimensional networks and multimodal data: A survey. Multimed. Syst. 28(6), 2133\u20132143 (2022). https:\/\/doi.org\/10.1007\/s00530-020-00731-z","journal-title":"Multimed. Syst."},{"key":"1133_CR7","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-01025-2","author":"L Rei","year":"2022","unstructured":"Rei, L., Mladenic, D., Dorozynski, M., Rottensteiner, F., Schleider, T., Troncy, R., Lozano, J.S., Salvatella, M.G.: Multimodal metadata assignment for cultural heritage artifacts. Multimed. Syst. (2022). https:\/\/doi.org\/10.1007\/s00530-022-01025-2","journal-title":"Multimed. Syst."},{"key":"1133_CR8","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1707.07250","author":"A Zadeh","year":"2017","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., Morency, L.-P.: Tensor Fusion Network for Multimodal Sentiment Analysis. arXiv preprint (2017). https:\/\/doi.org\/10.48550\/arXiv.1707.07250","journal-title":"arXiv preprint"},{"key":"1133_CR9","doi-asserted-by":"crossref","unstructured":"Sahay, S., Okur, E., Kumar, S.H., Nachman, L.: Low rank fusion based transformers for multimodal sequences. CoRR abs\/2007.02038 (2020)","DOI":"10.18653\/v1\/2020.challengehml-1.4"},{"key":"1133_CR10","doi-asserted-by":"publisher","first-page":"308","DOI":"10.1016\/j.ins.2020.07.049","volume":"544","author":"Y Zhou","year":"2021","unstructured":"Zhou, Y., Li, J., Chen, H., Wu, Y., Wu, J., Chen, L.: A spatiotemporal hierarchical attention mechanism-based model for multi-step station-level crowd flow prediction. Inform. Sci. 544, 308\u2013324 (2021). https:\/\/doi.org\/10.1016\/j.ins.2020.07.049","journal-title":"Inform. Sci."},{"key":"1133_CR11","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (2019)"},{"key":"1133_CR12","doi-asserted-by":"publisher","DOI":"10.1109\/UBMK50275.2020.9219384","author":"F Demirkiran","year":"2020","unstructured":"Demirkiran, F., \u00c7ayir, A., \u00dcnal, U., Da\u011f, H.: Website category classification using fine-tuned bert language model. Int. Conf. Comput. Sci. Eng. (2020). https:\/\/doi.org\/10.1109\/UBMK50275.2020.9219384","journal-title":"Int. Conf. Comput. Sci. Eng."},{"issue":"11","key":"1133_CR13","doi-asserted-by":"publisher","first-page":"10223","DOI":"10.1007\/s12652-020-02791-5","volume":"12","author":"S Madichetty","year":"2021","unstructured":"Madichetty, S., Muthukumarasamy, S., Jayadev, P.: Multi-modal classification of twitter data during disasters for humanitarian response. J. Ambient. Intell. Humaniz. Comput. 12(11), 10223\u201310237 (2021). https:\/\/doi.org\/10.1007\/s12652-020-02791-5","journal-title":"J. Ambient. Intell. Humaniz. Comput."},{"key":"1133_CR14","first-page":"101","volume-title":"2020 2nd symposium on signal processing systems SSPS 2020","author":"Y Zhang","year":"2020","unstructured":"Zhang, Y., Wang, Y., Wang, X., Zou, B., Xie, H.: Text-based decision fusion model for detecting depression. In: 2020 2nd symposium on signal processing systems SSPS 2020, pp. 101\u2013106. Association for Computing Machinery, NY, USA (2020)"},{"key":"1133_CR15","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9860014","author":"W Zou","year":"2022","unstructured":"Zou, W., Ding, J., Wang, C.: Utilizing bert intermediate layers for multimodal sentiment analysis. IEEE Int. Conf. Multimed. Export (2022). https:\/\/doi.org\/10.1109\/ICME52920.2022.9860014","journal-title":"IEEE Int. Conf. Multimed. Export"},{"key":"1133_CR16","doi-asserted-by":"publisher","first-page":"94557","DOI":"10.1109\/ACCESS.2021.3092735","volume":"9","author":"S Lee","year":"2021","unstructured":"Lee, S., Han, D.K., Ko, H.: Multimodal emotion recognition fusion analysis adapting bert with heterogeneous feature unification. IEEE Access 9, 94557\u201394572 (2021). https:\/\/doi.org\/10.1109\/ACCESS.2021.3092735","journal-title":"IEEE Access"},{"issue":"1","key":"1133_CR17","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/s41598-022-13072-w","volume":"12","author":"K Agarwal","year":"2022","unstructured":"Agarwal, K., Choudhury, S., Tipirneni, S., Mukherjee, P., Ham, C., Tamang, S., Baker, M., Tang, S., Kocaman, V., Gevaert, O.: Preparing for the next pandemic via transfer learning from existing diseases with hierarchical multi-modal bert: a study on covid-19 outcome prediction. Sci. Rep. 12(1), 1\u201313 (2022). https:\/\/doi.org\/10.1038\/s41598-022-13072-w","journal-title":"Sci. Rep."},{"key":"1133_CR18","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.115708","volume":"186","author":"Z Lei","year":"2021","unstructured":"Lei, Z., Ul Haq, A., Zeb, A., Suzauddola, M., Zhang, D.: Is the suggested food your desired?: Multi-modal recipe recommendation with demand-based knowledge graph. Expert Syst. Appl. 186, 115708 (2021). https:\/\/doi.org\/10.1016\/j.eswa.2021.115708","journal-title":"Expert Syst. Appl."},{"key":"1133_CR19","doi-asserted-by":"crossref","unstructured":"Khare, Y., Bagal, V., Mathew, M., Devi, A., Priyakumar, U.D., Jawahar, C.V.: MMBERT: multimodal BERT pretraining for improved medical VQA. CoRR abs\/2104.01394 (2021)","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"1133_CR20","unstructured":"Huang, Z., Zeng, Z., Liu, B., Fu, D., Fu, J.: Pixel-bert: Aligning image pixels with text by deep multi-modal transformers. CoRR abs\/2004.00849 (2020)"},{"key":"1133_CR21","doi-asserted-by":"publisher","first-page":"691","DOI":"10.1007\/978-3-031-19833-5_40","volume-title":"Computer vision - ECCV 2022","author":"Y Ge","year":"2022","unstructured":"Ge, Y., Ge, Y., Liu, X., Wang, J., Wu, J., Shan, Y., Qie, X., Luo, P.: Miles: Visual bert pre-training with injected language semantics for video-text retrieval. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer vision - ECCV 2022, pp. 691\u2013708. Springer, Cham (2022)"},{"key":"1133_CR22","unstructured":"Zhang, Z., Ma, J., Zhou, C., Men, R., Li, Z., Ding, M., Tang, J., Zhou, J., Yang, H.: UFC-BERT: unifying multi-modal controls for conditional image synthesis. CoRR abs\/2105.14211 (2021)"},{"key":"1133_CR23","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L.u., Polosukhin, I.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc., ??? (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"1133_CR24","unstructured":"Akbari, H., Yuan, L., Qian, R., Chuang, W., Chang, S., Cui, Y., Gong, B.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. CoRR abs\/2104.11178 (2021)"},{"key":"1133_CR25","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1145\/3371382.3378261","volume-title":"Companion of the 2020 ACM\/IEEE International Conference on Human-Robot Interaction","author":"Y Li","year":"2020","unstructured":"Li, Y., Zhao, T., Shen, X.: Attention-based multimodal fusion for estimating human emotion in real-world hri. In: Companion of the 2020 ACM\/IEEE International Conference on Human-Robot Interaction, pp. 340\u2013342. Association for Computing Machinery, NY, USA (2020)"},{"key":"1133_CR26","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1133_CR27","first-page":"521","volume-title":"CM-BERT cross-modal BERT for text-audio sentiment analysis","author":"K Yang","year":"2020","unstructured":"Yang, K., Xu, H., Gao, K.: CM-BERT cross-modal BERT for text-audio sentiment analysis, pp. 521\u2013528. Association for Computing Machinery, New York, NY, USA (2020)"},{"key":"1133_CR28","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1016\/j.neucom.2022.07.035","volume":"506","author":"D Kim","year":"2022","unstructured":"Kim, D., Kang, P.: Cross-modal distillation with audio-text fusion for fine-grained emotion classification using bert and wav2vec 2.0. Neurocomputing 506, 168\u2013183 (2022). https:\/\/doi.org\/10.1016\/j.neucom.2022.07.035","journal-title":"Neurocomputing"},{"key":"1133_CR29","doi-asserted-by":"publisher","unstructured":"Boukabous, M., Azizi, M.: Multimodal sentiment analysis using audio and text for crime detection. In: 2022 2nd International Conference on Innovative Research in Applied Science, Engineering and Technology (IRASET), pp. 1\u20135 (2022). https:\/\/doi.org\/10.1109\/IRASET52964.2022.9738175","DOI":"10.1109\/IRASET52964.2022.9738175"},{"key":"1133_CR30","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., G\u00fcl\u00e7ehre, \u00c7., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. CoRR abs\/1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01133-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01133-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01133-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,16]],"date-time":"2023-11-16T11:04:43Z","timestamp":1700132683000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01133-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,22]]},"references-count":30,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["1133"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01133-7","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,22]]},"assertion":[{"value":"16 March 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 July 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 July 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors declared no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"We promise that our studies have no ethical issues.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics approval"}}]}}