{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T16:53:19Z","timestamp":1771260799954,"version":"3.50.1"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T00:00:00Z","timestamp":1732060800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T00:00:00Z","timestamp":1732060800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272178"],"award-info":[{"award-number":["62272178"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1007\/s00521-024-10678-1","type":"journal-article","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T09:54:54Z","timestamp":1732096494000},"page":"1491-1508","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-level Multi-task representation learning with adaptive fusion for multimodal sentiment analysis"],"prefix":"10.1007","volume":"37","author":[{"given":"Chuanbo","family":"Zhu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Min","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haomin","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sheng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Han","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jincai","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,20]]},"reference":[{"key":"10678_CR1","unstructured":"Ouyang L, Wu J, Jiang X, et\u00a0al (2022) Training language models to follow instructions with human feedback. arXiv:2203.02155"},{"key":"10678_CR2","unstructured":"Touvron H, Lavril T, Izacard G, et\u00a0al (2023) Llama: open and efficient foundation language models. arXiv:2302.13971"},{"key":"10678_CR3","unstructured":"Bubeck S, Chandrasekaran V, Eldan R, et\u00a0al (2023) Sparks of artificial general intelligence: early experiments with gpt-4. arXiv:2303.12712"},{"key":"10678_CR4","unstructured":"OpenAI (2023) Gpt-4 technical report. arXiv:2303.08774"},{"key":"10678_CR5","unstructured":"Zhao W, Zhao Y, Lu X, et\u00a0al (2023) Is chatgpt equipped with emotional dialogue capabilities? arXiv:2304.09582"},{"key":"10678_CR6","doi-asserted-by":"crossref","unstructured":"Al-Qablan TA, Mohd\u00a0Noor MH, Al-Betar MA, et\u00a0al (2023) A survey on sentiment analysis and its applications. Neural Computing and Applications, pp 1\u201335","DOI":"10.1007\/s00521-023-08941-y"},{"key":"10678_CR7","doi-asserted-by":"publisher","unstructured":"Sun H, Wang H, Liu J, et\u00a0al (2022) Cubemlp: An mlp-based model for multimodal sentiment analysis and depression estimation. In: Proceedings of the 30th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201922, pp 3722\u20133729. https:\/\/doi.org\/10.1145\/3503161.3548025","DOI":"10.1145\/3503161.3548025"},{"issue":"2","key":"10678_CR8","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltrusaitis","year":"2019","unstructured":"Baltrusaitis T, Ahuja C, Morency LP (2019) Multimodal machine learning: a survey and taxonomy. IEEE Trans Pattern Anal Mach Intell 41(2):423\u2013443. https:\/\/doi.org\/10.1109\/TPAMI.2018.2798607","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10678_CR9","doi-asserted-by":"publisher","unstructured":"Hazarika D, Zimmermann R, Poria S (2020) Misa: modality-invariant and -specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201920, pp 1122\u20131131. https:\/\/doi.org\/10.1145\/3394171.3413678","DOI":"10.1145\/3394171.3413678"},{"key":"10678_CR10","doi-asserted-by":"publisher","unstructured":"Liu Y, Yuan Z, Mao H, et\u00a0al (2022) Make acoustic and visual cues matter: Ch-sims v2.0 dataset and av-mixup consistent module. In: Proceedings of the 2022 International Conference on Multimodal Interaction. Association for Computing Machinery, New York, NY, USA, ICMI \u201922, pp 247\u2013258. https:\/\/doi.org\/10.1145\/3536221.3556630","DOI":"10.1145\/3536221.3556630"},{"key":"10678_CR11","doi-asserted-by":"publisher","unstructured":"Yu W, Xu H, Yuan Z et al (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. Proceedings of the AAAI Conference on Artificial Intelligence 35(12):10790\u201310797. https:\/\/doi.org\/10.1609\/aaai.v35i12.17289","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"10678_CR12","doi-asserted-by":"publisher","unstructured":"Han W, Chen H, Poria S (2021) Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, pp 9180\u20139192. https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.723","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"10678_CR13","doi-asserted-by":"publisher","unstructured":"Han W, Chen H, Gelbukh A, et\u00a0al (2021) Bi-bimodal modality fusion for correlation-controlled multimodal sentiment analysis. In: Proceedings of the 2021 International Conference on Multimodal Interaction. Association for Computing Machinery, New York, NY, USA, ICMI \u201921, pp 6\u201315. https:\/\/doi.org\/10.1145\/3462244.3479919","DOI":"10.1145\/3462244.3479919"},{"key":"10678_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107676","volume":"235","author":"T Wu","year":"2022","unstructured":"Wu T, Peng J, Zhang W et al (2022) Video sentiment analysis with bimodal information-augmented multi-head attention. Knowl Based Syst 235:107676. https:\/\/doi.org\/10.1016\/j.knosys.2021.107676","journal-title":"Knowl Based Syst"},{"key":"10678_CR15","doi-asserted-by":"publisher","unstructured":"Yang K, Xu H, Gao K (2020) CM-BERT: cross-modal BERT for text-audio sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201920, pp 521\u2013528. https:\/\/doi.org\/10.1145\/3394171.3413690","DOI":"10.1145\/3394171.3413690"},{"key":"10678_CR16","doi-asserted-by":"publisher","unstructured":"Yang D, Huang S, Kuang H, et\u00a0al (2022) Disentangled representation learning for multimodal emotion recognition. In: Proceedings of the 30th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201922, pp 1642\u20131651. https:\/\/doi.org\/10.1145\/3503161.3547754","DOI":"10.1145\/3503161.3547754"},{"key":"10678_CR17","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2023.119125","volume":"641","author":"Z Tang","year":"2023","unstructured":"Tang Z, Xiao Q, Zhou X et al (2023) Learning discriminative multi-relation representations for multimodal sentiment analysis. Inf Sci 641:119125. https:\/\/doi.org\/10.1016\/j.ins.2023.119125","journal-title":"Inf Sci"},{"key":"10678_CR18","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1016\/j.ins.2023.01.116","volume":"628","author":"J Wang","year":"2023","unstructured":"Wang J, Wang S, Lin M et al (2023) Learning speaker-independent multimodal representation for sentiment analysis. Inf Sci 628:208\u2013225. https:\/\/doi.org\/10.1016\/j.ins.2023.01.116","journal-title":"Inf Sci"},{"key":"10678_CR19","doi-asserted-by":"publisher","unstructured":"Williams J, Kleinegesse S, Comanescu R, et\u00a0al (2018) Recognizing emotions in video using multimodal DNN feature fusion. In: Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML). Association for Computational Linguistics, Melbourne, Australia, pp 11\u201319. https:\/\/doi.org\/10.18653\/v1\/W18-3302","DOI":"10.18653\/v1\/W18-3302"},{"key":"10678_CR20","doi-asserted-by":"publisher","unstructured":"Yu W, Xu H, Meng F, et\u00a0al (2020) CH-SIMS: A Chinese multimodal sentiment analysis dataset with fine-grained annotation of modality. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics, Online, pp 3718\u20133727. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.343","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"10678_CR21","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1016\/j.inffus.2018.06.003","volume":"46","author":"Y Ma","year":"2019","unstructured":"Ma Y, Hao Y, Chen M et al (2019) Audio-visual emotion fusion (avef): a deep efficient weighted approach. Inf Fusion 46:184\u2013192. https:\/\/doi.org\/10.1016\/j.inffus.2018.06.003","journal-title":"Inf Fusion"},{"key":"10678_CR22","doi-asserted-by":"publisher","unstructured":"Zadeh A, Chen M, Poria S, et\u00a0al (2017) Tensor fusion network for multimodal sentiment analysis. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Copenhagen, Denmark, pp 1103\u20131114. https:\/\/doi.org\/10.18653\/v1\/D17-1115","DOI":"10.18653\/v1\/D17-1115"},{"key":"10678_CR23","doi-asserted-by":"crossref","unstructured":"Liu Z, Shen Y, Lakshminarasimhan VB, et\u00a0al (2018) Efficient low-rank multimodal fusion with modality-specific factors. CoRR abs\/1806.00064. arXiv:1806.00064","DOI":"10.18653\/v1\/P18-1209"},{"key":"10678_CR24","doi-asserted-by":"publisher","unstructured":"Jin T, Huang S, Li Y, et\u00a0al (2020) Dual low-rank multimodal fusion. In: Findings of the Association for Computational Linguistics: EMNLP 2020. Association for Computational Linguistics, Online, pp 377\u2013387. https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.35","DOI":"10.18653\/v1\/2020.findings-emnlp.35"},{"key":"10678_CR25","doi-asserted-by":"publisher","unstructured":"Fu Z, Liu F, Xu Q, et\u00a0al (2022) Nhfnet: a non-homogeneous fusion network for multimodal sentiment analysis. In: 2022 IEEE International Conference on Multimedia and Expo (ICME), pp 1\u20136. https:\/\/doi.org\/10.1109\/ICME52920.2022.9859836","DOI":"10.1109\/ICME52920.2022.9859836"},{"key":"10678_CR26","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1016\/j.inffus.2022.11.022","volume":"92","author":"K Kim","year":"2023","unstructured":"Kim K, Park S (2023) Aobert: all-modalities-in-one bert for multimodal sentiment analysis. Inf Fusion 92:37\u201345. https:\/\/doi.org\/10.1016\/j.inffus.2022.11.022","journal-title":"Inf Fusion"},{"key":"10678_CR27","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Poria S, et\u00a0al (2018) Multi-attention recurrent network for human communication comprehension. In: Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence and Thirtieth Innovative Applications of Artificial Intelligence Conference and Eighth AAAI Symposium on Educational Advances in Artificial Intelligence. AAAI Press, AAAI\u201918\/IAAI\u201918\/EAAI\u201918","DOI":"10.1609\/aaai.v32i1.12024"},{"key":"10678_CR28","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Mazumder N, et\u00a0al (2018) Memory fusion network for multi-view sequential learning. arXiv: 1802.00927","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"10678_CR29","doi-asserted-by":"publisher","unstructured":"Bagher\u00a0Zadeh A, Liang PP, Poria S, et\u00a0al (2018) Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, Melbourne, Australia, pp 2236\u20132246, https:\/\/doi.org\/10.18653\/v1\/P18-1208","DOI":"10.18653\/v1\/P18-1208"},{"key":"10678_CR30","doi-asserted-by":"publisher","unstructured":"Wang Y, Shen Y, Liu Z et al (2019) Words can shift: Dynamically adjusting word representations using nonverbal behaviors. In: Proceedings of the AAAI Conference on Artificial Intelligence 33(01):7216\u20137223. https:\/\/doi.org\/10.1609\/aaai.v33i01.33017216","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"10678_CR31","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang MW, Lee K, et\u00a0al (2019) BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). Association for Computational Linguistics, Minneapolis, Minnesota, pp 4171\u20134186, https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"10678_CR32","doi-asserted-by":"publisher","unstructured":"Kumar A, Vepa J (2020) Gated mechanism for attention based multi modal sentiment analysis. In: ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 4477\u20134481. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053012","DOI":"10.1109\/ICASSP40776.2020.9053012"},{"key":"10678_CR33","doi-asserted-by":"publisher","unstructured":"Wang Z, Wan Z, Wan X (2020) Transmodality: an end2end fusion method with transformer for multimodal sentiment analysis. In: Proceedings of the Web Conference 2020. Association for Computing Machinery, New York, NY, USA, WWW \u201920, pp 2514\u20132520. https:\/\/doi.org\/10.1145\/3366423.3380000","DOI":"10.1145\/3366423.3380000"},{"key":"10678_CR34","doi-asserted-by":"crossref","unstructured":"Sahay S, Okur E, Kumar SH, et\u00a0al (2020) Low rank fusion based transformers for multimodal sequences. arXiv:2007.02038","DOI":"10.18653\/v1\/2020.challengehml-1.4"},{"issue":"10","key":"10678_CR35","doi-asserted-by":"publisher","first-page":"13803","DOI":"10.1609\/aaai.v34i10.7173","volume":"34","author":"AB Harish","year":"2020","unstructured":"Harish AB, Sadat F (2020) Trimodal attention module for multimodal sentiment analysis (student abstract). Proc AAAI Conf Artif Intell 34(10):13803\u201313804. https:\/\/doi.org\/10.1609\/aaai.v34i10.7173","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"10678_CR36","unstructured":"Zadeh A, Mao C, Shi K, et\u00a0al (2019) Factorized multimodal transformer for multimodal sequential learning. arXiv:1911.09826"},{"key":"10678_CR37","doi-asserted-by":"crossref","unstructured":"Tsai YH, Bai S, Liang PP, et\u00a0al (2019) Multimodal transformer for unaligned multimodal language sequences. CoRR abs\/1906.00295. arXiv:1906.00295","DOI":"10.18653\/v1\/P19-1656"},{"key":"10678_CR38","doi-asserted-by":"publisher","unstructured":"Rahman W, Hasan MK, Lee S, et\u00a0al (2020) Integrating multimodal information in large pretrained transformers. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics, Online, pp 2359\u20132369. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.214","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"10678_CR39","doi-asserted-by":"crossref","unstructured":"Mai S, Sun Y, Hu H (2022) Curriculum learning meets weakly supervised multimodal correlation learning. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates, pp 3191\u20133203. https:\/\/aclanthology.org\/2022.emnlp-main.209","DOI":"10.18653\/v1\/2022.emnlp-main.209"},{"key":"10678_CR40","doi-asserted-by":"publisher","first-page":"542","DOI":"10.1016\/j.inffus.2022.11.003","volume":"91","author":"S Mai","year":"2023","unstructured":"Mai S, Sun Y, Zeng Y et al (2023) Excavating multimodal correlation for representation learning. Inf Fusion 91:542\u2013555. https:\/\/doi.org\/10.1016\/j.inffus.2022.11.003","journal-title":"Inf Fusion"},{"issue":"05","key":"10678_CR41","doi-asserted-by":"publisher","first-page":"8992","DOI":"10.1609\/aaai.v34i05.6431","volume":"34","author":"Z Sun","year":"2020","unstructured":"Sun Z, Sarma P, Sethares W et al (2020) Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. Proc AAAI Conf Artif Intell 34(05):8992\u20138999. https:\/\/doi.org\/10.1609\/aaai.v34i05.6431","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"10678_CR42","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2022.3195739","author":"T Zhang","year":"2022","unstructured":"Zhang T, Li S, Chen B et al (2022) Aia-net: adaptive interactive attention network for text-audio emotion recognition. IEEE Trans Cybern. https:\/\/doi.org\/10.1109\/TCYB.2022.3195739","journal-title":"IEEE Trans Cybern"},{"key":"10678_CR43","doi-asserted-by":"publisher","first-page":"2015","DOI":"10.1109\/TASLP.2022.3178204","volume":"30","author":"B Yang","year":"2022","unstructured":"Yang B, Wu L, Zhu J et al (2022) Multimodal sentiment analysis with two-phase multi-task learning. IEEE\/ACM Trans Audio Speech Lang Process 30:2015\u20132024. https:\/\/doi.org\/10.1109\/TASLP.2022.3178204","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"10678_CR44","doi-asserted-by":"publisher","unstructured":"Lin R, Hu H (2022) Multimodal contrastive learning via uni-modal coding and cross-modal prediction for multimodal sentiment analysis. In: Findings of the Association for Computational Linguistics: EMNLP 2022. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates, pp 511\u2013523. https:\/\/doi.org\/10.18653\/v1\/2022.findings-emnlp.36","DOI":"10.18653\/v1\/2022.findings-emnlp.36"},{"key":"10678_CR45","doi-asserted-by":"publisher","unstructured":"Shankar S (2022) Multimodal fusion via cortical network inspired losses. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, Dublin, Ireland, pp 1167\u20131178. https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.83","DOI":"10.18653\/v1\/2022.acl-long.83"},{"key":"10678_CR46","doi-asserted-by":"publisher","unstructured":"Yang D, Kuang H, Huang S, et\u00a0al (2022) Learning modality-specific and -agnostic representations for asynchronous multimodal language sequences. In: Proceedings of the 30th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201922, pp 1708\u20131717. https:\/\/doi.org\/10.1145\/3503161.3547755","DOI":"10.1145\/3503161.3547755"},{"issue":"8","key":"10678_CR47","doi-asserted-by":"publisher","first-page":"9100","DOI":"10.1609\/aaai.v36i8.20895","volume":"36","author":"Y Zhang","year":"2022","unstructured":"Zhang Y, Chen M, Shen J et al (2022) Tailor versatile multi-modal learning for multi-label emotion recognition. Proc AAAI Conf Artif Intell 36(8):9100\u20139108. https:\/\/doi.org\/10.1609\/aaai.v36i8.20895","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"10678_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.119240","volume":"213","author":"Y Zeng","year":"2023","unstructured":"Zeng Y, Li Z, Tang Z et al (2023) Heterogeneous graph convolution based on in-domain self-supervision for multimodal sentiment analysis. Expert Syst Appl 213:119240. https:\/\/doi.org\/10.1016\/j.eswa.2022.119240","journal-title":"Expert Syst Appl"},{"key":"10678_CR49","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3141237","author":"Z Lian","year":"2022","unstructured":"Lian Z, Liu B, Tao J (2022) Smin: semi-supervised multi-modal interaction network for conversational emotion recognition. IEEE Trans Affect Comput. https:\/\/doi.org\/10.1109\/TAFFC.2022.3141237","journal-title":"IEEE Trans Affect Comput"},{"key":"10678_CR50","doi-asserted-by":"publisher","unstructured":"Li Z, Zhou Y, Liu Y, et\u00a0al (2023) QAP: A quantum-inspired adaptive-priority-learning model for multimodal emotion recognition. In: Rogers A, Boyd-Graber J, Okazaki N (eds) Findings of the Association for Computational Linguistics: ACL 2023. Association for Computational Linguistics, Toronto, Canada, pp 12191\u201312204. https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.772","DOI":"10.18653\/v1\/2023.findings-acl.772"},{"key":"10678_CR51","doi-asserted-by":"crossref","unstructured":"Zhao B, Cui Q, Song R, et\u00a0al (2022) Decoupled knowledge distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 11953\u201311962","DOI":"10.1109\/CVPR52688.2022.01165"},{"key":"10678_CR52","unstructured":"Vaswani A, Shazeer N, Parmar N, et\u00a0al (2017) Attention is all you need. In: Guyon I, Luxburg UV, Bengio S, et\u00a0al (eds) Advances in Neural Information Processing Systems, vol\u00a030. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"issue":"6","key":"10678_CR53","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh A, Zellers R, Pincus E et al (2016) Multimodal sentiment intensity analysis in videos: facial gestures and verbal messages. IEEE Intell Syst 31(6):82\u201388. https:\/\/doi.org\/10.1109\/MIS.2016.94","journal-title":"IEEE Intell Syst"},{"key":"10678_CR54","doi-asserted-by":"publisher","unstructured":"Mao H, Yuan Z, Xu H, et\u00a0al (2022) M-SENA: An integrated platform for multimodal sentiment analysis. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations. Association for Computational Linguistics, Dublin, Ireland, pp 204\u2013213. https:\/\/doi.org\/10.18653\/v1\/2022.acl-demo.20","DOI":"10.18653\/v1\/2022.acl-demo.20"},{"key":"10678_CR55","doi-asserted-by":"crossref","unstructured":"Mcfee B, Raffel C, Liang D, et\u00a0al (2015) librosa: audio and music signal analysis in python. In: Python in Science Conference","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"10678_CR56","doi-asserted-by":"publisher","unstructured":"Eyben F, W\u00f6llmer M, Schuller B (2010) Opensmile: The munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201910, pp 1459\u20131462. https:\/\/doi.org\/10.1145\/1873951.1874246","DOI":"10.1145\/1873951.1874246"},{"key":"10678_CR57","doi-asserted-by":"publisher","unstructured":"Degottex G, Kane J, Drugman T, et\u00a0al (2014) Covarep - a collaborative voice analysis repository for speech technologies. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 960\u2013964. https:\/\/doi.org\/10.1109\/ICASSP.2014.6853739","DOI":"10.1109\/ICASSP.2014.6853739"},{"issue":"10","key":"10678_CR58","doi-asserted-by":"publisher","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang K, Zhang Z, Li Z et al (2016) Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Signal Process Lett 23(10):1499\u20131503. https:\/\/doi.org\/10.1109\/LSP.2016.2603342","journal-title":"IEEE Signal Process Lett"},{"key":"10678_CR59","doi-asserted-by":"publisher","unstructured":"Baltrusaitis T, Zadeh A, Lim YC, et\u00a0al (2018) Openface 2.0: Facial behavior analysis toolkit. In: 2018 13th IEEE International Conference on Automatic Face Gesture Recognition (FG 2018), pp 59\u201366. https:\/\/doi.org\/10.1109\/FG.2018.00019","DOI":"10.1109\/FG.2018.00019"},{"key":"10678_CR60","doi-asserted-by":"publisher","unstructured":"Tao R, Pan Z, Das RK, et\u00a0al (2021) Is someone speaking? exploring long-term temporal features for audio-visual active speaker detection. In: Proceedings of the 29th ACM International Conference on Multimedia. Association for Computing Machinery, New York, NY, USA, MM \u201921, pp 3927\u20133935. https:\/\/doi.org\/10.1145\/3474085.3475587","DOI":"10.1145\/3474085.3475587"},{"issue":"11","key":"10678_CR61","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten L, Hinton G (2008) Visualizing data using t-sne. J Mach Learn Res 9(11):2579","journal-title":"J Mach Learn Res"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10678-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-024-10678-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10678-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T04:32:10Z","timestamp":1737693130000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-024-10678-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,20]]},"references-count":61,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2025,1]]}},"alternative-id":["10678"],"URL":"https:\/\/doi.org\/10.1007\/s00521-024-10678-1","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,20]]},"assertion":[{"value":"1 September 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 November 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}