{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T12:46:46Z","timestamp":1742993206500,"version":"3.40.3"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031844560"},{"type":"electronic","value":"9783031844577"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-84457-7_23","type":"book-chapter","created":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T16:50:30Z","timestamp":1741020630000},"page":"370-386","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring Foundation Model Fusion Effectiveness and\u00a0Explainability for\u00a0Stylistic Analysis of\u00a0Emotional Podcast Data"],"prefix":"10.1007","author":[{"given":"Arnab","family":"Das","sequence":"first","affiliation":[]},{"given":"Carlos","family":"Franzreb","sequence":"additional","affiliation":[]},{"given":"Tim","family":"Polzehl","sequence":"additional","affiliation":[]},{"given":"Sebastian","family":"M\u00f6ller","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,4]]},"reference":[{"issue":"10","key":"23_CR1","doi-asserted-by":"publisher","first-page":"1533","DOI":"10.1109\/TASLP.2014.2339736","volume":"22","author":"O Abdel-Hamid","year":"2014","unstructured":"Abdel-Hamid, O., Mohamed, A., Jiang, H., Deng, L., Penn, G., Yu, D.: Convolutional neural networks for speech recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 22(10), 1533\u20131545 (2014)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"23_CR2","unstructured":"Adebayo, J., Gilmer, J., Muelly, M., Goodfellow, I., Hardt, M., Kim, B.: Sanity checks for saliency maps. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"23_CR3","doi-asserted-by":"crossref","unstructured":"Asgari, M., Kiss, G., Van\u00a0Santen, J., Shafran, I., Song, X.: Automatic measurement of affective valence and arousal in speech. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 965\u2013969. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6853740"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Atmaja, B.T., Hamada, Y., Akagi, M.: Predicting valence and arousal by aggregating acoustic features for acoustic-linguistic information fusion. In: 2020 IEEE Region 10 Conference (TENCON), pp. 1081\u20131085. IEEE (2020)","DOI":"10.1109\/TENCON50793.2020.9293899"},{"key":"23_CR5","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: Advances in Neural Information Processing Systems, vol. 33, pp. 12449\u201312460 (2020)"},{"key":"23_CR6","unstructured":"Balestriero, R., et\u00a0al.: A cookbook of self-supervised learning. arXiv preprint arXiv:2304.12210 (2023)"},{"key":"23_CR7","unstructured":"Barrault, L., et\u00a0al.: Seamlessm4t-massively multilingual & multimodal machine translation. arXiv preprint arXiv:2308.11596 (2023)"},{"issue":"1","key":"23_CR8","doi-asserted-by":"publisher","first-page":"418","DOI":"10.1016\/j.jfranklin.2023.11.038","volume":"361","author":"S Becker","year":"2024","unstructured":"Becker, S., Vielhaben, J., Ackermann, M., M\u00fcller, K.-R., Lapuschkin, S., Samek, W.: Audiomnist: exploring explainable artificial intelligence for audio analysis on a simple benchmark. J. Franklin Inst. 361(1), 418\u2013428 (2024)","journal-title":"J. Franklin Inst."},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Burges, C., et al.: Learning to rank using gradient descent. In: Proceedings of the 22nd International Conference on Machine Learning, pp. 89\u201396 (2005)","DOI":"10.1145\/1102351.1102363"},{"issue":"16","key":"23_CR10","first-page":"31","volume":"101","author":"P Chandrasekar","year":"2014","unstructured":"Chandrasekar, P., Chapaneri, S., Jayaswal, D.: Emotion recognition from speech using discriminative features. Int. J. Comput. Appl. 101(16), 31\u201336 (2014)","journal-title":"Int. J. Comput. Appl."},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Chang, J., Scherer, S.: Learning representations of emotional speech with deep convolutional generative adversarial networks. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2746\u20132750. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952656"},{"issue":"6","key":"23_CR12","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Top. Signal Process. 16(6), 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"23_CR13","volume-title":"Real-Time Speech and Music Classification by Large Audio Feature Space Extraction","author":"F Eyben","year":"2015","unstructured":"Eyben, F.: Real-Time Speech and Music Classification by Large Audio Feature Space Extraction. Springer, Cham (2015)"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Gong, Y., Khurana, S., Karlinsky, L., Glass, J.: Whisper-at: noise-robust automatic speech recognizers are also strong general audio event taggers. arXiv preprint arXiv:2307.03183 (2023)","DOI":"10.21437\/Interspeech.2023-2193"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Gulati, A., et\u00a0al.: Conformer: convolution-augmented transformer for speech recognition. arXiv preprint arXiv:2005.08100 (2020)","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"23_CR16","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"W-N Hsu","year":"2021","unstructured":"Hsu, W.-N., Bolte, B., Tsai, Y.-H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"23_CR17","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Lawrence, I., Lin, K.: A concordance correlation coefficient to evaluate reproducibility. Biometrics 255\u2013268 (1989)","DOI":"10.2307\/2532051"},{"key":"23_CR19","doi-asserted-by":"crossref","unstructured":"Lim, W., Jang, D., Lee, T.: Speech emotion recognition using convolutional and recurrent neural networks. In: 2016 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA), pp. 1\u20134. IEEE (2016)","DOI":"10.1109\/APSIPA.2016.7820699"},{"key":"23_CR20","doi-asserted-by":"crossref","unstructured":"Liu, S., et al.: Audio self-supervised learning: a survey. Patterns 3(12) (2022)","DOI":"10.1016\/j.patter.2022.100616"},{"issue":"4","key":"23_CR21","doi-asserted-by":"publisher","first-page":"471","DOI":"10.1109\/TAFFC.2017.2736999","volume":"10","author":"R Lotfian","year":"2017","unstructured":"Lotfian, R., Busso, C.: Building naturalistic emotionally balanced speech corpus by retrieving emotional speech from existing podcast recordings. IEEE Trans. Affect. Comput. 10(4), 471\u2013483 (2017)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"23_CR22","doi-asserted-by":"crossref","unstructured":"Luengo, I., Navas, E., Hern\u00e1ez, I., S\u00e1nchez, J.: Automatic emotion recognition using prosodic parameters. In: Ninth European Conference on Speech Communication and Technology. Citeseer (2005)","DOI":"10.21437\/Interspeech.2005-324"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Mirsamadi, S., Barsoum, E., Zhang, C.: Automatic speech emotion recognition using recurrent neural networks with local attention. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2227\u20132231. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952552"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Nogueiras, A., Moreno, A., Bonafonte, A., Mari\u00f1o, J.B.: Speech emotion recognition using hidden Markov models. In: Seventh European Conference on Speech Communication and Technology (2001)","DOI":"10.21437\/Eurospeech.2001-627"},{"issue":"4","key":"23_CR25","doi-asserted-by":"publisher","first-page":"603","DOI":"10.1016\/S0167-6393(03)00099-2","volume":"41","author":"TL Nwe","year":"2003","unstructured":"Nwe, T.L., Foo, S.W., De Silva, L.C.: Speech emotion recognition using hidden Markov models. Speech Commun. 41(4), 603\u2013623 (2003)","journal-title":"Speech Commun."},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Parthasarathy, S., Busso, C.: Jointly predicting arousal, valence and dominance with multi-task learning. In: Interspeech, vol. 2017, pp. 1103\u20131107 (2017)","DOI":"10.21437\/Interspeech.2017-1494"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Pasad, A., Chou, J.-C., Livescu, K.: Layer-wise analysis of a self-supervised speech representation model. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), pp. 914\u2013921. IEEE (2021)","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"23_CR28","unstructured":"Pastor, E., Koudounas, A., Attanasio, G., Hovy, D., Baralis, E.: Explaining speech classification models via word-level audio segments and paralinguistic features. In: Graham, Y., Purver, M. (eds.) Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), St. Julian\u2019s, Malta, March 2024, pp. 2221\u20132238. Association for Computational Linguistics (2024)"},{"key":"23_CR29","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"23_CR30","doi-asserted-by":"crossref","unstructured":"Rahurkar, M.A., Hansen, J.H.L., Meyerhoff, J., Saviolakis, G., Koenig, M.: Frequency band analysis for stress detection using a teager energy operator based feature. In: INTERSPEECH (2002)","DOI":"10.21437\/ICSLP.2002-555"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Schuller, B., Arsic, D., Wallhoff, F., Rigoll, G.: Emotion recognition in the noise applying large acoustic feature sets (2006)","DOI":"10.21437\/SpeechProsody.2006-150"},{"key":"23_CR32","unstructured":"Schuller, B., W\u00f6llmer, M., Eyben, F., Rigoll, G.: Prosodic, spectral or voice quality? Feature type relevance for the discrimination of emotion pairs (2009)"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Shen, P., Changjun, Z., Chen, X.: Automatic speech emotion recognition using support vector machine. In: Proceedings of 2011 International Conference on Electronic and Mechanical Engineering and Information Technology, vol.\u00a02, pp. 621\u2013625. IEEE (2011)","DOI":"10.1109\/EMEIT.2011.6023178"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Srinivasan, S., Huang, Z., Kirchhoff, K.: Representation learning through cross-modal conditional teacher-student training for speech emotion recognition. In: 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2022, pp. 6442\u20136446. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747754"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Tian, L., Lai, C., Moore, J.: Polarity and intensity: the two aspects of sentiment analysis. In: Zadeh, A., Liang, P.P., Morency, L.-P., Poria, S., Cambria, E., Scherer, S. (eds.) Proceedings of Grand Challenge and Workshop on Human Multimodal Language (Challenge-HML), Melbourne, Australia, July 2018, pp. 40\u201347. Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/W18-3306"},{"key":"23_CR36","doi-asserted-by":"crossref","unstructured":"Truong, K.P., van Leeuwen, D.A., Neerincx, M.A., Jong, F.M.: Arousal and valence prediction in spontaneous emotional speech: felt versus perceived emotion (2009)","DOI":"10.21437\/Interspeech.2009-583"},{"issue":"9","key":"23_CR37","doi-asserted-by":"publisher","first-page":"1162","DOI":"10.1016\/j.specom.2006.04.003","volume":"48","author":"D Ververidis","year":"2006","unstructured":"Ververidis, D., Kotropoulos, C.: Emotional speech recognition: resources, features, and methods. Speech Commun. 48(9), 1162\u20131181 (2006)","journal-title":"Speech Commun."},{"key":"23_CR38","doi-asserted-by":"publisher","first-page":"10745","DOI":"10.1109\/TPAMI.2023.3263585","volume":"45","author":"J Wagner","year":"2023","unstructured":"Wagner, J., et al.: Dawn of the transformer era in speech emotion recognition: closing the valence gap. IEEE Trans. Pattern Anal. Mach. Intell. 45, 10745\u201310759 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Wang, K., Zhao, Y., Dong, Q., Ko, T., Wang, M.: MOSPC: MOS prediction based on pairwise comparison. arXiv preprint arXiv:2306.10493 (2023)","DOI":"10.18653\/v1\/2023.acl-short.132"},{"key":"23_CR40","unstructured":"Wang, Y., Boumadane, A., Heba, A.: A fine-tuned wav2vec 2.0\/hubert benchmark for speech emotion recognition, speaker verification and spoken language understanding. arXiv preprint arXiv:2111.02735 (2021)"},{"key":"23_CR41","doi-asserted-by":"publisher","first-page":"292","DOI":"10.3389\/fpsyg.2013.00292","volume":"4","author":"F Weninger","year":"2013","unstructured":"Weninger, F., Eyben, F., Schuller, B.W., Mortillaro, M., Scherer, K.R.: On the acoustics of emotion in audio: what speech, music, and sound have in common. Front. Psychol. 4, 292 (2013)","journal-title":"Front. Psychol."},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Wu, X., Bell, P., Rajan, A.: Can we trust explainable AI methods on ASR? An evaluation on phoneme recognition. In: 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2024, pp. 10296\u201310300 (2024)","DOI":"10.1109\/ICASSP48485.2024.10445989"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Yildirim, S., et al.: An acoustic study of emotions expressed in speech. In: Eighth International Conference on Spoken Language Processing (2004)","DOI":"10.21437\/Interspeech.2004-242"},{"key":"23_CR44","doi-asserted-by":"crossref","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision \u2013 ECCV 2014, pp. 818\u2013833. Springer, Cham (2014)","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"23_CR45","doi-asserted-by":"crossref","unstructured":"Zheng, W.Q., Yu, J.S., Zou, Y.X.: An experimental study of speech emotion recognition based on deep convolutional neural networks. In: 2015 International Conference on Affective Computing and Intelligent Interaction (ACII), pp. 827\u2013831. IEEE (2015)","DOI":"10.1109\/ACII.2015.7344669"}],"container-title":["Lecture Notes in Networks and Systems","Advances in Information and Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-84457-7_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,3]],"date-time":"2025-03-03T16:50:46Z","timestamp":1741020646000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-84457-7_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031844560","9783031844577"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-84457-7_23","relation":{},"ISSN":["2367-3370","2367-3389"],"issn-type":[{"type":"print","value":"2367-3370"},{"type":"electronic","value":"2367-3389"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"4 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"FICC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Future of Information and Communication Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Berlin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ficc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/saiconference.com\/FICC","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}