{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T03:24:01Z","timestamp":1742959441574,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031780134"},{"type":"electronic","value":"9783031780141"}],"license":[{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T00:00:00Z","timestamp":1732233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78014-1_10","type":"book-chapter","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T12:25:24Z","timestamp":1732191924000},"page":"122-137","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Utilizing Speaker Models and\u00a0Topic Markers for\u00a0Emotion Recognition in\u00a0Dialogues"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8992-9654","authenticated-orcid":false,"given":"Olesia","family":"Makhnytkina","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7010-1585","authenticated-orcid":false,"given":"Yuri","family":"Matveev","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5213-8477","authenticated-orcid":false,"given":"Alexander","family":"Zubakov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7494-8329","authenticated-orcid":false,"given":"Anton","family":"Matveev","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,22]]},"reference":[{"key":"10_CR1","unstructured":"Mikolov, T., Yih, W.-T., Zweig, G.: Linguistic regularities in continuous space word representations. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 746\u2013751. Association for Computational Linguistics (2013). https:\/\/aclanthology.org\/N13-1090"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching Word Vectors with Subword Information (Version 2) (2016). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1607.04606","DOI":"10.1162\/tacl_a_00051"},{"key":"10_CR3","doi-asserted-by":"publisher","unstructured":"Pennington, J., Socher, R., Manning, C.: GloVe: global vectors for word representation. In Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP). Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP). Association for Computational Linguistics (2014). https:\/\/doi.org\/10.3115\/v1\/d14-1162","DOI":"10.3115\/v1\/d14-1162"},{"key":"10_CR4","doi-asserted-by":"publisher","unstructured":"Peters, M.E., et al.: Deep contextualized word representations (Version 2) (2018). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1802.05365","DOI":"10.48550\/ARXIV.1802.05365"},{"key":"10_CR5","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (Version 2) (2018). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1810.04805","DOI":"10.48550\/ARXIV.1810.04805"},{"key":"10_CR6","unstructured":"Liu, Y., et al.: RoBERTa: A Robustly Optimized BERT Pretraining Approach (Version 1) (2019). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1907.11692"},{"key":"10_CR7","unstructured":"Raffel, C., et al.: Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer (Version 4) (2019). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1910.10683"},{"key":"10_CR8","doi-asserted-by":"publisher","unstructured":"Lian, Z., Liu, B., Tao, J.: SMIN: semi-supervised multi-modal interaction network for conversational emotion recognition. In: IEEE Transactions on Affective Computing (Vol. 14, Issue 3, pp. 2415\u20132429). Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/taffc.2022.3141237","DOI":"10.1109\/taffc.2022.3141237"},{"key":"10_CR9","doi-asserted-by":"publisher","unstructured":"Arumugam, B., Bhattacharjee, S. D., Yuan, J.: Multimodal attentive learning for real-time explainable emotion recognition in conversations. In 2022 IEEE International Symposium on Circuits and Systems (ISCAS) (Vol. 2, pp. 1210\u20131214). 2022 IEEE International Symposium on Circuits and Systems (ISCAS). IEEE (2022). https:\/\/doi.org\/10.1109\/iscas48785.2022.9938005","DOI":"10.1109\/iscas48785.2022.9938005"},{"key":"10_CR10","doi-asserted-by":"publisher","unstructured":"Ho, N.-H., Yang, H.-J., Kim, S.-H., Lee, G.: Multimodal approach of speech emotion recognition using multi-level multi-head fusion attention-based recurrent neural network. In: IEEE Access (Vol. 8, pp. 61672\u201361686). Institute of Electrical and Electronics Engineers (IEEE) (2020). https:\/\/doi.org\/10.1109\/access.2020.2984368","DOI":"10.1109\/access.2020.2984368"},{"key":"10_CR11","doi-asserted-by":"publisher","unstructured":"Xu, Y., Xu, H., Zou, J.: HGFM: a hierarchical grained and feature model for acoustic emotion recognition. In ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (Vol. 8, pp. 6499\u20136503). ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2020). https:\/\/doi.org\/10.1109\/icassp40776.2020.9053039","DOI":"10.1109\/icassp40776.2020.9053039"},{"key":"10_CR12","doi-asserted-by":"publisher","unstructured":"Oliveira, J., Praca, I.: On the usage of pre-trained speech recognition deep layers to detect emotions. In: IEEE Access (Vol. 9, pp. 9699\u20139705). Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/access.2021.3051083","DOI":"10.1109\/access.2021.3051083"},{"key":"10_CR13","doi-asserted-by":"publisher","unstructured":"Degottex, G., Kane, J., Drugman, T., Raitio, T., Scherer, S.: COVAREP; a collaborative voice analysis repository for speech technologies. In 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2014). https:\/\/doi.org\/10.1109\/icassp.2014.6853739","DOI":"10.1109\/icassp.2014.6853739"},{"key":"10_CR14","doi-asserted-by":"publisher","unstructured":"Eyben, F., W\u00f6llmer, M., Schuller, B.: OpenSMILE. In: Proceedings of the 18th ACM International Conference on Multimedia. MM \u201910: ACM Multimedia Conference. ACM (2010). https:\/\/doi.org\/10.1145\/1873951.1874246","DOI":"10.1145\/1873951.1874246"},{"key":"10_CR15","doi-asserted-by":"publisher","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: Unsupervised Pre-training for Speech Recognition (Version 4) (2019). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1904.05862","DOI":"10.48550\/ARXIV.1904.05862"},{"key":"10_CR16","doi-asserted-by":"publisher","unstructured":"Simonyan, K., Zisserman, A.: Very Deep Convolutional Networks for Large-Scale Image Recognition (Version 6) (2014). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1409.1556","DOI":"10.48550\/ARXIV.1409.1556"},{"key":"10_CR17","doi-asserted-by":"publisher","unstructured":"Matveev, A., Matveev, Y., Frolova, O., Nikolaev, A., Lyakso, E.: A neural network architecture for children\u2019s audio-visual emotion recognition. In: Mathematics (Vol. 11, Issue 22, p. 4573). MDPI AG (2023). https:\/\/doi.org\/10.3390\/math11224573","DOI":"10.3390\/math11224573"},{"key":"10_CR18","doi-asserted-by":"publisher","unstructured":"Meng, H., Yan, T., Yuan, F., Wei, H.: Speech emotion recognition from 3D Log-Mel spectrograms with deep learning network. In: IEEE Access (Vol. 7, pp. 125868-125881). Institute of Electrical and Electronics Engineers (IEEE) (2019). https:\/\/doi.org\/10.1109\/access.2019.2938007","DOI":"10.1109\/access.2019.2938007"},{"key":"10_CR19","doi-asserted-by":"publisher","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations (Version 6) (2018). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1810.02508","DOI":"10.48550\/ARXIV.1810.02508"},{"key":"10_CR20","doi-asserted-by":"publisher","unstructured":"Schneider, S., Baevski, A., Collobert, R., Auli, M.: wav2vec: Unsupervised Pre-training for Speech Recognition (Version 4) (2019). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1904.05862","DOI":"10.48550\/ARXIV.1904.05862"},{"key":"10_CR21","doi-asserted-by":"publisher","unstructured":"Ta, B.T., Nguyen, T.L., Dang, D.S., Le, N.M., Do, V.H.: Improving speech emotion recognition via fine-tuning ASR with speaker information. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC) (Vol. 38, pp. 1\u20136). 2022 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). IEEE (2022). https:\/\/doi.org\/10.23919\/apsipaasc55919.2022.9980214","DOI":"10.23919\/apsipaasc55919.2022.9980214"},{"key":"10_CR22","doi-asserted-by":"publisher","unstructured":"Ulgen, I.R., Du, Z., Busso, C., Sisman, B.: Revealing Emotional Clusters in Speaker Embeddings: A Contrastive Learning Strategy for Speech Emotion Recognition (2024). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.2401.11017","DOI":"10.48550\/ARXIV.2401.11017"},{"key":"10_CR23","doi-asserted-by":"publisher","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. In: IEEE Journal of Selected Topics in Signal Processing (Vol. 16, Issue 6, pp. 1505\u20131518). Institute of Electrical and Electronics Engineers (IEEE) (2022). https:\/\/doi.org\/10.1109\/jstsp.2022.3188113","DOI":"10.1109\/jstsp.2022.3188113"},{"key":"10_CR24","doi-asserted-by":"publisher","unstructured":"Ghosal, D., Majumder, N., Gelbukh, A., Mihalcea, R., Poria, S.: COSMIC: COmmonSense knowledge for eMotion Identification in Conversations (Version 1) (2020). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.2010.02795","DOI":"10.48550\/ARXIV.2010.02795"},{"key":"10_CR25","doi-asserted-by":"publisher","unstructured":"Zhu, L., Pergola, G., Gui, L., Zhou, D., He, Y.: Topic-driven and knowledge-aware transformer for dialogue emotion detection. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.125","DOI":"10.18653\/v1\/2021.acl-long.125"},{"key":"10_CR26","doi-asserted-by":"publisher","unstructured":"Sap, M., et al.: ATOMIC: An Atlas of Machine Commonsense for If-Then Reasoning (Version 3) (2018). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.1811.00146","DOI":"10.48550\/ARXIV.1811.00146"},{"key":"10_CR27","doi-asserted-by":"publisher","unstructured":"Lian, Z., Liu, B., Tao, J.: CTNet: conversational transformer network for emotion recognition. In: IEEE\/ACM Transactions on Audio, Speech, and Language Processing (Vol. 29, pp. 985\u20131000). Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/taslp.2021.3049898","DOI":"10.1109\/taslp.2021.3049898"},{"key":"10_CR28","doi-asserted-by":"publisher","unstructured":"Huang, X., et al.: Emotion detection for conversations based on reinforcement learning framework. In: IEEE MultiMedia (Vol. 28, Issue 2, pp. 76\u201385). Institute of Electrical and Electronics Engineers (IEEE) (2021). https:\/\/doi.org\/10.1109\/mmul.2021.3065678","DOI":"10.1109\/mmul.2021.3065678"},{"key":"10_CR29","doi-asserted-by":"publisher","unstructured":"Ma, H., Wang, J., Lin, H., Zhang, B., Zhang, Y., Xu, B.: A transformer-based model with self-distillation for multimodal emotion recognition in conversations. In: IEEE Transactions on Multimedia (Vol. 26, pp. 776\u2013788). Institute of Electrical and Electronics Engineers (IEEE) (2024). https:\/\/doi.org\/10.1109\/tmm.2023.3271019","DOI":"10.1109\/tmm.2023.3271019"},{"key":"10_CR30","doi-asserted-by":"publisher","unstructured":"Ren, M., Huang, X., Liu, J., Liu, M., Li, X., Liu, A.-A.: MALN: multimodal adversarial learning network for conversational emotion recognition. In: IEEE Transactions on Circuits and Systems for Video Technology (Vol. 33, Issue 11, pp. 6965\u20136980). Institute of Electrical and Electronics Engineers (IEEE) (2023). https:\/\/doi.org\/10.1109\/tcsvt.2023.3273577","DOI":"10.1109\/tcsvt.2023.3273577"},{"key":"10_CR31","doi-asserted-by":"publisher","unstructured":"Guo, L., Wang, L., Dang, J., Fu, Y., Liu, J., Ding, S.: Emotion recognition with multimodal transformer fusion framework based on acoustic and lexical information. In: IEEE MultiMedia (Vol. 29, Issue 2, pp. 94\u2013103). Institute of Electrical and Electronics Engineers (IEEE) (2022). https:\/\/doi.org\/10.1109\/mmul.2022.3161411","DOI":"10.1109\/mmul.2022.3161411"},{"key":"10_CR32","doi-asserted-by":"publisher","unstructured":"Xu, C., Gao, Y.: Multi-modal transformer with multi-head attention for emotion recognition. In: 2023 IEEE International Conference on Sensors, Electronics and Computer Engineering (ICSECE) (pp. 826\u2013831). IEEE (2023). https:\/\/doi.org\/10.1109\/icsece58870.2023.10263303","DOI":"10.1109\/icsece58870.2023.10263303"},{"key":"10_CR33","doi-asserted-by":"publisher","unstructured":"Hou, M., Zhang, Z., Lu, G.: Multi-modal emotion recognition with self-guided modality calibration. In ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 4688\u20134692). ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2022). https:\/\/doi.org\/10.1109\/icassp43922.2022.9747859","DOI":"10.1109\/icassp43922.2022.9747859"},{"key":"10_CR34","doi-asserted-by":"publisher","unstructured":"Zhong, P., Wang, D., Miao, C.: Knowledge-Enriched Transformer for Emotion Detection in Textual Conversations (Version 2) (2019). arXiv https:\/\/doi.org\/10.48550\/ARXIV.1909.10681","DOI":"10.48550\/ARXIV.1909.10681"},{"key":"10_CR35","doi-asserted-by":"publisher","unstructured":"Li, J., Zhang, M., Ji, D., Liu, Y.: Multi-Task Learning with Auxiliary Speaker Identification for Conversational Emotion Recognition (Version 2) (2020). arXiv https:\/\/doi.org\/10.48550\/ARXIV.2003.01478","DOI":"10.48550\/ARXIV.2003.01478"},{"key":"10_CR36","doi-asserted-by":"publisher","unstructured":"Kim, T., Vossen, P.: EmoBERTa: Speaker-Aware Emotion Recognition in Conversation with RoBERTa (Version 1) (2021). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.2108.12009","DOI":"10.48550\/ARXIV.2108.12009"},{"key":"10_CR37","doi-asserted-by":"publisher","unstructured":"Son, J., Kim, J., Lim, J., Lim, H.: GRASP: Guiding model with RelAtional Semantics using Prompt for Dialogue Relation Extraction (Version 4) (2022). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.2208.12494","DOI":"10.48550\/ARXIV.2208.12494"},{"key":"10_CR38","doi-asserted-by":"publisher","unstructured":"Hu, G., Lin, T.-E., Zhao, Y., Lu, G., Wu, Y., Li, Y.: UniMSE: Towards Unified Multimodal Sentiment Analysis and Emotion Recognition (Version 1) (2022). arXiv. https:\/\/doi.org\/10.48550\/ARXIV.2211.11256","DOI":"10.48550\/ARXIV.2211.11256"},{"key":"10_CR39","doi-asserted-by":"publisher","unstructured":"Ma, H., Wang, J., Lin, H., Zhang, B., Zhang, Y., Xu, B.: A transformer-based model with self-distillation for multimodal emotion recognition in conversations. In: IEEE Transactions on Multimedia (Vol. 26, pp. 776\u2013788). Institute of Electrical and Electronics Engineers (IEEE) (2024). https:\/\/doi.org\/10.1109\/tmm.2023.3271019","DOI":"10.1109\/tmm.2023.3271019"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78014-1_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T13:05:37Z","timestamp":1732194337000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78014-1_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,22]]},"ISBN":["9783031780134","9783031780141"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78014-1_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,22]]},"assertion":[{"value":"22 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Belgrade","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Serbia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/specom2024.ftn.uns.ac.rs\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}