{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T00:52:34Z","timestamp":1775263954935,"version":"3.50.1"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031483080","type":"print"},{"value":"9783031483097","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-48309-7_7","type":"book-chapter","created":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T20:03:21Z","timestamp":1700597001000},"page":"79-93","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Aggregation Strategies of\u00a0Wav2vec 2.0 Embeddings for\u00a0Computational Paralinguistic Tasks"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3914-2036","authenticated-orcid":false,"given":"Mercedes","family":"Vetr\u00e1b","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2864-6466","authenticated-orcid":false,"given":"G\u00e1bor","family":"Gosztolya","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,11,22]]},"reference":[{"key":"7_CR1","unstructured":"Baevski, A., Auli, M., Conneau, A.: Wav2vec 2.0: learning the structure of speech from raw audio (2020). https:\/\/ai.meta.com\/blog\/wav2vec-20-learning-the-structure-of-speech-from-raw-audio\/"},{"key":"7_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"CC Chang","year":"2011","unstructured":"Chang, C.C., Lin, C.J.: LIBSVM: a library for support vector machines. ACM Trans. Intell. Syst. Technol. 2, 1\u201327 (2011). https:\/\/doi.org\/10.1145\/1961189.1961199","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"7_CR3","doi-asserted-by":"publisher","unstructured":"Chen, J., Ye, J., Tang, F., Zhou, J.: Automatic detection of Alzheimer\u2019s Disease using spontaneous speech only. In: Proceedings of the Interspeech 2021, pp. 3830\u20133834 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-2002","DOI":"10.21437\/Interspeech.2021-2002"},{"key":"7_CR4","doi-asserted-by":"publisher","unstructured":"Conneau, A., Baevski, A., Collobert, R., Mohamed, A., Auli, M.: Unsupervised Cross-lingual Representation Learning for Speech Recognition (2020). https:\/\/doi.org\/10.48550\/ARXIV.2006.13979","DOI":"10.48550\/ARXIV.2006.13979"},{"key":"7_CR5","doi-asserted-by":"publisher","unstructured":"Egas-L\u00f3pez, J.V., Gosztolya, G.: Deep Neural Network Embeddings for the estimation of the degree of sleepiness. In: IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, pp. 7288\u20137292 (2021). https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9413589","DOI":"10.1109\/ICASSP39728.2021.9413589"},{"key":"7_CR6","doi-asserted-by":"publisher","unstructured":"Egas-L\u00f3pez, J.V., Kiss, G., Sztah\u00f3, D., Gosztolya, G.: Automatic assessment of the degree of clinical depression from speech using X-Vectors. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 8502\u20138506 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746068","DOI":"10.1109\/ICASSP43922.2022.9746068"},{"key":"7_CR7","doi-asserted-by":"publisher","unstructured":"Egas-L\u00f3pez, J.V., Vetr\u00e1b, M., T\u00f3th, L., Gosztolya, G.: identifying conflict escalation and primates by using ensemble x-vectors and fisher vector features. In: Proceedings of the Interspeech 2021, pp. 476\u2013480 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1173","DOI":"10.21437\/Interspeech.2021-1173"},{"key":"7_CR8","doi-asserted-by":"publisher","first-page":"7","DOI":"10.12700\/APH.17.6.2020.6.1","volume":"17","author":"G Gosztolya","year":"2020","unstructured":"Gosztolya, G.: Using the Fisher vector representation for audio-based emotion recognition. Acta Polytechnica Hungarica 17, 7\u201323 (2020)","journal-title":"Acta Polytechnica Hungarica"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Gosztolya, G., T\u00f3th, L., Svindt, V., B\u00f3na, J., Hoffmann, I.: Using acoustic deep neural network embeddings to detect multiple sclerosis from speech. In: Proceedings of ICASSP, pp. 6927\u20136931 (2022)","DOI":"10.1109\/ICASSP43922.2022.9746856"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Gosztolya, G., Beke, A., Neuberger, T.: Differentiating laughter types via HMM\/DNN and probabilistic sampling. In: Speech and Computer, SPECOM 2019. vol. 11658, pp. 122\u2013132 (2019)","DOI":"10.1007\/978-3-030-26061-3_13"},{"key":"7_CR11","doi-asserted-by":"publisher","unstructured":"Grezes, F., Richards, J., Rosenberg, A.: Let me finish: automatic conflict detection using speaker overlap. In: Proceedings of the Interspeech 2013, pp. 200\u2013204 (2013). https:\/\/doi.org\/10.21437\/Interspeech.2013-67","DOI":"10.21437\/Interspeech.2013-67"},{"key":"7_CR12","unstructured":"Grosman, J.: Fine-tuned XLSR-53 large model for speech recognition in German (2021). https:\/\/huggingface.co\/jonatasgrosman\/wav2vec2-large-xlsr-53-german"},{"key":"7_CR13","doi-asserted-by":"publisher","first-page":"1590","DOI":"10.1109\/TASL.2008.2002085","volume":"16","author":"KJ Han","year":"2008","unstructured":"Han, K.J., Kim, S., Narayanan, S.S.: Strategies to improve the robustness of Agglomerative Hierarchical Clustering under data source variation for speaker diarization. IEEE Trans. Audio Speech Lang. Process. 16, 1590\u20131601 (2008). https:\/\/doi.org\/10.1109\/TASL.2008.2002085","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"7_CR14","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MSP.2012.2205597","volume":"29","author":"G Hinton","year":"2012","unstructured":"Hinton, G., et al.: Deep Neural Networks for Acoustic Modeling in Speech Recognition: the shared views of four research groups. IEEE Signal Process. Mag. 29, 82\u201397 (2012). https:\/\/doi.org\/10.1109\/MSP.2012.2205597","journal-title":"IEEE Signal Process. Mag."},{"key":"7_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.3389\/fninf.2021.578369","volume":"15","author":"L Jeancolas","year":"2021","unstructured":"Jeancolas, L., et al.: X-Vectors: new quantitative biomarkers for early Parkinson\u2019s Disease detection from speech. Front. Neuroinform. 15, 1\u201318 (2021). https:\/\/doi.org\/10.3389\/fninf.2021.578369","journal-title":"Front. Neuroinform."},{"key":"7_CR16","doi-asserted-by":"publisher","unstructured":"Kadiri, S., Kethireddy, R., Alku, P.: Parkinson\u2019s Disease detection from speech using Single Frequency Filtering Cepstral Coefficients. In: Proceedings of the Interspeech 2020, pp. 4971\u20134975 (2020). https:\/\/doi.org\/10.21437\/Interspeech.2020-3197","DOI":"10.21437\/Interspeech.2020-3197"},{"key":"7_CR17","doi-asserted-by":"publisher","unstructured":"Kaya, H., Karpov, A., Salah, A.: Fisher vectors with cascaded normalization for paralinguistic analysis. In: Proceedings of the Interspeech 2015, pp. 909\u2013913 (2015). https:\/\/doi.org\/10.21437\/Interspeech.2015-193","DOI":"10.21437\/Interspeech.2015-193"},{"key":"7_CR18","unstructured":"Krajewski, J., Schieder, S., Batliner, A.: Description of the upper respiratory tract infection corpus (urtic). In: Proceedings of the Interspeech 2017 (2017)"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Lin, W.W., Mak, M.W.: Wav2spk: a simple DNN architecture for learning speaker embeddings from waveforms. In: Proceedings of Interspeech, pp. 3211\u20133215 (2020)","DOI":"10.21437\/Interspeech.2020-1287"},{"key":"7_CR20","doi-asserted-by":"publisher","unstructured":"Metze, F., Batliner, A., Eyben, F., Polzehl, T., Schuller, B., Steidl, S.: Emotion recognition using imperfect speech recognition. In: Proceedings of the Interspeech 2010, pp. 478\u2013481 (2010). https:\/\/doi.org\/10.21437\/Interspeech.2010-202","DOI":"10.21437\/Interspeech.2010-202"},{"key":"7_CR21","doi-asserted-by":"publisher","unstructured":"Mustaqeem, Kwon, S.: CLSTM: deep feature-based speech emotion recognition using the hierarchical ConvLSTM network. Mathematics 8, 1\u201319 (2020). https:\/\/doi.org\/10.3390\/math8122133","DOI":"10.3390\/math8122133"},{"key":"7_CR22","doi-asserted-by":"publisher","unstructured":"Oflazoglu, C., Yildirim, S.: Recognizing emotion from Turkish speech using acoustic features. In: EURASIP Journal on Audio Speech and Music Processing 2013 (2013). https:\/\/doi.org\/10.1186\/1687-4722-2013-26","DOI":"10.1186\/1687-4722-2013-26"},{"key":"7_CR23","doi-asserted-by":"publisher","unstructured":"Pappagari, R., et al.: Automatic detection and assessment of Alzheimer Disease using speech and language technologies in low-resource scenarios. In: Proceedings of the Interspeech 2021, pp. 3825\u20133829 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-1850","DOI":"10.21437\/Interspeech.2021-1850"},{"key":"7_CR24","doi-asserted-by":"publisher","unstructured":"P\u00e9rez-Toro, P., et al.: Alzheimer\u2019s detection from English to Spanish using acoustic and linguistic embeddings. In: Proceedings of Interspeech 2022, pp. 2483\u20132487 (2022). https:\/\/doi.org\/10.21437\/Interspeech.2022-10883","DOI":"10.21437\/Interspeech.2022-10883"},{"key":"7_CR25","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1515\/jee-2017-0001","volume":"68","author":"J P\u0159ibil","year":"2017","unstructured":"P\u0159ibil, J., P\u0159ibilov\u00e1, A., Matou\u0161ek, J.: GMM-based speaker age and gender classification in Czech and Slovak. J. Electr. Eng. 68, 3\u201312 (2017). https:\/\/doi.org\/10.1515\/jee-2017-0001","journal-title":"J. Electr. Eng."},{"key":"7_CR26","doi-asserted-by":"publisher","unstructured":"Schuller, B., Steidl, S., Batliner, A.: The INTERSPEECH 2009 emotion challenge. In: Proceedings of the Interspeech 2009, pp. 312\u2013315 (2009). https:\/\/doi.org\/10.21437\/Interspeech. 2009\u2013103","DOI":"10.21437\/Interspeech"},{"key":"7_CR27","doi-asserted-by":"publisher","unstructured":"Schuller, B., et al.: The INTERSPEECH 2017 computational paralinguistics challenge: addressee, cold & snoring. In: Proceedings of the Interspeech 2017, pp. 3442\u20133446 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-43","DOI":"10.21437\/Interspeech.2017-43"},{"key":"7_CR28","doi-asserted-by":"publisher","unstructured":"Schuller, B., et al.: The INTERSPEECH 2015 computational paralinguistics challenge: Nativeness, Parkinson\u2019s & eating condition. In: Proceedings of the Interspeech 2015, pp. 478\u2013482 (2015). https:\/\/doi.org\/10.21437\/Interspeech.2015-179","DOI":"10.21437\/Interspeech.2015-179"},{"key":"7_CR29","doi-asserted-by":"publisher","unstructured":"Schuller, B.W., et al.: The INTERSPEECH 2019 computational paralinguistics challenge: Styrian dialects, continuous sleepiness, baby sounds & orca activity. In: Proceedings of the Interspeech 2019, pp. 2378\u20132382 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1122","DOI":"10.21437\/Interspeech.2019-1122"},{"key":"7_CR30","doi-asserted-by":"publisher","unstructured":"Sheikh, S.A., Sahidullah, M., Hirsch, F., Ouni, S.: Introducing ECAPA-TDNN and Wav2Vec2.0 Embeddings to Stuttering Detection (2022). https:\/\/doi.org\/10.48550\/ARXIV.2204.01564","DOI":"10.48550\/ARXIV.2204.01564"},{"key":"7_CR31","doi-asserted-by":"publisher","unstructured":"Snyder, D., Garcia-Romero, D., Sell, G., Povey, D., Khudanpur, S.: X-Vectors: robust DNN embeddings for speaker verification. In: IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, pp. 5329\u20135333 (2018). https:\/\/doi.org\/10.1109\/ICASSP.2018.8461375","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"7_CR32","unstructured":"Steidl, S.: Automatic classification of emotion related user states in spontaneous children\u2019s speech. Logos-Verlag Berlin, Germany (2009). https:\/\/d-nb.info\/992551641"},{"key":"7_CR33","doi-asserted-by":"crossref","unstructured":"Tzirakis, P., Zhang, J., Schuller, B.W.: End-to-end speech emotion recognition using deep neural networks. In: 2018 IEEE international Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5089\u20135093 (2018)","DOI":"10.1109\/ICASSP.2018.8462677"},{"key":"7_CR34","doi-asserted-by":"publisher","unstructured":"Van Segbroeck, M., et al.: Classification of cognitive load from speech using an i-vector framework. In: Proceedings of the Interspeech 2014, pp. 751\u2013755 (2014). https:\/\/doi.org\/10.21437\/Interspeech.2014-114","DOI":"10.21437\/Interspeech.2014-114"},{"key":"7_CR35","unstructured":"Vetr\u00e1b, M., Gosztolya, G.: Speech emotion detection form a Hungarian database with the Bag-of-Audi-Words technique (in Hungarian). In: Proceedings of MSZNY, pp. 265\u2013274. Szeged (2019)"},{"key":"7_CR36","doi-asserted-by":"publisher","first-page":"5208","DOI":"10.3390\/s23115208","volume":"23","author":"M Vetr\u00e1b","year":"2023","unstructured":"Vetr\u00e1b, M., Gosztolya, G.: Using hybrid HMM\/DNN embedding extractor models in computational paralinguistic tasks. Sensors 23, 5208 (2023)","journal-title":"Sensors"},{"key":"7_CR37","doi-asserted-by":"publisher","unstructured":"Vetr\u00e1b, M., et al.: Using spectral sequence-to-sequence autoencoders to assess mild cognitive impairment. In: IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP, pp. 6467\u20136471 (2022). https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9746148","DOI":"10.1109\/ICASSP43922.2022.9746148"},{"key":"7_CR38","doi-asserted-by":"publisher","unstructured":"V\u00e1squez-Correa, J., Orozco-Arroyave, J.R., N\u00f6th, E.: Convolutional Neural Network to model articulation impairments in patients with Parkinson\u2019s Disease. In: Proceedings of the Interspeech 2017, pp. 314\u2013318 (2017). https:\/\/doi.org\/10.21437\/Interspeech.2017-1078","DOI":"10.21437\/Interspeech.2017-1078"},{"key":"7_CR39","doi-asserted-by":"publisher","unstructured":"Wagner, J., Schiller, D., Seiderer, A., Andre, E.: Deep learning in paralinguistic recognition tasks: are hand-crafted features still relevant? In: Interspeech, pp. 147\u2013151 (2018). https:\/\/doi.org\/10.21437\/Interspeech.2018-1238","DOI":"10.21437\/Interspeech.2018-1238"},{"key":"7_CR40","first-page":"9","volume":"33","author":"W Wang","year":"2008","unstructured":"Wang, W., Lu, P., Yan, Y.: An improved hierarchical speaker clustering. Acta Acustica 33, 9\u201314 (2008)","journal-title":"Acta Acustica"},{"key":"7_CR41","doi-asserted-by":"publisher","unstructured":"Zhao, Z., Bao, Z., Zhang, Z., Cummins, N., Wang, H., Schuller, B.: Attention-enhanced connectionist temporal classification for discrete speech emotion recognition. In: Proceedings of the Interspeech 2019, pp. 206\u2013210 (2019). https:\/\/doi.org\/10.21437\/Interspeech.2019-1649","DOI":"10.21437\/Interspeech.2019-1649"}],"container-title":["Lecture Notes in Computer Science","Speech and Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-48309-7_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,21]],"date-time":"2023-11-21T20:10:01Z","timestamp":1700597401000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-48309-7_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031483080","9783031483097"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-48309-7_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"22 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SPECOM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Speech and Computer","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dharwad","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"specom2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iitdh.ac.in\/specom-2023\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"174","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"94","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"54% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}