{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:06:26Z","timestamp":1776888386929,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3551876.3554801","type":"proceedings-article","created":{"date-parts":[[2022,9,28]],"date-time":"2022-09-28T22:17:21Z","timestamp":1664403441000},"page":"89-94","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Transformer-based Non-Verbal Emotion Recognition"],"prefix":"10.1145","author":[{"given":"Lorenzo","family":"Vaiani","sequence":"first","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alkis","family":"Koudounas","sequence":"additional","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Moreno","family":"La Quatra","sequence":"additional","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Luca","family":"Cagliero","sequence":"additional","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Paolo","family":"Garza","sequence":"additional","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elena","family":"Baralis","sequence":"additional","affiliation":[{"name":"Politecnico di Torino, Turin, Italy"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1002\/eng2.12189"},{"key":"e_1_3_2_2_2_1","unstructured":"Alexei Baevski Michael Auli and Abdelrahman Mohamed. 2019. Effectiveness of self-supervised pre-training for speech recognition. https:\/\/doi.org\/10.48550\/ ARXIV.1911.03912  Alexei Baevski Michael Auli and Abdelrahman Mohamed. 2019. Effectiveness of self-supervised pre-training for speech recognition. https:\/\/doi.org\/10.48550\/ ARXIV.1911.03912"},{"key":"#cr-split#-e_1_3_2_2_3_1.1","unstructured":"Alexei Baevski Wei-Ning Hsu Qiantong Xu Arun Babu Jiatao Gu and Michael Auli. 2022. data2vec: A General Framework for Self-supervised Learning in Speech Vision and Language. https:\/\/doi.org\/10.48550\/ARXIV.2202.03555 10.48550\/ARXIV.2202.03555"},{"key":"#cr-split#-e_1_3_2_2_3_1.2","unstructured":"Alexei Baevski Wei-Ning Hsu Qiantong Xu Arun Babu Jiatao Gu and Michael Auli. 2022. data2vec: A General Framework for Self-supervised Learning in Speech Vision and Language. https:\/\/doi.org\/10.48550\/ARXIV.2202.03555"},{"key":"e_1_3_2_2_4_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rylwJxrYDS","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski , Steffen Schneider , and Michael Auli . 2020 . vq-wav2vec: SelfSupervised Learning of Discrete Speech Representations . In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rylwJxrYDS Alexei Baevski, Steffen Schneider, and Michael Auli. 2020. vq-wav2vec: SelfSupervised Learning of Discrete Speech Representations. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rylwJxrYDS"},{"key":"e_1_3_2_2_5_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems. 12449--12460","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski , Henry Zhou , Abdelrahman Mohamed , and Michael Auli . 2020 . wav2vec 2.0: a framework for self-supervised learning of speech representations . In Proceedings of the 34th International Conference on Neural Information Processing Systems. 12449--12460 . Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: a framework for self-supervised learning of speech representations. In Proceedings of the 34th International Conference on Neural Information Processing Systems. 12449--12460."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"#cr-split#-e_1_3_2_2_7_1.1","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao Jian Wu Long Zhou Shuo Ren Yanmin Qian Yao Qian Jian Wu Michael Zeng Xiangzhan Yu and Furu Wei. 2021. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. https:\/\/doi.org\/10.48550\/ARXIV.2110.13900 10.48550\/ARXIV.2110.13900"},{"key":"#cr-split#-e_1_3_2_2_7_1.2","doi-asserted-by":"crossref","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao Jian Wu Long Zhou Shuo Ren Yanmin Qian Yao Qian Jian Wu Michael Zeng Xiangzhan Yu and Furu Wei. 2021. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. https:\/\/doi.org\/10.48550\/ARXIV.2110.13900","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the 37th International Conference on Machine Learning (Proceedings of Machine Learning Research","author":"Chen Ting","year":"2020","unstructured":"Ting Chen , Simon Kornblith , Mohammad Norouzi , and Geoffrey Hinton . 2020 . A Simple Framework for Contrastive Learning of Visual Representations . In Proceedings of the 37th International Conference on Machine Learning (Proceedings of Machine Learning Research , Vol. 119), Hal Daum\u00e9 III and Aarti Singh (Eds.). PMLR, 1597--1607. https:\/\/proceedings.mlr.press\/v119\/chen20j.html Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A Simple Framework for Contrastive Learning of Visual Representations. In Proceedings of the 37th International Conference on Machine Learning (Proceedings of Machine Learning Research, Vol. 119), Hal Daum\u00e9 III and Aarti Singh (Eds.). PMLR, 1597--1607. https:\/\/proceedings.mlr.press\/v119\/chen20j.html"},{"key":"e_1_3_2_2_9_1","volume-title":"Childers and Kuang chieh Wu","author":"Donald","year":"1991","unstructured":"Donald G. Childers and Kuang chieh Wu . 1991 . Gender recognition from speech. Part II: Fine analysis. The Journal of the Acoustical Society of America 90 4 Pt 1 (1991), 1841--56. Donald G. Childers and Kuang chieh Wu. 1991. Gender recognition from speech. Part II: Fine analysis. The Journal of the Acoustical Society of America 90 4 Pt 1 (1991), 1841--56."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2017.8273615"},{"key":"#cr-split#-e_1_3_2_2_11_1.1","unstructured":"Yu-An Chung and James Glass. 2019. Generative Pre-Training for Speech with Autoregressive Predictive Coding. https:\/\/doi.org\/10.48550\/ARXIV.1910.12607 10.48550\/ARXIV.1910.12607"},{"key":"#cr-split#-e_1_3_2_2_11_1.2","doi-asserted-by":"crossref","unstructured":"Yu-An Chung and James Glass. 2019. Generative Pre-Training for Speech with Autoregressive Predictive Coding. https:\/\/doi.org\/10.48550\/ARXIV.1910.12607","DOI":"10.1109\/ICASSP40776.2020.9054438"},{"key":"e_1_3_2_2_12_1","volume-title":"Glass","author":"Chung Yu-An","year":"2019","unstructured":"Yu-An Chung , Wei-Ning Hsu , Hao Tang , and James R . Glass . 2019 . An Unsupervised Autoregressive Model for Speech Representation Learning. In INTERSPEECH. Yu-An Chung, Wei-Ning Hsu, Hao Tang, and James R. Glass. 2019. An Unsupervised Autoregressive Model for Speech Representation Learning. In INTERSPEECH."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00779-020-01389-0"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/10.846676"},{"key":"e_1_3_2_2_15_1","volume-title":"Momentum Contrast for Unsupervised Visual Representation Learning. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE Computer Society, 9726--9735","author":"He Kaiming","year":"2020","unstructured":"Kaiming He , Haoqi Fan , Yuxin Wu , Saining Xie , and Ross Girshick . 2020 . Momentum Contrast for Unsupervised Visual Representation Learning. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE Computer Society, 9726--9735 . Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE Computer Society, 9726--9735."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3076364"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"#cr-split#-e_1_3_2_2_18_1.1","doi-asserted-by":"crossref","unstructured":"Kun-Yi Huang Chung-Hsien Wu Qian-Bei Hong Ming-Hsiang Su and Yi-Hsuan Chen. 2019. Speech Emotion Recognition Using Deep Neural Network Considering Verbal and Nonverbal Speech Sounds. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 5866--5870. https:\/\/doi.org\/10.1109\/ICASSP.2019.8682283 10.1109\/ICASSP.2019.8682283","DOI":"10.1109\/ICASSP.2019.8682283"},{"key":"#cr-split#-e_1_3_2_2_18_1.2","doi-asserted-by":"crossref","unstructured":"Kun-Yi Huang Chung-Hsien Wu Qian-Bei Hong Ming-Hsiang Su and Yi-Hsuan Chen. 2019. Speech Emotion Recognition Using Deep Neural Network Considering Verbal and Nonverbal Speech Sounds. In ICASSP 2019 - 2019 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP). 5866--5870. https:\/\/doi.org\/10.1109\/ICASSP.2019.8682283","DOI":"10.1109\/ICASSP.2019.8682283"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2936124"},{"key":"e_1_3_2_2_20_1","volume-title":"Data Augmenting Contrastive Learning of Speech Representations in the Time Domain. In 2021 IEEE Spoken Language Technology Workshop (SLT). 215--222","author":"Kharitonov Eugene","year":"2021","unstructured":"Eugene Kharitonov , Morgane Rivi\u00e8re , Gabriel Synnaeve , Lior Wolf , PierreEmmanuel Mazar\u00e9 , Matthijs Douze , and Emmanuel Dupoux . 2021 . Data Augmenting Contrastive Learning of Speech Representations in the Time Domain. In 2021 IEEE Spoken Language Technology Workshop (SLT). 215--222 . https: \/\/doi.org\/10.1109\/SLT48900.2021.9383605 10.1109\/SLT48900.2021.9383605 Eugene Kharitonov, Morgane Rivi\u00e8re, Gabriel Synnaeve, Lior Wolf, PierreEmmanuel Mazar\u00e9, Matthijs Douze, and Emmanuel Dupoux. 2021. Data Augmenting Contrastive Learning of Speech Representations in the Time Domain. In 2021 IEEE Spoken Language Technology Workshop (SLT). 215--222. https: \/\/doi.org\/10.1109\/SLT48900.2021.9383605"},{"key":"e_1_3_2_2_21_1","volume-title":"Lin (Eds.)","volume":"33","author":"Khosla Prannay","year":"2020","unstructured":"Prannay Khosla , Piotr Teterwak , Chen Wang , Aaron Sarna , Yonglong Tian , Phillip Isola , Aaron Maschinot , Ce Liu , and Dilip Krishnan . 2020 . Supervised Contrastive Learning. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H . Lin (Eds.) , Vol. 33 . Curran Associates, Inc. , 18661--18673. https:\/\/proceedings.neurips.cc\/paper\/ 2020\/file\/d89a66c7c80a29b1bdbab0f2a1a94af8-Paper.pdf Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised Contrastive Learning. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 18661--18673. https:\/\/proceedings.neurips.cc\/paper\/ 2020\/file\/d89a66c7c80a29b1bdbab0f2a1a94af8-Paper.pdf"},{"key":"e_1_3_2_2_22_1","unstructured":"Tomi Kinnunen Jaime Lorenzo-Trueba Junichi Yamagishi Tomoki Toda Daisuke Saito Fernando Villavicencio and Zhenhua Ling. 2018. The Voice Conversion Challenge 2018: database and results. (2018).  Tomi Kinnunen Jaime Lorenzo-Trueba Junichi Yamagishi Tomoki Toda Daisuke Saito Fernando Villavicencio and Zhenhua Ling. 2018. The Voice Conversion Challenge 2018: database and results. (2018)."},{"key":"e_1_3_2_2_23_1","volume-title":"S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations. arXiv preprint arXiv:2104.02901","author":"Lin Yist Y","year":"2021","unstructured":"Jheng-hao Lin, Yist Y Lin , Chung-Ming Chien , and Hung-yi Lee. 2021. S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations. arXiv preprint arXiv:2104.02901 ( 2021 ). Jheng-hao Lin, Yist Y Lin, Chung-Ming Chien, and Hung-yi Lee. 2021. S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations. arXiv preprint arXiv:2104.02901 (2021)."},{"key":"e_1_3_2_2_24_1","first-page":"2693","article-title":"Unspeech: Unsupervised Speech Context Embeddings","volume":"2018","author":"Milde Benjamin","year":"2018","unstructured":"Benjamin Milde and Chris Biemann . 2018 . Unspeech: Unsupervised Speech Context Embeddings . Proc. Interspeech 2018 (2018), 2693 -- 2697 . Benjamin Milde and Chris Biemann. 2018. Unspeech: Unsupervised Speech Context Embeddings. Proc. Interspeech 2018 (2018), 2693--2697.","journal-title":"Proc. Interspeech"},{"key":"#cr-split#-e_1_3_2_2_25_1.1","unstructured":"Abdelrahman Mohamed Hung-yi Lee Lasse Borgholt Jakob D. Havtorn Joakim Edin Christian Igel Katrin Kirchhoff Shang-Wen Li Karen Livescu Lars Maal\u00f8e Tara N. Sainath and Shinji Watanabe. 2022. Self-Supervised Speech Representation Learning: A Review. https:\/\/doi.org\/10.48550\/ARXIV.2205.10643 10.48550\/ARXIV.2205.10643"},{"key":"#cr-split#-e_1_3_2_2_25_1.2","doi-asserted-by":"crossref","unstructured":"Abdelrahman Mohamed Hung-yi Lee Lasse Borgholt Jakob D. Havtorn Joakim Edin Christian Igel Katrin Kirchhoff Shang-Wen Li Karen Livescu Lars Maal\u00f8e Tara N. Sainath and Shinji Watanabe. 2022. Self-Supervised Speech Representation Learning: A Review. https:\/\/doi.org\/10.48550\/ARXIV.2205.10643","DOI":"10.1109\/JSTSP.2022.3207050"},{"key":"#cr-split#-e_1_3_2_2_26_1.1","unstructured":"Aaron van den Oord Yazhe Li and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. https:\/\/doi.org\/10.48550\/ARXIV.1807.03748 10.48550\/ARXIV.1807.03748"},{"key":"#cr-split#-e_1_3_2_2_26_1.2","unstructured":"Aaron van den Oord Yazhe Li and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. https:\/\/doi.org\/10.48550\/ARXIV.1807.03748"},{"key":"e_1_3_2_2_27_1","first-page":"161","article-title":"Learning Problem-Agnostic Speech Representations from Multiple Self-Supervised Tasks","volume":"2019","author":"Pascual Santiago","year":"2019","unstructured":"Santiago Pascual , Mirco Ravanelli , Joan Serr\u00e0 , Antonio Bonafonte , and Yoshua Bengio . 2019 . Learning Problem-Agnostic Speech Representations from Multiple Self-Supervised Tasks . Proc. Interspeech 2019 (2019), 161 -- 165 . Santiago Pascual, Mirco Ravanelli, Joan Serr\u00e0, Antonio Bonafonte, and Yoshua Bengio. 2019. Learning Problem-Agnostic Speech Representations from Multiple Self-Supervised Tasks. Proc. Interspeech 2019 (2019), 161--165.","journal-title":"Proc. Interspeech"},{"key":"#cr-split#-e_1_3_2_2_28_1.1","doi-asserted-by":"crossref","unstructured":"Leonardo Pepino Pablo Riera and Luciana Ferrer. 2021. Emotion Recognition from Speech Using Wav2vec 2.0 Embeddings. https:\/\/doi.org\/10.48550\/ARXIV. 2104.03502 10.48550\/ARXIV","DOI":"10.21437\/Interspeech.2021-703"},{"key":"#cr-split#-e_1_3_2_2_28_1.2","doi-asserted-by":"crossref","unstructured":"Leonardo Pepino Pablo Riera and Luciana Ferrer. 2021. Emotion Recognition from Speech Using Wav2vec 2.0 Embeddings. https:\/\/doi.org\/10.48550\/ARXIV. 2104.03502","DOI":"10.21437\/Interspeech.2021-703"},{"key":"e_1_3_2_2_29_1","volume-title":"The Role of Pitch and Timbre in Voice Gender Categorization. Frontiers in Psychology 3","author":"Pernet Cyril","year":"2012","unstructured":"Cyril Pernet and Pascal Belin . 2012. The Role of Pitch and Timbre in Voice Gender Categorization. Frontiers in Psychology 3 ( 2012 ). https:\/\/doi.org\/10.3389\/fpsyg. 2012.00023 10.3389\/fpsyg Cyril Pernet and Pascal Belin. 2012. The Role of Pitch and Timbre in Voice Gender Categorization. Frontiers in Psychology 3 (2012). https:\/\/doi.org\/10.3389\/fpsyg. 2012.00023"},{"key":"#cr-split#-e_1_3_2_2_30_1.1","doi-asserted-by":"crossref","unstructured":"Mirco Ravanelli Jianyuan Zhong Santiago Pascual Pawel Swietojanski Joao Monteiro Jan Trmal and Yoshua Bengio. 2020. Multi-task self-supervised learning for Robust Speech Recognition. https:\/\/doi.org\/10.48550\/ARXIV.2001.09239 10.48550\/ARXIV.2001.09239","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"#cr-split#-e_1_3_2_2_30_1.2","doi-asserted-by":"crossref","unstructured":"Mirco Ravanelli Jianyuan Zhong Santiago Pascual Pawel Swietojanski Joao Monteiro Jan Trmal and Yoshua Bengio. 2020. Multi-task self-supervised learning for Robust Speech Recognition. https:\/\/doi.org\/10.48550\/ARXIV.2001.09239","DOI":"10.1109\/ICASSP40776.2020.9053569"},{"key":"e_1_3_2_2_31_1","volume-title":"Contrastive Learning of General-Purpose Audio Representations. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3875--3879","author":"Saeed Aaqib","year":"2021","unstructured":"Aaqib Saeed , David Grangier , and Neil Zeghidour . 2021 . Contrastive Learning of General-Purpose Audio Representations. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3875--3879 . https: \/\/doi.org\/10.1109\/ICASSP39728.2021.9413528 10.1109\/ICASSP39728.2021.9413528 Aaqib Saeed, David Grangier, and Neil Zeghidour. 2021. Contrastive Learning of General-Purpose Audio Representations. In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 3875--3879. https: \/\/doi.org\/10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1002\/9781118706664"},{"key":"e_1_3_2_2_34_1","volume-title":"2004 IEEE International Conference on Acoustics, Speech, and Sig","author":"Schuller B.","unstructured":"B. Schuller , G. Rigoll , and M. Lang . 2004. Speech emotion recognition combining acoustic features and linguistic information in a hybrid support vector machinebelief network architecture . In 2004 IEEE International Conference on Acoustics, Speech, and Sig B. Schuller, G. Rigoll, and M. Lang. 2004. Speech emotion recognition combining acoustic features and linguistic information in a hybrid support vector machinebelief network architecture. In 2004 IEEE International Conference on Acoustics, Speech, and Sig"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551591"},{"key":"e_1_3_2_2_36_1","volume-title":"Emotion recognition from audio signals using Support Vector Machine. In 2015 IEEE recent advances in intelligent computational systems (RAICS)","author":"Sinith MS","unstructured":"MS Sinith , E Aswathi , TM Deepa , CP Shameema , and Shiny Rajan . 2015. Emotion recognition from audio signals using Support Vector Machine. In 2015 IEEE recent advances in intelligent computational systems (RAICS) . IEEE , 139--144. MS Sinith, E Aswathi, TM Deepa, CP Shameema, and Shiny Rajan. 2015. Emotion recognition from audio signals using Support Vector Machine. In 2015 IEEE recent advances in intelligent computational systems (RAICS). IEEE, 139--144."},{"key":"e_1_3_2_2_37_1","volume-title":"CURL: Contrastive Unsupervised Representations for Reinforcement Learning. https:\/\/doi.org\/10. 48550\/ARXIV.2004.04136","author":"Srinivas Aravind","year":"2020","unstructured":"Aravind Srinivas , Michael Laskin , and Pieter Abbeel . 2020 . CURL: Contrastive Unsupervised Representations for Reinforcement Learning. https:\/\/doi.org\/10. 48550\/ARXIV.2004.04136 Aravind Srinivas, Michael Laskin, and Pieter Abbeel. 2020. CURL: Contrastive Unsupervised Representations for Reinforcement Learning. https:\/\/doi.org\/10. 48550\/ARXIV.2004.04136"},{"key":"e_1_3_2_2_38_1","volume-title":"Titze and Daniel w. Martin","author":"Ingo","year":"1994","unstructured":"Ingo R. Titze and Daniel w. Martin . 1994 . Principles of voice production. Ingo R. Titze and Daniel w. Martin. 1994. Principles of voice production."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3184066.3184086"},{"key":"e_1_3_2_2_40_1","volume-title":"22nd Annual Conference of the International Speech Communication Association, Brno","author":"Tseng Wei-Cheng","year":"2021","unstructured":"Wei-Cheng Tseng , Chien-yu Huang, Wei-Tsung Kao , Yist Y. Lin , and Hung-yi Lee. 2021 . Utilizing Self-Supervised Representations for MOS Prediction. In Interspeech 2021 , 22nd Annual Conference of the International Speech Communication Association, Brno , Czechia, 30 August - 3 September 2021, Hynek Hermansky, Honza Cernock\u00fd, Luk\u00e1s Burget, Lori Lamel, Odette Scharenborg, and Petr Motl\u00edcek (Eds.). ISCA, 2781--2785. https:\/\/doi.org\/10.21437\/Interspeech. 2021--2013 10.21437\/Interspeech.2021--2013 Wei-Cheng Tseng, Chien-yu Huang, Wei-Tsung Kao, Yist Y. Lin, and Hung-yi Lee. 2021. Utilizing Self-Supervised Representations for MOS Prediction. In Interspeech 2021, 22nd Annual Conference of the International Speech Communication Association, Brno, Czechia, 30 August - 3 September 2021, Hynek Hermansky, Honza Cernock\u00fd, Luk\u00e1s Burget, Lori Lamel, Odette Scharenborg, and Petr Motl\u00edcek (Eds.). ISCA, 2781--2785. https:\/\/doi.org\/10.21437\/Interspeech.2021--2013"},{"key":"e_1_3_2_2_41_1","unstructured":"A\u00e4ron van den Oord Oriol Vinyals and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. In NIPS.  A\u00e4ron van den Oord Oriol Vinyals and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. In NIPS."},{"key":"e_1_3_2_2_42_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Lukasz Kaiser , and Illia Polosukhin . 2017. Attention is all you need. Advances in neural information processing systems 30 ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_2_43_1","volume-title":"Exploring the Effectiveness of Self-supervised Learning and Classifier Chains in Emotion Recognition of Nonverbal Vocalizations. arXiv preprint arXiv:2206.10695","author":"Xin Detai","year":"2022","unstructured":"Detai Xin , Shinnosuke Takamichi , and Hiroshi Saruwatari . 2022. Exploring the Effectiveness of Self-supervised Learning and Classifier Chains in Emotion Recognition of Nonverbal Vocalizations. arXiv preprint arXiv:2206.10695 ( 2022 ). Detai Xin, Shinnosuke Takamichi, and Hiroshi Saruwatari. 2022. Exploring the Effectiveness of Self-supervised Learning and Classifier Chains in Emotion Recognition of Nonverbal Vocalizations. arXiv preprint arXiv:2206.10695 (2022)."},{"key":"#cr-split#-e_1_3_2_2_44_1.1","doi-asserted-by":"crossref","unstructured":"Shu-wen Yang Po-Han Chi Yung-Sung Chuang Cheng-I Jeff Lai Kushal Lakhotia Yist Y. Lin Andy T. Liu Jiatong Shi Xuankai Chang Guan-Ting Lin TzuHsien Huang Wei-Cheng Tseng Ko-tik Lee Da-Rong Liu Zili Huang Shuyan Dong Shang-Wen Li Shinji Watanabe Abdelrahman Mohamed and Hung-yi Lee. 2021. SUPERB: Speech processing Universal PERformance Benchmark. https:\/\/doi.org\/10.48550\/ARXIV.2105.01051 10.48550\/ARXIV.2105.01051","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"#cr-split#-e_1_3_2_2_44_1.2","doi-asserted-by":"crossref","unstructured":"Shu-wen Yang Po-Han Chi Yung-Sung Chuang Cheng-I Jeff Lai Kushal Lakhotia Yist Y. Lin Andy T. Liu Jiatong Shi Xuankai Chang Guan-Ting Lin TzuHsien Huang Wei-Cheng Tseng Ko-tik Lee Da-Rong Liu Zili Huang Shuyan Dong Shang-Wen Li Shinji Watanabe Abdelrahman Mohamed and Hung-yi Lee. 2021. SUPERB: Speech processing Universal PERformance Benchmark. https:\/\/doi.org\/10.48550\/ARXIV.2105.01051","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.matpr.2022.01.169"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3551876.3554801","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3551876.3554801","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:16Z","timestamp":1750186816000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3551876.3554801"}},"subtitle":["Exploring Model Portability across Speakers' Genders"],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":54,"alternative-id":["10.1145\/3551876.3554801","10.1145\/3551876"],"URL":"https:\/\/doi.org\/10.1145\/3551876.3554801","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}