{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,9]],"date-time":"2026-05-09T00:49:49Z","timestamp":1778287789041,"version":"3.51.4"},"reference-count":34,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Speech Communication"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.specom.2026.103355","type":"journal-article","created":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T16:47:14Z","timestamp":1769186834000},"page":"103355","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Classification of phonation types in singing and speaking voice using self-supervised learning models"],"prefix":"10.1016","volume":"178","author":[{"given":"Prathamesh Parasharam","family":"Patil","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7987-1735","authenticated-orcid":false,"given":"Kiran Reddy","family":"Mittapalle","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Paavo","family":"Alku","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.specom.2026.103355_b1","series-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","author":"Abadi","year":"2015"},{"key":"10.1016\/j.specom.2026.103355_b2","series-title":"Eighth Annual Conference of the International Speech Communication Association (Interspeech 2007)","first-page":"2513","article-title":"Comparison of multiple voice source parameters in different phonation types","author":"Airas","year":"2007"},{"issue":"2","key":"10.1016\/j.specom.2026.103355_b3","doi-asserted-by":"crossref","first-page":"701","DOI":"10.1121\/1.1490365","article-title":"Normalized amplitude quotient for parametrization of the glottal flow","volume":"112","author":"Alku","year":"2002","journal-title":"J. Acoust. Soc. Am."},{"key":"10.1016\/j.specom.2026.103355_b4","series-title":"Advances in Neural Information Processing Systems 33 (NeurIPS 2020)","first-page":"12449","article-title":"Wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020"},{"issue":"6","key":"10.1016\/j.specom.2026.103355_b5","doi-asserted-by":"crossref","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","article-title":"WavLM: Large-scale self-supervised pre-training for full stack speech processing","volume":"16","author":"Chen","year":"2022","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"10.1016\/j.specom.2026.103355_b6","series-title":"Keras","author":"Chollet","year":"2015"},{"key":"10.1016\/j.specom.2026.103355_b7","series-title":"2021 IEEE Automatic Speech Recognition and Understanding Workshop","first-page":"31","article-title":"W2V-BERT: Combining contrastive learning and masked language modeling for self-supervised speech pre-training","author":"Chung","year":"2021"},{"issue":"3","key":"10.1016\/j.specom.2026.103355_b8","doi-asserted-by":"crossref","first-page":"273","DOI":"10.1023\/A:1022627411411","article-title":"Support-vector networks","volume":"20","author":"Cortes","year":"1995","journal-title":"Mach. Learn."},{"key":"10.1016\/j.specom.2026.103355_b9","series-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"issue":"1\u20132","key":"10.1016\/j.specom.2026.103355_b10","doi-asserted-by":"crossref","first-page":"189","DOI":"10.1016\/S0167-6393(02)00082-1","article-title":"The role of voice quality in communicating emotion, mood and attitude","volume":"40","author":"Gobl","year":"2003","journal-title":"Speech Commun."},{"issue":"4","key":"10.1016\/j.specom.2026.103355_b11","doi-asserted-by":"crossref","first-page":"769","DOI":"10.1044\/jshr.3704.769","article-title":"Acoustic correlates of breathy vocal quality","volume":"37","author":"Hillenbrand","year":"1994","journal-title":"J. Speech, Lang. Hear. Res."},{"issue":"6","key":"10.1016\/j.specom.2026.103355_b12","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: Self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103355_b13","series-title":"Speech Prosody 2004, International Conference","first-page":"729","article-title":"Politeness and voice quality\u2014The alternative method to measure aspiration noise","author":"Ito","year":"2004"},{"key":"10.1016\/j.specom.2026.103355_b14","doi-asserted-by":"crossref","first-page":"33","DOI":"10.1016\/j.specom.2020.02.004","article-title":"Analysis and classification of phonation types in speech and singing voice","volume":"118","author":"Kadiri","year":"2020","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2026.103355_b15","series-title":"Proc. Interspeech 2018","first-page":"441","article-title":"Analysis and detection of phonation modes in singing voice using excitation source features and single frequency filtering cepstral coefficients (SFFCC)","author":"Kadiri","year":"2018"},{"key":"10.1016\/j.specom.2026.103355_b16","series-title":"Interspeech","first-page":"177","article-title":"Identifying regions of non-modal phonation using features of the wavelet transform","author":"Kane","year":"2011"},{"issue":"6","key":"10.1016\/j.specom.2026.103355_b17","doi-asserted-by":"crossref","first-page":"1170","DOI":"10.1109\/TASL.2013.2245653","article-title":"Wavelet maxima dispersion for breathy to tense voice discrimination","volume":"21","author":"Kane","year":"2013","journal-title":"IEEE Trans. Audio, Speech, Lang. Process."},{"key":"10.1016\/j.specom.2026.103355_b18","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.specom.2026.103355_b19","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Voc2vec: A foundation model for non-verbal vocalization","author":"Koudounas","year":"2025"},{"issue":"6","key":"10.1016\/j.specom.2026.103355_b20","doi-asserted-by":"crossref","DOI":"10.1121\/10.0026241","article-title":"Classification of phonation types in singing voice using wavelet scattering network-based features","volume":"4","author":"Mittapalle","year":"2024","journal-title":"JASA Express Lett."},{"key":"10.1016\/j.specom.2026.103355_b21","doi-asserted-by":"crossref","DOI":"10.1016\/j.jvoice.2024.11.016","article-title":"Tunable Q-factor wavelet transform-based features in the classification of phonation types in the singing and speaking voice","author":"Mittapalle","year":"2024","journal-title":"J. Voice"},{"key":"10.1016\/j.specom.2026.103355_b22","doi-asserted-by":"crossref","first-page":"35","DOI":"10.1016\/j.specom.2021.12.001","article-title":"Glottal flow characteristics in vowels produced by speakers with heart failure","volume":"137","author":"Mittapalle","year":"2022","journal-title":"Speech Commun."},{"key":"10.1016\/j.specom.2026.103355_b23","series-title":"2015 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"5206","article-title":"Librispeech: An ASR corpus based on public domain audio books","author":"Panayotov","year":"2015"},{"key":"10.1016\/j.specom.2026.103355_b24","series-title":"ICASSP 2021 \u2013 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"6024","article-title":"Layer-wise analysis of encoders in end-to-end speech recognition","author":"Pasad","year":"2021"},{"key":"10.1016\/j.specom.2026.103355_b25","first-page":"2825","article-title":"Scikit-learn: Machine learning in python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"J. Mach. Learn. Res."},{"issue":"2","key":"10.1016\/j.specom.2026.103355_b26","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1080\/09298215.2013.821496","article-title":"Breathy, resonant, pressed-automatic detection of phonation mode from audio recordings of singing","volume":"42","author":"Proutskova","year":"2013","journal-title":"J. New Music Res."},{"key":"10.1016\/j.specom.2026.103355_b27","series-title":"Interspeech","first-page":"150","article-title":"Automatic classification of phonation modes in singing voice: Towards singing style characterisation and application to ethnomusicological recordings","author":"Rouas","year":"2016"},{"key":"10.1016\/j.specom.2026.103355_b28","series-title":"The Science of Singing Voice","author":"Sundberg","year":"1987"},{"issue":"1","key":"10.1016\/j.specom.2026.103355_b29","doi-asserted-by":"crossref","first-page":"4","DOI":"10.1016\/j.jvoice.2020.03.018","article-title":"Objective characterization of phonation type using amplitude of flow glottogram pulse and of voice source fundamental","volume":"36","author":"Sundberg","year":"2022","journal-title":"J. Voice"},{"key":"10.1016\/j.specom.2026.103355_b30","series-title":"Principles of Voice Production","author":"Titze","year":"1994"},{"issue":"1\u20133","key":"10.1016\/j.specom.2026.103355_b31","doi-asserted-by":"crossref","first-page":"120","DOI":"10.1159\/000021519","article-title":"Voice problems at work: A challenge for occupational safety and health arrangement","volume":"52","author":"Vilkman","year":"2000","journal-title":"Folia Phoniatr. et Logop."},{"key":"10.1016\/j.specom.2026.103355_b32","series-title":"Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","first-page":"38","article-title":"Transformers: State-of-the-art natural language processing","author":"Wolf","year":"2020"},{"key":"10.1016\/j.specom.2026.103355_b33","series-title":"SUPERB: Speech processing universal PERformance benchmark","author":"Yang","year":"2021"},{"key":"10.1016\/j.specom.2026.103355_b34","series-title":"Interspeech","first-page":"1849","article-title":"Voice quality and F0 cues for affect expression: Implications for synthesis","author":"Yanushevskaya","year":"2005"}],"container-title":["Speech Communication"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000038?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167639326000038?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T09:52:09Z","timestamp":1771926729000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167639326000038"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":34,"alternative-id":["S0167639326000038"],"URL":"https:\/\/doi.org\/10.1016\/j.specom.2026.103355","relation":{},"ISSN":["0167-6393"],"issn-type":[{"value":"0167-6393","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Classification of phonation types in singing and speaking voice using self-supervised learning models","name":"articletitle","label":"Article Title"},{"value":"Speech Communication","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.specom.2026.103355","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103355"}}