{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T07:35:36Z","timestamp":1780644936053,"version":"3.54.1"},"reference-count":35,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icasspw65056.2025.11011159","type":"proceedings-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T17:05:14Z","timestamp":1748365514000},"page":"1-5","source":"Crossref","is-referenced-by-count":4,"title":["Generative Data Augmentation Challenge: Zero-Shot Speech Synthesis for Personalized Speech Enhancement"],"prefix":"10.1109","author":[{"given":"Jae-Sung","family":"Bae","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Anastasia","family":"Kuznetsova","sequence":"additional","affiliation":[{"name":"Indiana University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dinesh","family":"Manocha","sequence":"additional","affiliation":[{"name":"University of Maryland"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"John","family":"Hershey","sequence":"additional","affiliation":[{"name":"Google Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Trausti","family":"Kristjansson","sequence":"additional","affiliation":[{"name":"Amazon Lab126"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minje","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. on Machine Learning (ICLR)","author":"Radford"},{"key":"ref2","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. Int. Conf. on Machine Learning (ICLR)","author":"Kim"},{"key":"ref3","article-title":"YourTTS: Towards zero-shot multi-speaker TTS and zero-shot voice conversion for everyone","volume-title":"Proc. Int. Conf. on Machine Learning (ICML)","author":"Casanova"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"ref5","article-title":"Neural codec language models are zero-shot text to speech synthesizers","volume-title":"CoRR","volume":"abs\/2301.02111","author":"Wang","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-599"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747120"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096971"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096601"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-019-0197-0"},{"key":"ref11","article-title":"Effective data augmentation with diffusion models","volume-title":"Proc. Int. Conf. on Learning Representations (ICLR)","author":"Trabucco"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446098"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094777"},{"key":"ref15","article-title":"Utilizing TTS synthesized data for efficient development of keyword spotting model","volume-title":"CoRR","volume":"abs\/2407.18879","author":"Park","year":"2024"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/IEEECONF59524.2023.10476833"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref18","article-title":"MUSAN: A music, speech, and noise corpus","volume-title":"CoRR","volume":"abs\/1510.08484","author":"Snyder","year":"2015"},{"key":"ref19","article-title":"SpeechBrain: A general-purpose speech toolkit","volume-title":"CoRR","volume":"abs\/2106.04624","author":"Ravanelli","year":"2021"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2111"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-299"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389671"},{"key":"ref23","article-title":"NaturalSpeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","volume-title":"Proc. Int. Conf. on Machine Learning (ICML)","author":"Ju"},{"key":"ref24","article-title":"The T05 system for the VoiceMOS challenge 2024: Transfer learning from deep image classifier to naturalness MOS prediction of high-quality synthetic speech","volume-title":"CoRR","volume":"abs\/2409.09305","author":"Baba","year":"2024"},{"key":"ref25","article-title":"The VoiceMOS challenge 2024: Beyond speech quality prediction","volume-title":"CoRR","volume":"abs\/2409.07001","author":"Huang","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495701"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.393"},{"key":"ref30","article-title":"Better speech synthesis through scaling","volume-title":"CoRR","volume":"abs\/2305.07243","author":"Betker","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2915167"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3181782"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"ref35","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. on Learning Representations (ICLR)","author":"Kingma"}],"event":{"name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11010992\/11010997\/11011159.pdf?arnumber=11011159","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T04:53:05Z","timestamp":1748407985000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11011159\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":35,"URL":"https:\/\/doi.org\/10.1109\/icasspw65056.2025.11011159","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}