{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T04:04:32Z","timestamp":1768017872049,"version":"3.49.0"},"reference-count":38,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]},{"name":"IMDA, Singapore"},{"name":"National LLM Funding Initiative"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Signal Process. Lett."],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/lsp.2025.3528359","type":"journal-article","created":{"date-parts":[[2025,1,13]],"date-time":"2025-01-13T20:30:45Z","timestamp":1736800245000},"page":"776-780","source":"Crossref","is-referenced-by-count":1,"title":["PRESENT: Zero-Shot Text-to-Prosody Control"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9607-0756","authenticated-orcid":false,"given":"Perry","family":"Lam","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2528-273X","authenticated-orcid":false,"given":"Huayun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Infocomm Research, A*STAR, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0872-5877","authenticated-orcid":false,"given":"Nancy F.","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Infocomm Research, A*STAR, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8078-3305","authenticated-orcid":false,"given":"Berrak","family":"Sisman","sequence":"additional","affiliation":[{"name":"Johns Hopkins University, Baltimore, MD, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(90)90021-Z"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref3","article-title":"Neural pitch-shifting and time-stretching with controllable LPCNet","author":"Morrison","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.876123"},{"key":"ref5","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2018"},{"key":"ref6","first-page":"3331","article-title":"Chive: Varying prosody in speech synthesis with a linguistically driven dynamic hierarchical conditional variational network","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kenter","year":"2019"},{"key":"ref7","article-title":"Speech synthesis markup language (SSML) version 1.1","author":"Shuang","year":"2010"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-465"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-384"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413864"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097074"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1757"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2024.3402088"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2679"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3027619"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1590"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1821"},{"key":"ref18","article-title":"Speak foreign languages with your own voice: Cross-lingual neural codec language modeling","author":"Zhang","year":"2023"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/575"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053520"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683501"},{"key":"ref23","article-title":"Improved prosodic clustering for multispeaker and speaker-independent phoneme-level prosody control","volume-title":"Proc. Speech Comput.:","volume":"12997","author":"Tsiakoulis","year":"2021"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.11.006"},{"key":"ref25","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. 9th Int. Conf. Learn. Representations","author":"Ren","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-307"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.26"},{"key":"ref28","first-page":"780","article-title":"Phoneme alignment: An exploration","volume-title":"Proc. 48th Annu. Meeting Assoc. Comput. Linguistics","author":"Jiampojamarn","year":"2010"},{"key":"ref29","article-title":"INTO-TTS: Intonation template based prosody control system","author":"Lee","year":"2022"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/SpeechProsody.2002-79"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10294"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1500"},{"key":"ref34","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. 40th Int. Conf. Mach. Learn.","author":"Radford","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095751"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.3758\/BRM.42.4.1096"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1177\/0098628316677643"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-9996"}],"container-title":["IEEE Signal Processing Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/97\/10802935\/10838710.pdf?arnumber=10838710","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,13]],"date-time":"2025-06-13T17:54:27Z","timestamp":1749837267000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10838710\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/lsp.2025.3528359","relation":{},"ISSN":["1070-9908","1558-2361"],"issn-type":[{"value":"1070-9908","type":"print"},{"value":"1558-2361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}