{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:11:44Z","timestamp":1775200304033,"version":"3.50.1"},"reference-count":23,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434790","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Controllable Singing Voice Synthesis using Phoneme-Level Energy Sequence"],"prefix":"10.1109","author":[{"given":"Yerin","family":"Ryu","sequence":"first","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]},{"given":"Inseop","family":"Shin","sequence":"additional","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]},{"given":"Chanwoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea University,Department of Artificial Intelligence,Seoul,Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3394769"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.268"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34571"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2015.2424572"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"ref6","article-title":"MIDI-DDSP: Detailed control of musical performance via hierarchical modeling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Wu"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747664"},{"key":"ref9","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end textto-speech","volume-title":"International Conference on Machine Learning.","author":"Kim"},{"key":"ref10","doi-asserted-by":"crossref","article-title":"RMSSinger: Realistic-Music-Score based Singing Voice Synthesis","author":"He","DOI":"10.18653\/v1\/2023.findings-acl.16"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447981"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-678"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681642"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446786"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1720"},{"key":"ref16","first-page":"68406851","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref17","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Ren"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10585"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414043"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP55362.2022.9948936"},{"key":"ref21","article-title":"HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Jungil","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0034"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27774"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434790.pdf?arnumber=11434790","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:48Z","timestamp":1775192388000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434790\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":23,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434790","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}