{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:27:04Z","timestamp":1775230024774,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,9,26]],"date-time":"2022-09-26T00:00:00Z","timestamp":1664150400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,9,26]],"date-time":"2022-09-26T00:00:00Z","timestamp":1664150400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,9,26]]},"DOI":"10.1109\/mmsp55362.2022.9948936","type":"proceedings-article","created":{"date-parts":[[2022,11,22]],"date-time":"2022-11-22T21:39:16Z","timestamp":1669153156000},"page":"1-6","source":"Crossref","is-referenced-by-count":4,"title":["Singing Voice Synthesis with Vibrato Modeling and Latent Energy Representation"],"prefix":"10.1109","author":[{"given":"Yingjie","family":"Song","sequence":"first","affiliation":[{"name":"School of Communication and Information Engineering, Shanghai University,Shanghai,China"}]},{"given":"Wei","family":"Song","sequence":"additional","affiliation":[{"name":"JD AI Research,Beijing,China"}]},{"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[{"name":"JD AI Research,Beijing,China"}]},{"given":"Zhengchen","family":"Zhang","sequence":"additional","affiliation":[{"name":"JD AI Research,Beijing,China"}]},{"given":"Dan","family":"Zeng","sequence":"additional","affiliation":[{"name":"School of Communication and Information Engineering, Shanghai University,Shanghai,China"}]},{"given":"Zhi","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Communication and Information Engineering, Shanghai University,Shanghai,China"}]},{"given":"Yang","family":"Yu","sequence":"additional","affiliation":[{"name":"Shanghai Conservatory of Music,Shanghai,China"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2019.8903099"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1761"},{"key":"ref33","article-title":"Attention Is All You Need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref32","first-page":"425","article-title":"Singing-voice Synthesis Using ANN Vibrato-parameter Models","volume":"30","author":"gu","year":"2014","journal-title":"J Inf Sci Eng"},{"key":"ref31","first-page":"23","article-title":"A Study of Vibrato Features to Control Singing Voices","author":"migita","year":"0","journal-title":"Proc ICA"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1121\/1.383733"},{"key":"ref37","article-title":"Adam: A Method for Stochastic Optimization","author":"kingma","year":"2014","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"Deep Learning using Rectified Linear Units (ReLu)","author":"agarap","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref35","first-page":"448","article-title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift","author":"ioffe","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694316"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP48831.2020.9287168"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1862"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2016.7813352"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2008.4665203"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2002.1203295"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.apacoust.2003.07.005"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2006.876756"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054582"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2015.2424572"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1563"},{"key":"ref19","article-title":"Fast and Flexible Neural Audio Synthesis","author":"hantrakul","year":"2019","journal-title":"ISMIR"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414348"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053944"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414727"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-36"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403249"},{"key":"ref29","first-page":"291","article-title":"Replicability and accuracy of pitch patterns in professional singers","author":"sundberg","year":"1996","journal-title":"Vocal fold physiology controlling complexity and chaos"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref7","article-title":"Recent development of the hmm-based singing voice synthesis system-sinsy","author":"oura","year":"0","journal-title":"Seventh ISCA Workshop on Speech Synthesis"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683154"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414043"},{"key":"ref9","article-title":"FastSpeech 2: Fast and High-Quality End-to-End Text to Speech","author":"ren","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.2307\/3681274"},{"key":"ref22","article-title":"Generating singing voice expression contours based on unit selection","author":"umbert","year":"0","journal-title":"Proc SMAC"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-07467-2_27"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054707"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2015.04.001"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362104"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362077"}],"event":{"name":"2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)","location":"Shanghai, China","start":{"date-parts":[[2022,9,26]]},"end":{"date-parts":[[2022,9,28]]}},"container-title":["2022 IEEE 24th International Workshop on Multimedia Signal Processing (MMSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9948698\/9948704\/09948936.pdf?arnumber=9948936","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,12]],"date-time":"2022-12-12T19:54:37Z","timestamp":1670874877000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9948936\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,26]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/mmsp55362.2022.9948936","relation":{},"subject":[],"published":{"date-parts":[[2022,9,26]]}}}