{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:19:18Z","timestamp":1776885558885,"version":"3.51.2"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9414499","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"5694-5698","source":"Crossref","is-referenced-by-count":7,"title":["End-to-End Text-to-Speech Using Latent Duration Based on VQ-VAE"],"prefix":"10.1109","author":[{"given":"Yusuke","family":"Yasuda","sequence":"first","affiliation":[{"name":"National Institute of Informatics,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junichi","family":"Yamagishd","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref31","article-title":"Wavenet: A generative model for raw audio","author":"van den oord","year":"2016"},{"key":"ref30","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"Proc ICLR"},{"key":"ref10","article-title":"DurlAN: Duration informed attention network for multimodal synthesis","author":"yu","year":"2019"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2123"},{"key":"ref12","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"2020","journal-title":"CoRR"},{"key":"ref13","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2014","journal-title":"Proc ICLR"},{"key":"ref14","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"Proc NIPS"},{"key":"ref15","article-title":"Glow-TTS: A generative flow for text-to-speech via monotonic alignment search","author":"kim","year":"2020","journal-title":"CoRR"},{"key":"ref16","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2004-460","article-title":"Hidden semi-markov model based speech synthesis","author":"zen","year":"2004","journal-title":"Proc INTERSPEECH"},{"key":"ref17","article-title":"Deep encoder-decoder models for unsupervised learning of controllable speech synthesis","author":"henter","year":"2018"},{"key":"ref18","first-page":"6699","article-title":"Generating diverse and natural text-to-speech samples using a quantized fine-grained vae and autoregressive prosody prior","author":"sun","year":"2020","journal-title":"Proc ICASSP"},{"key":"ref19","article-title":"Discretalk: Text-to-speech as a machine translation problem","author":"hayashi","year":"2020"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2571"},{"key":"ref4","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"2015","journal-title":"Proc NIPS"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/1143844.1143891"},{"key":"ref3","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc ICLR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054106"},{"key":"ref29","doi-asserted-by":"crossref","first-page":"4006","DOI":"10.21437\/Interspeech.2017-1452","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang","year":"2017","journal-title":"Proc INTERSPEECH"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1972"},{"key":"ref8","first-page":"3165","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"Proc NIPS"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-38"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054119"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683277"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1198"},{"key":"ref24","first-page":"157","article-title":"A vector quantized variational autoencoder (VQ-VAE) autoregressive neural f0 model for statistical parametric speech synthesis","volume":"28","author":"wang","year":"2020","journal-title":"IEEE\/ACM Trans ASLP"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3232"},{"key":"ref26","article-title":"End-to-end text-to- speech using latent duration based on vq-vae","author":"yasuda","year":"2020","journal-title":"CoRR"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1615"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Toronto, ON, Canada","start":{"date-parts":[[2021,6,6]]},"end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09414499.pdf?arnumber=9414499","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,27]],"date-time":"2022-12-27T08:27:03Z","timestamp":1672129623000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9414499\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9414499","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}