{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:16:17Z","timestamp":1776881777931,"version":"3.51.2"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,16]],"date-time":"2023-12-16T00:00:00Z","timestamp":1702684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,16]]},"DOI":"10.1109\/asru57964.2023.10389791","type":"proceedings-article","created":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:38:40Z","timestamp":1705689520000},"page":"1-7","source":"Crossref","is-referenced-by-count":11,"title":["Transduce and Speak: Neural Transducer for Text-To-Speech with Semantic Token Prediction"],"prefix":"10.1109","author":[{"given":"Minchan","family":"Kim","sequence":"first","affiliation":[{"name":"Seoul National University,Department of Electrical and Computer Engineering and INMC,Seoul,South Korea"}]},{"given":"Myeonghun","family":"Jeong","sequence":"additional","affiliation":[{"name":"Seoul National University,Department of Electrical and Computer Engineering and INMC,Seoul,South Korea"}]},{"given":"Byoung Jin","family":"Choi","sequence":"additional","affiliation":[{"name":"Seoul National University,Department of Electrical and Computer Engineering and INMC,Seoul,South Korea"}]},{"given":"Dongjune","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University,Department of Electrical and Computer Engineering and INMC,Seoul,South Korea"}]},{"given":"Nam Soo","family":"Kim","sequence":"additional","affiliation":[{"name":"Seoul National University,Department of Electrical and Computer Engineering and INMC,Seoul,South Korea"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00618"},{"key":"ref4","article-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech","volume-title":"International Conference on Learning Representations","author":"Ren"},{"key":"ref5","first-page":"5530","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"International Conference on Machine Learning. PMLR","author":"Kim"},{"key":"ref6","first-page":"8599","article-title":"Grad-tts: A diffusion probabilistic model for text-to-speech","volume-title":"International Conference on Machine Learning. PMLR","author":"Popov"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref8","first-page":"8067","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"Kim","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"Sequence transduction with recurrent neural networks","author":"Graves","year":"2012","journal-title":"Representation Learning Worksop"},{"key":"ref10","article-title":"Initial investigation of an encoder-decoder end-to-end tts framework using marginalization of monotonic hard latent alignments","author":"Yasuda","year":"2019","journal-title":"arXiv preprint arXiv:1908.11535"},{"key":"ref11","first-page":"6621","article-title":"Speech-t: Transducer for text to speech and beyond","volume":"34","author":"Chen","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-225"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10340"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2020-3015"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-329"},{"key":"ref19","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023","journal-title":"arXiv preprint arXiv:2301.02111"},{"key":"ref20","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022","journal-title":"arXiv preprint arXiv:2212.04356"},{"key":"ref21","article-title":"Speechbrain: A general-purpose speech toolkit","author":"Ravanelli","year":"2021","journal-title":"arXiv preprint arXiv:2106.04624"}],"event":{"name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Taipei, Taiwan","start":{"date-parts":[[2023,12,16]]},"end":{"date-parts":[[2023,12,20]]}},"container-title":["2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10388490\/10389614\/10389791.pdf?arnumber=10389791","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,23]],"date-time":"2024-01-23T16:59:29Z","timestamp":1706029169000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10389791\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,16]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/asru57964.2023.10389791","relation":{},"subject":[],"published":{"date-parts":[[2023,12,16]]}}}