{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:16:46Z","timestamp":1780355806391,"version":"3.54.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10887855","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T13:52:43Z","timestamp":1741787563000},"page":"1-5","source":"Crossref","is-referenced-by-count":4,"title":["Accelerating Codec-based Speech Synthesis with Multi-Token Prediction and Speculative Decoding"],"prefix":"10.1109","author":[{"given":"Tan Dat","family":"Nguyen","sequence":"first","affiliation":[{"name":"Korea Advanced Institute of Science and Technology,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ji-Hoon","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jeongsoo","family":"Choi","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shukjae","family":"Choi","sequence":"additional","affiliation":[{"name":"42dot Inc,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinseok","family":"Park","sequence":"additional","affiliation":[{"name":"42dot Inc,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Younglo","family":"Lee","sequence":"additional","affiliation":[{"name":"42dot Inc,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology,South Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"A survey on neural speech synthesis","author":"Tan","year":"2021"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1609.03499"},{"key":"ref3","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","volume-title":"Proc. ICML","author":"Skerry-Ryan"},{"key":"ref4","article-title":"Fastspeech: Fast, robust and controllable text to speech","author":"Ren","year":"2019","journal-title":"NeurIPS"},{"key":"ref5","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"Proc. ICML","author":"Kim"},{"key":"ref6","article-title":"Gradtts: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. ICML","author":"Popov"},{"key":"ref7","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3530270"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49660.2025.10890943"},{"key":"ref10","article-title":"Vall-e r: Robust and efficient zero-shot text-to-speech synthesis via monotonic alignment","author":"Han","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.673"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34703"},{"key":"ref13","article-title":"Cosyvoice: A scalable multilingual zeroshot text-to-speech synthesizer based on supervised semantic tokens","author":"Du","year":"2024"},{"key":"ref14","article-title":"Soundstorm: Efficient parallel audio generation","author":"Borsos","year":"2023"},{"key":"ref15","article-title":"Speechtokenizer: Unified speech tokenizer for speech large language models","author":"Zhang","year":"2023"},{"key":"ref16","article-title":"Uniaudio: An audio foundation model toward universal audio generation","volume-title":"Proc. ICML","author":"Yang"},{"key":"ref17","article-title":"Spoken question answering and speech continuation using spectrogram-powered LLM","volume-title":"Proc. ICLR","author":"Nachmani"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref19","article-title":"Listen, think, and understand","volume-title":"Proc. ICLR","author":"Gong"},{"key":"ref20","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref22","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref23","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","author":"Ju","year":"2024","journal-title":"ICML"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/icassp48485.2024.10447120"},{"key":"ref26","article-title":"Common voice: A massively-multilingual speech corpus","author":"Ardila","year":"2020","journal-title":"LREC"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref28","article-title":"Vocos: Closing the gap between time-domain and fourier-based neural vocoders for high-quality audio synthesis","volume-title":"Proc. ICLR","author":"Siuzdak"},{"key":"ref29","article-title":"Base tts: Lessons from building a billion-parameter text-to-speech model on 100k hours of data","author":"\u0141ajszczak","year":"2024"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-845"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447251"},{"key":"ref32","article-title":"Audiopalm: A large language model that can speak and listen","author":"Rubenstein","year":"2023"},{"key":"ref33","article-title":"Uniaudio 1.5: Large language model-driven audio codec is a few-shot audio task learner","author":"Yang","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/577"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612061"},{"key":"ref36","article-title":"Spectral codecs: Spectrogram-based audio codecs for high quality speech synthesis","author":"Langman","year":"2024"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i24.34761"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2024.3506286"},{"key":"ref39","article-title":"Wavtokenizer: an efficient acoustic discrete codec tokenizer for audio language modeling","author":"Ji","year":"2024"},{"key":"ref40","article-title":"Clam-tts: Improving neural codec language model for zero-shot text-to-speech","author":"Kim","year":"2024","journal-title":"ICLR"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-67199-4_101002"},{"key":"ref42","article-title":"Hqvae: Hierarchical discrete representation learning with variational bayes","author":"Takida","year":"2024","journal-title":"Transactions on Machine Learning Research"},{"key":"ref43","article-title":"Fast inference from transformers via speculative decoding","volume-title":"Proc. ICML","author":"Leviathan"},{"key":"ref44","article-title":"Accelerating large language model decoding with speculative sampling","author":"Chen","year":"2023"},{"key":"ref45","article-title":"Medusa: Simple llm inference acceleration framework with multiple decoding heads","volume-title":"Proc. ICML","author":"Cai"},{"key":"ref46","article-title":"Better & faster large language models via multi-token prediction","volume-title":"Proc. ICML","author":"Gloeckle"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10887855.pdf?arnumber=10887855","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:23:50Z","timestamp":1774416230000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10887855\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10887855","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}