{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:15:34Z","timestamp":1774419334272,"version":"3.50.1"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001321","name":"National Research Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001321","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002551","name":"Seoul National University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002551","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002551","name":"Seoul National University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002551","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10888403","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T17:15:02Z","timestamp":1741799702000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["TokenSynth: A Token-based Neural Synthesizer for Instrument Cloning and Text-to-Instrument"],"prefix":"10.1109","author":[{"given":"Kyungsu","family":"Kim","sequence":"first","affiliation":[{"name":"Seoul National University,Music and Audio Research Group (MARG),Department of Intelligence and Information"}]},{"given":"Junghyun","family":"Koo","sequence":"additional","affiliation":[{"name":"Seoul National University,Music and Audio Research Group (MARG),Department of Intelligence and Information"}]},{"given":"Sungho","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University,Music and Audio Research Group (MARG),Department of Intelligence and Information"}]},{"given":"Haesun","family":"Joung","sequence":"additional","affiliation":[{"name":"Seoul National University,Music and Audio Research Group (MARG),Department of Intelligence and Information"}]},{"given":"Kyogu","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University,Music and Audio Research Group (MARG),Department of Intelligence and Information"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref2","article-title":"Audiogen: Textually guided audio generation","volume-title":"The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Kreuk"},{"key":"ref3","article-title":"Simple and controllable music generation","volume":"36","author":"Copet","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","article-title":"Musiclm: Generating music from text","author":"Agostinelli","year":"2023"},{"key":"ref5","first-page":"21450","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","volume-title":"International Conference on Machine Learning","author":"Liu"},{"key":"ref6","article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Kong"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3626235"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref10","article-title":"High fidelity neural audio compression","volume":"2023","author":"D\u00e9fossez","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref11","article-title":"High-fidelity audio compression with improved rvqgan","volume":"36","author":"Kumar","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ojsp.2025.3534686"},{"key":"ref14","first-page":"1068","article-title":"Neural audio synthesis of musical notes with wavenet autoencoders","volume-title":"International Conference on Machine Learning","author":"Engel"},{"key":"ref15","article-title":"Gansynth: Adversarial neural audio synthesis","volume-title":"7th International Conference on Learning Representations, ICLR 2019","author":"Engel"},{"key":"ref16","article-title":"DDSP: differentiable digital signal processing","volume-title":"8th International Conference on Learning Representations, ICLR 2020","author":"Engel"},{"key":"ref17","article-title":"Sing: Symbol-to-instrument neural generator","volume":"31","author":"D\u00e9fossez","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683596"},{"key":"ref19","first-page":"254","article-title":"Neural waveshaping synthesis","volume-title":"Proceedings of the 22nd International Society for Music Information Retrieval Conference, ISMIR 2021","author":"Hayes"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746940"},{"key":"ref21","first-page":"608","article-title":"DDX7: differentiable FM synthesis of musical instrument sounds","volume-title":"Proceedings of the 23rd International Society for Music Information Retrieval Conference, ISMIR 2022","author":"Caspe"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3389\/frsip.2023.1284100"},{"key":"ref23","article-title":"Neural music instrument cloning from few samples","volume-title":"25th International Conference on Digital Audio Effects (DAFx20in22)","author":"Jonason"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097250"},{"key":"ref25","article-title":"Instrumentgen: Generating sample-based musical instruments from text","author":"Nercessian","year":"2023"},{"key":"ref26","article-title":"Generating sample-based musical instruments using neural audio codec language models","author":"Nercessian","year":"2024"},{"key":"ref27","volume-title":"Learning-based methods for comparing sequences, with applications to audio-to-midi alignment and matching","author":"Raffel","year":"2016"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097162"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref30","article-title":"Classifier-free diffusion guidance","author":"Ho","year":"2022"},{"key":"ref31","article-title":"Neural discrete representation learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/83.480761"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01123"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.17487\/rfc3003"},{"key":"ref35","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref36","article-title":"MT3: multi-task multitrack music transcription","volume-title":"The Tenth International Conference on Learning Representations, ICLR 2022","author":"Gardner"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096458"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref39","article-title":"PyTorch Lightning","author":"Falcon","year":"2019"},{"key":"ref40","article-title":"Mixed precision training","volume-title":"6th International Conference on Learning Representations, ICLR 2018","author":"Micikevicius"},{"key":"ref41","first-page":"2014","article-title":"Mir eval: A transparent implementation of common mir metrics","volume":"10","author":"Raffel","year":"2014","journal-title":"ISMIR"},{"key":"ref42","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","volume":"35","author":"Liang","year":"2022","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10888403.pdf?arnumber=10888403","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:22:20Z","timestamp":1774416140000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10888403\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10888403","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}