{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:43:10Z","timestamp":1763192590109,"version":"3.45.0"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228078","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Efficient and Fast Generative-Based Singing Voice Separation using a Latent Diffusion Model"],"prefix":"10.1109","author":[{"given":"Gen\u00eds","family":"Plaja-Roglans","sequence":"first","affiliation":[{"name":"Music.AI,Salt Lake City,Utah,United States"}]},{"given":"Yun-Ning","family":"Hung","sequence":"additional","affiliation":[{"name":"Music.AI,Salt Lake City,Utah,United States"}]},{"given":"Xavier","family":"Serra","sequence":"additional","affiliation":[{"name":"Universitat Pompeu Fabra,Music Technology Group,Barcelona,Spain"}]},{"given":"Igor","family":"Pereira","sequence":"additional","affiliation":[{"name":"Music.AI,Salt Lake City,Utah,United States"}]}],"member":"263","reference":[{"key":"ref1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"33th Advances in Neural Information Processing Systems (NeurIPS)","author":"Ho"},{"article-title":"Mo\u00fbsai: Text-to-music generation with long-context latent diffusion","year":"2023","author":"Schneider","key":"ref2"},{"key":"ref3","first-page":"12 652","article-title":"Fast timing-conditioned latent audio diffusion","volume-title":"Int. Conf. on Machine Learning (ICML)","author":"Evans"},{"article-title":"Universal speech enhancement with score-based diffusion","year":"2022","author":"Serr\u00e0","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095637"},{"article-title":"Multi-source diffusion models for simultaneous music generation and separation","volume-title":"The 12th Int. Conf. on Learning Representations","author":"Mariani","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-36"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889421"},{"key":"ref10","first-page":"745","article-title":"Singing voice separation with deep U-Net convolutional networks","volume-title":"18th Int. Society for Music Information Retrieval Conf. (ISMIR)","author":"Jansson"},{"article-title":"Carnatic singing voice separation using cold diffusion on training data with bleeding","volume-title":"24th Int. Society for Music Information Retrieval Conf. (ISMIR)","author":"Plaja-Roglans","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.5334\/tismir.171"},{"article-title":"Investigating U-Nets with various intermediate blocks for spectrogram-based singing voice separation","volume-title":"21th Int. Society for Music Information Retrieval Conf. (ISMIR)","author":"Choi","key":"ref13"},{"article-title":"KUIELab-MDX-Net: a two-stream neural network for music demixing","year":"2021","author":"Kim","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3219355"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10096956"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446843"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/EUSIPCO.2016.7760550"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746530"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3115"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053024"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"article-title":"High fidelity neural audio compression","year":"2022","author":"D\u00e9fossez","key":"ref23"},{"article-title":"Unsupervised source separation via bayesian inference in the latent domain","year":"2021","author":"Mancusi","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889006"},{"article-title":"Parallel and Flexible Sampling from Autoregressive Models via Langevin Dynamics","volume-title":"Int. Conf. on Learning Representations (ICLR)","author":"Jayaram","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095310"},{"key":"ref28","article-title":"Zero-shot duet singing voices separation with diffusion models","author":"Yu","year":"2023","journal-title":"Sound Demixing Workshop (SDX)"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2019.8937170"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2024.106762"},{"article-title":"Combining audio control and style transfer using latent diffusion","volume-title":"25th Int. Society for Music Information Retrieval Conf. (ISMIR)","author":"Demerl\u00e9","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2021.3129994"},{"article-title":"High-Fidelity Audio Compression with Improved RVQGAN","volume-title":"37th Advances in Neural Information Processing Systems (NeurIPS)","author":"Kumar","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888065"},{"key":"ref35","first-page":"1526","article-title":"From discrete tokens to high-fidelity audio using multi-band diffusion","volume":"36","author":"Roman","year":"2023","journal-title":"Advances in Neural Information Processing Systems (NeurIPS"},{"article-title":"Denoising diffusion implicit models","volume-title":"Int. Conf. on Learning Representations (ICLR)","author":"Song","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/MP.2006.1664069"},{"article-title":"MUSDB18: a corpus for music separation","year":"2017","author":"Rafii","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"9th Int. Conf. on Learning Representations (ICLR)","author":"Kong","key":"ref40"},{"article-title":"Progressive distillation for fast sampling of diffusion models","volume-title":"Int. Conf. on Learning Representations (ICLR)","author":"Salimans","key":"ref41"},{"article-title":"Diffusion Models Beat GANs on Image Synthesis","volume-title":"35th Advances on Neural Information Processing Systems (NeurIPS)","author":"Dhariwal","key":"ref42"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93764-9_28"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2023.3271145"},{"article-title":"Hybrid spectrogram and waveform source separation","year":"2021","author":"D\u00e9fossez","key":"ref45"},{"article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Int. Conf. on Machine Learning (ICML)","author":"Liu","key":"ref46"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.5334\/jors.187"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228078.pdf?arnumber=11228078","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:39:15Z","timestamp":1763192355000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228078\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228078","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}