{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:37:33Z","timestamp":1763192253222,"version":"3.45.0"},"reference-count":43,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000266","name":"Engineering and Physical Sciences Research Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000266","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230934","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Towards Reliable Objective Evaluation Metrics for Generative Singing Voice Separation Models"],"prefix":"10.1109","author":[{"given":"Paul A.","family":"Bereuter","sequence":"first","affiliation":[{"name":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"}]},{"given":"Benjamin","family":"Stahl","sequence":"additional","affiliation":[{"name":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"}]},{"given":"Mark D.","family":"Plumbley","sequence":"additional","affiliation":[{"name":"University of Surrey,Centre for Vision, Speech and Signal Processing (CVSSP),UK"}]},{"given":"Alois","family":"Sontacchi","sequence":"additional","affiliation":[{"name":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74494-8_69"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93764-9_28"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.5334\/tismir.171"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2005.858005"},{"key":"ref5","first-page":"21 450","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Proc. ICML","author":"Liu"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3285241"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683130"},{"key":"ref8","first-page":"31 841","article-title":"Multi-source diffusion models for simultaneous music generation and separation","volume-title":"Proc. ICLR","author":"Mariani"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889421"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3252272"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462091"},{"volume-title":"A Matlab toolbox that computes perceptually motivated objective measures for the evaluation of audio source separation","year":"2012","author":"Vincent","key":"ref12"},{"article-title":"ViSQOL: The virtual speech quality objective listener","volume-title":"Proc. IWAENC","author":"Hines","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref16","first-page":"12 181","article-title":"MERT: Acoustic music understanding model with large-scale self-supervised training","volume-title":"Proc. ICLR","author":"LI"},{"key":"ref17","first-page":"111","article-title":"Music2Latent: Consistency autoencoders for latent audio compression","volume-title":"Proc. ISMIR","author":"Pasini"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446663"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA58266.2023.10248049"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888007"},{"article-title":"Meta Audiobox Aesthetics: Unified automatic quality assessment for speech, music, and sound","year":"2025","author":"Tjandra","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-325"},{"article-title":"SingMOS: An extensive open-source singing voice dataset for MOS prediction","year":"2024","author":"Tang","key":"ref24"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.3389\/frsip.2021.808395"},{"article-title":"Hybrid spectrogram and waveform source separation","volume-title":"Proceedings of the ISMIR 2021 Workshop on Music Source Separation","author":"D\u00e9fossez","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446843"},{"key":"ref29","first-page":"454","article-title":"Mel-RoFormer for vocal separation and vocal melody transcription","volume-title":"Proc. ISMIR","author":"Wang"},{"key":"ref30","article-title":"Mel-Band-Roformer-Vocal-Model","volume-title":"GitHub repository","author":"Jensen","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-153"},{"article-title":"BigVGAN: A universal neural vocoder with large-scale training","volume-title":"Proc. ICLR","author":"gil Lee","key":"ref32"},{"article-title":"The MUSDB18 corpus for music separation","year":"2017","author":"Rafii","key":"ref33"},{"key":"ref34","first-page":"619","article-title":"MoisesDB: A dataset for source separation beyond 4-stems","volume-title":"Proc. ISMIR","author":"Pereira"},{"article-title":"ITU-T Recommendation P.808: subjective evaluation of speech quality with a crowdsourcing approach","volume-title":"Tech. Rep.","year":"2021","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2665"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.5334\/jors.187"},{"volume-title":"EBU R 128: Loudness normalisation and permitted maximum level of audio signals","year":"2023","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21105\/joss.04101"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747473"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX48832.2020.9123150"},{"article-title":"auraloss: Audio focused loss functions in PyTorch","volume-title":"Digital Music Research Network One-day Workshop (DMRN+15)","author":"Steinmetz","key":"ref42"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.2307\/1412159"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230934.pdf?arnumber=11230934","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:32:31Z","timestamp":1763191951000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230934\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230934","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}