{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T10:24:46Z","timestamp":1766053486933,"version":"3.48.0"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,29]],"date-time":"2025-10-29T00:00:00Z","timestamp":1761696000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,29]],"date-time":"2025-10-29T00:00:00Z","timestamp":1761696000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001663","name":"Volkswagen Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001663","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,29]]},"DOI":"10.1109\/is264627.2025.11284630","type":"proceedings-article","created":{"date-parts":[[2025,12,16]],"date-time":"2025-12-16T18:30:42Z","timestamp":1765909842000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["An Octave-based Multi-Resolution CQT Architecture for Diffusion-based Audio Generation"],"prefix":"10.1109","author":[{"given":"Maur\u00edcio do V. M.","family":"da Costa","sequence":"first","affiliation":[{"name":"University of Osnabr&#x00FC;ck,MTDML, IMM,Osnabr&#x00FC;ck,Germany"}]},{"given":"Eloi","family":"Moliner","sequence":"additional","affiliation":[{"name":"Aalto University,Acoustics Lab, DICE,Espoo,Finland"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3138870"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.5334\/tismir.251"},{"key":"ref3","first-page":"4352","article-title":"Adversarial generation of time-frequency features with application in audio synthesis","volume-title":"Proc. ICML. PMLR","author":"Marafioti"},{"article-title":"Adversarial audio synthesis","volume-title":"Proc. ICLR","author":"Donahue","key":"ref4"},{"article-title":"Gansynth: Adversarial neural audio synthesis","volume-title":"Proc. ICLR","author":"Engel","key":"ref5"},{"article-title":"Timbretron: A wavenet (cyclegan (cqt (audio))) pipeline for musical timbre transfer","volume-title":"International Conference on Learning Representations","author":"Huang","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3285241"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095637"},{"key":"ref9","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"article-title":"Score-based generative modeling through stochastic differential equations","volume-title":"Proc. ICLR","author":"Song","key":"ref10"},{"key":"ref11","first-page":"26565","article-title":"Elucidating the design space of diffusion-based generative models","volume":"35","author":"Karras","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref12","first-page":"8599","article-title":"Gradtts: A diffusion probabilistic model for text-to-speech","volume-title":"Proc. ICML","author":"Popov"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-469"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2024.3445871"},{"key":"ref15","first-page":"21450","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","volume-title":"Proc. ICML","author":"Liu"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888461"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.17743\/jaes.2022.0129"},{"article-title":"Wavegrad: Estimating gradients for waveform generation","volume-title":"International Conference on Learning Representations","author":"Chen","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096760"},{"key":"ref20","article-title":"Crash: Raw audio score-based generative modeling for controllable high-resolution drum sound synthesis","author":"Rouard","year":"2021","journal-title":"ISMIR"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-301"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01042"},{"article-title":"Mo\\\u02c6 usai: Text-to-music generation with long-context latent diffusion","year":"2023","author":"Schneider","key":"ref24"},{"article-title":"Diffa-riff: Musical accompaniment co-creation via latent diffusion models","year":"2024","author":"Nistal","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"article-title":"Generative modelling in latent space","year":"2025","author":"Dieleman","key":"ref27"},{"article-title":"A survey on diffusion models for inverse problems","year":"2024","author":"Daras","key":"ref28"},{"article-title":"A2sb: Audio-to-audio schrodinger bridges","year":"2025","author":"Kong","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3574988"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1162\/NECO_a_00142"},{"article-title":"Flow matching for generative modeling","volume-title":"The Eleventh International Conference on Learning Representations","author":"Lipman","key":"ref32"},{"author":"Velasco","key":"ref33","article-title":"Constructing an invertible constant-q transform with non-stationary gabor frames"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2234114"},{"key":"ref35","first-page":"7537","article-title":"Fourier features let networks learn high frequency functions in low dimensional domains","volume-title":"Proc. NeurIPS","volume":"33","author":"Tancik"},{"article-title":"Fma: A dataset for music analysis","volume-title":"18th International Society for Music Information Retrieval Conference (ISMIR)","author":"Defferrard","key":"ref36"},{"key":"ref37","doi-asserted-by":"crossref","DOI":"10.1145\/3474085.3475437","article-title":"Multi-singer: Fast multi-singer singing voice vocoder with a large-scale corpus","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia (ACM MM)","author":"Huang"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref39","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP48485.2024.10446663","article-title":"Adapting frechet audio distance for generative music evaluation","volume-title":"Proc. ICASSP","author":"Gui"}],"event":{"name":"2025 IEEE 6th International Symposium on the Internet of Sounds (IS2)","start":{"date-parts":[[2025,10,29]]},"location":"L'Aquila, Italy","end":{"date-parts":[[2025,10,31]]}},"container-title":["2025 IEEE 6th International Symposium on the Internet of Sounds (IS2)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11283123\/11283645\/11284630.pdf?arnumber=11284630","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T10:20:09Z","timestamp":1766053209000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11284630\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,29]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/is264627.2025.11284630","relation":{},"subject":[],"published":{"date-parts":[[2025,10,29]]}}}