{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:37:37Z","timestamp":1763192257546,"version":"3.45.0"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230970","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["SpecMaskFoley: Steering Pretrained Spectral Masked Generative Transformer Toward Synchronized Video-to-audio Synthesis via ControlNet"],"prefix":"10.1109","author":[{"given":"Zhi","family":"Zhong","sequence":"first","affiliation":[{"name":"Sony Group Corporation,Japan"}]},{"given":"Akira","family":"Takahashi","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Japan"}]},{"given":"Shuyang","family":"Cui","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Japan"}]},{"given":"Keisuke","family":"Toyama","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Japan"}]},{"given":"Shusuke","family":"Takahashi","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Japan"}]},{"given":"Yuki","family":"Mitsufuji","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Japan"}]}],"member":"263","reference":[{"article-title":"Audiogen: Textually guided audio generation","year":"2022","author":"Kreuk","key":"ref1"},{"article-title":"Make-an-audio 2: Temporal-enhanced text-to-audio generation","year":"2023","author":"Huang","key":"ref2"},{"article-title":"Specmaskgit: Masked generative modeling of audio spectrograms for efficient audio synthesis and beyond","year":"2024","author":"Comunit\u00e0","key":"ref3"},{"article-title":"Soundctm: Uniting score-based and consistency models for text-to-sound generation","year":"2024","author":"Saito","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888461"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02691"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"article-title":"Visual echoes: A simple unified transformer for audio-visual generation","year":"2024","author":"Yang","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29475"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00683"},{"article-title":"Fr\\\u2019echet audio distance: A metric for evaluating music enhancement algorithms","year":"2018","author":"Kilgour","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448489"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref14","first-page":"11127","article-title":"Uni-controlnet: All-in-one control to text-to-image diffusion models","volume-title":"Proc. NeurIPS 2023","volume":"36","author":"Zhao"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i17.33934"},{"article-title":"Foleycrafter: Bring silent videos to life with lifelike and synchronized sounds","year":"2024","author":"Zhang","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3213"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890587"},{"key":"ref19","first-page":"128118","article-title":"Frieren: Efficient video-to-audio generation network with rectified flow matching","volume-title":"Proc. NeurIPS 2024","volume":"37","author":"Wang"},{"key":"ref20","first-page":"20371","article-title":"Mavil: Masked audio-video learners","volume-title":"Proc. NeurIPS 2023","volume":"36","author":"Huang"},{"key":"ref21","first-page":"48855","article-title":"Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models","volume-title":"Proc. NeurIPS 2023","volume":"36","author":"Luo"},{"article-title":"Stable-v2a: Synthesis of synchronized sound effects with temporal and semantic controls","year":"2024","author":"Gramaccioni","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890403"},{"key":"ref24","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML 2021","author":"Radford"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01103"},{"article-title":"Eva-clip: Improved training techniques for clip at scale","year":"2023","author":"Sun","key":"ref26"},{"article-title":"Music foundation model as generic booster for music downstream tasks","year":"2024","author":"Liao","key":"ref27"},{"article-title":"High fidelity neural audio compression","year":"2022","author":"D\u00e9fossez","key":"ref28"},{"key":"ref29","first-page":"27980","article-title":"High-fidelity audio compression with improved rvqgan","volume-title":"Proc. NeurIPS 2023","volume":"36","author":"Kumar"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.336"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA58266.2023.10248171"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00213"},{"article-title":"Audioldm: Text-to-audio generation with latent diffusion models","year":"2023","author":"Liu","key":"ref34"},{"article-title":"Pixart-{\\delta}: Fast and controllable image generation with latent consistency models","year":"2024","author":"Chen","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49660.2025.10888146"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890309"},{"article-title":"Classifier-free diffusion guidance","year":"2022","author":"Ho","key":"ref38"},{"article-title":"Muse: Text-to-image generation via masked generative transformers","year":"2023","author":"Chang","key":"ref39"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446088"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-227"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref46","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"NeurIPS","volume":"33","author":"Kong"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230970.pdf?arnumber=11230970","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:32:42Z","timestamp":1763191962000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230970\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230970","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}