{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T19:48:14Z","timestamp":1768074494456,"version":"3.49.0"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228639","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-9","source":"Crossref","is-referenced-by-count":2,"title":["A Simple but Strong Baseline for Sounding Video Generation: Effective Adaptation of Audio and Video Diffusion Models for Joint Generation"],"prefix":"10.1109","author":[{"given":"Masato","family":"Ishii","sequence":"first","affiliation":[{"name":"Sony AI,Tokyo,Japan"}]},{"given":"Akio","family":"Hayakawa","sequence":"additional","affiliation":[{"name":"Sony AI,Tokyo,Japan"}]},{"given":"Takashi","family":"Shibuya","sequence":"additional","affiliation":[{"name":"Sony AI,Tokyo,Japan"}]},{"given":"Yuki","family":"Mitsufuji","sequence":"additional","affiliation":[{"name":"Sony AI \/ Sony Group Corp.,Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3626235"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3262180"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"ref5","article-title":"Any-to-any generation via composable diffusion","volume":"36","author":"Tang","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref6","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref7","article-title":"Denoising diffusion implicit models","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Song"},{"key":"ref8","first-page":"36 479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"Video diffusion models","volume-title":"ICLR Workshop on Deep Generative Models for Highly Structured Data","author":"Ho"},{"key":"ref10","article-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref12","article-title":"Auto-encoding variational bayes","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Kingma"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref14","article-title":"Wavegrad: Estimating gradients for waveform generation","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Chen"},{"key":"ref15","article-title":"Diffwave: A versatile diffusion model for audio synthesis","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Kong"},{"key":"ref16","first-page":"21 450","article-title":"AudioLDM: Text-to-audio generation with latent diffusion models","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Liu"},{"key":"ref17","article-title":"Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Huang"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3005033"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12329"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3177894"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.336"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00240"},{"key":"ref25","article-title":"Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models","volume":"36","author":"Luo","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","article-title":"Diffava: Personalized text-to-audio generation with visual alignment","author":"Mo","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00940"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447063"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29475"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4068"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref32","article-title":"Scaling autoregressive video models","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Weissenborn"},{"key":"ref33","article-title":"Videogpt: Video generation using vq-vae and transformers","author":"Yan","year":"2021"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_1"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680612"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0378"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00683"},{"key":"ref40","first-page":"13 213","article-title":"simple diffusion: End-to-end diffusion for high resolution images","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Hoogeboom"},{"key":"ref41","article-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Chen"},{"key":"ref42","article-title":"Llava-next: A strong zero-shot video understanding model","author":"Zhang","year":"2024"},{"key":"ref43","article-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2018"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref46","article-title":"Classifier-free diffusion guidance","volume-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications","author":"Ho"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_3"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","location":"Rome, Italy","start":{"date-parts":[[2025,6,30]]},"end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228639.pdf?arnumber=11228639","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:27:54Z","timestamp":1763191674000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228639\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228639","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}