{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:23:30Z","timestamp":1763191410476,"version":"3.45.0"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11229067","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["FoleyGRAM: Video-to-Audio Generation with GRAM-Aligned Multimodal Encoders"],"prefix":"10.1109","author":[{"given":"Riccardo F.","family":"Gramaccioni","sequence":"first","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]},{"given":"Christian","family":"Marinoni","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]},{"given":"Eleonora","family":"Grassucci","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]},{"given":"Giordano","family":"Cicchetti","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]},{"given":"Aurelio","family":"Uncini","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]},{"given":"Danilo","family":"Comminiello","sequence":"additional","affiliation":[{"name":"Sapienza University of Rome,Dept. Information Engineering, Electronics and Telecommunications (DIET),Italy"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447063"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"},{"article-title":"Semantically consistent video-to-audio generation using multimodal language large model","year":"2024","author":"Chen","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096353"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3177894"},{"article-title":"Relative representations enable zero-shot latent space communication","volume-title":"International Conference on Learning Representations","author":"Moschella","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1814"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"article-title":"LanguageBind: Extending video-language pretraining to n-modality by language-based semantic alignment","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhu","key":"ref10"},{"key":"ref11","article-title":"Gramian multimodal representation learning and alignment","author":"Cicchetti","year":"2025","journal-title":"ICLR"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096023"},{"key":"ref13","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","volume":"139","author":"Radford"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.336"},{"key":"ref16","first-page":"48855","article-title":"Diff-Foley: Synchronized video-to-audio synthesis with latent diffusion models","volume":"36","author":"Luo","year":"2023","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00240"},{"article-title":"Stable-V2A: Synthesis of synchronized sound effects with temporal and semantic controls","year":"2024","author":"Gramaccioni","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447380"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i8.26153"},{"key":"ref22","article-title":"VAST: A vision-audio-subtitle-text omni-modality foundation model and dataset","author":"Chen","year":"2023","journal-title":"Neural Information Processing Systems (NeurIPS)"},{"article-title":"Representation learning with contrastive predictive coding","year":"2018","author":"van den Oord","key":"ref23"},{"volume-title":"Matrix theory","year":"1959","author":"Gantmacher","key":"ref24"},{"article-title":"EVA-CLIP: Improved training techniques for clip at scale","year":"2023","author":"Sun","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref27","first-page":"5178","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"International Conference on Machine Learning","author":"Chen"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49660.2025.10888461"},{"article-title":"Fast timing-conditioned latent audio diffusion","year":"2024","author":"Evans","key":"ref29"},{"article-title":"PixArt-\u03b1: Fast training of diffusion transformer for photorealistic text-to-image synthesis","volume-title":"International Conference on Learning Representations (ICLR)","author":"Chen","key":"ref30"},{"article-title":"Long-form music generation with latent diffusion","year":"2024","author":"Evans","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"article-title":"Stable-v2a: Synthesis of synchronized sound effects with temporal and semantic controls","year":"2024","author":"Gramaccioni","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO63174.2024.10714935"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72986-7_17"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3597477"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11229067.pdf?arnumber=11229067","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:20:26Z","timestamp":1763191226000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11229067\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11229067","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}