{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:48:53Z","timestamp":1777657733254,"version":"3.51.4"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"crossref","award":["2022ZD0161600"],"award-info":[{"award-number":["2022ZD0161600"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"crossref","award":["62376237"],"award-info":[{"award-number":["62376237"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Shenzhen Science and Technology Program","award":["ZDSYS20230626091302006"],"award-info":[{"award-number":["ZDSYS20230626091302006"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s11263-025-02649-3","type":"journal-article","created":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T09:59:03Z","timestamp":1767779943000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["FoleyCrafter: Bring Silent Videos to Life with Lifelike and Synchronized Sounds"],"prefix":"10.1007","volume":"134","author":[{"given":"Yiming","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yicheng","family":"Gu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3596-5163","authenticated-orcid":false,"given":"Yanhong","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Zhening","family":"Xing","sequence":"additional","affiliation":[]},{"given":"Yuancheng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhizheng","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,7]]},"reference":[{"key":"2649_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al. (2023). Gpt-4 technical report. arXiv:2303.08774 ."},{"key":"2649_CR2","doi-asserted-by":"publisher","DOI":"10.4324\/9780203766880","volume-title":"The Foley grail: The art of performing sound for film, games, and animation","author":"VT Ament","year":"2014","unstructured":"Ament, V. T. (2014). The Foley grail: The art of performing sound for film, games, and animation. Routledge."},{"key":"2649_CR3","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., & Zisserman, A. (2020). Vggsound: A large-scale audio-visual dataset. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"2649_CR4","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023). Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv:2310.09478 ."},{"key":"2649_CR5","doi-asserted-by":"crossref","unstructured":"Chen, K., Zhang, C., Fang, C., Wang, Z., Bui, T., & Nevatia, R. (2018). Visually indicated sound generation by perceptually optimized classification. In Proceedings of the European Conference on Computer Vision (ECCV) Workshops, pp. 0\u20130.","DOI":"10.1007\/978-3-030-11024-6_43"},{"key":"2649_CR6","doi-asserted-by":"publisher","first-page":"8292","DOI":"10.1109\/TIP.2020.3009820","volume":"29","author":"P Chen","year":"2020","unstructured":"Chen, P., Zhang, Y., Tan, M., Xiao, H., Huang, D., & Gan, C. (2020). Generating visually aligned sound from videos. IEEE Transactions on Image Processing,29, 8292\u20138302.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2649_CR7","doi-asserted-by":"crossref","unstructured":"Cherti, M., Beaumont, R., Wightman, R., Wortsman, M., Ilharco, G., Gordon, C., Schuhmann, C., Schmidt, L., & Jitsev, J. (2023). Reproducible scaling laws for contrastive language-image learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132829.","DOI":"10.1109\/CVPR52729.2023.00276"},{"issue":"70","key":"2649_CR8","first-page":"1","volume":"25","author":"HW Chung","year":"2024","unstructured":"Chung, H. W., Hou, L., Longpre, S., Zoph, B., Tay, Y., Fedus, W., Li, Y., Wang, X., Dehghani, M., Brahma, S., et al. (2024). Scaling instruction-finetuned language models. Journal of Machine Learning Research,25(70), 1\u201353.","journal-title":"Journal of Machine Learning Research"},{"key":"2649_CR9","doi-asserted-by":"crossref","unstructured":"Chung, Y., Lee, J., & Nam, J. (2024). T-foley: A controllable waveform-domain diffusion model for temporal-event-guided foley sound synthesis. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 6820\u20136824. IEEE.","DOI":"10.1109\/ICASSP48485.2024.10447380"},{"key":"2649_CR10","doi-asserted-by":"crossref","unstructured":"Comunit\u00e0, M., Gramaccioni, R.F., Postolache, E., Rodol\u00e0, E., Comminiello, D., & Reiss, J.D. (2024). Syncfusion: Multimodal onset-synchronized video-to-audio foley synthesis. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 936\u2013940. IEEE.","DOI":"10.1109\/ICASSP48485.2024.10447063"},{"key":"2649_CR11","unstructured":"contributors, W. (2024). Foley (filmmaking). Accessed on May 11, 2024."},{"key":"2649_CR12","doi-asserted-by":"crossref","unstructured":"Dong, H.W., Liu, X., Pons, J., Bhattacharya, G., Pascual, S., Serr\u00e0, J., Berg-Kirkpatrick, T., & McAuley, J. (2023). Clipsonic: Text-to-audio synthesis with unlabeled videos and pretrained language-vision models. In 2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), pp. 1\u20135. IEEE.","DOI":"10.1109\/WASPAA58266.2023.10248160"},{"key":"2649_CR13","doi-asserted-by":"crossref","unstructured":"Du, Y., Chen, Z., Salamon, J., Russell, B., & Owens, A. (2023). Conditional generation of audio from video via foley analogies. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132436.","DOI":"10.1109\/CVPR52729.2023.00240"},{"key":"2649_CR14","unstructured":"Freesound Project. (2024). Freesound. https:\/\/freesound.org\/. Accessed: 2024-04-12."},{"key":"2649_CR15","doi-asserted-by":"crossref","unstructured":"Ghosal, D., Majumder, N., Mehrish, A., & Poria, S. (2023). Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv:2304.13731.","DOI":"10.1145\/3581783.3612348"},{"key":"2649_CR16","doi-asserted-by":"crossref","unstructured":"Girdhar, R., El-Nouby, A., Liu, Z., Singh, M., Alwala, K.V., Joulin, A., & Misra, I. (2023). Imagebind: One embedding space to bind them all. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15180\u201315190.","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"2649_CR17","doi-asserted-by":"publisher","first-page":"18153","DOI":"10.1609\/aaai.v38i16.29773","volume":"38","author":"Z Guo","year":"2024","unstructured":"Guo, Z., Mao, J., Tao, R., Yan, L., Ouchi, K., Liu, H., & Wang, X. (2024). Audio generation with multiple conditional diffusion model. In Proceedings of the AAAI Conference on Artificial Intelligence,38, 18153\u201318161.","journal-title":"In Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2649_CR18","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., & Hochreiter, S. (2017). Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems\u00a030 ."},{"key":"2649_CR19","unstructured":"Huang, R., Huang, J., Yang, D., Ren, Y., Liu, L., Li, M., Ye, Z., Liu, J., Yin, X., & Zhao Z. (2023). Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. In International Conference on Machine Learning, pp. 13916\u201313932. PMLR."},{"key":"2649_CR20","doi-asserted-by":"crossref","unstructured":"Iashin, V., & Rahtu, E. (2021). Taming visually guided sound generation. In The 32st British Machine Vision Virtual Conference. BMVA Press.","DOI":"10.5244\/C.35.336"},{"key":"2649_CR21","unstructured":"Iashin, V., Xie, W., Rahtu, E., & Zisserman, A. (2022). Sparse in space and time: Audio-visual synchronisation with trainable selectors. arXiv:2210.07055 ."},{"key":"2649_CR22","doi-asserted-by":"crossref","unstructured":"Ishii, M., Hayakawa, A., Shibuya, T., & Mitsufuji, Y. (2024). A simple but strong baseline for sounding video generation: Effective adaptation of audio and video diffusion models for joint generation. arXiv:2409.17550.","DOI":"10.1109\/IJCNN64981.2025.11228639"},{"key":"2649_CR23","doi-asserted-by":"crossref","unstructured":"Jeong, Y., Kim, Y., Chun, S., & Lee, J. (2024). Read, watch and scream! sound generation from text and video. arXiv:2407.05551.","DOI":"10.1609\/aaai.v39i17.33934"},{"key":"2649_CR24","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Tian, Y., Yang, L., Bertasius, G., & Wang, H. (2024). Vmas: Video-to-music generation via semantic alignment in web music videos. arXiv:2409.07450.","DOI":"10.1109\/WACV61041.2025.00120"},{"key":"2649_CR25","unstructured":"Liu, H., Chen, Z., Yuan, Y., Mei, X., Liu, X., Mandic, D., Wang, W., & Plumbley, M.D. (2023). Audioldm: text-to-audio generation with latent diffusion models. In Proceedings of the 40th International Conference on Machine Learning, pp. 21450\u201321474."},{"key":"2649_CR26","doi-asserted-by":"crossref","unstructured":"Liu, H., Tian, Q., Yuan, Y., Liu, X., Mei, X., Kong, Q., Wang, Y., Wang, W., Wang, Y., & Plumbley, M.D. (2023). Audioldm 2: Learning holistic audio generation with self-supervised pretraining. arXiv:2308.05734.","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"2649_CR27","doi-asserted-by":"publisher","first-page":"101337","DOI":"10.52202\/079017-3213","volume":"37","author":"X Liu","year":"2024","unstructured":"Liu, X., Su, K., & Shlizerman, E. (2024). Tell what you hear from what you see-video to audio generation through text. Advances in Neural Information Processing Systems,37, 101337\u2013101366.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2649_CR28","unstructured":"Luo, S., Yan, C., Hu, C., & Zhao, H. (2023). Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models. Advances in Neural Information Processing Systems\u00a036 ."},{"key":"2649_CR29","unstructured":"Mo, S., Shi, J., & Tian, Y. (2024). Text-to-audio generation synchronized with videos. arXiv:2403.07938."},{"key":"2649_CR30","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P.,\u00a0McDermott, J., Torralba, A., Adelson, E.H., & Freeman, W.T. (2016). Visually indicated sounds. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 2405\u20132413.","DOI":"10.1109\/CVPR.2016.264"},{"key":"2649_CR31","doi-asserted-by":"crossref","unstructured":"Pascual, S.,\u00a0Yeh, C.,\u00a0Tsiamas, I., &\u00a0Serr\u00e0, J. (2024). Masked generative video-to-audio transformers with enhanced synchronicity. arXiv:2407.10387.","DOI":"10.1007\/978-3-031-73021-4_15"},{"key":"2649_CR32","unstructured":"Radford, A., Kim, J.W.,\u00a0Hallacy, C.,\u00a0Ramesh, A.,\u00a0Goh, G.,\u00a0Agarwal, S.,\u00a0Sastry, G., Askell, A.,\u00a0Mishkin, P.,\u00a0Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pp. 8748\u20138763. PMLR."},{"key":"2649_CR33","doi-asserted-by":"crossref","unstructured":"Rombach, R.,\u00a0Blattmann, A.,\u00a0Lorenz, D.,\u00a0Esser, P., &\u00a0Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2649_CR34","doi-asserted-by":"crossref","unstructured":"Sheffer, R., &\u00a0Adi, Y. (2023). I hear your true colors: Image guided audio generation. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE.","DOI":"10.1109\/ICASSP49357.2023.10096023"},{"key":"2649_CR35","unstructured":"Su, K.,\u00a0Liu, X., &\u00a0Shlizerman, E. (2024), 21\u201327. From vision to audio and beyond: A unified model for audio-visual representation and generation. In R.\u00a0Salakhutdinov, Z.\u00a0Kolter, K.\u00a0Heller, A.\u00a0Weller, N.\u00a0Oliver, J.\u00a0Scarlett, and F.\u00a0Berkenkamp (Eds.), Proceedings of the 41st International Conference on Machine Learning, Volume 235 of Proceedings of Machine Learning Research, pp. 46804\u201346822. PMLR."},{"key":"2649_CR36","unstructured":"Tang, Z.,\u00a0Yang, Z.,\u00a0Zhu, C.,\u00a0Zeng, M., &\u00a0Bansal, M. (2024). Any-to-any generation via composable diffusion. Advances in Neural Information Processing Systems\u00a036 ."},{"key":"2649_CR37","unstructured":"Vaswani, A.,\u00a0Shazeer, N.,\u00a0Parmar, N.,\u00a0Uszkoreit, J.,\u00a0Jones, L., Gomez, A.N., Kaiser,\u0141., &\u00a0Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems\u00a030."},{"key":"2649_CR38","doi-asserted-by":"crossref","unstructured":"Wang, H., Ma, J., Pascual, S., & Cartwright, R., & Cai, W. (2024). V2a-mapper: A lightweight solution for vision-to-audio generation by connecting foundation models. In Proceedings of the AAAI Conference on Artificial Intelligence,38, 15492\u201315501.","DOI":"10.1609\/aaai.v38i14.29475"},{"key":"2649_CR39","unstructured":"Wang, Y.,\u00a0Guo, W.,\u00a0Huang, R.,\u00a0Huang, J.,\u00a0Wang, Z.,\u00a0You, F.,\u00a0Li, R., &\u00a0Zhao, Z. (2024). Frieren: Efficient video-to-audio generation with rectified flow matching. arXiv:2406.00320."},{"key":"2649_CR40","doi-asserted-by":"crossref","unstructured":"Wu, H.H.,\u00a0Seetharaman, P.,\u00a0Kumar, K., & Bello, J.P. (2022). Wav2clip: Learning robust audio representations from clip. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4563\u20134567. IEEE.","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"2649_CR41","doi-asserted-by":"crossref","unstructured":"Xie, Z.,\u00a0Xu, X.,\u00a0Wu, Z., &\u00a0Wu, M. (2024). Picoaudio: Enabling precise timestamp and frequency controllability of audio events in text-to-audio generation. arXiv:2407.02869.","DOI":"10.1109\/ICASSP49660.2025.10890827"},{"key":"2649_CR42","doi-asserted-by":"crossref","unstructured":"Xie, Z.,\u00a0Yu, S.,\u00a0Li, M.,\u00a0He, Q.,\u00a0Chen, C., & Jiang, Y.G. (2024). Sonicvisionlm: Playing sound with vision language models. CVPR.","DOI":"10.1109\/CVPR52733.2024.02537"},{"key":"2649_CR43","doi-asserted-by":"crossref","unstructured":"Xing, Y.,\u00a0He, Y.,\u00a0Tian, Z.,\u00a0Wang, X., &\u00a0Chen, Q. (2024). Seeing and hearing: Open-domain visual-audio generation with diffusion latent aligners. In CVPR.","DOI":"10.1109\/CVPR52733.2024.00683"},{"key":"2649_CR44","doi-asserted-by":"crossref","unstructured":"Xue, J.,\u00a0Deng, Y.,\u00a0Gao, Y., &\u00a0Li, Y. (2024). Auffusion: Leveraging the power of diffusion and large language models for text-to-audio generation. arXiv:2401.01044.","DOI":"10.1109\/TASLP.2024.3485485"},{"key":"2649_CR45","doi-asserted-by":"crossref","unstructured":"Yariv, G., Gat, I., Benaim, S., Wolf, L., & Schwartz, I., & Adi, Y. (2024). Diverse and aligned audio-to-video generation via text-to-video model adaptation. In Proceedings of the AAAI Conference on Artificial Intelligence,38, 6639\u20136647.","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"2649_CR46","unstructured":"Ye, H.,\u00a0Zhang, J.,\u00a0Liu, S.,\u00a0Han, X., &\u00a0Yang, W. (2023). Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv:2308.06721."},{"key":"2649_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, L.,\u00a0Mo, S.,\u00a0Zhang, Y., &\u00a0Morgado, P. (2024). Audio-synchronized visual animation. arXiv:2403.05659.","DOI":"10.1007\/978-3-031-72940-9_1"},{"key":"2649_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, L.,\u00a0Rao, A., &\u00a0Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2649_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, Y.,\u00a0Wang, Z.,\u00a0Fang, C.,\u00a0Bui, T., & Berg, T.L. (2018). Visual to sound: Generating natural sound for videos in the wild. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3550\u20133558.","DOI":"10.1109\/CVPR.2018.00374"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02649-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02649-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02649-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T15:39:50Z","timestamp":1771601990000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02649-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":49,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["2649"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02649-3","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]},"assertion":[{"value":"14 March 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 October 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"46"}}