{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:13:34Z","timestamp":1777655614656,"version":"3.51.4"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730207","type":"print"},{"value":"9783031730214","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73021-4_15","type":"book-chapter","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T09:20:20Z","timestamp":1732094420000},"page":"247-264","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Masked Generative Video-to-Audio Transformers with\u00a0Enhanced Synchronicity"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8365-7387","authenticated-orcid":false,"given":"Santiago","family":"Pascual","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5132-0858","authenticated-orcid":false,"given":"Chunghsin","family":"Yeh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1049-2515","authenticated-orcid":false,"given":"Ioannis","family":"Tsiamas","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1303-6558","authenticated-orcid":false,"given":"Joan","family":"Serr\u00e0","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"15_CR1","unstructured":"Agostinelli, A., et\u00a0al.: MusicLM: generating music from text. arXiv preprint arXiv:2301.11325 (2023)"},{"key":"15_CR2","unstructured":"Besnier, V., Chen, M.: A Pytorch reproduction of masked generative image transformer. arXiv preprint arXiv:2310.14400 (2023)"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Borsos, Z., et al.: AudioLM: a language modeling approach to audio generation. In: IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2023)","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"15_CR4","unstructured":"Borsos, Z., Sharifi, M., Vincent, D., Kharitonov, E., Zeghidour, N., Tagliasacchi, M.: SoundStorm: efficient parallel audio generation. arXiv preprint arXiv:2305.09636 (2023)"},{"key":"15_CR5","unstructured":"Chang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., Freeman, W.T.: MaskGIT: masked generative image transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11315\u201311325 (2022)","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"15_CR7","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronisation in the wild. arXiv preprint arXiv:2112.04432 (2021)"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: VGGSound: a large-scale audio-visual dataset. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Chen, L., Srivastava, S., Duan, Z., Xu, C.: Deep cross-modal audio-visual generation. In: Proceedings of the on Thematic Workshops of ACM Multimedia 2017, pp. 349\u2013357 (2017)","DOI":"10.1145\/3126686.3126723"},{"key":"15_CR10","doi-asserted-by":"publisher","first-page":"8292","DOI":"10.1109\/TIP.2020.3009820","volume":"29","author":"P Chen","year":"2020","unstructured":"Chen, P., Zhang, Y., Tan, M., Xiao, H., Huang, D., Gan, C.: Generating visually aligned sound from videos. IEEE Trans. Image Process. 29, 8292\u20138302 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"15_CR11","unstructured":"Chen, S., et al.: BEATs: audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)"},{"key":"15_CR12","unstructured":"Copet, J., et al.: Simple and controllable music generation. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"15_CR13","unstructured":"D\u00e9fossez, A., Copet, J., Synnaeve, G., Adi, Y.: High fidelity neural audio compression. arXiv preprint arXiv:2210.13438 (2022)"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Dong, H.W., et al.: CLIPSonic: text-to-audio synthesis with unlabeled videos and pretrained language-vision models. In: 2023 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/WASPAA58266.2023.10248160"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Foote, J.: Automatic audio segmentation using a measure of audio novelty. In: Proceedings of the IEEE International Conference on Multimedia and Expo (ICME), vol.\u00a01, pp. 452\u2013455. IEEE (2000)","DOI":"10.1109\/ICME.2000.869637"},{"key":"15_CR17","unstructured":"Garcia, H.F., Seetharaman, P., Kumar, R., Pardo, B.: VampNet: music generation via masked acoustic token modeling. arXiv preprint arXiv:2307.04686 (2023)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Gui, A., Gamper, H., Braun, S., Emmanouilidou, D.: Adapting frechet audio distance for generative music evaluation. arXiv preprint arXiv:2311.01616 (2023)","DOI":"10.1109\/ICASSP48485.2024.10446663"},{"key":"15_CR20","doi-asserted-by":"crossref","unstructured":"Hao, W., Zhang, Z., Guan, H.: CMCGAN: a uniform framework for cross-modal visual-audio mutual generation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.12329"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Hershey, S., et\u00a0al.: CNN architectures for large-scale audio classification. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"15_CR22","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"15_CR23","unstructured":"Iashin, V., Rahtu, E.: Taming visually guided sound generation. arXiv preprint arXiv:2110.08791 (2021)"},{"key":"15_CR24","unstructured":"Iashin, V., Xie, W., Rahtu, E., Zisserman, A.: Sparse in space and time: audio-visual synchronisation with trainable selectors. arXiv preprint arXiv:2210.07055 (2022)"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Jeong, Y., et al.: The power of sound (TPoS): audio reactive video generation with stable diffusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7822\u20137832 (2023)","DOI":"10.1109\/ICCV51070.2023.00719"},{"key":"15_CR26","unstructured":"Kalchbrenner, N., et al.: Efficient neural audio synthesis. In: International Conference on Machine Learning, pp. 2410\u20132419. PMLR (2018)"},{"key":"15_CR27","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Kilgour, K., Zuluaga, M., Roblek, D., Sharifi, M.: Fr\u00e9chet audio distance: a metric for evaluating music enhancement algorithms. arXiv preprint arXiv:1812.08466 (2018)","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"15_CR29","unstructured":"Kreuk, F., et al.: AudioGen: textually guided audio generation. arXiv preprint arXiv:2209.15352 (2022)"},{"key":"15_CR30","unstructured":"Kumar, R., Seetharaman, P., Luebs, A., Kumar, I., Kumar, K.: High-fidelity audio compression with improved RVQGAN. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Liu, H., Chen, K., Tian, Q., Wang, W., Plumbley, M.D.: AudioSR: versatile audio super-resolution at scale. arXiv preprint arXiv:2309.07314 (2023)","DOI":"10.1109\/ICASSP48485.2024.10447246"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: AudioLDM 2: learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734 (2023)","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"15_CR33","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"15_CR34","unstructured":"Luo, S., Yan, C., Hu, C., Zhao, H.: Diff-Foley: synchronized video-to-audio synthesis with latent diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"15_CR35","unstructured":"Mehri, S., et al.: SampleRNN: an unconditional end-to-end neural audio generation model. arXiv preprint arXiv:1612.07837 (2016)"},{"key":"15_CR36","unstructured":"Mei, X., et al.: FoleyGen: visually-guided audio generation. arXiv preprint arXiv:2309.10537 (2023)"},{"key":"15_CR37","unstructured":"Oord, A., et\u00a0al.: Parallel WaveNet: fast high-fidelity speech synthesis. In: International Conference on Machine Learning, pp. 3918\u20133926. PMLR (2018)"},{"key":"15_CR38","unstructured":"van den Oord, A., et al.: WaveNet: a generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Pascual, S., Bhattacharya, G., Yeh, C., Pons, J., Serr\u00e0, J.: Full-band general audio synthesis with score-based diffusion. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096760"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"15_CR42","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Ruan, L., et al.: MM-Diffusion: learning multi-modal diffusion models for joint audio and video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10219\u201310228 (2023)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Sheffer, R., Adi, Y.: I hear your true colors: image guided audio generation. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096023"},{"key":"15_CR45","doi-asserted-by":"publisher","unstructured":"Taubert, S.: Mean-opinion-score (2023). https:\/\/doi.org\/10.5281\/zenodo.8238259. https:\/\/github.com\/stefantaubert\/mean-opinion-score","DOI":"10.5281\/zenodo.8238259"},{"key":"15_CR46","unstructured":"Tsiamas, I., Pascual, S., Yeh, C., Serr\u00e0, J.: Sequential contrastive audio-visual learning. arXiv preprint arXiv:2407.05782 (2024)"},{"key":"15_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Wang, H., Ma, J., Pascual, S., Cartwright, R., Cai, W.: V2A-Mapper: a lightweight solution for vision-to-audio generation by connecting foundation models. arXiv preprint arXiv:2308.09300 (2023)","DOI":"10.1609\/aaai.v38i14.29475"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Wu, H.H., Seetharaman, P., Kumar, K., Bello, J.P.: Wav2CLIP: learning robust audio representations from CLIP. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4563\u20134567. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"15_CR50","doi-asserted-by":"crossref","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 305\u2013321 (2018)","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"15_CR51","doi-asserted-by":"crossref","unstructured":"Yariv, G., Gat, I., Benaim, S., Wolf, L., Schwartz, I., Adi, Y.: Diverse and aligned audio-to-video generation via text-to-video model adaptation. arXiv preprint arXiv:2309.16429 (2023)","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"15_CR52","doi-asserted-by":"crossref","unstructured":"Wu, Y., Chen, K., Zhang, T., Hui, Y., Berg-Kirkpatrick, T., Dubnov, S.: Large-scale contrastive language-audio pretraining with feature fusion and keyword-to-caption augmentation. arXiv preprint arXiv:2211.06687 (2024)","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"15_CR53","doi-asserted-by":"publisher","first-page":"495","DOI":"10.1109\/TASLP.2021.3129994","volume":"30","author":"N Zeghidour","year":"2021","unstructured":"Zeghidour, N., Luebs, A., Omran, A., Skoglund, J., Tagliasacchi, M.: SoundStream: an end-to-end neural audio codec. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 495\u2013507 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Ma, W.C., Torralba, A.: The sound of motions. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1735\u20131744 (2019)","DOI":"10.1109\/ICCV.2019.00182"},{"key":"15_CR55","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: generating natural sound for videos in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3550\u20133558 (2018)","DOI":"10.1109\/CVPR.2018.00374"},{"key":"15_CR57","unstructured":"Ziv, A., et al.: Masked audio generation using a single non-autoregressive transformer. arXiv preprint arXiv:2401.04577 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73021-4_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T09:45:39Z","timestamp":1732095939000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73021-4_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"ISBN":["9783031730207","9783031730214"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73021-4_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"21 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}