{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T19:05:17Z","timestamp":1762110317029,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":100,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729881"},{"type":"electronic","value":"9783031729898"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72989-8_2","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:02:04Z","timestamp":1729875724000},"page":"20-40","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Self-Supervised Audio-Visual Soundscape Stylization"],"prefix":"10.1007","author":[{"given":"Tingle","family":"Li","sequence":"first","affiliation":[]},{"given":"Renhao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Po-Yao","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Owens","sequence":"additional","affiliation":[]},{"given":"Gopala","family":"Anumanchipalli","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"issue":"12","key":"2_CR1","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"2_CR3","unstructured":"Bau, D., et al.: Paint by word. arXiv preprint arXiv:2103.10951 (2021)"},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., Gao, R., Calamia, P., Grauman, K.: Visual acoustic matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18858\u201318868 (2022)","DOI":"10.1109\/CVPR52688.2022.01829"},{"key":"2_CR6","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Audio-visual synchronisation in the wild. arXiv preprint arXiv:2112.04432 (2021)"},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Localizing visual sounds the hard way. In: Proceedings of the Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: A large-scale audio-visual dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 721\u2013725. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"2_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Z., Qian, S., Owens, A.: Sound localization from motion: jointly learning sound direction and camera rotation. arXiv preprint arXiv:2303.11329 (2023)","DOI":"10.1109\/ICCV51070.2023.00726"},{"key":"2_CR10","unstructured":"Corporation, B.B.: BBC Sound Effects (2017). https:\/\/sound-effects.bbcrewind.co.uk\/search"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Cramer, A.L., Wu, H.H., Salamon, J., Bello, J.P.: Look, listen, and learn more: design choices for deep audio embeddings. In: ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3852\u20133856. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Doersch, C., Zisserman, A.: Multi-task self-supervised visual learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2051\u20132060 (2017)","DOI":"10.1109\/ICCV.2017.226"},{"key":"2_CR13","unstructured":"Donahue, C., et\u00a0al.: Singsong: generating musical accompaniments from singing. arXiv preprint arXiv:2301.12662 (2023)"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Dong, H., Yu, S., Wu, C., Guo, Y.: Semantic image synthesis via adversarial learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5706\u20135714 (2017)","DOI":"10.1109\/ICCV.2017.608"},{"key":"2_CR15","unstructured":"Du, C., et al.: On uni-modal feature learning in supervised multi-modal learning. In: International Conference on Machine Learning, pp. 8632\u20138656. PMLR (2023)"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Du, Y., Chen, Z., Salamon, J., Russell, B., Owens, A.: Conditional generation of audio from video via foley analogies. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132436 (2023)","DOI":"10.1109\/CVPR52729.2023.00240"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Deshmukh, S., Al\u00a0Ismail, M., Wang, H.: Clap learning audio concepts from natural language supervision. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Ephrat, A., et al.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. ACM Trans. Graph. (TOG) 37(4) (2016)","DOI":"10.1145\/3197517.3201357"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Ephrat, A., Peleg, S.: Vid2speech: speech reconstruction from silent video. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5095\u20135099. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7953127"},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Feng, C., Chen, Z., Owens, A.: Self-supervised video forensics by audio-visual anomaly detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10491\u201310503 (2023)","DOI":"10.1109\/CVPR52729.2023.01011"},{"key":"2_CR22","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1109\/TASLP.2021.3133208","volume":"30","author":"E Fonseca","year":"2021","unstructured":"Fonseca, E., Favory, X., Pons, J., Font, F., Serra, X.: Fsd50k: an open dataset of human-labeled sound events. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 829\u2013852 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"758","DOI":"10.1007\/978-3-030-58621-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Gan","year":"2020","unstructured":"Gan, C., Huang, D., Chen, P., Tenenbaum, J.B., Torralba, A.: Foley music: learning to generate music from videos. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 758\u2013775. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_44"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Gao, R., Feris, R., Grauman, K.: Learning to separate object sounds by watching unlabeled video. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 35\u201353 (2018)","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"2_CR25","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5 d visual sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 324\u2013333 (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Imagebind: one embedding space to bind them all. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"2_CR28","doi-asserted-by":"publisher","DOI":"10.3389\/fevo.2022.894232","volume":"10","author":"E Grinfeder","year":"2022","unstructured":"Grinfeder, E., Lorenzi, C., Haupert, S., Sueur, J.: What do we mean by \u201csoundscape\"? a functional description. Front. Ecol. Evol. 10, 894232 (2022)","journal-title":"Front. Ecol. Evol."},{"key":"2_CR29","doi-asserted-by":"crossref","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 649\u2013665 (2018)","DOI":"10.1007\/978-3-030-01231-1_40"},{"key":"2_CR30","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2_CR31","doi-asserted-by":"crossref","unstructured":"Hershey, S., et\u00a0al.: Cnn architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (icassp), pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"2_CR32","doi-asserted-by":"crossref","unstructured":"Hertzmann, A., Jacobs, C.E., Oliver, N., Curless, B., Salesin, D.H.: Image analogies. In: Seminal Graphics Papers: Pushing the Boundaries, vol. 2, pp. 557\u2013570 (2023)","DOI":"10.1145\/3596711.3596770"},{"key":"2_CR33","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR34","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"2_CR35","first-page":"16582","volume":"34","author":"C Hu","year":"2021","unstructured":"Hu, C., et al.: Neural dubber: dubbing for videos according to scripts. Adv. Neural. Inf. Process. Syst. 34, 16582\u201316595 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR36","unstructured":"Huang, P.Y., et al.: Mavil: masked audio-video learners (2023)"},{"key":"2_CR37","unstructured":"Huang, R., et al.: Make-an-audio: text-to-audio generation with prompt-enhanced diffusion models. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"2_CR38","unstructured":"Huang, S., Li, Q., Anil, C., Bao, X., Oore, S., Grosse, R.B.: Timbretron: a wavenet (cyclegan (cqt (audio))) pipeline for musical timbre transfer. arXiv preprint arXiv:1811.09620 (2018)"},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"Huang, X., Belongie, S.: Arbitrary style transfer in real-time with adaptive instance normalization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1501\u20131510 (2017)","DOI":"10.1109\/ICCV.2017.167"},{"key":"2_CR40","doi-asserted-by":"crossref","unstructured":"Huh, J., Chalk, J., Kazakos, E., Damen, D., Zisserman, A.: Epic-sounds: a large-scale dataset of actions that sound. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096198"},{"issue":"2","key":"2_CR41","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1044\/2019_JSLHR-19-00057","volume":"63","author":"EJ Hunter","year":"2020","unstructured":"Hunter, E.J., et al.: Toward a consensus description of vocal effort, vocal load, vocal loading, and vocal fatigue. J. Speech Lang. Hear. Res. 63(2), 509\u2013532 (2020)","journal-title":"J. Speech Lang. Hear. Res."},{"key":"2_CR42","unstructured":"Iashin, V., Rahtu, E.: Taming visually guided sound generation. In: The British Machine Vision Conference (BMVC) (2021)"},{"key":"2_CR43","unstructured":"Inc., A.: Enhance speech: remove noise and echo from voice recordings (2023). https:\/\/podcast.adobe.com\/enhance"},{"key":"2_CR44","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1125\u20131134 (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"2_CR45","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H.: Cyclegan-vc: non-parallel voice conversion using cycle-consistent adversarial networks. In: 2018 26th European Signal Processing Conference (EUSIPCO), pp. 2100\u20132104. IEEE (2018)","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"2_CR46","doi-asserted-by":"crossref","unstructured":"Kilgour, K., Zuluaga, M., Roblek, D., Sharifi, M.: Fr\u00e9chet audio distance: A reference-free metric for evaluating music enhancement algorithms. In: INTERSPEECH, pp. 2350\u20132354 (2019)","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"2_CR47","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"2_CR48","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"2_CR49","doi-asserted-by":"crossref","unstructured":"Koepke, A.S., Wiles, O., Moses, Y., Zisserman, A.: Sight to sound: an end-to-end approach for visual piano transcription. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1838\u20131842. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9053115"},{"key":"2_CR50","first-page":"17022","volume":"33","author":"J Kong","year":"2020","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-gan: generative adversarial networks for efficient and high fidelity speech synthesis. Adv. Neural. Inf. Process. Syst. 33, 17022\u201317033 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR51","doi-asserted-by":"publisher","first-page":"2880","DOI":"10.1109\/TASLP.2020.3030497","volume":"28","author":"Q Kong","year":"2020","unstructured":"Kong, Q., Cao, Y., Iqbal, T., Wang, Y., Wang, W., Plumbley, M.D.: Panns: large-scale pretrained audio neural networks for audio pattern recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 28, 2880\u20132894 (2020)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2_CR52","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Proceedings of the Advances in Neural Information Processing Systems (2018)"},{"key":"2_CR53","unstructured":"Kreuk, F., et al.: Audiogen: textually guided audio generation. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"2_CR54","doi-asserted-by":"crossref","unstructured":"Lee, S.H., et al.: Sound-guided semantic image manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3377\u20133386 (2022)","DOI":"10.1109\/CVPR52688.2022.00337"},{"key":"2_CR55","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"2_CR56","doi-asserted-by":"crossref","unstructured":"Li, T., Lin, Q., Bao, Y., Li, M.: Atss-net: target speaker separation via attention-based neural network. In: Interspeech, pp. 1411\u20131415 (2020)","DOI":"10.21437\/Interspeech.2020-1436"},{"key":"2_CR57","doi-asserted-by":"crossref","unstructured":"Li, T., Liu, Y., Hu, C., Zhao, H.: CVC: contrastive learning for non-parallel voice conversion. In: Interspeech (2021)","DOI":"10.21437\/Interspeech.2021-137"},{"key":"2_CR58","doi-asserted-by":"publisher","unstructured":"Li, T., Liu, Y., Owens, A., Zhao, H.: Learning visual styles from audio-visual associations. In: European Conference on Computer Vision, pp. 235\u2013252. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_14","DOI":"10.1007\/978-3-031-19836-6_14"},{"key":"2_CR59","unstructured":"Liu, H., et al.: Audioldm: text-to-audio generation with latent diffusion models. In: International Conference on Machine Learning (ICML) (2023)"},{"key":"2_CR60","doi-asserted-by":"crossref","unstructured":"Lo, C.C., et al.: Mosnet: deep learning based objective assessment for voice conversion. arXiv preprint arXiv:1904.08352 (2019)","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"2_CR61","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"2_CR62","unstructured":"Luo, S., Yan, C., Hu, C., Zhao, H.: Diff-foley: synchronized video-to-audio synthesis with latent diffusion models. arXiv preprint arXiv:2306.17203 (2023)"},{"issue":"5","key":"2_CR63","doi-asserted-by":"publisher","first-page":"926","DOI":"10.1016\/j.neuron.2011.06.032","volume":"71","author":"JH McDermott","year":"2011","unstructured":"McDermott, J.H., Simoncelli, E.P.: Sound texture perception via statistics of the auditory periphery: evidence from sound synthesis. Neuron 71(5), 926\u2013940 (2011)","journal-title":"Neuron"},{"key":"2_CR64","doi-asserted-by":"crossref","unstructured":"Mei, X., et al.: Wavcaps: a chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"2_CR65","unstructured":"Morgado, P., Vasconcelos, N., Langlois, T., Wang, O.: Self-supervised generation of spatial audio for 360 video. Adv. Neural Inf. Process. Syst. (2018)"},{"key":"2_CR66","doi-asserted-by":"crossref","unstructured":"Morgado, P., Vasconcelos, N., Misra, I.: Audio-visual instance discrimination with cross-modal agreement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12475\u201312486 (2021)","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"2_CR67","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of the European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"2_CR68","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"2_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1007\/978-3-319-46448-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Owens","year":"2016","unstructured":"Owens, A., Wu, J., McDermott, J.H., Freeman, W.T., Torralba, A.: Ambient sound provides supervision for visual learning. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 801\u2013816. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_48"},{"key":"2_CR70","doi-asserted-by":"crossref","unstructured":"Patrick, M., et al.: Space-time crop & attend: improving cross-modal video representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10560\u201310572 (2021)","DOI":"10.1109\/ICCV48922.2021.01039"},{"key":"2_CR71","doi-asserted-by":"crossref","unstructured":"Petermann, D., Wichern, G., Wang, Z.Q., Le\u00a0Roux, J.: The cocktail fork problem: three-stem audio separation for real-world soundtracks. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 526\u2013530. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746005"},{"issue":"3","key":"2_CR72","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1525\/bio.2011.61.3.6","volume":"61","author":"BC Pijanowski","year":"2011","unstructured":"Pijanowski, B.C., et al.: Soundscape ecology: the science of sound in the landscape. Bioscience 61(3), 203\u2013216 (2011)","journal-title":"Bioscience"},{"key":"2_CR73","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: Learning individual speaking styles for accurate lip to speech synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13796\u201313805 (2020)","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"2_CR74","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"2_CR75","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"2_CR76","doi-asserted-by":"crossref","unstructured":"Rix, A.W., Beerends, J.G., Hollier, M.P., Hekstra, A.P.: Perceptual evaluation of speech quality (pesq)-a new method for speech quality assessment of telephone networks and codecs. In: 2001 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No. 01CH37221), vol.\u00a02, pp. 749\u2013752. IEEE (2001)","DOI":"10.1109\/ICASSP.2001.941023"},{"key":"2_CR77","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2_CR78","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training gans. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"2_CR79","doi-asserted-by":"crossref","unstructured":"Sheffer, R., Adi, Y.: I hear your true colors: image guided audio generation. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096023"},{"key":"2_CR80","doi-asserted-by":"crossref","unstructured":"Singh, N., Mentch, J., Ng, J., Beveridge, M., Drori, I.: Image2reverb: cross-modal reverb impulse response synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 286\u2013295 (2021)","DOI":"10.1109\/ICCV48922.2021.00035"},{"key":"2_CR81","unstructured":"Somayazulu, A., Chen, C., Grauman, K.: Self-supervised visual acoustic matching. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"2_CR82","doi-asserted-by":"crossref","unstructured":"Son\u00a0Chung, J., Senior, A., Vinyals, O., Zisserman, A.: Lip reading sentences in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6447\u20136456 (2017)","DOI":"10.1109\/CVPR.2017.367"},{"key":"2_CR83","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"2_CR84","doi-asserted-by":"crossref","unstructured":"Steinmetz, C.J., Bryan, N.J., Reiss, J.D.: Style transfer of audio effects with differentiable signal processing. arXiv preprint arXiv:2207.08759 (2022)","DOI":"10.17743\/jaes.2022.0025"},{"key":"2_CR85","first-page":"29258","volume":"34","author":"K Su","year":"2021","unstructured":"Su, K., Liu, X., Shlizerman, E.: How does it sound? Adv. Neural. Inf. Process. Syst. 34, 29258\u201329273 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR86","unstructured":"Team, S.: Silero vad: pre-trained enterprise-grade voice activity detector (vad), number detector and language classifier (2021). https:\/\/github.com\/snakers4\/silero-vad"},{"key":"2_CR87","unstructured":"Ulyanov, D.: Audio texture synthesis and style transfer (2016). https:\/\/dmitryulyanov.github.io\/audio-texture-synthesis-and-style-transfer\/"},{"key":"2_CR88","unstructured":"V\u00e4lim\u00e4ki, V., Parker, J., Savioja, L., Smith, J.O., Abel, J.: More than 50 years of artificial reverberation. In: Audio Engineering Society Conference: 60th International Conference: Dreams (Dereverberation and Reverberation of Audio, Music, and Speech). Audio Engineering Society (2016)"},{"key":"2_CR89","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2_CR90","unstructured":"Verma, P., Smith, J.O.: Neural style transfer for audio spectograms. arXiv preprint arXiv:1801.01589 (2018)"},{"key":"2_CR91","doi-asserted-by":"crossref","unstructured":"Wang, W., Tran, D., Feiszli, M.: What makes training multi-modal classification networks hard? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12695\u201312705 (2020)","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"2_CR92","unstructured":"Wang, Y., et al.: Audit: audio editing by following instructions with latent diffusion models. arXiv preprint arXiv:2304.00830 (2023)"},{"key":"2_CR93","doi-asserted-by":"publisher","first-page":"1720","DOI":"10.1109\/TASLP.2023.3268730","volume":"31","author":"D Yang","year":"2023","unstructured":"Yang, D., et al.: Diffsound: discrete diffusion model for text-to-sound generation. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 1720\u20131733 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"2_CR94","unstructured":"Yang, F., Ma, C., Zhang, J., Zhu, J., Yuan, W., Owens, A.: Touch and go: learning from human-collected vision and touch. In: Neural Information Processing Systems (NeurIPS) - Datasets and Benchmarks Track (2022)"},{"key":"2_CR95","doi-asserted-by":"crossref","unstructured":"Yang, F., Zhang, J., Owens, A.: Generating visual scenes from touch. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.02017"},{"key":"2_CR96","doi-asserted-by":"crossref","unstructured":"Yang, K., Russell, B., Salamon, J.: Telling left from right: learning spatial correspondence of sight and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9932\u20139941 (2020)","DOI":"10.1109\/CVPR42600.2020.00995"},{"key":"2_CR97","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Ma, W.C., Torralba, A.: The sound of motions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1735\u20131744 (2019)","DOI":"10.1109\/ICCV.2019.00182"},{"key":"2_CR98","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"2_CR99","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, Z., Fang, C., Bui, T., Berg, T.L.: Visual to sound: generating natural sound for videos in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3550\u20133558 (2018)","DOI":"10.1109\/CVPR.2018.00374"},{"key":"2_CR100","doi-asserted-by":"crossref","unstructured":"Zhu, J.Y., Park, T., Isola, P., Efros, A.A.: Unpaired image-to-image translation using cycle-consistent adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2223\u20132232 (2017)","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72989-8_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T17:09:53Z","timestamp":1729876193000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72989-8_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031729881","9783031729898"],"references-count":100,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72989-8_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}