{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:28:24Z","timestamp":1777656504417,"version":"3.51.4"},"publisher-location":"Singapore","reference-count":58,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819609598","type":"print"},{"value":"9789819609604","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,8]],"date-time":"2024-12-08T00:00:00Z","timestamp":1733616000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0960-4_8","type":"book-chapter","created":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T08:05:17Z","timestamp":1733558717000},"page":"123-139","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Language-Guided Joint Audio-Visual Editing via\u00a0One-Shot Adaptation"],"prefix":"10.1007","author":[{"given":"Susan","family":"Liang","sequence":"first","affiliation":[]},{"given":"Chao","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yapeng","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Anurag","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,8]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: ICCV. pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM TOG 42(4), 1\u201310 (2023)","DOI":"10.1145\/3592116"},{"key":"8_CR3","unstructured":"Chen, H., Zhang, Y., Wu, S., Wang, X., Duan, X., Zhou, Y., Zhu, W.: Disenbooth: Identity-preserving disentangled tuning for subject-driven text-to-image generation. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"8_CR4","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Vedaldi, A., Zisserman, A.: Vggsound: A large-scale audio-visual dataset. In: ICASSP. pp. 721\u2013725 (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"8_CR5","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. NeurIPS 34, 8780\u20138794 (2021)"},{"key":"8_CR6","doi-asserted-by":"crossref","unstructured":"Du, Y., Chen, Z., Salamon, J., Russell, B., Owens, A.: Conditional generation of audio from video via foley analogies. In: CVPR. pp. 2426\u20132436 (2023)","DOI":"10.1109\/CVPR52729.2023.00240"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Elizalde, B., Deshmukh, S., Al\u00a0Ismail, M., Wang, H.: Clap learning audio concepts from natural language supervision. In: ICASSP. pp.\u00a01\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Ephrat, A., Mosseri, I., Lang, O., Dekel, T., Wilson, K., Hassidim, A., Freeman, W.T., Rubinstein, M.: Looking to listen at the cocktail party: a speaker-independent audio-visual model for speech separation. ACM Trans. Graph. 37(4), 112 (2018)","DOI":"10.1145\/3197517.3201357"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: CVPR. pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"8_CR10","unstructured":"Gal, R., Alaluf, Y., Atzmon, Y., Patashnik, O., Bermano, A.H., Chechik, G., Cohen-Or, D.: An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Ghosal, D., Majumder, N., Mehrish, A., Poria, S.: Text-to-audio generation using instruction-tuned llm and latent diffusion model. arXiv preprint arXiv:2304.13731 (2023)","DOI":"10.1145\/3581783.3612348"},{"key":"8_CR12","unstructured":"Guo, Y., Yang, C., Rao, A., Wang, Y., Qiao, Y., Lin, D., Dai, B.: Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725 (2023)"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: Audioclip: Extending clip to image, text and audio. In: ICASSP. pp. 976\u2013980 (2022)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Hayes, T., Zhang, S., Yin, X., Pang, G., Sheng, S., Yang, H., Ge, S., Hu, Q., Parikh, D.: Mugen: A playground for video-audio-text multimodal understanding and generation. In: ECCV. pp. 431\u2013449 (2022)","DOI":"10.1007\/978-3-031-20074-8_25"},{"key":"8_CR15","unstructured":"He, Y., Xia, M., Chen, H., Cun, X., Gong, Y., Xing, J., Zhang, Y., Wang, X., Weng, C., Shan, Y., Chen, Q.: Animate-a-story: Storytelling with retrieval-augmented video generation. arXiv preprint arXiv:2307.06940 (2023)"},{"key":"8_CR16","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross-attention control. In: ICLR (2023)"},{"key":"8_CR17","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)"},{"key":"8_CR18","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. arXiv:2204.03458 (2022)"},{"key":"8_CR19","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W.: Lora: Low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"8_CR20","unstructured":"Hua, M., Liu, J., Ding, F., Liu, W., Wu, J., He, Q.: Dreamtuner: Single image is enough for subject-driven generation. arXiv preprint arXiv:2312.13691 (2023)"},{"key":"8_CR21","unstructured":"Huang, C., Liang, S., Tian, Y., Kumar, A., Xu, C.: Davis: High-quality audio-visual separation with generative diffusion models. arXiv preprint arXiv:2308.00122 (2023)"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Huang, C., Markovic, D., Xu, C., Richard, A.: Modeling and driving human body soundfields through acoustic primitives. arXiv preprint arXiv:2407.13083 (2024)","DOI":"10.1007\/978-3-031-72684-2_1"},{"key":"8_CR23","unstructured":"Huang, J., Ren, Y., Huang, R., Yang, D., Ye, Z., Zhang, C., Liu, J., Yin, X., Ma, Z., Zhao, Z.: Make-an-audio 2: Temporal-enhanced text-to-audio generation. arXiv preprint arXiv:2305.18474 (2023)"},{"key":"8_CR24","unstructured":"Huang, R., Huang, J., Yang, D., Ren, Y., Liu, L., Li, M., Ye, Z., Liu, J., Yin, X., Zhao, Z.: Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models. In: ICML. pp. 13916\u201313932 (2023)"},{"key":"8_CR25","doi-asserted-by":"crossref","unstructured":"Jeong, Y., Ryoo, W., Lee, S., Seo, D., Byeon, W., Kim, S., Kim, J.: The power of sound (tpos): Audio reactive video generation with stable diffusion. arXiv preprint arXiv:2309.04509 (2023)","DOI":"10.1109\/ICCV51070.2023.00719"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., Movsisyan, A., Tadevosyan, V., Henschel, R., Wang, Z., Navasardyan, S., Shi, H.: Text2video-zero: Text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Kilgour, K., Zuluaga, M., Roblek, D., Sharifi, M.: Fr\u00e9chet audio distance: A reference-free metric for evaluating music enhancement algorithms. In: INTERSPEECH. pp. 2350\u20132354 (2019)","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"8_CR28","unstructured":"Kong, J., Kim, J., Bae, J.: Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Adv. Neural. Inf. Process. Syst. 33, 17022\u201317033 (2020)"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion. In: CVPR. pp. 1931\u20131941 (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Lee, S.H., Roh, W., Byeon, W., Yoon, S.H., Kim, C., Kim, J., Kim, S.: Sound-guided semantic image manipulation. In: CVPR. pp. 3377\u20133386 (2022)","DOI":"10.2139\/ssrn.4437061"},{"key":"8_CR31","unstructured":"Liang, S., Huang, C., Tian, Y., Kumar, A., Xu, C.: Av-nerf: Learning neural fields for real-world audio-visual scene synthesis. Adv. Neural. Inf. Process. Syst. 36, 37472\u201337490 (2023)"},{"key":"8_CR32","unstructured":"Liang, S., Huang, C., Tian, Y., Kumar, A., Xu, C.: Neural acoustic context field: Rendering realistic room impulse response with neural fields. arXiv preprint arXiv:2309.15977 (2023)"},{"key":"8_CR33","unstructured":"Liu, H., Chen, Z., Yuan, Y., Mei, X., Liu, X., Mandic, D.P., Wang, W., Plumbley, M.D.: Audioldm: Text-to-audio generation with latent diffusion models. In: ICML. pp. 21450\u201321474 (2023)"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Liu, H., Tian, Q., Yuan, Y., Liu, X., Mei, X., Kong, Q., Wang, Y., Wang, W., Wang, Y., Plumbley, M.D.: Audioldm 2: Learning holistic audio generation with self-supervised pretraining. arXiv preprint arXiv:2308.05734 (2023)","DOI":"10.1109\/TASLP.2024.3399607"},{"key":"8_CR35","unstructured":"Luo, S., Yan, C., Hu, C., Zhao, H.: Diff-foley: Synchronized video-to-audio synthesis with latent diffusion models (2023)"},{"key":"8_CR36","unstructured":"Mo, S., Shi, J., Tian, Y.: Diffava: Personalized text-to-audio generation with visual alignment. arXiv preprint arXiv:2305.12903 (2023)"},{"key":"8_CR37","unstructured":"Nichol, A.Q., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., Chen, M.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. In: ICML. vol.\u00a0162, pp. 16784\u201316804 (2022)"},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Qin, C., Yu, N., Xing, C., Zhang, S., Chen, Z., Ermon, S., Fu, Y., Xiong, C., Xu, R.: Gluegen: Plug and play multi-modal encoders for x-to-image generation. arXiv preprint arXiv:2303.10056 (2023)","DOI":"10.1109\/ICCV51070.2023.02110"},{"key":"8_CR39","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML. pp. 8748\u20138763 (2021)"},{"key":"8_CR40","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., Liu, P.J.: Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research 21(1), 5485\u20135551 (2020)"},{"key":"8_CR41","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR. pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: Convolutional networks for biomedical image segmentation. In: MICCAI. pp. 234\u2013241 (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Ruan, L., Ma, Y., Yang, H., He, H., Liu, B., Fu, J., Yuan, N.J., Jin, Q., Guo, B.: Mm-diffusion: Learning multi-modal diffusion models for joint audio and video generation. In: CVPR. pp. 10219\u201310228 (2023)","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR. pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"8_CR46","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al.: Photorealistic text-to-image diffusion models with deep language understanding. NeurIPS 35, 36479\u201336494 (2022)"},{"key":"8_CR47","doi-asserted-by":"crossref","unstructured":"Shi, J., Xiong, W., Lin, Z., Jung, H.J.: Instantbooth: Personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)","DOI":"10.1109\/CVPR52733.2024.00816"},{"key":"8_CR48","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., Parikh, D., Gupta, S., Taigman, Y.: Make-a-video: Text-to-video generation without text-video data. In: ICLR (2023)"},{"key":"8_CR49","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2020)"},{"key":"8_CR50","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2020)"},{"key":"8_CR51","doi-asserted-by":"crossref","unstructured":"Sung-Bin, K., Senocak, A., Ha, H., Owens, A., Oh, T.H.: Sound to visual scene generation by audio-to-visual latent alignment. In: CVPR. pp. 6430\u20136440 (2023)","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"8_CR52","unstructured":"Tang, Z., Yang, Z., Zhu, C., Zeng, M., Bansal, M.: Any-to-any generation via composable diffusion. arXiv preprint arXiv:2305.11846 (2023)"},{"key":"8_CR53","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. Advances in neural information processing systems 30 (2017)"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: Elite: Encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"8_CR55","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., Ge, Y., Wang, X., Lei, W., Gu, Y., Hsu, W., Shan, Y., Qie, X., Shou, M.Z.: Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565 (2022)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"8_CR56","doi-asserted-by":"crossref","unstructured":"Xiao, G., Yin, T., Freeman, W.T., Durand, F., Han, S.: Fastcomposer: Tuning-free multi-subject image generation with localized attention. arXiv preprint arXiv:2305.10431 (2023)","DOI":"10.1007\/s11263-024-02227-z"},{"key":"8_CR57","doi-asserted-by":"crossref","unstructured":"Yang, D., Yu, J., Wang, H., Wang, W., Weng, C., Zou, Y., Yu, D.: Diffsound: Discrete diffusion model for text-to-sound generation. Speech, and Language Processing, IEEE\/ACM Transactions on Audio (2023)","DOI":"10.1109\/TASLP.2023.3268730"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Zhao, H., Gan, C., Rouditchenko, A., Vondrick, C., McDermott, J., Torralba, A.: The sound of pixels. In: ECCV. pp. 570\u2013586 (2018)","DOI":"10.1007\/978-3-030-01246-5_35"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0960-4_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,7]],"date-time":"2024-12-07T08:31:40Z","timestamp":1733560300000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0960-4_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,8]]},"ISBN":["9789819609598","9789819609604"],"references-count":58,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0960-4_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,8]]},"assertion":[{"value":"8 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}