{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:11:24Z","timestamp":1765357884260,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755130","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"9773-9782","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["CatchPhrase: EXPrompt-Guided Encoder Adaptation for Audio-to-Image Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1428-7888","authenticated-orcid":false,"given":"Hyunwoo","family":"Oh","sequence":"first","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1941-0691","authenticated-orcid":false,"given":"SeungJu","family":"Cha","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4525-871X","authenticated-orcid":false,"given":"Kwanyoung","family":"Lee","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3373-3390","authenticated-orcid":false,"given":"Si-Woo","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7231-7494","authenticated-orcid":false,"given":"Dong-Jin","family":"Kim","sequence":"additional","affiliation":[{"name":"Hanyang University, Seoul, Republic of Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"What Do You See? Enhancing Zero-Shot Image Classification with Multimodal Large Language Models. arXiv preprint arXiv:2405.15668","author":"Abdelhamed Abdelrahman","year":"2024","unstructured":"Abdelrahman Abdelhamed, Mahmoud Afifi, and Alec Go. 2024. What Do You See? Enhancing Zero-Shot Image Classification with Multimodal Large Language Models. arXiv preprint arXiv:2405.15668 (2024)."},{"key":"e_1_3_2_2_2_1","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et al. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"Umur Berkay Karaka\u015f, Duygu Ceylan, Erkut Erdem, and Aykut Erdem.","author":"Biner Burak Can","year":"2024","unstructured":"Burak Can Biner, Farrin Marouf Sofian, Umur Berkay Karaka\u015f, Duygu Ceylan, Erkut Erdem, and Aykut Erdem. 2024. SonicDiffusion: Audio-Driven Image Generation and Editing with Pretrained Diffusion Models. arXiv preprint arXiv:2405.00878 (2024)."},{"key":"e_1_3_2_2_4_1","unstructured":"Mikolaj Binkowski Danica J. Sutherland Michael Arbel and Arthur Gretton. 2021. Demystifying MMD GANs. arXiv:1801.01401 [stat.ML] https:\/\/arxiv.org\/abs\/1801.01401"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_2_6_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877-1901."},{"key":"e_1_3_2_2_7_1","unstructured":"SeungJu Cha Kwanyoung Lee Ye-Chan Kim Hyunwoo Oh and Dong-Jin Kim. 2025. VerbDiff: Text-Only Diffusion Models with Enhanced Interaction Awareness. arXiv:2503.16406 [cs.GR] https:\/\/arxiv.org\/abs\/2503.16406"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73039-9_11"},{"key":"e_1_3_2_2_10_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Du Chengbin","year":"2024","unstructured":"Chengbin Du, Yanxi Li, Zhongwei Qiu, and Chang Xu. 2024. Stable diffusion is unstable. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_2_14_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Hao Yaru","year":"2024","unstructured":"Yaru Hao, Zewen Chi, Li Dong, and Furu Wei. 2024. Optimizing prompts for text-to-image generation. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_15_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2018. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. arXiv:1706.08500 [cs.LG] https:\/\/arxiv.org\/abs\/1706.08500"},{"key":"e_1_3_2_2_16_1","volume-title":"Promptcap: Prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699","author":"Hu Yushi","year":"2022","unstructured":"Yushi Hu, Hang Hua, Zhengyuan Yang, Weijia Shi, Noah A Smith, and Jiebo Luo. 2022. Promptcap: Prompt-guided task-aware image captioning. arXiv preprint arXiv:2211.09699 (2022)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446672"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754714"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3754715"},{"key":"e_1_3_2_2_20_1","unstructured":"Seungwoo Lee Chaerin Kong Donghyeon Jeon and Nojun Kwak. 2023b. AADiff: Audio-Aligned Video Synthesis with Text-to-Image Diffusion. arXiv:2305.04001 [cs.CV] https:\/\/arxiv.org\/abs\/2305.04001"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00337"},{"key":"e_1_3_2_2_22_1","unstructured":"Taegyeong Lee Jeonghun Kang Hyeonyu Kim and Taehwan Kim. 2023a. Generating Realistic Images from In-the-wild Sounds. arXiv:2309.02405 [cs.CV] https:\/\/arxiv.org\/abs\/2309.02405"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01275"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_2_25_1","first-page":"746","volume-title":"Nature","volume":"264","author":"McGurk Harry","year":"1976","unstructured":"Harry McGurk and John MacDonald. 1976. Hearing lips and seeing voices. Nature, Vol. 264, 5588 (1976), 746-748."},{"key":"e_1_3_2_2_26_1","volume-title":"Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research","author":"Mei Xinhao","year":"2024","unstructured":"Xinhao Mei, Chutong Meng, Haohe Liu, Qiuqiang Kong, Tom Ko, Chengqi Zhao, Mark D Plumbley, Yuexian Zou, and Wenwu Wang. 2024. Wavcaps: A chatgpt-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2024)."},{"key":"e_1_3_2_2_27_1","unstructured":"Sachit Menon and Carl Vondrick. 2022. Visual Classification via Description from Large Language Models. arXiv:2210.07183 [cs.CV] https:\/\/arxiv.org\/abs\/2210.07183"},{"key":"e_1_3_2_2_28_1","volume-title":"Audio-Journey: Open Domain Latent Diffusion Based Text-To-Audio Generation. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6960-6964","author":"Michaels Jackson","year":"2024","unstructured":"Jackson Michaels, Juncheng B Li, Laura Yao, Lijun Yu, Zach Wood-Doughty, and Florian Metze. 2024. Audio-Journey: Open Domain Latent Diffusion Based Text-To-Audio Generation. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6960-6964."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Liliane Momeni Mathilde Caron Arsha Nagrani Andrew Zisserman and Cordelia Schmid. 2023. Verbs in Action: Improving verb understanding in video-language models. arXiv:2304.06708 [cs.CV] https:\/\/arxiv.org\/abs\/2304.06708","DOI":"10.1109\/ICCV51070.2023.01428"},{"key":"e_1_3_2_2_30_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_2_31_1","unstructured":"Yuta Oshima Masahiro Suzuki Yutaka Matsuo and Hiroki Furuta. 2025. Inference-Time Text-to-Video Alignment with Diffusion Latent Beam Search. arXiv:2501.19252 [cs.CV] https:\/\/arxiv.org\/abs\/2501.19252"},{"key":"e_1_3_2_2_32_1","unstructured":"Long Ouyang Jeff Wu Xu Jiang Diogo Almeida Carroll L. Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray John Schulman Jacob Hilton Fraser Kelton Luke Miller Maddie Simens Amanda Askell Peter Welinder Paul Christiano Jan Leike and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. arXiv:2203.02155 [cs.CL] https:\/\/arxiv.org\/abs\/2203.02155"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01438"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02110"},{"key":"e_1_3_2_2_36_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_37_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_2_38_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_39_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_2_40_1","first-page":"3536","article-title":"Linguistic binding in diffusion models: Enhancing attribute correspondence through attention map alignment","volume":"36","author":"Rassin Royi","year":"2023","unstructured":"Royi Rassin, Eran Hirsch, Daniel Glickman, Shauli Ravfogel, Yoav Goldberg, and Gal Chechik. 2023. Linguistic binding in diffusion models: Enhancing attribute correspondence through attention map alignment. Advances in Neural Information Processing Systems, Vol. 36 (2023), 3536-3559.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_42_1","volume-title":"FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. arXiv preprint arXiv:2305.17718","author":"Rotstein Noam","year":"2023","unstructured":"Noam Rotstein, David Bensaid, Shaked Brody, Roy Ganz, and Ron Kimmel. 2023. FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions. arXiv preprint arXiv:2305.17718 (2023)."},{"key":"e_1_3_2_2_43_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al., 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, Vol. 35 (2022), 36479-36494."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_2_45_1","unstructured":"Kim Sung-Bin Kim Jun-Seong Junseok Ko Yewon Kim and Tae-Hyun Oh. 2024. SoundBrush: Sound as a Brush for Visual Scene Editing. arXiv:2501.00645 [cs.CV] https:\/\/arxiv.org\/abs\/2501.00645"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"e_1_3_2_2_47_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_2_48_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Guy Yariv Itai Gat Lior Wolf Yossi Adi and Idan Schwartz. 2023. AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation. arXiv:2305.13050 [cs.SD] https:\/\/arxiv.org\/abs\/2305.13050","DOI":"10.21437\/Interspeech.2023-852"},{"key":"e_1_3_2_2_52_1","unstructured":"Rushikesh Zawar Shaurya Dewan Prakanshul Saxena Yingshan Chang Andrew Luo and Yonatan Bisk. 2024. DiffusionPID: Interpreting Diffusion via Partial Information Decomposition. arXiv:2406.05191 [cs.CV] https:\/\/arxiv.org\/abs\/2406.05191"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755130","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:02:11Z","timestamp":1765310531000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755130"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":54,"alternative-id":["10.1145\/3746027.3755130","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755130","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}