{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T14:51:06Z","timestamp":1779375066801,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3770976","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:11Z","timestamp":1765008491000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SeeingSounds: Learning Audio-to-Visual Alignment via Text"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8367-3933","authenticated-orcid":false,"given":"Simone","family":"Carnemolla","sequence":"first","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6721-4383","authenticated-orcid":false,"given":"Matteo","family":"Pennisi","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4402-6159","authenticated-orcid":false,"given":"Chiara Maria","family":"Russo","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2441-0982","authenticated-orcid":false,"given":"Simone","family":"Palazzo","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5135-1351","authenticated-orcid":false,"given":"Daniela","family":"Giordano","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6653-2577","authenticated-orcid":false,"given":"Concetto","family":"Spampinato","sequence":"additional","affiliation":[{"name":"University of Catania, Catania, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Xavier Amatriain. 2024. Prompt design and engineering: Introduction and advanced methods. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.14423 (2024)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","unstructured":"Sarah\u00a0H. Baum Randi\u00a0C. Martin A.\u00a0Cris Hamilton and Michael\u00a0S. Beauchamp. 2012. Multisensory speech perception without the left superior temporal sulcus. NeuroImage 62 3 (2012) 1825\u20131832. 10.1016\/j.neuroimage.2012.05.034","DOI":"10.1016\/j.neuroimage.2012.05.034"},{"key":"e_1_3_3_2_4_2","unstructured":"Burak\u00a0Can Biner Farrin\u00a0Marouf Sofian Umur\u00a0Berkay Karaka\u015f Duygu Ceylan Erkut Erdem and Aykut Erdem. 2024. SonicDiffusion: Audio-Driven Image Generation and Editing with Pretrained Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2405.00878\u00a0[cs.CV]"},{"key":"e_1_3_3_2_5_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_2_6_2","unstructured":"Arantxa Casanova Marl\u00e8ne Careil Jakob Verbeek Michal Drozdzal and Adriana Romero. 2021. Instance-Conditioned GAN. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.05070 (2021). https:\/\/arxiv.org\/abs\/2109.05070"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00396"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126723"},{"key":"e_1_3_3_2_10_2","volume-title":"arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.01266","author":"Fanzeres Leonardo\u00a0A.","year":"2021","unstructured":"Leonardo\u00a0A. Fanzeres and Climent Nadeu. 2021. Sound-to-imagination: Unsupervised crossmodal translation using deep dense network architecture. In arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.01266."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","unstructured":"Asif\u00a0A. Ghazanfar and Charles\u00a0E. Schroeder. 2006. Is neocortex essentially multisensory?Trends in Cognitive Sciences 10 6 (2006) 278\u2013285. 10.1016\/j.tics.2006.04.008","DOI":"10.1016\/j.tics.2006.04.008"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-698"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_3_2_15_2","volume-title":"AAAI Conference on Artificial Intelligence","author":"Hao Wangli","year":"2018","unstructured":"Wangli Hao, Zhaoxiang Zhang, and He Guan. 2018. CMC-GAN: A uniform framework for cross-modal visual-audio mutual generation. In AAAI Conference on Artificial Intelligence."},{"key":"e_1_3_3_2_16_2","unstructured":"Black\u00a0Forest Labs. 2024. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_3"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4437061"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00337"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_14"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","unstructured":"Steven\u00a0R. Livingstone and Frank\u00a0A. Russo. 2018. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic multimodal set of facial and vocal expressions in North American English. PLOS ONE 13 5 (05 2018) 1\u201335. 10.1371\/journal.pone.0196391","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"key":"e_1_3_3_2_23_2","unstructured":"Fabrizio Pedersoli Dryden Wiebe Amin Banitalebi Yong Zhang George Tzanetakis and Kwang\u00a0M. Yi. 2022. Estimating Visual Information From Audio Through Manifold Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.02337 (2022). https:\/\/arxiv.org\/abs\/2208.02337"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02110"},{"key":"e_1_3_3_2_26_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_27_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21 140 (2020) 1\u201367. http:\/\/jmlr.org\/papers\/v21\/20-074.html"},{"key":"e_1_3_3_2_28_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_2_29_2","first-page":"8821","volume-title":"International conference on machine learning","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821\u20138831."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_31_2","unstructured":"Vignesh Subramaniam Colin Conwell Chen Wang Gabriel Kreiman Boris Katz Ignacio Cases and Andrei Barbu. 2024. Revealing Vision-Language Integration in the Brain with Multimodal Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.14481 (2024). https:\/\/arxiv.org\/abs\/2406.14481 PMID: 38947929; PMCID: PMC11213144."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Kim Sung-Bin Arda Senocak Hyunwoo Ha Andrew Owens and Tae-Hyun Oh. 2023. Sound to Visual Scene Generation by Audio-to-Visual Latent Alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.17490 (2023). https:\/\/arxiv.org\/abs\/2303.17490","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"e_1_3_3_2_33_2","unstructured":"Zineng Tang Ziyi Yang Chenguang Zhu Michael Zeng and Mohit Bansal. 2023. Any-to-any generation via composable diffusion. Advances in Neural Information Processing Systems 36 (2023) 16083\u201316099."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682383"},{"key":"e_1_3_3_2_35_2","unstructured":"Jason Wei Xuezhi Wang Dale Schuurmans Maarten Bosma Fei Xia Ed Chi Quoc\u00a0V Le Denny Zhou et\u00a0al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022) 24824\u201324837."},{"key":"e_1_3_3_2_36_2","first-page":"1408","volume-title":"Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference)","author":"Wenjuan Han","year":"2024","unstructured":"Han Wenjuan, Wei Xiang, Cui Xingyu, Cheng Ning, Jiang Guangyuan, Qian Weinan, and Zhang Chi. 2024. Prompt Engineering 101 Prompt Engineering Guidelines from a Linguistic Perspective. In Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 1: Main Conference), Maosong Sun, Jiye Liang, Xianpei Han, Zhiyuan Liu, and Yulan He (Eds.). Chinese Information Processing Society of China, Taiyuan, China, 1408\u20131426. https:\/\/aclanthology.org\/2024.ccl-1.108\/"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28486"},{"key":"e_1_3_3_2_38_2","first-page":"5446","volume-title":"Interspeech","author":"Yariv Guy","year":"2023","unstructured":"Guy Yariv, Itai Gat, Lior Wolf, Yossi Adi, and Idan Schwartz. 2023. Audio-Token: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation. In Interspeech. 5446\u20135450."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Guy Yariv Itai Gat Lior Wolf Yossi Adi and Idan Schwartz. 2023. AudioToken: Adaptation of Text-Conditioned Diffusion Models for Audio-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.13050 (2023). https:\/\/arxiv.org\/abs\/2305.13050","DOI":"10.21437\/Interspeech.2023-852"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3770976","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:10:20Z","timestamp":1765008620000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3770976"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":39,"alternative-id":["10.1145\/3743093.3770976","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3770976","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}