{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:51:10Z","timestamp":1765504270722,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761366","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:18:04Z","timestamp":1762561084000},"page":"635-644","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Hearable Image: On-Device Image-Driven Sound Effect Generation for Hearing What You See"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6060-2554","authenticated-orcid":false,"given":"Deokjun","family":"Eom","sequence":"first","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5856-767X","authenticated-orcid":false,"given":"Nahyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5328-6610","authenticated-orcid":false,"given":"Woohyun","family":"Nam","sequence":"additional","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6934-9698","authenticated-orcid":false,"given":"Kyung-Rae","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7326-5646","authenticated-orcid":false,"given":"Chaebin","family":"Im","sequence":"additional","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3722-0659","authenticated-orcid":false,"given":"Jungwon","family":"Park","sequence":"additional","affiliation":[{"name":"Samsung Research, Samsung Electronics, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Yatong Bai Trung Dang Dung Tran Kazuhito Koishida and Somayeh Sojoudi. 2024. ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation with Consistency Distillation. In INTERSPEECH."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096710"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447265"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2700"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Ho Kei Cheng Masato Ishii Akio Hayakawa Takashi Shibuya Alexander Schwing and Yuki Mitsufuji. 2025. MMAudio: Taming Multimodal Joint Training for High-Quality Video-to-Audio Synthesis. In CVPR.","DOI":"10.1109\/CVPR52734.2025.02691"},{"key":"e_1_3_2_1_7_1","first-page":"47704","volume-title":"Levine (Eds.)","volume":"36","author":"Copet Jade","year":"2023","unstructured":"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, and Alexandre Defossez. 2023a. Simple and Controllable Music Generation. In Advances in Neural Information Processing Systems, A. Oh, T. Naumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 47704-47720."},{"key":"e_1_3_2_1_8_1","first-page":"47704","article-title":"Simple and controllable music generation","volume":"36","author":"Copet Jade","year":"2023","unstructured":"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, and Alexandre D\u00e9fossez. 2023b. Simple and controllable music generation. Advances in Neural Information Processing Systems, Vol. 36 (2023), 47704-47720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888184"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612348"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_14_1","volume-title":"Generalized Huber Loss for Robust Learning and its Efficient Minimization for a Robust Statistics. ArXiv","author":"Gokcesu Kaan","year":"2021","unstructured":"Kaan Gokcesu and Hakan Gokcesu. 2021. Generalized Huber Loss for Robust Learning and its Efficient Minimization for a Robust Statistics. ArXiv, Vol. abs\/2108.12627 (2021)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29773"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"e_1_3_2_1_17_1","first-page":"131","article-title":"CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp)","author":"Hershey Shawn","year":"2017","unstructured":"Shawn Hershey, Sourish Chaudhuri, Daniel PW Ellis, Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al., 2017. CNN architectures for large-scale audio classification. In 2017 ieee international conference on acoustics, speech and signal processing (icassp). IEEE, 131-135.","journal-title":"IEEE"},{"key":"e_1_3_2_1_18_1","volume-title":"Make-an-audio 2: Temporal-enhanced text-to-audio generation. arXiv preprint arXiv:2305.18474","author":"Huang Jiawei","year":"2023","unstructured":"Jiawei Huang, Yi Ren, Rongjie Huang, Dongchao Yang, Zhenhui Ye, Chen Zhang, Jinglin Liu, Xiang Yin, Zejun Ma, and Zhou Zhao. 2023. Make-an-audio 2: Temporal-enhanced text-to-audio generation. arXiv preprint arXiv:2305.18474 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Mobilespeech: A fast and high-fidelity framework for mobile zero-shot text-to-speech. arXiv preprint arXiv:2402.09378","author":"Ji Shengpeng","year":"2024","unstructured":"Shengpeng Ji, Ziyue Jiang, Hanting Wang, Jialong Zuo, and Zhou Zhao. 2024. Mobilespeech: A fast and high-fidelity framework for mobile zero-shot text-to-speech. arXiv preprint arXiv:2402.09378 (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"e_1_3_2_1_21_1","volume-title":"Workshop on Efficient Systems for Foundation Models@ ICML2023","author":"Kim Bo-Kyeong","year":"2023","unstructured":"Bo-Kyeong Kim, Hyoung-Kyu Song, Thibault Castells, and Shinkook Choi. 2023. Bk-sdm: Architecturally compressed stable diffusion for efficient text-to-image generation. In Workshop on Efficient Systems for Foundation Models@ ICML2023."},{"volume-title":"2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings. arXiv:http:\/\/arxiv.org\/abs\/1312","author":"Diederik","key":"e_1_3_2_1_22_1","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding(Variational Bayes). In 2nd International Conference on Learning Representations, ICLR 2014, Banff, AB, Canada, April 14-16, 2014, Conference Track Proceedings. arXiv:http:\/\/arxiv.org\/abs\/1312.6114v10 [stat.ML]"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 34th International Conference on Neural Information Processing Systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020b. HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. In Proceedings of the 34th International Conference on Neural Information Processing Systems (Vancouver, BC, Canada) (NIPS '20). Curran Associates Inc., Red Hook, NY, USA, Article 1428, 12 pages."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_25_1","volume-title":"AudioGen: Textually Guided Audio Generation. In The Eleventh International Conference on Learning Representations.","author":"Kreuk Felix","year":"2023","unstructured":"Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre D\u00e9fossez, Jade Copet, Devi Parikh, Yaniv Taigman, and Yossi Adi. 2023. AudioGen: Textually Guided Audio Generation. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_26_1","first-page":"20662","article-title":"Snapfusion: Text-to-image diffusion model on mobile devices within two seconds","volume":"36","author":"Li Yanyu","year":"2023","unstructured":"Yanyu Li, Huan Wang, Qing Jin, Ju Hu, Pavlo Chemerys, Yun Fu, Yanzhi Wang, Sergey Tulyakov, and Jian Ren. 2023b. Snapfusion: Text-to-image diffusion model on mobile devices within two seconds. Advances in Neural Information Processing Systems, Vol. 36 (2023), 20662-20678.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","first-page":"19594","article-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","volume":"36","author":"Li Yinghao Aaron","year":"2023","unstructured":"Yinghao Aaron Li, Cong Han, Vinay Raghavan, Gavin Mischler, and Nima Mesgarani. 2023a. Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 19594-19621.","journal-title":"Advances in Neural Information Processing Systems"},{"volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Liu Haohe","key":"e_1_3_2_1_28_1","unstructured":"Haohe Liu, Zehua Chen, Yi Yuan, Xinhao Mei, Xubo Liu, Danilo Mandic, Wenwu Wang, and Mark D. Plumbley. 2023. AudioLDM: text-to-audio generation with latent diffusion models. In Proceedings of the 40th International Conference on Machine Learning (Honolulu, Hawaii, USA) (ICML'23). JMLR.org, Article 886, 25 pages."},{"key":"e_1_3_2_1_29_1","volume-title":"Audiolcm: Text-to-audio generation with latent consistency models. arXiv preprint arXiv:2406.00356","author":"Liu Huadai","year":"2024","unstructured":"Huadai Liu, Rongjie Huang, Yang Liu, Hengyuan Cao, Jialei Wang, Xize Cheng, Siqi Zheng, and Zhou Zhao. 2024a. Audiolcm: Text-to-audio generation with latent consistency models. arXiv preprint arXiv:2406.00356 (2024)."},{"key":"e_1_3_2_1_30_1","first-page":"2871","volume-title":"Speech and Lang. Proc.","volume":"32","author":"Liu Haohe","year":"2024","unstructured":"Haohe Liu, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Qiao Tian, Yuping Wang, Wenwu Wang, Yuxuan Wang, and Mark D. Plumbley. 2024b. AudioLDM 2: Learning Holistic Audio Generation With Self-Supervised Pretraining. IEEE\/ACM Trans. Audio, Speech and Lang. Proc., Vol. 32 (May 2024), 2871-2883."},{"key":"e_1_3_2_1_31_1","unstructured":"Simian Luo Yiqin Tan Longbo Huang Jian Li and Hang Zhao. 2023. Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference. arXiv:2310.04378 [cs.CV]"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681688"},{"key":"e_1_3_2_1_33_1","volume-title":"European Conference on Computer Vision. Springer, 78-96","author":"Qin Danfeng","year":"2024","unstructured":"Danfeng Qin, Chas Leichner, Manolis Delakis, Marco Fornoni, Shixin Luo, Fan Yang, Weijun Wang, Colby Banbury, Chengxi Ye, Berkin Akin, et al., 2024. MobileNetV4: universal models for the mobile ecosystem. In European Conference on Computer Vision. Springer, 78-96."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3376984"},{"key":"e_1_3_2_1_36_1","volume-title":"sheng zhao, and Jiang Bian","author":"Shen Kai","year":"2024","unstructured":"Kai Shen, Zeqian Ju, Xu Tan, Eric Liu, Yichong Leng, Lei He, Tao Qin, sheng zhao, and Jiang Bian. 2024. NaturalSpeech 2: Latent Diffusion Models are Natural and Zero-Shot Speech and Singing Synthesizers. (2024)."},{"key":"e_1_3_2_1_37_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2021. Denoising Diffusion Implicit Models.. In ICLR."},{"key":"e_1_3_2_1_38_1","volume-title":"Representation Learning with Contrastive Predictive Coding. CoRR","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. CoRR, Vol. abs\/1807.03748 (2018)."},{"key":"e_1_3_2_1_39_1","volume-title":"Frieren: Efficient Video-to-Audio Generation Network with Rectified Flow Matching. In The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Wang Yongqi","year":"2024","unstructured":"Yongqi Wang, Wenxiang Guo, Rongjie Huang, Jiawei Huang, Zehan Wang, Fuming You, Ruiqi Li, and Zhou Zhao. 2024. Frieren: Efficient Video-to-Audio Generation Network with Rectified Flow Matching. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3399026"},{"key":"e_1_3_2_1_41_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona T. Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. CoRR, Vol. abs\/2205.01068 (2022). arXiv:2205.01068"},{"key":"e_1_3_2_1_42_1","volume-title":"European Conference on Computer Vision. Springer, 225-242","author":"Zhao Yang","year":"2024","unstructured":"Yang Zhao, Yanwu Xu, Zhisheng Xiao, Haolin Jia, and Tingbo Hou. 2024. Mobilediffusion: Instant text-to-image generation on mobile devices. In European Conference on Computer Vision. Springer, 225-242."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761366","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T01:46:26Z","timestamp":1765503986000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761366"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":42,"alternative-id":["10.1145\/3746252.3761366","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761366","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}