{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:14:19Z","timestamp":1776114859389,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,29]],"date-time":"2023-10-29T00:00:00Z","timestamp":1698537600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,29]]},"DOI":"10.1145\/3586183.3606823","type":"proceedings-article","created":{"date-parts":[[2023,10,20]],"date-time":"2023-10-20T20:46:22Z","timestamp":1697834782000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Soundify: Matching Sound Effects to Video"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0116-0463","authenticated-orcid":false,"given":"David Chuan-En","family":"Lin","sequence":"first","affiliation":[{"name":"Human-Computer Interaction Institute, Carnegie Mellon University, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0005-8197","authenticated-orcid":false,"given":"Anastasis","family":"Germanidis","sequence":"additional","affiliation":[{"name":"Runway, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3809-4324","authenticated-orcid":false,"given":"Crist\u00f3bal","family":"Valenzuela","sequence":"additional","affiliation":[{"name":"Runway, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6612-4703","authenticated-orcid":false,"given":"Yining","family":"Shi","sequence":"additional","affiliation":[{"name":"Runway, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1824-0243","authenticated-orcid":false,"given":"Nikolas","family":"Martelaro","sequence":"additional","affiliation":[{"name":"Human-Computer Interaction Institute, Carnegie Mellon University, United States"}]}],"member":"320","published-online":{"date-parts":[[2023,10,29]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Retrieved","year":"2022","unstructured":"2018. Foley Techniques and Sound Effects: A Sound Design Guide. Retrieved August 15, 2022 from https:\/\/www.ipr.edu\/blogs\/sound-design-for-visual-media\/foley-sound-effects-sound-design\/"},{"key":"e_1_3_2_2_2_1","volume-title":"Commits - openai\/CLIP. Retrieved","year":"2022","unstructured":"2022. Commits - openai\/CLIP. Retrieved August 15, 2022 from https:\/\/github.com\/openai\/CLIP\/commits\/main"},{"key":"e_1_3_2_2_3_1","unstructured":"2022. Epidemic Sound. Retrieved August 15 2022 from https:\/\/www.epidemicsound.com\/"},{"key":"e_1_3_2_2_4_1","unstructured":"2022. Upwork. Retrieved August 15 2022 from https:\/\/www.upwork.com\/"},{"key":"e_1_3_2_2_5_1","unstructured":"2022. YOLOv5. Retrieved August 15 2022 from https:\/\/github.com\/ultralytics\/yolov5"},{"key":"e_1_3_2_2_6_1","volume-title":"Dance2music: Automatic dance-driven music generation. arXiv preprint arXiv:2107.06252","author":"Aggarwal Gunjan","year":"2021","unstructured":"Gunjan Aggarwal and Devi Parikh. 2021. Dance2music: Automatic dance-driven music generation. arXiv preprint arXiv:2107.06252 (2021)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_2_8_1","volume-title":"Using thematic analysis in psychology. Qualitative research in psychology 3, 2","author":"Braun Virginia","year":"2006","unstructured":"Virginia Braun and Victoria Clarke. 2006. Using thematic analysis in psychology. Qualitative research in psychology 3, 2 (2006), 77\u2013101."},{"key":"e_1_3_2_2_9_1","volume-title":"SUS-A quick and dirty usability scale. Usability evaluation in industry 189, 194","author":"John Brooke","year":"1996","unstructured":"John Brooke 1996. SUS-A quick and dirty usability scale. Usability evaluation in industry 189, 194 (1996), 4\u20137."},{"key":"e_1_3_2_2_10_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0\u20130.","author":"Chen Kan","year":"2018","unstructured":"Kan Chen, Chuanxi Zhang, Chen Fang, Zhaowen Wang, Trung Bui, and Ram Nevatia. 2018. Visually indicated sound generation by perceptually optimized classification. In Proceedings of the European Conference on Computer Vision (ECCV) Workshops. 0\u20130."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009820"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584047"},{"key":"e_1_3_2_2_13_1","volume-title":"Jukebox: A generative model for music. arXiv preprint arXiv:2005.00341","author":"Dhariwal Prafulla","year":"2020","unstructured":"Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong\u00a0Wook Kim, Alec Radford, and Ilya Sutskever. 2020. Jukebox: A generative model for music. arXiv preprint arXiv:2005.00341 (2020)."},{"key":"e_1_3_2_2_14_1","volume-title":"Gansynth: Adversarial neural audio synthesis. arXiv preprint arXiv:1902.08710","author":"Engel Jesse","year":"2019","unstructured":"Jesse Engel, Kumar\u00a0Krishna Agrawal, Shuo Chen, Ishaan Gulrajani, Chris Donahue, and Adam Roberts. 2019. Gansynth: Adversarial neural audio synthesis. arXiv preprint arXiv:1902.08710 (2019)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_3"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00398"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3005033"},{"key":"e_1_3_2_2_19_1","volume-title":"FoleyGAN: Visually Guided Generative Adversarial Network-Based Synchronous Sound Generation in Silent Videos. arXiv preprint arXiv:2107.09262","author":"Ghose Sanchita","year":"2021","unstructured":"Sanchita Ghose and John\u00a0J Prevost. 2021. FoleyGAN: Visually Guided Generative Adversarial Network-Based Synchronous Sound Generation in Silent Videos. arXiv preprint arXiv:2107.09262 (2021)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.391417"},{"key":"e_1_3_2_2_21_1","volume-title":"Advances in psychology. Vol.\u00a052","author":"Hart G","unstructured":"Sandra\u00a0G Hart and Lowell\u00a0E Staveland. 1988. Development of NASA-TLX (Task Load Index): Results of empirical and theoretical research. In Advances in psychology. Vol.\u00a052. Elsevier, 139\u2013183."},{"key":"e_1_3_2_2_22_1","volume-title":"Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761","author":"Kong Zhifeng","year":"2020","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. 2020. Diffwave: A versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761 (2020)."},{"key":"e_1_3_2_2_23_1","volume-title":"Melgan: Generative adversarial networks for conditional waveform synthesis. Advances in neural information processing systems 32","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar, Thibault de Boissiere, Lucas Gestin, Wei\u00a0Zhen Teoh, Jose Sotelo, Alexandre de Br\u00e9bisson, Yoshua Bengio, and Aaron\u00a0C Courville. 2019. Melgan: Generative adversarial networks for conditional waveform synthesis. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_25_1","volume-title":"Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499","author":"van\u00a0den Oord Aaron","year":"2016","unstructured":"Aaron van\u00a0den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu. 2016. Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499 (2016)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.264"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_2_28_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2642918.2647406"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2501988.2501993"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"e_1_3_2_2_32_1","volume-title":"Grad-CAM: Why did you say that?arXiv preprint arXiv:1611.07450","author":"Selvaraju R","year":"2016","unstructured":"Ramprasaath\u00a0R Selvaraju, Abhishek Das, Ramakrishna Vedantam, Michael Cogswell, Devi Parikh, and Dhruv Batra. 2016. Grad-CAM: Why did you say that?arXiv preprint arXiv:1611.07450 (2016)."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00458"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984561"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/1622176.1622193"},{"key":"e_1_3_2_2_36_1","first-page":"3325","article-title":"Audeo: Audio generation for a silent performance video","volume":"33","author":"Su Kun","year":"2020","unstructured":"Kun Su, Xiulong Liu, and Eli Shlizerman. 2020. Audeo: Audio generation for a silent performance video. Advances in Neural Information Processing Systems 33 (2020), 3325\u20133337.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_37_1","volume-title":"How Does it Sound?Advances in Neural Information Processing Systems 34","author":"Su Kun","year":"2021","unstructured":"Kun Su, Xiulong Liu, and Eli Shlizerman. 2021. How Does it Sound?Advances in Neural Information Processing Systems 34 (2021), 29258\u201329273."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545680"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415845"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415882"},{"key":"e_1_3_2_2_42_1","volume-title":"MidiNet: A convolutional generative adversarial network for symbolic-domain music generation. arXiv preprint arXiv:1703.10847","author":"Yang Li-Chia","year":"2017","unstructured":"Li-Chia Yang, Szu-Yu Chou, and Yi-Hsuan Yang. 2017. MidiNet: A convolutional generative adversarial network for symbolic-domain music generation. arXiv preprint arXiv:1703.10847 (2017)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_35"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00374"}],"event":{"name":"UIST '23: The 36th Annual ACM Symposium on User Interface Software and Technology","location":"San Francisco CA USA","acronym":"UIST '23","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3586183.3606823","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3586183.3606823","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:53:35Z","timestamp":1755820415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3586183.3606823"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,29]]},"references-count":44,"alternative-id":["10.1145\/3586183.3606823","10.1145\/3586183"],"URL":"https:\/\/doi.org\/10.1145\/3586183.3606823","relation":{},"subject":[],"published":{"date-parts":[[2023,10,29]]},"assertion":[{"value":"2023-10-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}