{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T21:14:17Z","timestamp":1776114857617,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":76,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"AnalytiXIN Faculty Fellowship"},{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2211428"],"award-info":[{"award-number":["2211428"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"NVIDIA Academic Hardware Grant"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3635636.3656189","type":"proceedings-article","created":{"date-parts":[[2024,6,22]],"date-time":"2024-06-22T06:24:22Z","timestamp":1719037462000},"page":"156-169","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["MIMOSA: Human-AI Co-Creation of Computational Spatial Audio Effects on Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7374-7453","authenticated-orcid":false,"given":"Zheng","family":"Ning","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7040-2326","authenticated-orcid":false,"given":"Zheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5620-3513","authenticated-orcid":false,"given":"Jerrick","family":"Ban","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0483-693X","authenticated-orcid":false,"given":"Kaiwen","family":"Jiang","sequence":"additional","affiliation":[{"name":"Jacobs School of Engineering, University of California San Diego, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6580-4336","authenticated-orcid":false,"given":"Ruohong","family":"Gan","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, Carnegie Mellon University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1423-4513","authenticated-orcid":false,"given":"Yapeng","family":"Tian","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Texas at Dallas, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7902-7625","authenticated-orcid":false,"given":"Toby Jia-Jun","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, University of Notre Dame, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,6,23]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/365024.365092"},{"key":"e_1_3_2_1_2_1","volume-title":"Using thematic analysis in psychology. Qualitative research in psychology 3, 2","author":"Braun Virginia","year":"2006","unstructured":"Virginia Braun and Victoria Clarke. 2006. Using thematic analysis in psychology. Qualitative research in psychology 3, 2 (2006), 77\u2013101."},{"key":"e_1_3_2_1_3_1","first-page":"44","article-title":"Directional sources and listeners in interactive sound propagation using reciprocal wave field coding","volume":"39","author":"R\u00a0Alla Chaitanya Chakravarty","year":"2020","unstructured":"Chakravarty R\u00a0Alla Chaitanya, Nikunj Raghuvanshi, Keith\u00a0W Godin, Zechen Zhang, Derek Nowrouzezahrai, and John\u00a0M Snyder. 2020. Directional sources and listeners in interactive sound propagation using reciprocal wave field coding. ACM Transactions on Graphics (TOG) 39, 4 (2020), 44\u20131.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2617588"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3461778.3462050"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532106.3533505"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAU.1957.1166013"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.17743\/jaes.2016.0059"},{"key":"e_1_3_2_1_9_1","volume-title":"Media. In Audio Engineering Society Convention 141","author":"Dalton Robert","year":"2016","unstructured":"Robert Dalton, Jimmy Tobin, and David Grunzweig. 2016. Rondo360: Dysonics\u2019 Spatial Audio Post-Production Toolkit for 360 Media. In Audio Engineering Society Convention 141. https:\/\/www.aes.org\/e-lib\/browse.cfm?elib=18387"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.1167311"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300619"},{"key":"e_1_3_2_1_12_1","volume-title":"2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Gao Ruohan","year":"2018","unstructured":"Ruohan Gao and Kristen Grauman. 2018. 2.5D Visual Sound. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2018), 324\u2013333. https:\/\/api.semanticscholar.org\/CorpusID:54628402"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.412407"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581352"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414750"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300854"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174211"},{"key":"e_1_3_2_1_18_1","volume-title":"Music transformer: Generating music with long-term structure","author":"Huang Zhi\u00a0Anna","year":"2018","unstructured":"Cheng-Zhi\u00a0Anna Huang, Ashish Vaswani, Jakob Uszkoreit, Noam Shazeer, Curtis Hawthorne, AM Dai, MD Hoffman, and D Eck. 2018. Music transformer: Generating music with long-term structure (2018). arXiv preprint arXiv:1809.04281 (2018)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606830"},{"key":"e_1_3_2_1_20_1","volume-title":"Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth. arXiv preprint arXiv:2201.07436","author":"Kim Doyeon","year":"2022","unstructured":"Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, and Junmo Kim. 2022. Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth. arXiv preprint arXiv:2201.07436 (2022)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Qiuqiang Kong Yin Cao Turab Iqbal Yuxuan Wang Wenwu Wang and Mark\u00a0D. Plumbley. 2020. PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. arxiv:1912.10211\u00a0[cs.SD]","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2013.2270375"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.4324\/9780203512890"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-805390-4.09987-8"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_17"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606823"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376258"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376739"},{"key":"e_1_3_2_1_30_1","volume-title":"Audio Engineering Society Conference: 2019 AES International Conference on Immersive and Interactive Audio. Audio Engineering Society.","author":"McCormack Leo","year":"2019","unstructured":"Leo McCormack and Archontis Politis. 2019. SPARTA & COMPASS: Real-time implementations of linear and parametric spatial audio reproduction and processing methods. In Audio Engineering Society Conference: 2019 AES International Conference on Immersive and Interactive Audio. Audio Engineering Society."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.37514\/PRA-B.2022.1688.2.01"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_33_1","volume-title":"Social-Implicit: Rethinking Trajectory Prediction Evaluation and The Effectiveness of Implicit Maximum Likelihood Estimation. arXiv preprint arXiv:2203.03057","author":"Mohamed Abduallah","year":"2022","unstructured":"Abduallah Mohamed, Deyao Zhu, Warren Vu, Mohamed Elhoseiny, and Christian Claudel. 2022. Social-Implicit: Rethinking Trajectory Prediction Evaluation and The Effectiveness of Implicit Maximum Likelihood Estimation. arXiv preprint arXiv:2203.03057 (2022)."},{"key":"e_1_3_2_1_34_1","first-page":"300","article-title":"Head-related transfer functions of human subjects","volume":"43","author":"M\u00f8ller Henrik","year":"1995","unstructured":"Henrik M\u00f8ller, Michael\u00a0Friis S\u00f8rensen, Dorte Hammersh\u00f8i, and Clemen\u00a0Boje Jensen. 1995. Head-related transfer functions of human subjects. Journal of the Audio Engineering Society 43, 5 (1995), 300\u2013321.","journal-title":"Journal of the Audio Engineering Society"},{"key":"e_1_3_2_1_35_1","volume-title":"Self-supervised generation of spatial audio for 360 video. Advances in neural information processing systems 31","author":"Morgado Pedro","year":"2018","unstructured":"Pedro Morgado, Nuno Nvasconcelos, Timothy Langlois, and Oliver Wang. 2018. Self-supervised generation of spatial audio for 360 video. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3476550"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126594.3126659"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/97243.97281"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642632"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581641.3584067"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1093\/acprof:oso\/9780198567424.001.0001"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174223"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/302979.303163"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/319382.319398"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2601097.2601184"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201339"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77772-2_7"},{"key":"e_1_3_2_1_48_1","volume-title":"International conference on machine learning. PMLR, 1060\u20131069","author":"Reed Scott","year":"2016","unstructured":"Scott Reed, Zeynep Akata, Xinchen Yan, Lajanugen Logeswaran, Bernt Schiele, and Honglak Lee. 2016. Generative adversarial text to image synthesis. In International conference on machine learning. PMLR, 1060\u20131069."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357236.3395474"},{"key":"e_1_3_2_1_50_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015), 91\u201399."},{"key":"e_1_3_2_1_51_1","volume-title":"Neural Synthesis of Binaural Speech From Mono Audio. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=uAX8q61EVRu","author":"Richard Alexander","year":"2021","unstructured":"Alexander Richard, Dejan Markovic, Israel\u00a0D. Gebru, Steven Krenn, Gladstone\u00a0Alexander Butler, Fernando Torre, and Yaser Sheikh. 2021. Neural Synthesis of Binaural Speech From Mono Audio. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=uAX8q61EVRu"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","unstructured":"Agnieszka Roginska and Paul Geluso. 2017. Immersive sound: The art and science of binaural and multi-channel audio. https:\/\/doi.org\/10.4324\/9781315707525","DOI":"10.4324\/9781315707525"},{"key":"e_1_3_2_1_53_1","volume-title":"Executive control of cognitive processes in task switching.Journal of experimental psychology: human perception and performance 27, 4","author":"Rubinstein S","year":"2001","unstructured":"Joshua\u00a0S Rubinstein, David\u00a0E Meyer, and Jeffrey\u00a0E Evans. 2001. Executive control of cognitive processes in task switching.Journal of experimental psychology: human perception and performance 27, 4 (2001), 763."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.4324\/9780080498195"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.4324\/9781003092919"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1028630.1028636"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC48978.2021.9565113"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/1357054.1357169"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/1622176.1622193"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00277"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3145090"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Dakuo Wang Elizabeth Churchill Pattie Maes Xiangmin Fan Ben Shneiderman Yuanchun Shi and Qianying Wang. 2020. From human-human collaboration to Human-AI collaboration: Designing AI systems that can work together with people. In Extended abstracts of the 2020 CHI conference on human factors in computing systems. 1\u20136.","DOI":"10.1145\/3334480.3381069"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2208.09579"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414774"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_51"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01523"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130838"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","unstructured":"Jing Yang and Friedemann Mattern. 2019. Audio Augmented Reality for Human-Object Interactions. In Adjunct Proceedings of the 2019 ACM International Joint Conference on Pervasive and Ubiquitous Computing and Proceedings of the 2019 ACM International Symposium on Wearable Computers (London United Kingdom) (UbiComp\/ISWC \u201919 Adjunct). Association for Computing Machinery New York NY USA 408\u2013412. https:\/\/doi.org\/10.1145\/3341162.3349302","DOI":"10.1145\/3341162.3349302"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376301"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3180308.3180360"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357236.3395433"},{"key":"e_1_3_2_1_73_1","volume-title":"Personalized HRTF Modeling Using DNN-Augmented BEM. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 451\u2013455","author":"Zhang Mengfan","year":"2021","unstructured":"Mengfan Zhang, Jui-Hsien Wang, and Doug\u00a0L James. 2021. Personalized HRTF Modeling Using DNN-Augmented BEM. In ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 451\u2013455."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606800"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606776"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_4"}],"event":{"name":"C&C '24: Creativity and Cognition","location":"Chicago IL USA","acronym":"C&C '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Creativity and Cognition"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3635636.3656189","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3635636.3656189","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:00:08Z","timestamp":1755885608000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3635636.3656189"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":76,"alternative-id":["10.1145\/3635636.3656189","10.1145\/3635636"],"URL":"https:\/\/doi.org\/10.1145\/3635636.3656189","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-06-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}