{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:01Z","timestamp":1750309501955,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681620","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"1361-1369","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["PSM: Learning Probabilistic Embeddings for Multi-scale Zero-Shot Soundscape Mapping"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7666-8603","authenticated-orcid":false,"given":"Subash","family":"Khanal","sequence":"first","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9158-4201","authenticated-orcid":false,"given":"Eric","family":"Xing","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4646-9416","authenticated-orcid":false,"given":"Srikumar","family":"Sastry","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4431-0628","authenticated-orcid":false,"given":"Aayush","family":"Dhakal","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5233-1520","authenticated-orcid":false,"given":"Zhexiao","family":"Xiong","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1211-1184","authenticated-orcid":false,"given":"Adeel","family":"Ahmad","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4242-8967","authenticated-orcid":false,"given":"Nathan","family":"Jacobs","sequence":"additional","affiliation":[{"name":"Washington University in St. Louis, Saint Louis, Missouri, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"unstructured":"[n. d.]. EOX::Maps https:\/\/tiles.maps.eox.at.","key":"e_1_3_2_1_1_1"},{"unstructured":"[n. d.]. Freesound https:\/\/freesound.org\/.","key":"e_1_3_2_1_2_1"},{"unstructured":"[n. d.]. GIS Data for the 2012 National Insect & Disease Risk Map (NIDRM) Report. https:\/\/www.fs.usda.gov\/foresthealth\/technology","key":"e_1_3_2_1_3_1"},{"unstructured":"[n. d.]. iNaturalist https:\/\/www.inaturalist.org.","key":"e_1_3_2_1_4_1"},{"unstructured":"[n. d.]. Radio aporee: Maps - sounds of the world https:\/\/aporee.org.","key":"e_1_3_2_1_5_1"},{"key":"e_1_3_2_1_6_1","volume-title":"Chatty maps: constructing sound maps of urban areas from social media data","author":"Aiello Luca Maria","year":"2016","unstructured":"Luca Maria Aiello, Rossano Schifanella, Daniele Quercia, and Francesco Aletta. 2016. Chatty maps: constructing sound maps of urban areas from social media data. Royal Society open science 3, 3 (2016), 150690."},{"unstructured":"Alex Alemi Ian Fischer Josh Dillon and Kevin Murphy. 2017. Deep Variational Information Bottleneck. In ICLR. https:\/\/arxiv.org\/abs\/1612.00410","key":"e_1_3_2_1_7_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.1609\/aaai.v37i6.25819"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_9_1","DOI":"10.1109\/IGARSS47720.2021"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_10_1","DOI":"10.1145\/3394171.3413869"},{"key":"e_1_3_2_1_11_1","volume-title":"Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arXiv preprint arXiv:2311.07919","author":"Chu Yunfei","year":"2023","unstructured":"Yunfei Chu, Jin Xu, Xiaohuan Zhou, Qian Yang, Shiliang Zhang, Zhijie Yan, Chang Zhou, and Jingren Zhou. 2023. Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models. arXiv preprint arXiv:2311.07919 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"Improved Probabilistic Image-Text Representations. In International Conference on Learning Representations (ICLR).","author":"Chun Sanghyuk","year":"2024","unstructured":"Sanghyuk Chun. 2024. Improved Probabilistic Image-Text Representations. In International Conference on Learning Representations (ICLR)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/CVPR46437.2021.00831"},{"unstructured":"Yezhen Cong Samar Khanna Chenlin Meng Patrick Liu Erik Rozi Yutong He Marshall Burke David B. Lobell and Stefano Ermon. 2022. SatMAE: Pretraining Transformers for Temporal and Multi-Spectral Satellite Imagery. In Advances in Neural Information Processing Systems Alice H. Oh Alekh Agarwal Danielle Belgrave and Kyunghyun Cho (Eds.). https:\/\/openreview.net\/forum? id=WBhqzpF6KYH","key":"e_1_3_2_1_14_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"e_1_3_2_1_16_1","volume-title":"A review of the use of psychoacoustic indicators on soundscape studies. Current Pollution Reports","author":"Engel Margret Sibylle","year":"2021","unstructured":"Margret Sibylle Engel, Andr\u00e9 Fiebig, Carmella Pfaffenbach, and Janina Fels. 2021. A review of the use of psychoacoustic indicators on soundscape studies. Current Pollution Reports (2021), 1--20."},{"unstructured":"EROS. [n. d.]. National Land Cover Database. https:\/\/www.usgs.gov\/centers\/ eros\/science\/national-land-cover-database","key":"e_1_3_2_1_17_1"},{"key":"e_1_3_2_1_18_1","volume-title":"Geneva","author":"International Organization for Standardization. 2014.","year":"2014","unstructured":"International Organization for Standardization. 2014. ISO 12913--1: 2014 acoustics? Soundscape?part 1: definition and conceptual framework. ISO, Geneva (2014)."},{"key":"e_1_3_2_1_19_1","volume-title":"Juan Miguel Barrig\u00f3n Morillas, and Guillermo Rey- Gozalo","author":"Gonz\u00e1lez David Montes","year":"2023","unstructured":"David Montes Gonz\u00e1lez, Juan Miguel Barrig\u00f3n Morillas, and Guillermo Rey- Gozalo. 2023. Effects of noise on pedestrians in urban environments where road traffic is the main source of sound. Science of the total environment 857 (2023), 159406."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_20_1","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_21_1","DOI":"10.1016\/j.jag.2022.103130"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings, Part XXIV 16","author":"Hu Di","year":"2020","unstructured":"Di Hu, Xuhong Li, Lichao Mou, Pu Jin, Dong Chen, Liping Jing, Xiaoxiang Zhu, and Dejing Dou. 2020. Cross-task transfer for geotagged audiovisual aerial scene recognition. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXIV 16. Springer, 68--84."},{"key":"e_1_3_2_1_23_1","first-page":"29406","article-title":"Learning with noisy correspondence for cross-modal matching","volume":"34","author":"Huang Zhenyu","year":"2021","unstructured":"Zhenyu Huang, Guocheng Niu, Xiao Liu, Wenbiao Ding, Xinyan Xiao, Hua Wu, and Xi Peng. 2021. Learning with noisy correspondence for cross-modal matching. Advances in Neural Information Processing Systems 34 (2021), 29406--29419.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_24_1","volume-title":"Taming Visually Guided Sound Generation. In British Machine Vision Conference (BMVC).","author":"Iashin Vladimir","year":"2021","unstructured":"Vladimir Iashin and Esa Rahtu. 2021. Taming Visually Guided Sound Generation. In British Machine Vision Conference (BMVC)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1109\/CVPR52729.2023.02228"},{"key":"e_1_3_2_1_26_1","volume-title":"Learning Tri-modal Embeddings for Zero-Shot Soundscape Mapping. In British Machine Vision Conference (BMVC).","author":"Khanal Subash","year":"2023","unstructured":"Subash Khanal, Srikumar Sastry, Aayush Dhakal, and Nathan Jacobs. 2023. Learning Tri-modal Embeddings for Zero-Shot Soundscape Mapping. In British Machine Vision Conference (BMVC)."},{"volume-title":"Soundscapes: Humans and Their Acoustic Environment","author":"Lercher Peter","unstructured":"Peter Lercher and Angel M Dzhambov. 2023. Soundscape and Health. In Soundscapes: Humans and Their Acoustic Environment. Springer, 243--276.","key":"e_1_3_2_1_27_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1016\/j.apacoust.2020.107479"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_29_1","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_30_1","volume-title":"Soundscape mapping in environmental noise management and urban planning: case studies in two UK cities. Noise mapping 4, 1","author":"Margaritis Efstathios","year":"2017","unstructured":"Efstathios Margaritis and Jian Kang. 2017. Soundscape mapping in environmental noise management and urban planning: case studies in two UK cities. Noise mapping 4, 1 (2017), 87--103."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_31_1","DOI":"10.1109\/CVPRW56347.2022.00501"},{"key":"e_1_3_2_1_32_1","volume-title":"Joo Young Hong, and Woon-Seng Gan","author":"Ooi Kenneth","year":"2023","unstructured":"Kenneth Ooi, Zhen-Ting Ong, Karn N Watcharasupat, Bhan Lam, Joo Young Hong, and Woon-Seng Gan. 2023. ARAUS: A large-scale dataset and baseline models of affective responses to augmented urban soundscapes. IEEE Transactions on Affective Computing (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings, Part I 14","author":"Owens Andrew","year":"2016","unstructured":"Andrew Owens, JiajunWu, Josh H McDermott, William T Freeman, and Antonio Torralba. 2016. Ambient sound provides supervision for visual learning. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part I 14. Springer, 801--816."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.1016\/j.buildenv.2018.10.049"},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.1109\/ICCV51070.2023.00378"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/IGARSS.2018.8517977"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_39_1","DOI":"10.1145\/3474085.3475248"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_40_1","DOI":"10.1145\/3394171.3413694"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1145\/2812802"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1109\/ICCV51070.2023.00182"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_43_1","DOI":"10.1145\/3503161.3548263"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_44_1","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"e_1_3_2_1_45_1","volume-title":"Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-CaptionAugmentation. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP.","author":"Yusong","year":"2023","unstructured":"Yusong Wu*, Ke Chen*, Tianyu Zhang*, Yuchen Hui*, Taylor Berg-Kirkpatrick, and Shlomo Dubnov. 2023. Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-CaptionAugmentation. In IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_46_1","DOI":"10.1145\/3592614"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_47_1","DOI":"10.1016\/j.compenvurbsys.2022.101915"}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"MM '24","name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681620","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681620","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681620"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":47,"alternative-id":["10.1145\/3664647.3681620","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681620","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}