{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T22:36:51Z","timestamp":1780612611474,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,20]],"date-time":"2026-06-20T00:00:00Z","timestamp":1781913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"City University of Hong Kong","award":["9610753"],"award-info":[{"award-number":["9610753"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,21]]},"DOI":"10.1145\/3745756.3809218","type":"proceedings-article","created":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:52:21Z","timestamp":1780059141000},"page":"492-505","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["ZA-SLAM: Leveraging Vision-Language Model for Zero-Shot Acoustic SLAM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4914-3949","authenticated-orcid":false,"given":"Zhuochen","family":"Yu","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9061-7423","authenticated-orcid":false,"given":"David K. Y.","family":"Yau","sequence":"additional","affiliation":[{"name":"Singapore University of Technology &amp; Design, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3896-411X","authenticated-orcid":false,"given":"Yijie","family":"Shen","sequence":"additional","affiliation":[{"name":"ShanghaiTech University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6368-9250","authenticated-orcid":false,"given":"Xiaoran","family":"Fan","sequence":"additional","affiliation":[{"name":"Google, Mountain View, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4565-5548","authenticated-orcid":false,"given":"Tao","family":"Chen","sequence":"additional","affiliation":[{"name":"Independent Researcher, Frisco, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3611-9404","authenticated-orcid":false,"given":"Qun","family":"Song","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,20]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 15th Annual International Conference on Mobile Computing and Networking. 261\u2013272","author":"Azizyan Martin","year":"2009","unstructured":"Martin Azizyan, Ionut Constandache, and Romit Roy Choudhury. 2009. Surroundsense: mobile phone localization via ambience fingerprinting. In Proceedings of the 15th Annual International Conference on Mobile Computing and Networking. 261\u2013272."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2000.832252"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1093\/ageing\/26.1.15","article-title":"Comfortable and maximum walking speed of adults aged 20\u201379 years: reference values and determinants","volume":"26","author":"Bohannon Richard W","year":"1997","unstructured":"Richard W Bohannon. 1997. Comfortable and maximum walking speed of adults aged 20\u201379 years: reference values and determinants. Age and Ageing 26, 1 (1997), 15\u201319.","journal-title":"Age and Ageing"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","first-page":"1309","DOI":"10.1109\/TRO.2016.2624754","article-title":"Past, present, and future of simultaneous localization and mapping: Toward the robust-perception age","volume":"32","author":"Cadena Cesar","year":"2017","unstructured":"Cesar Cadena, Luca Carlone, Henry Carrillo, Yasir Latif, Davide Scaramuzza, Jos\u00e9 Neira, Ian Reid, and John J Leonard. 2017. Past, present, and future of simultaneous localization and mapping: Toward the robust-perception age. IEEE Transactions on Robotics 32, 6 (2017), 1309\u20131332.","journal-title":"IEEE Transactions on Robotics"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-37456-2_14"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3666025.3699331"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2307636.2307653"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2008.08.013"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations.","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16\u00d716 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_10_1","unstructured":"ElectronicsForce. 2026. Samsung Galaxy A42 5G 128GB Unlocked White. https:\/\/electronicsforce.com\/products\/samsung-galaxy-a42-5g-128gb-unlocked-white Accessed: 2026-03-23."},{"key":"e_1_3_2_1_11_1","first-page":"226","article-title":"A density-based algorithm for discovering clusters in large spatial databases with noise","volume":"96","author":"Ester Martin","year":"1996","unstructured":"Martin Ester, Hans-Peter Kriegel, J\u00f6rg Sander, Xiaowei Xu, et al. 1996. A density-based algorithm for discovering clusters in large spatial databases with noise. In KDD, Vol. 96. 226\u2013231.","journal-title":"KDD"},{"key":"e_1_3_2_1_12_1","volume-title":"2025 IEEE International Conference on Robotics and Automation (ICRA). IEEE, 8275\u20138283","author":"Etukuru Haritheja","year":"2025","unstructured":"Haritheja Etukuru, Norihito Naka, Zijin Hu, Seungjae Lee, Julian Mehu, Aaron Edsinger, Chris Paxton, Soumith Chintala, Lerrel Pinto, and Nur Muhammad Mahi Shafiullah. 2025. Robot utility models: General policies for zero-shot deployment in new environments. In 2025 IEEE International Conference on Robotics and Automation (ICRA). IEEE, 8275\u20138283."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2018.2828321"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2015.2418205"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02291478"},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Learning Representations.","author":"Gu Xiuye","year":"2022","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2022. Open-vocabulary Object Detection via Vision and Language Knowledge Distillation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_18_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 976\u2013980","author":"Guzhov Andrey","year":"2022","unstructured":"Andrey Guzhov, Federico Raue, J\u00f6rn Hees, and Andreas Dengel. 2022. Audioclip: Extending clip to image, text and audio. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 976\u2013980."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3139222"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00727"},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Ubiquitous Computing. Springer, 159\u2013176","author":"Hightower Jeffrey","year":"2005","unstructured":"Jeffrey Hightower, Sunny Consolvo, Anthony LaMarca, Ian Smith, and Jeff Hughes. 2005. Learning and recognizing the places we go. In International Conference on Ubiquitous Computing. Springer, 159\u2013176."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 4904\u20134916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International Conference on Machine Learning. PMLR, 4904\u20134916."},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning. PMLR, 19730\u201319742."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. PMLR, 12888\u201312900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888\u201312900."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3478095","article-title":"EchoSpot: Spotting your locations via acoustic sensing","volume":"5","author":"Lian Jie","year":"2021","unstructured":"Jie Lian, Jiadong Lou, Li Chen, and Xu Yuan. 2021. EchoSpot: Spotting your locations via acoustic sensing. Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies 5, 3 (2021), 1\u201321.","journal-title":"Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies"},{"key":"e_1_3_2_1_29_1","first-page":"1473","article-title":"ALCDNet: Loop closure detection based on acoustic echoes","volume":"10","author":"Liu Guangyao","year":"2024","unstructured":"Guangyao Liu, Weimeng Cui, Naizheng Jia, Yuzhang Xi, Shuyu Li, and Zhi Wang. 2024. ALCDNet: Loop closure detection based on acoustic echoes. IEEE Robotics and Automation Letters 10, 2 (2024), 1473\u20131480.","journal-title":"IEEE Robotics and Automation Letters"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1008854305733"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560905.3568510"},{"key":"e_1_3_2_1_32_1","first-page":"6634","article-title":"Indoor smartphone SLAM with acoustic echoes","volume":"23","author":"Luo Wenjie","year":"2023","unstructured":"Wenjie Luo, Qun Song, Zhenyu Yan, Rui Tan, and Guosheng Lin. 2023. Indoor smartphone SLAM with acoustic echoes. IEEE Transactions on Mobile Computing 23, 6 (2023), 6634\u20136649.","journal-title":"IEEE Transactions on Mobile Computing"},{"key":"e_1_3_2_1_33_1","volume-title":"OVO-SLAM: Open-Vocabulary Online Simultaneous Localization and Mapping. arXiv preprint arXiv:2411.15043","author":"Martins Tomas Berriel","year":"2024","unstructured":"Tomas Berriel Martins, Martin R Oswald, and Javier Civera. 2024. OVO-SLAM: Open-Vocabulary Online Simultaneous Localization and Mapping. arXiv preprint arXiv:2411.15043 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Wireless Algorithms, Systems, and Applications. Springer, 155\u2013167","author":"Meng Chuize","year":"2022","unstructured":"Chuize Meng, Shan Jiang, Mengning Wu, Xuan Xiao, Dan Tao, and Ruipeng Gao. 2022. BatMapper-Plus: Smartphone-Based Multi-level Indoor Floor Plan Construction via Acoustic Ranging and Inertial Sensing. In International Conference on Wireless Algorithms, Systems, and Applications. Springer, 155\u2013167."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2014.00587"},{"key":"e_1_3_2_1_36_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/SSRR50563.2020.9292572"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3214278"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 28492\u201328518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning. PMLR, 28492\u201328518."},{"key":"e_1_3_2_1_41_1","first-page":"5310","article-title":"Robust indoor location identification for smart-phones using echoes from dominant reflectors","volume":"23","author":"Ren Yanzhi","year":"2023","unstructured":"Yanzhi Ren, Siyi Li, Chen Chen, Hongbo Liu, Jiadi Yu, Yingying Chen, Haomiao Yang, and Hongwei Li. 2023. Robust indoor location identification for smart-phones using echoes from dominant reflectors. IEEE Transactions on Mobile Computing 23, 5 (2023), 5310\u20135326.","journal-title":"IEEE Transactions on Mobile Computing"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 4th Augmented Human International Conference. 89\u201395","author":"Rossi Mirco","year":"2013","unstructured":"Mirco Rossi, Julia Seiter, Oliver Amft, Seraina Buchmeier, and Gerhard Tr\u00f6ster. 2013. RoomSense: an indoor positioning system for smartphones using active sound probing. In Proceedings of the 4th Augmented Human International Conference. 89\u201395."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3372224.3380884"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3264945"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/1999995.2000011"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2789168.2790102"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2819938"},{"key":"e_1_3_2_1_48_1","volume-title":"Undesirable Effects as a Result of Short-Term Exposure to an Ultrasonic Repellent Device. Part II-Exposure of Volunteers. Assignment No. Food Chain Safety and Environment","author":"Wieringen A Van","year":"2014","unstructured":"A Van Wieringen. 2014. Undesirable Effects as a Result of Short-Term Exposure to an Ultrasonic Repellent Device. Part II-Exposure of Volunteers. Assignment No. Food Chain Safety and Environment (2014)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2016.7759302"},{"key":"e_1_3_2_1_50_1","first-page":"15028","article-title":"Cliploss and norm-based data selection methods for multimodal contrastive learning","volume":"37","author":"Wang Yiping","year":"2024","unstructured":"Yiping Wang, Yifang Chen, Wendan Yan, Alex Fang, Wenjing Zhou, Kevin Jamieson, and Simon S Du. 2024. Cliploss and norm-based data selection methods for multimodal contrastive learning. Advances in Neural Information Processing Systems 37 (2024), 15028\u201315069.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2024.3380162"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155438"},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 18th Annual International Conference on Mobile Computing and Networking. 269\u2013280","author":"Yang Zheng","year":"2012","unstructured":"Zheng Yang, Chenshu Wu, and Yunhao Liu. 2012. Locating in fingerprint space: Wireless indoor localization with little human intervention. In Proceedings of the 18th Annual International Conference on Mobile Computing and Networking. 269\u2013280."},{"key":"e_1_3_2_1_54_1","volume-title":"2024 IEEE International Conference on Robotics and Automation (ICRA). IEEE, 5111\u20135118","author":"Yin Mingsheng","year":"2024","unstructured":"Mingsheng Yin, Tao Li, Haozhe Lei, Yaqi Hu, Sundeep Rangan, and Quanyan Zhu. 2024. Zero-shot wireless indoor navigation through physics-informed reinforcement learning. In 2024 IEEE International Conference on Robotics and Automation (ICRA). IEEE, 5111\u20135118."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/1067170.1067193"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"e_1_3_2_1_57_1","volume-title":"Proceedings of the 22nd Annual International Conference on Mobile Computing and Networking. 230\u2013242","author":"Zhang Chi","year":"2016","unstructured":"Chi Zhang and Xinyu Zhang. 2016. LiTell: Robust indoor localization using un-modified light fixtures. In Proceedings of the 22nd Annual International Conference on Mobile Computing and Networking. 230\u2013242."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3081333.3081363"}],"event":{"name":"MobiSys '26: 24th Annual International Conference on Mobile Systems, Applications and Services","location":"University of Cambridge Cambridge United Kingdom","acronym":"MobiSys '26","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 24th Annual International Conference on Mobile Systems, Applications and Services"],"original-title":[],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T13:02:57Z","timestamp":1780059777000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3745756.3809218"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,20]]},"references-count":58,"alternative-id":["10.1145\/3745756.3809218","10.1145\/3745756"],"URL":"https:\/\/doi.org\/10.1145\/3745756.3809218","relation":{},"subject":[],"published":{"date-parts":[[2026,6,20]]},"assertion":[{"value":"2026-06-20","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}