{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T12:48:52Z","timestamp":1776084532074,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3713631","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T03:17:03Z","timestamp":1745464623000},"page":"1-17","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SpeechCompass: Enhancing Mobile Captioning with Diarization and Directional Guidance via Multi-Microphone Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2826-1099","authenticated-orcid":false,"given":"Artem","family":"Dementyev","sequence":"first","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5316-8593","authenticated-orcid":false,"given":"Dimitri","family":"Kanevsky","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2460-6456","authenticated-orcid":false,"given":"Samuel","family":"Yang","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4544-8304","authenticated-orcid":false,"given":"Mathieu","family":"Parvaix","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0302-1602","authenticated-orcid":false,"given":"Chiong","family":"Lai","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2578-3403","authenticated-orcid":false,"given":"Alex","family":"Olwal","sequence":"additional","affiliation":[{"name":"Google Research, Mountain View, California, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/GHTC.2013.6713692"},{"key":"e_1_3_3_3_3_2","unstructured":"Android. 2022. Introducing Live Transcribe. https:\/\/www.android.com\/accessibility\/live-transcribe\/. Accessed 2022-03-26."},{"key":"e_1_3_3_3_4_2","unstructured":"Android. 2022. SpeechRecognizer API Documentation). https:\/\/developer.android.com\/reference\/android\/speech\/SpeechRecognizer. Accessed 2022-10-25."},{"key":"e_1_3_3_3_5_2","doi-asserted-by":"crossref","unstructured":"Xavier Anguera Chuck Wooters and Javier Hernando. 2007. Acoustic beamforming for speaker diarization of meetings. IEEE Transactions on Audio Speech and Language Processing 15 7 (2007) 2011\u20132022.","DOI":"10.1109\/TASL.2007.902460"},{"key":"e_1_3_3_3_6_2","unstructured":"ARM. 2022. CMSIS DSP Software Library. https:\/\/www.keil.com\/pack\/doc\/CMSIS\/DSP\/html\/index.html. Accessed 2022-05-12."},{"key":"e_1_3_3_3_7_2","unstructured":"Ava. 2022. Ava Captioning Solution. https:\/\/www.ava.me\/. Accessed 2024-12-10."},{"key":"e_1_3_3_3_8_2","unstructured":"Jacob Benesty Jingdong Chen and Yiteng Huang. 2008. Conventional beamforming techniques. Microphone array signal processing (2008) 39\u201365."},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290607.3312921"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3132541"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","unstructured":"Rachel Boll Shruti Mahajan Jeanne Reis and Erin\u00a0T. Solovey. 2020. Creating Questionnaires That Align with ASL Linguistic Principles and Cultural Practices within the Deaf Community. Article 61 (2020) 4\u00a0pages. 10.1145\/3373625.3418071","DOI":"10.1145\/3373625.3418071"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/2982142.2982171"},{"key":"e_1_3_3_3_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052974"},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3308561.3353772"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","unstructured":"Chao Cai Henglin Pu Peng Wang Zhe Chen and Jun Luo. 2021. We Hear Your PACE: Passive Acoustic Localization of Multiple Walking Persons. Proc. ACM Interact. Mob. Wearable Ubiquitous Technol. 5 2 Article 55 (jun 2021) 24\u00a0pages. 10.1145\/3463510","DOI":"10.1145\/3463510"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642258"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474772"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366631"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"crossref","unstructured":"Ariel Ephrat Inbar Mosseri Oran Lang Tali Dekel Kevin Wilson Avinatan Hassidim William\u00a0T Freeman and Michael Rubinstein. 2018. Looking to listen at the cocktail party: A speaker-independent audio-visual model for speech separation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.03619 (2018).","DOI":"10.1145\/3197517.3201357"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178061"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3290605.3300276"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"crossref","unstructured":"Israel\u00a0D Gebru Sileye Ba Xiaofei Li and Radu Horaud. 2017. Audio-visual speaker diarization based on spatiotemporal bayesian fusion. IEEE transactions on pattern analysis and machine intelligence 40 5 (2017) 1086\u20131099.","DOI":"10.1109\/TPAMI.2017.2648793"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3132525.3134781"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","unstructured":"Steven\u00a0M. Goodman Ping Liu Dhruv Jain Emma\u00a0J. McDonnell Jon\u00a0E. Froehlich and Leah Findlater. 2021. Toward User-Driven Sound Recognizer Personalization with People Who Are d\/Deaf or Hard of Hearing. Proc. ACM Interact. Mob. Wearable Ubiquitous Technol. 5 2 Article 63 (jun 2021) 23\u00a0pages. 10.1145\/3463501","DOI":"10.1145\/3463501"},{"key":"e_1_3_3_3_25_2","unstructured":"Google. 2018. Google Surveys Methodology. http:\/\/services.google.com\/fh\/files\/misc\/white_paper_how_google_surveys_works.pdf. Accessed 2022-03-22."},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"crossref","unstructured":"Beth\u00a0G Greene David\u00a0B Pisoni and Thomas\u00a0D Carrell. 1984. Recognition of speech spectrograms. The Journal of the Acoustical Society of America 76 1 (1984) 32\u201343.","DOI":"10.1121\/1.391035"},{"key":"e_1_3_3_3_27_2","doi-asserted-by":"crossref","unstructured":"Fran\u00e7ois Grondin and Fran\u00e7ois Michaud. 2019. Lightweight and optimized sound source localization and tracking methods for open and closed microphone array configurations. Robotics and Autonomous Systems 113 (2019) 63\u201380.","DOI":"10.1016\/j.robot.2019.01.002"},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373625.3418031"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471664"},{"key":"e_1_3_3_3_30_2","unstructured":"Pulsar Instruments. 2022. Decibel chart \u2013 decibel levels of common sounds. https:\/\/pulsarinstruments.com\/news\/decibel-chart-noise-level. Accessed 2022-07-27."},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"crossref","unstructured":"Yusuf Isik Jonathan\u00a0Le Roux Zhuo Chen Shinji Watanabe and John\u00a0R Hershey. 2016. Single-channel multi-speaker separation using deep clustering. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1607.02173 (2016).","DOI":"10.21437\/Interspeech.2016-1176"},{"key":"e_1_3_3_3_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702393"},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702393"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376758"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446297"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/SMC.2013.608"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/2973750.2985625"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"Charles Knapp and Glifford Carter. 1976. The generalized correlation method for estimation of time delay. IEEE transactions on acoustics speech and signal processing 24 4 (1976) 320\u2013327.","DOI":"10.1109\/TASSP.1976.1162830"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/2700648.2809843"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"publisher","DOI":"10.23919\/SOFTCOM.2017.8115577"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/HSCMA.2008.4538695"},{"key":"e_1_3_3_3_42_2","doi-asserted-by":"crossref","unstructured":"Bo Li Tara\u00a0N Sainath Ron\u00a0J Weiss Kevin\u00a0W Wilson and Michiel Bacchiani. 2016. Neural network adaptive beamforming for robust multichannel speech recognition. (2016).","DOI":"10.21437\/Interspeech.2016-173"},{"key":"e_1_3_3_3_43_2","unstructured":"LibriVox. 2022. Alice\u2019s Adventures in Wonderland by Lewis Carroll (Version 2). https:\/\/librivox.org\/alices-adventures-in-wonderland-by-lewis-carroll-4\/. Accessed 2022-07-12."},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"crossref","first-page":"4332","DOI":"10.1109\/IROS.2010.5650170","volume-title":"2010 IEEE\/RSJ International Conference on Intelligent Robots and Systems","author":"Liu Hong","year":"2010","unstructured":"Hong Liu and Miao Shen. 2010. Continuous sound source localization based on microphone array for mobile robots. In 2010 IEEE\/RSJ International Conference on Intelligent Robots and Systems. IEEE, 4332\u20134339."},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1017\/9781139051699"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"crossref","unstructured":"James\u00a0C Makous and John\u00a0C Middlebrooks. 1990. Two-dimensional sound localization by human listeners. The journal of the Acoustical Society of America 87 5 (1990) 2188\u20132200.","DOI":"10.1121\/1.399186"},{"key":"e_1_3_3_3_47_2","unstructured":"Microsoft. 2022. Translator. https:\/\/translator.microsoft.com\/. Accessed 2022-03-26."},{"key":"e_1_3_3_3_48_2","unstructured":"MiniDSP. 2022. USB Mic array. https:\/\/www.minidsp.com\/products\/usb-audio-interface\/uma-8-16-usb-mic-array. Accessed 2022-05-15."},{"key":"e_1_3_3_3_49_2","unstructured":"Kai Morich. 2022. usb-serial-for-android). https:\/\/github.com\/mik3y\/usb-serial-for-android. Accessed 2022-11-09."},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/AFRCON.2015.7331970"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415817"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"crossref","unstructured":"Tae\u00a0Jin Park Naoyuki Kanda Dimitrios Dimitriadis Kyu\u00a0J Han Shinji Watanabe and Shrikanth Narayanan. 2022. A review of speaker diarization: Recent advances with deep learning. Computer Speech & Language 72 (2022) 101317.","DOI":"10.1016\/j.csl.2021.101317"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/345910.345917"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/ROBOT.2009.5152861"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1145\/3334480.3375039"},{"key":"e_1_3_3_3_57_2","unstructured":"Deep Sleep. 2022. Rain Sound. https:\/\/www.youtube.com\/watch?v=13EL6Mgeocc&t=3448s. Accessed 2022-07-12."},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683760"},{"key":"e_1_3_3_3_59_2","unstructured":"Speaksee. 2022. Speaksee microphone kit. https:\/\/speak-see.com\/. Accessed 2022-10-25."},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446934"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","unstructured":"Hassan Taherian and DeLiang Wang. 2024. Multi-Channel Conversational Speaker Separation via Neural Diarization. IEEE\/ACM Transactions on Audio Speech and Language Processing 32 (2024) 2467\u20132476. 10.1109\/TASLP.2024.3393726","DOI":"10.1109\/TASLP.2024.3393726"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2007.4425280"},{"key":"e_1_3_3_3_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-478350-8.50007-6"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"crossref","unstructured":"Yuteng Xiao Jihang Yin Honggang Qi Hongsheng Yin and Gang Hua. 2017. MVDR algorithm based on estimated diagonal loading for beamforming. Mathematical Problems in Engineering 2017 (2017).","DOI":"10.1155\/2017\/7904356"},{"key":"e_1_3_3_3_66_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446587"},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413803"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713631","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3713631","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T04:41:49Z","timestamp":1751604109000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3713631"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":66,"alternative-id":["10.1145\/3706598.3713631","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3713631","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}