{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T08:01:45Z","timestamp":1773734505343,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,20]],"date-time":"2020-10-20T00:00:00Z","timestamp":1603152000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,20]]},"DOI":"10.1145\/3379337.3415588","type":"proceedings-article","created":{"date-parts":[[2020,10,16]],"date-time":"2020-10-16T19:01:43Z","timestamp":1602874903000},"page":"1121-1131","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Direction-of-Voice (DoV) Estimation for Intuitive Speech Interaction with Smart Devices Ecosystems"],"prefix":"10.1145","author":[{"given":"Karan","family":"Ahuja","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Andy","family":"Kong","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Mayank","family":"Goel","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Chris","family":"Harrison","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,10,20]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Alberto Abad Dusan Macho Carlos Segura Javier Hernando and Climent Nadeu. \"Effect of head orienta-tion on the speaker localization performance in smart-room environment.\" In Ninth European Conference on Speech Communication and Technology. 2005.  Alberto Abad Dusan Macho Carlos Segura Javier Hernando and Climent Nadeu. \"Effect of head orienta-tion on the speaker localization performance in smart-room environment.\" In Ninth European Conference on Speech Communication and Technology. 2005.","DOI":"10.21437\/Interspeech.2005-89"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Alberto Abad Carlos Segura Climent Nadeu and Javier Hernando. \"Audio-based approaches to head ori-entation estimation in a smart-room.\" In Eighth An-nual Conference of the International Speech Commu-nication Association. 2007.  Alberto Abad Carlos Segura Climent Nadeu and Javier Hernando. \"Audio-based approaches to head ori-entation estimation in a smart-room.\" In Eighth An-nual Conference of the International Speech Commu-nication Association. 2007.","DOI":"10.21437\/Interspeech.2007-257"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351229"},{"key":"e_1_3_2_2_4_1","volume-title":"DAFX 04.","author":"Barry Dan","year":"2004"},{"key":"e_1_3_2_2_5_1","first-page":"315","article-title":"Considering the second peak in the GCC function for multi-source TDOA estimation with a microphone array","author":"Bechler Dirk","year":"2003","journal-title":"Pro-ceedings of the International Workshop on Acoustic Echo and Noise Control"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3264901"},{"key":"e_1_3_2_2_7_1","volume-title":"American Institute of Physics","author":"Beranek Leo L."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/800250.807503"},{"key":"e_1_3_2_2_9_1","volume-title":"Proc. IEEE International Con-ference on Acoustics, Speech and Signal Processing","author":"Brandstein M. S.","year":"1997"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967202"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Alessio Brutti Maurizio Omologo and Piergiorgio Svaizer. \"Oriented global coherence field for the esti-mation of the head orientation in smart rooms equipped with distributed microphone arrays.\" In Ninth European Conference on Speech Communica-tion and Technology. 2005.  Alessio Brutti Maurizio Omologo and Piergiorgio Svaizer. \"Oriented global coherence field for the esti-mation of the head orientation in smart rooms equipped with distributed microphone arrays.\" In Ninth European Conference on Speech Communica-tion and Technology. 2005.","DOI":"10.21437\/Interspeech.2005-745"},{"key":"e_1_3_2_2_12_1","volume-title":"IEEE, 2008","author":"Canton-Ferrer Cristian","year":"2008"},{"key":"e_1_3_2_2_13_1","volume-title":"IEEE, 2019","author":"Cervenak Rastislav","year":"2019"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Soumitro Chakrabarty and Emanu\u00ebl AP Habets. \"Multi-speaker localization using convolutional neural network trained with noise.\" arXiv preprint arXiv:1712.04276 (2017).  Soumitro Chakrabarty and Emanu\u00ebl AP Habets. \"Multi-speaker localization using convolutional neural network trained with noise.\" arXiv preprint arXiv:1712.04276 (2017).","DOI":"10.1109\/WASPAA.2017.8170010"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1296843.1296888"},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the seventh annual ACM\/IEEE international confer-ence on Human-Robot Interaction (HRI '12)","author":"Deleforge Antoine","year":"2012"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-73059-2"},{"key":"e_1_3_2_2_19_1","first-page":"49","volume-title":"\"Challenges and future perspectives in speech-sources direction of arri-val estimation and localization.\" In Direction of arri-val estimation and localization of multi-speech sources","author":"Dey Nilanjan","year":"2018"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/2209797.2210131"},{"key":"e_1_3_2_2_21_1","first-page":"2386","volume-title":"Speech and Signal Processing (ICASSP)","author":"Ferguson Eric L.","year":"2018"},{"key":"e_1_3_2_2_22_1","first-page":"77","volume-title":"\"A speaker diarization system with robust speaker localization and voice activity detection.\" In Contemporary Challenges and Solutions in Applied Artificial Intelligence","author":"Huang Yangyang","year":"2013"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.na.2009.06.089"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2360646"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"crossref","unstructured":"C. H. Knapp and G. C. Carter: 1976 The generalized correlation method for estimation of time delay IEEE Transactions on Acoustics Speech and Signal Pro-cessing ASSP-24(4) 320--327. DOI: https:\/\/doi.org\/10.1109\/TASSP.1976.1162830  C. H. Knapp and G. C. Carter: 1976 The generalized correlation method for estimation of time delay IEEE Transactions on Acoustics Speech and Signal Pro-cessing ASSP-24(4) 320--327. DOI: https:\/\/doi.org\/10.1109\/TASSP.1976.1162830","DOI":"10.1109\/TASSP.1976.1162830"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.5120\/9970-4758"},{"key":"e_1_3_2_2_27_1","volume-title":"IEEE","author":"Kwon Byoungho","year":"2009"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Avram Levi and Harvey Silverman. \"A robust meth-od to extract talker azimuth orientation using a large-aperture microphone array.\" IEEE transactions on au-dio speech and language processing 18 no. 2 (2009):  277--285. DOI: https:\/\/doi.org\/10.1109\/TASL.2009.2025793  Avram Levi and Harvey Silverman. \"A robust meth-od to extract talker azimuth orientation using a large-aperture microphone array.\" IEEE transactions on au-dio speech and language processing 18 no. 2 (2009): 277--285. DOI: https:\/\/doi.org\/10.1109\/TASL.2009.2025793","DOI":"10.1109\/TASL.2009.2025793"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00034-013-9578-3"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1017\/S0263574709990865"},{"key":"e_1_3_2_2_31_1","unstructured":"Michael I. Mandel Daniel P. Ellis and Tony Jebara. \"An EM algorithm for localizing multiple sound sources in reverberant environments.\" In Advances in neural information processing systems pp. 953--960. 2007. DOI: https:\/\/doi.org\/10.7916\/D84176FK  Michael I. Mandel Daniel P. Ellis and Tony Jebara. \"An EM algorithm for localizing multiple sound sources in reverberant environments.\" In Advances in neural information processing systems pp. 953--960. 2007. DOI: https:\/\/doi.org\/10.7916\/D84176FK"},{"key":"e_1_3_2_2_32_1","first-page":"2016","article-title":"\"On indexicality, direction of arrival of sound sources, and human-robot interaction","author":"Meza Ivan","year":"2016","journal-title":"Journal of robotics"},{"key":"e_1_3_2_2_33_1","first-page":"1","volume-title":"ITG Symposium","author":"M\u00fcller Menno","year":"2016"},{"key":"e_1_3_2_2_34_1","volume-title":"IEEE, 2009","author":"Nakajima Hirofumi","year":"2009"},{"key":"e_1_3_2_2_35_1","volume-title":"IEEE, 2011","author":"Nakamura Keisuke","year":"2011"},{"key":"e_1_3_2_2_36_1","volume-title":"IEEE, 2009","author":"Nakano Alberto Yoshihiro","year":"2009"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3025171.3025202"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1452392.1452446"},{"issue":"3","key":"e_1_3_2_2_39_1","first-page":"270","article-title":"Performance analysis of GCC-PHAT-based sound source localization for intelligent robots","volume":"2","author":"Park Beom-Chul","year":"2007","journal-title":"Journal of Korea Robotics Society"},{"key":"e_1_3_2_2_40_1","first-page":"96","volume-title":"Speech and Signal Processing (ICASSP)","author":"Pavlidi Despoina","year":"2016"},{"key":"e_1_3_2_2_41_1","volume-title":"Designing voice user interfaces: princi-ples of conversational experiences. \" O'Reilly Media","author":"Pearl Cathy","year":"2016"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174214"},{"issue":"5","key":"e_1_3_2_2_43_1","article-title":"A survey on: Voice command recognition technique","volume":"3","author":"Prabhakar Om Prakash","year":"2013","journal-title":"In-ternational Journal of Advanced Research in Comput-er Science and Software Engineering"},{"key":"e_1_3_2_2_44_1","first-page":"10","volume-title":"\"Robotic orientation towards speaker for human-robot interaction.\" In Ibero-American Conference on Artifi-cial Intelligence","author":"Rasc\u00f3n Caleb","year":"2010"},{"key":"e_1_3_2_2_45_1","unstructured":"ReSpeaker. URL: https:\/\/wiki.seeedstudio.com\/ReSpeaker-USB-Mic-Array  ReSpeaker. URL: https:\/\/wiki.seeedstudio.com\/ReSpeaker-USB-Mic-Array"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Joshua M. Sachar and Harvey F. Silverman. \"A base-line algorithm for estimating talker orientation using acoustical data from a large-aperture microphone ar-ray.\" In 2004 IEEE International Conference on Acoustics Speech and Signal Processing vol. 4 pp. iv-iv. IEEE 2004. DOI: https:\/\/doi.org\/10.1109\/ICASSP.2004.1326764  Joshua M. Sachar and Harvey F. Silverman. \"A base-line algorithm for estimating talker orientation using acoustical data from a large-aperture microphone ar-ray.\" In 2004 IEEE International Conference on Acoustics Speech and Signal Processing vol. 4 pp. iv-iv. IEEE 2004. DOI: https:\/\/doi.org\/10.1109\/ICASSP.2004.1326764","DOI":"10.1109\/ICASSP.2004.1326764"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2418571"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Carlos Segura Alberto Abad Javier Hernando and Climent Nadeu. \"Speaker orientation estimation based on hybridation of GCC-PHAT and HLBR.\" In Ninth Annual Conference of the International Speech Com-munication Association. 2008.  Carlos Segura Alberto Abad Javier Hernando and Climent Nadeu. \"Speaker orientation estimation based on hybridation of GCC-PHAT and HLBR.\" In Ninth Annual Conference of the International Speech Com-munication Association. 2008.","DOI":"10.21437\/Interspeech.2008-387"},{"key":"e_1_3_2_2_49_1","first-page":"1","article-title":"GCC-PHAT based head orientation estimation","author":"Segura Carlos","year":"2012","journal-title":"13th Annual Conference of International Speech Communication Association"},{"key":"e_1_3_2_2_50_1","first-page":"II-681","volume-title":"Speech and Signal Processing-ICASSP'07","volume":"2","author":"Segura Carlos"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2007.914342"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Hannu Soronen Markku Turunen and Jaakko Hakulinen. \"Voice commands in home environment-a consumer survey.\" In Ninth Annual Conference of the International Speech Communication Association. 2008.  Hannu Soronen Markku Turunen and Jaakko Hakulinen. \"Voice commands in home environment-a consumer survey.\" In Ninth Annual Conference of the International Speech Communication Association. 2008.","DOI":"10.21437\/Interspeech.2008-539"},{"key":"e_1_3_2_2_53_1","first-page":"3081","volume-title":"Speech, and Signal Processing. Proceed-ings. ICASSP99 (Cat. No. 99CH36258)","author":"Strobel Norbert","year":"1999"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Ryoichi Takashima Tetsuya Takiguchi and Yasuo Ariki. \"Estimation of talker's head orientation based on discrimination of the shape of cross-power spec-trum phase coefficients.\" In Thirteenth Annual Con-ference of the International Speech Communication Association. 2012.  Ryoichi Takashima Tetsuya Takiguchi and Yasuo Ariki. \"Estimation of talker's head orientation based on discrimination of the shape of cross-power spec-trum phase coefficients.\" In Thirteenth Annual Con-ference of the International Speech Communication Association. 2012.","DOI":"10.21437\/Interspeech.2012-403"},{"key":"e_1_3_2_2_55_1","volume-title":"IEEE, 2016","author":"Takeda Ryu","year":"2016"},{"key":"e_1_3_2_2_56_1","volume-title":"Proceedings of the 2020 ACM\/IEEE Inter-national Conference on Human-Robot Interaction (HRI ?20)","author":"Mutlu Bilge","year":"2020"},{"key":"e_1_3_2_2_57_1","unstructured":"Tobii Pro Glasses 2. URL: https:\/\/www.tobiipro.com\/product-listing\/tobii-pro-glasses-2\/  Tobii Pro Glasses 2. URL: https:\/\/www.tobiipro.com\/product-listing\/tobii-pro-glasses-2\/"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.3390\/s121013781"},{"key":"e_1_3_2_2_59_1","first-page":"187","volume-title":"Speech, and Signal Processing","author":"Wang Hong","year":"1997"},{"key":"e_1_3_2_2_60_1","volume-title":"Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems (CHI '20)","author":"Yang Jackie"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.917406"}],"event":{"name":"UIST '20: The 33rd Annual ACM Symposium on User Interface Software and Technology","location":"Virtual Event USA","acronym":"UIST '20","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 33rd Annual ACM Symposium on User Interface Software and Technology"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3379337.3415588","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3379337.3415588","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:45:05Z","timestamp":1750203905000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3379337.3415588"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,20]]},"references-count":60,"alternative-id":["10.1145\/3379337.3415588","10.1145\/3379337"],"URL":"https:\/\/doi.org\/10.1145\/3379337.3415588","relation":{},"subject":[],"published":{"date-parts":[[2020,10,20]]},"assertion":[{"value":"2020-10-20","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}