{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T22:29:23Z","timestamp":1776119363173,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":366,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100003382","name":"Core Research for Evolutional Science and Technology","doi-asserted-by":"publisher","award":["JPMJCR19A"],"award-info":[{"award-number":["JPMJCR19A"]}],"id":[{"id":"10.13039\/501100003382","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,4]]},"DOI":"10.1145\/3623809.3623837","type":"proceedings-article","created":{"date-parts":[[2023,12,3]],"date-time":"2023-12-03T12:50:35Z","timestamp":1701607835000},"page":"13-21","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Multimodal Voice Activity Prediction: Turn-taking Events Detection in Expert-Novice Conversation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2807-2916","authenticated-orcid":false,"given":"Kazuyo","family":"Onishi","sequence":"first","affiliation":[{"name":"Nara Institute of Science and Technology, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0548-6252","authenticated-orcid":false,"given":"Hiroki","family":"Tanaka","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6956-3803","authenticated-orcid":false,"given":"Satoshi","family":"Nakamura","sequence":"additional","affiliation":[{"name":"Nara Institute of Science and Technology, Japan"}]}],"member":"320","published-online":{"date-parts":[[2023,12,4]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00019"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136780"},{"key":"e_1_3_2_1_3_1","volume-title":"OpenPose: realtime multi-person 2D pose estimation using Part Affinity Fields","author":"Cao Zhe","year":"2021","unstructured":"Zhe Cao, Gines Hidalgo, Tomas Simon, Shih-En Wei, and Yaser Sheikh. 2021. OpenPose: realtime multi-person 2D pose estimation using Part Affinity Fields. IEEE transactions on pattern analysis and machine intelligence 43, 1 (2021), 172\u2013186."},{"key":"e_1_3_2_1_4_1","volume-title":"Using language","author":"Clark H","unstructured":"Herbert\u00a0H Clark. 1996. Using language. Cambridge university press."},{"key":"e_1_3_2_1_5_1","volume-title":"\u00a0J. Heylen","author":"de Kok Iwan","year":"2009","unstructured":"Iwan de Kok and Dirk K.\u00a0J. Heylen. 2009. Multimodal end-of-turn prediction in multi-party meetings. In ICMI-MLMI \u201909."},{"key":"e_1_3_2_1_6_1","volume-title":"Some signals and rules for taking speaking turns in conversations.Journal of personality and social psychology 23, 2","author":"Duncan Starkey","year":"1972","unstructured":"Starkey Duncan. 1972. Some signals and rules for taking speaking turns in conversations.Journal of personality and social psychology 23, 2 (1972), 283."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1159\/000090099"},{"key":"e_1_3_2_1_8_1","unstructured":"Erik Ekstedt. 2022. Continuous Conversational SSL. https:\/\/github.com\/erikekstedt\/conv_ssl."},{"key":"e_1_3_2_1_9_1","volume-title":"VAP: Voice Activity Projection. https:\/\/github.com\/ErikEkstedt\/vap_turn_taking.","author":"Ekstedt Erik","year":"2022","unstructured":"Erik Ekstedt. 2022. VAP: Voice Activity Projection. https:\/\/github.com\/ErikEkstedt\/vap_turn_taking."},{"key":"e_1_3_2_1_10_1","volume-title":"Turngpt: a transformer-based language model for predicting turn-taking in spoken dialog. arXiv preprint arXiv:2010.10874","author":"Ekstedt Erik","year":"2020","unstructured":"Erik Ekstedt and Gabriel Skantze. 2020. Turngpt: a transformer-based language model for predicting turn-taking in spoken dialog. arXiv preprint arXiv:2010.10874 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"How Much Does Prosody Help Turn-taking? Investigations using Voice Activity Projection Models. arXiv preprint arXiv:2209.05161","author":"Ekstedt Erik","year":"2022","unstructured":"Erik Ekstedt and Gabriel Skantze. 2022. How Much Does Prosody Help Turn-taking? Investigations using Voice Activity Projection Models. arXiv preprint arXiv:2209.05161 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Voice activity projection: Self-supervised learning of turn-taking events. arXiv preprint arXiv:2205.09812","author":"Ekstedt Erik","year":"2022","unstructured":"Erik Ekstedt and Gabriel Skantze. 2022. Voice activity projection: Self-supervised learning of turn-taking events. arXiv preprint arXiv:2205.09812 (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-565"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-874"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2"},{"key":"e_1_3_2_1_16_1","unstructured":"Nishitha Guntakandla and Rodney\u00a0D. Nielsen. 2015. Modelling Turn-Taking in Human Conversations. In AAAI Spring Symposia."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Kohei Hara Koji Inoue Katsuya Takanashi and Tatsuya Kawahara. 2019. Turn-Taking Prediction Based on Detection of Transition Relevance Place.. In INTERSPEECH. 4170\u20134174.","DOI":"10.21437\/Interspeech.2019-1537"},{"key":"e_1_3_2_1_18_1","volume-title":"Processing language in face-to-face conversation: Questions with gestures get faster responses. Psychonomic bulletin & review 25","author":"Holler Judith","year":"2018","unstructured":"Judith Holler, Kobin\u00a0H Kendrick, and Stephen\u00a0C Levinson. 2018. Processing language in face-to-face conversation: Questions with gestures get faster responses. Psychonomic bulletin & review 25 (2018), 1900\u20131908."},{"key":"e_1_3_2_1_19_1","volume-title":"Speech driven backchannel generation using deep Q-network for enhancing engagement in human-robot interaction. arXiv preprint arXiv:1908.01618","author":"Hussain Nusrah","year":"2019","unstructured":"Nusrah Hussain, Engin Erzin, T\u00a0Metin Sezgin, and Yucel Yemez. 2019. Speech driven backchannel generation using deep Q-network for enhancing engagement in human-robot interaction. arXiv preprint arXiv:1908.01618 (2019)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2522848.2522890"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478360"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2022.774547"},{"key":"e_1_3_2_1_23_1","unstructured":"Toshihiko Itoh Norihide Kitaoka and Ryota Nishimura. 2009. Subjective experiments on influence of response timing in spoken dialogues. In Interspeech."},{"key":"e_1_3_2_1_24_1","volume-title":"Some functions of gaze-direction in social interaction. Acta psychologica 26","author":"Kendon Adam","year":"1967","unstructured":"Adam Kendon. 1967. Some functions of gaze-direction in social interaction. Acta psychologica 26 (1967), 22\u201363."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1098\/rstb.2021.0473"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340555.3353727"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5494991"},{"key":"e_1_3_2_1_28_1","volume-title":"Timing in turn-taking and its implications for processing models of language. Frontiers in psychology 6","author":"Levinson C","year":"2015","unstructured":"Stephen\u00a0C Levinson and Francisco Torreira. 2015. Timing in turn-taking and its implications for processing models of language. Frontiers in psychology 6 (2015), 731."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Tomer Meshorer and Peter\u00a0A. Heeman. 2016. Using Past Speaker Behavior to Better Predict Turn Transitions. In Interspeech.","DOI":"10.21437\/Interspeech.2016-1409"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-85483-8_18"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-20916-6_31"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947688"},{"key":"e_1_3_2_1_33_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van\u00a0den Oord Aaron","year":"2018","unstructured":"Aaron van\u00a0den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_34_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Antoine Raux Dan Bohus Brian Langner Alan\u00a0W. Black and Maxine Esk\u00e9nazi. 2006. Doing research on a deployed spoken dialogue system: one year of let\u2019s go! experience. In Interspeech.","DOI":"10.21437\/Interspeech.2006-17"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3242997"},{"key":"e_1_3_2_1_37_1","volume-title":"Advanced social interaction with agents: 8th international workshop on spoken dialog systems","author":"Ruede Robin","unstructured":"Robin Ruede, Markus M\u00fcller, Sebastian St\u00fcker, and Alex Waibel. 2019. Yeah, right, uh-huh: a deep learning backchannel predictor. In Advanced social interaction with agents: 8th international workshop on spoken dialog systems. Springer, 247\u2013258."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-746"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023458"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the SIGDIAL 2013 Conference. 384\u2013393","author":"Selfridge Ethan","year":"2013","unstructured":"Ethan Selfridge, Iker Arizmendi, Peter\u00a0A Heeman, and Jason\u00a0D Williams. 2013. Continuously predicting and processing barge-in during a live spoken dialogue task. In Proceedings of the SIGDIAL 2013 Conference. 384\u2013393."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1075\/gest.12.2.03sik"},{"key":"e_1_3_2_1_42_1","unstructured":"Gabriel Skantze. 2012. A Testbed for Examining the Timing of Feedback using a Map Task."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-5527"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101178"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.21437\/Eurospeech.2003-258"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15892-6_48"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Bekir\u00a0Berker T\u00fcrker Zana Bu\u00e7inca Engin Erzin Y\u00fccel Yemez and T\u00a0Metin Sezgin. 2017. Analysis of Engagement and User Experience with a Laughter Responsive Social Robot.. In Interspeech. 844\u2013848.","DOI":"10.21437\/Interspeech.2017-1395"},{"key":"e_1_3_2_1_48_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639673"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Nigel\u00a0G. Ward Anais\u00a0G. Rivera Karen Ward and David\u00a0G. Novick. 2005. Root causes of lost time and user stress in a simple dialog system. In Interspeech.","DOI":"10.21437\/Interspeech.2005-458"},{"key":"e_1_3_2_1_51_1","first-page":"831","article-title":"Prosody and hand gesture at turn boundaries in Swedish","volume":"2016","author":"Zellers Margaret","year":"2016","unstructured":"Margaret Zellers, David House, and Simon Alexanderson. 2016. Prosody and hand gesture at turn boundaries in Swedish. Proc. Speech Prosody 2016 (2016), 831\u2013835.","journal-title":"Proc. Speech Prosody"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267880"}],"event":{"name":"HAI '23: International Conference on Human-Agent Interaction","location":"Gothenburg Sweden","acronym":"HAI '23"},"container-title":["International Conference on Human-Agent Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623809.3623837","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3623809.3623837","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3623809.3623837","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3623809.3623837","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,6]],"date-time":"2025-10-06T19:32:38Z","timestamp":1759779158000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623809.3623837"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,4]]},"references-count":52,"alternative-id":["10.1145\/3623809.3623837","10.1145\/3623809"],"URL":"https:\/\/doi.org\/10.1145\/3623809.3623837","relation":{},"subject":[],"published":{"date-parts":[[2023,12,4]]},"assertion":[{"value":"2023-12-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}