{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T09:24:03Z","timestamp":1769505843201,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3716553.3750781","type":"proceedings-article","created":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T13:13:16Z","timestamp":1760188396000},"page":"446-455","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Predicting End-of-turn and Backchannel Based on Multimodal Voice Activity Prediction Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-3849-1656","authenticated-orcid":false,"given":"Ryo","family":"Ishii","sequence":"first","affiliation":[{"name":"Human Informatics Laboratories, NTT, Inc., Yokosuka-shi, Kanagawa, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4351-3530","authenticated-orcid":false,"given":"Shinichiro","family":"Eitoku","sequence":"additional","affiliation":[{"name":"Human Informatics Laboratories, NTT, Inc., Yokosuka-shi, Kanagawa, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4694-0462","authenticated-orcid":false,"given":"Ryota","family":"Yokoyama","sequence":"additional","affiliation":[{"name":"Human Informatics Laboratories, NTT, Inc., Yokosuka-shi, Kanagawa, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8219-2943","authenticated-orcid":false,"given":"Junichi","family":"Sawase","sequence":"additional","affiliation":[{"name":"Human Informatics Laboratories, NTT, Inc., Yokosuka-shi, Kanagawa, Japan"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414193"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Patricia\u00a0M Clancy Sandra\u00a0A Thompson Ryoko Suzuki and Hongyin Tao. 1996. The conversational use of reactive tokens in English Japanese and Mandarin. Journal of Pragmatics 26 3 (1996) 355\u2013387.","DOI":"10.1016\/0378-2166(95)00036-4"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511620539"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.268"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10955"},{"key":"e_1_3_3_2_7_2","first-page":"5190","volume-title":"INTERSPEECH","author":"Ekstedt Erik","year":"2022","unstructured":"Erik Ekstedt and Gabriel Skantze. 2022. Voice Activity Projection: Self-supervised learning of turn-taking events. In INTERSPEECH. 5190\u20135194."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","unstructured":"Judith Holler Kobin\u00a0H. Kendrick and Stephen\u00a0C. Levinson. 2018. Processing language in face-to-face conversation: Questons with gestures get faster responses. Psychonomic Bulletin & Review 6 (2018) 25. 10.3758\/s13423-017-1363-z","DOI":"10.3758\/s13423-017-1363-z"},{"key":"e_1_3_3_2_9_2","first-page":"11873","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)","author":"Inoue Koji","year":"2024","unstructured":"Koji Inoue, Bing\u2019er Jiang, Erik Ekstedt, Tatsuya Kawahara, and Gabriel Skantze. 2024. Multilingual Turn-taking Prediction Using Voice Activity Projection. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 11873\u201311883."},{"key":"e_1_3_3_2_10_2","unstructured":"Koji Inoue Divesh Lala Gabriel Skantze and Tatsuya Kawahara. 2025. Yeah Un Oh: Continuous and Real-time Backchannel Prediction with Fine-tuning of Voice Activity Projection. arxiv:https:\/\/arXiv.org\/abs\/2410.15929\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2410.15929"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267866"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178385"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1145\/3125739.3125765"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-42293-571"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","unstructured":"Ryo Ishii Kazuhiro Otsuka Shiro Kumano Ryuichiro Higashinaka and Junji Tomita. 2019. Prediction of Who Will Be Next Speaker and When Using Mouth-Opening Pattern in Multi-Party Conversation. Multimodal Technologies and Interaction 3 4 (2019) 70. 10.3390\/mti3040070","DOI":"10.3390\/mti3040070"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","unstructured":"Ryo Ishii Kazuhiro Otsuka Shiro Kumano and Junji Yamamoto. 2016. Predicting of Who Will Be the Next Speaker and When Using Gaze Behavior in Multiparty Meetings. ACM Transactions on Interactive Intelligent Systems (TiiS) 6 1 (2016) 4. 10.1145\/2757284","DOI":"10.1145\/2757284"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Ryo Ishii Kazuhiro Otsuka Shiro Kumano and Junji Yamamoto. 2016. Using Respiration to Predict Who Will Speak Next and When in Multiparty Meetings. ACM Transactions on Interactive Intelligent Systems (TiiS) 6 2 (2016) 20. 10.1145\/2946838","DOI":"10.1145\/2946838"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","unstructured":"Ryo Ishii Xutong Ren Michal Muszynski and Louis\u2011Philippe Morency. 2022. Trimodal prediction of speaking and listening willingness to help improve turn-changing modeling. Frontiers in Psychology 13 (2022) 774547. 10.3389\/fpsyg.2022.774547","DOI":"10.3389\/fpsyg.2022.774547"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3383652.3423907"},{"key":"e_1_3_3_2_20_2","first-page":"131","volume-title":"Proceedings of the 21st ACM International Conference on Intelligent Virtual Agents","author":"Ishii Ryo","year":"2021","unstructured":"Ryo Ishii, Xutong Ren, Michal Muszynski, and Louis-Philippe Morency. 2021. Multimodal and Multitask Approach to Listener\u2019s Backchannel Prediction: Can Prediction of Turn-changing and Turn-management Willingness Improve Backchannel Modeling?. In Proceedings of the 21st ACM International Conference on Intelligent Virtual Agents. 131\u2013138."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445449"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.277"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"Kristiina Jokinen Hirohisa Furukawa Masafumi Nishida and Seiichi Yamamoto. 2013. Gaze and turn-taking behavior in casual conversational interactions. ACM Transactions on Interactive Intelligent Systems (TiiS) 3 2 (2013) 12. 10.1145\/2499474.2499481","DOI":"10.1145\/2499474.2499481"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-04380-249"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-226"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-118"},{"key":"e_1_3_3_2_27_2","unstructured":"Kobayashi Koga Shengzhe Li Akifumi Nakamachi and Toshinori Sato. 2023. LINE DistilBERT Japanese. http:\/\/github.com\/line\/LINE-DistilBERT-Japanese. (2023)."},{"key":"e_1_3_3_2_28_2","first-page":"2658","volume-title":"INTERSPEECH","author":"Kurata Fuma","year":"2023","unstructured":"Fuma Kurata, Mao Saeki, Shinya Fujie, and Yoichi Matsuyama. 2023. Multimodal turn-taking model using visual cues for end-of-utterance prediction in spoken dialogue systems. In INTERSPEECH. 2658\u20132662."},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Meng-Chen Lee and Zhigang Deng. 2024. Online Multimodal End-of-Turn Prediction for Three-party Conversations. 57\u201365.","DOI":"10.1145\/3678957.3685742"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1006"},{"key":"e_1_3_3_2_31_2","unstructured":"Camillo Lugaresi Jiuqiang Tang Hadon Nash Chris McClanahan Esha Uboweja Michael Hays Fan Zhang Chuo-Ling Chang Ming\u00a0Guang Yong Juhyun Lee Wan-Teh Chang Wei Hua Manfred Georg and Matthias Grundmann. 2019. MediaPipe: A Framework for Building Perception Pipelines. arxiv:https:\/\/arXiv.org\/abs\/1906.08172\u00a0[cs.DC] https:\/\/arxiv.org\/abs\/1906.08172"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4614-1785-9_11"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5024"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-85483-8_18"},{"key":"e_1_3_3_2_35_2","first-page":"2638","volume-title":"INTERSPEECH","author":"Muromachi Toshiki","year":"2023","unstructured":"Toshiki Muromachi and Yoshinobu Kano. 2023. Estimation of Listening Response Timing by Generative Model and Parameter Control of Response Substantialness Using Dynamic-Prompt-Tune. In INTERSPEECH. 2638\u20132642."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_2_37_2","first-page":"1689","volume-title":"Conference of the European Chapter of the Association for Computational Linguistics (EACL) Findings","author":"Park Yo-Han","year":"2024","unstructured":"Yo-Han Park, Wencke Liermann, Yong-Seok Choi, and Kong\u00a0Joo Lee. 2024. Improving backchannel prediction leveraging sequential and attentive context awareness. In Conference of the European Chapter of the Association for Computational Linguistics (EACL) Findings. 1689\u20131694."},{"key":"e_1_3_3_2_38_2","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In Proceedings of the 40th International Conference on Machine Learning. Article 1182, 27\u00a0pages."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.5555\/1622064.1622066"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"e_1_3_3_2_41_2","first-page":"879","volume-title":"INTERSPEECH","author":"Ruede Robin","year":"2017","unstructured":"Robin Ruede, Markus M\u00fcller, Sebastian St\u00fcker, and Alex Waibel. 2017. Enhancing backchannel prediction using word embeddings. In INTERSPEECH. 879\u2013883."},{"key":"e_1_3_3_2_42_2","first-page":"247","volume-title":"International Workshop on Spoken Dialogue Systems Technology (IWSDS)","author":"Ruede Robin","year":"2017","unstructured":"Robin Ruede, Markus M\u00fcller, Sebastian St\u00fcker, and Alex Waibel. 2017. Yeah, right, uh-huh: A deep learning backchannel predictor. In International Workshop on Spoken Dialogue Systems Technology (IWSDS). 247\u2013258."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023458"},{"key":"e_1_3_3_2_44_2","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1910.01108 (2019)."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-5527"},{"key":"e_1_3_3_2_46_2","unstructured":"Silero Team. 2024. Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD) Number Detector and Language Classifier. https:\/\/github.com\/snakers4\/silero-vad."},{"key":"e_1_3_3_2_47_2","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2019. Representation Learning with Contrastive Predictive Coding. arxiv:https:\/\/arXiv.org\/abs\/1807.03748\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447196"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Nigel Ward and Wataru Tsukahara. 2000. Prosodic features which cue back-channel responses in English and Japanese. Journal of pragmatics 32 8 (2000) 1177\u20131207.","DOI":"10.1016\/S0378-2166(99)00109-5"}],"event":{"name":"ICMI '25: International Conference on Multimodal Interaction","location":"Canberra Australia","acronym":"ICMI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 27th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750781","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T22:25:21Z","timestamp":1769466321000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3716553.3750781"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":48,"alternative-id":["10.1145\/3716553.3750781","10.1145\/3716553"],"URL":"https:\/\/doi.org\/10.1145\/3716553.3750781","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}