{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T09:06:14Z","timestamp":1769504774200,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,10,12]],"date-time":"2026-10-12T00:00:00Z","timestamp":1791763200000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["IIS 2005430"],"award-info":[{"award-number":["IIS 2005430"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3716553.3750756","type":"proceedings-article","created":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T13:13:16Z","timestamp":1760188396000},"page":"265-274","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Learning Multimodal Motion Cues for Online End-of-Turn Prediction in Multi-Party Dialogue"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9726-1153","authenticated-orcid":false,"given":"Meng-Chen","family":"Lee","sequence":"first","affiliation":[{"name":"Department of Computer Science, University of Houston, Houston, Texas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2571-5865","authenticated-orcid":false,"given":"Zhigang","family":"Deng","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Houston, Houston, Texas, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_3_1_2_2","first-page":"2","volume-title":"Proc. NAACL workshop on adaptation in dialogue systems","author":"Bell Linda","year":"2001","unstructured":"Linda Bell, Johan Boye, and Joakim Gustafson. 2001. Real-time handling of fragmented utterances. In Proc. NAACL workshop on adaptation in dialogue systems. 2\u20138."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Atef Ben-Youssef Giovanna Varni Slim Essid and Chlo\u00e9 Clavel. 2019. On-the-fly detection of user engagement decrease in spontaneous human\u2013robot interaction using recurrent and deep neural networks. International Journal of Social Robotics 11 5 (2019) 815\u2013828.","DOI":"10.1007\/s12369-019-00591-2"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR50410.2021.00037"},{"key":"e_1_3_3_1_5_2","unstructured":"Paul Boersma and Vincent Van\u00a0Heuven. 2001. Speak and unSpeak with PRAAT. Glot International 5 9\/10 (2001) 341\u2013347."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/2070481.2070507"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Geert Br\u00f4ne Bert Oben Annelies Jehoul Jelena Vranjes and Kurt Feyaerts. 2017. Eye gaze and viewpoint in multimodal interaction management. Cognitive Linguistics 28 3 (2017) 449\u2013483.","DOI":"10.1515\/cog-2016-0119"},{"key":"e_1_3_3_1_8_2","unstructured":"Junyoung Chung Caglar Gulcehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.3555 (2014)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/1647314.1647332"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Ziedune Degutyte and Arlene Astell. 2021. The role of eye gaze in regulating turn taking in conversations: a systematized review of methods and findings. Frontiers in Psychology 12 (2021) 616471.","DOI":"10.3389\/fpsyg.2021.616471"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"crossref","unstructured":"Jan Deriu Alvaro Rodrigo Arantxa Otegi Guillermo Echegoyen Sophie Rosset Eneko Agirre and Mark Cieliebak. 2021. Survey on evaluation methods for dialogue systems. Artificial Intelligence Review 54 (2021) 755\u2013810.","DOI":"10.1007\/s10462-020-09866-x"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025644"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Nia\u00a0MM Dowell Tristan\u00a0M Nixon and Arthur\u00a0C Graesser. 2019. Group communication analysis: A computational linguistics approach for detecting sociocognitive roles in multiparty interactions. Behavior research methods 51 (2019) 1007\u20131041.","DOI":"10.3758\/s13428-018-1102-z"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Cecilia\u00a0E Ford and Sandra\u00a0A Thompson. 1996. Interactional units in conversation: Syntactic intonational and pragmatic resources for the management of turns. Studies in interactional sociolinguistics 13 (1996) 134\u2013184.","DOI":"10.1017\/CBO9780511620874.003"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"crossref","unstructured":"Agust\u00edn Gravano and Julia Hirschberg. 2011. Turn-taking cues in task-oriented dialogue. Computer Speech & Language 25 3 (2011) 601\u2013634.","DOI":"10.1016\/j.csl.2010.10.003"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/2663204.2663277"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"crossref","unstructured":"Kohei Hara Koji Inoue Katsuya Takanashi and Tatsuya Kawahara. 2018. Prediction of turn-taking using multitask learning with prediction of backchannels and fillers. Listener 162 (2018) 364.","DOI":"10.21437\/Interspeech.2018-1442"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Simon Ho Tom Foulsham and Alan Kingstone. 2015. Speaking and listening with the eyes: Gaze signaling during dyadic interactions. PloS one 10 8 (2015) e0136905.","DOI":"10.1371\/journal.pone.0136905"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2820755"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178385"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1145\/2522848.2522856"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Aobo Jin Qixin Deng and Zhigang Deng. 2020. A Live Speech-Driven Avatar-Mediated Three-Party Telepresence System: Design and Evaluation. PRESENCE: Virtual and Augmented Reality 29 (2020) 113\u2013139.","DOI":"10.1162\/pres_a_00358"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3561975.3562954"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Aobo Jin Qixin Deng Yuting Zhang and Zhigang Deng. 2019. A deep learning-based model for head and eye motion generation in three-party conversations. Proceedings of the ACM on Computer Graphics and Interactive Techniques 2 2 (2019) 1\u201319.","DOI":"10.1145\/3340250"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"crossref","unstructured":"Kristiina Jokinen Hirohisa Furukawa Masafumi Nishida and Seiichi Yamamoto. 2013. Gaze and turn-taking behavior in casual conversational interactions. ACM Transactions on Interactive Intelligent Systems (TiiS) 3 2 (2013) 1\u201330.","DOI":"10.1145\/2499474.2499481"},{"key":"e_1_3_3_1_27_2","first-page":"727","volume-title":"Interspeech","author":"Kawahara Tatsuya","year":"2012","unstructured":"Tatsuya Kawahara, Takuma Iwatate, and Katsuya Takanashi. 2012. Prediction of Turn-Taking by Combining Prosodic and Eye-Gaze Information in Poster Conversations.. In Interspeech. 727\u2013730."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/AFGR.2000.840610"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Hanae Koiso Yasuo Horiuchi Syun Tutiya Akira Ichikawa and Yasuharu Den. 1998. An analysis of turn-taking and backchannels based on prosodic and syntactic features in Japanese map task dialogs. Language and speech 41 3-4 (1998) 295\u2013321.","DOI":"10.1177\/002383099804100404"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3340555.3353727"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3678957.3685742"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652988.3673915"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614139"},{"key":"e_1_3_3_1_34_2","unstructured":"Xian Liu Qianyi Wu Hang Zhou Yuanqi Du Wayne Wu Dahua Lin and Ziwei Liu. 2022. Audio-driven co-speech gesture video generation. Advances in Neural Information Processing Systems 35 21386\u201321399."},{"key":"e_1_3_3_1_35_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Raveesh Meena Gabriel Skantze and Joakim Gustafson. 2014. Data-driven models for timing feedback responses in a Map Task dialogue system. Computer Speech & Language 28 4 (2014) 903\u2013922.","DOI":"10.1016\/j.csl.2014.02.002"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-25554-5_46"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Hannah Pelikan and Emily Hofstetter. 2023. Managing delays in human-robot interaction. ACM Transactions on Computer-Human Interaction 30 4 (2023) 1\u201342.","DOI":"10.1145\/3569890"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3242997"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-623550-0.50008-2"},{"key":"e_1_3_3_1_41_2","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1910.01108 (2019)."},{"key":"e_1_3_3_1_42_2","first-page":"861","volume-title":"Interspeech","author":"Sato Ryo","year":"2002","unstructured":"Ryo Sato, Ryuichiro Higashinaka, Masafumi Tamoto, Mikio Nakano, and Kiyoaki Aikawa. 2002. Learning decision trees to determine turn-taking by spoken dialogue systems.. In Interspeech. 861\u2013864."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"David Schlangen. 2006. From reaction to prediction: Experiments with computational models of turn-taking. Proceedings of Interspeech (2006).","DOI":"10.21437\/Interspeech.2006-550"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Candace\u00a0L Sidner Christopher Lee Cory\u00a0D Kidd Neal Lesh and Charles Rich. 2005. Explorations in engagement for humans and robots. Artificial Intelligence 166 1-2 (2005) 140\u2013164.","DOI":"10.1016\/j.artint.2005.03.005"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-5527"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"crossref","unstructured":"Gabriel Skantze. 2021. Turn-taking in conversational systems and human-robot interaction: a review. Computer Speech & Language 67 (2021) 101178.","DOI":"10.1016\/j.csl.2020.101178"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2820749"},{"key":"e_1_3_3_1_48_2","unstructured":"Aaron Van Den\u00a0Oord Oriol Vinyals et\u00a0al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1017\/9781316848265"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981117"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"}],"event":{"name":"ICMI '25: International Conference on Multimodal Interaction","location":"Canberra Australia","acronym":"ICMI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 27th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750756","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750756","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T22:27:40Z","timestamp":1769466460000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3716553.3750756"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":51,"alternative-id":["10.1145\/3716553.3750756","10.1145\/3716553"],"URL":"https:\/\/doi.org\/10.1145\/3716553.3750756","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}