{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,12]],"date-time":"2026-07-12T06:32:10Z","timestamp":1783837930662,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202139, 61932009"],"award-info":[{"award-number":["62202139, 61932009"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The University Synergy Innovation Program of Anhui Province","award":["GXXT-2022-038"],"award-info":[{"award-number":["GXXT-2022-038"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3688988","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"11397-11403","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["DAT: Dialogue-Aware Transformer with Modality-Group Fusion for Human Engagement Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9446-249X","authenticated-orcid":false,"given":"Jia","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7261-1677","authenticated-orcid":false,"given":"Yangchen","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4436-6613","authenticated-orcid":false,"given":"Yin","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3350-7609","authenticated-orcid":false,"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5878-0922","authenticated-orcid":false,"given":"Peng","family":"Jia","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0519-147X","authenticated-orcid":false,"given":"Yunbo","family":"Xu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9717-7416","authenticated-orcid":false,"given":"Ziqiang","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5461-3986","authenticated-orcid":false,"given":"Richang","family":"Hong","sequence":"additional","affiliation":[{"name":"School of Computer Science and Information Engineering, Hefei University of Technology, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548363"},{"key":"e_1_3_2_1_2_1","volume-title":"Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV)","author":"Baltruvsaitis Tadas","unstructured":"Tadas Baltruvsaitis, Peter Robinson, and Louis-Philippe Morency. 2016. Openface: an open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (WACV). IEEE, 1--10."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2401836.2401846"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136780"},{"key":"e_1_3_2_1_5_1","unstructured":"Ozan Caglayan Lo\u00efc Barrault and Fethi Bougares. 2016. Multimodal Attention for Neural Machine Translation. arxiv: 1609.03976 [cs.CL] https:\/\/arxiv.org\/abs\/1609.03976"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_7_1","volume-title":"From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos. arXiv preprint arXiv:2312.05447","author":"Chen Yin","year":"2023","unstructured":"Yin Chen, Jia Li, Shiguang Shan, Meng Wang, and Richang Hong. 2023. From static to dynamic: Adapting landmark-aware image models for facial expression recognition in videos. arXiv preprint arXiv:2312.05447 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00470"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"e_1_3_2_1_10_1","volume-title":"Finding structure in time. Cognitive science","author":"Elman Jeffrey L","year":"1990","unstructured":"Jeffrey L Elman. 1990. Finding structure in time. Cognitive science, Vol. 14, 2 (1990), 179--211."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10648-019-09514-z"},{"key":"e_1_3_2_1_13_1","volume-title":"Developing an Effective and Automated Patient Engagement Estimator for Telehealth: A Machine Learning Approach. arxiv","author":"Guhan Pooja","year":"2011","unstructured":"Pooja Guhan, Naman Awasthi, Kathryn McDonald, Kristin Bussell, Dinesh Manocha, Gloria Reeves, and Aniket Bera. 2023. Developing an Effective and Automated Patient Engagement Estimator for Telehealth: A Machine Learning Approach. arxiv: 2011.08690 [cs.CV] https:\/\/arxiv.org\/abs\/2011.08690"},{"key":"e_1_3_2_1_14_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation, Vol. 9, 8 (1997), 1735--1780."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-78114-9_19"},{"key":"e_1_3_2_1_16_1","volume-title":"Prediction and localization of student engagement in the wild. In 2018 Digital Image Computing: Techniques and Applications (DICTA)","author":"Kaur Amanjot","unstructured":"Amanjot Kaur, Aamir Mustafa, Love Mehta, and Abhinav Dhall. 2018. Prediction and localization of student engagement in the wild. In 2018 Digital Image Computing: Techniques and Applications (DICTA). IEEE, 1--8."},{"key":"e_1_3_2_1_17_1","unstructured":"Divesh Lala Koji Inoue Pierrick Milhorat and Tatsuya Kawahara. 2017. Detection of social signals for recognizing engagement in human-robot interaction. arxiv: 1709.10257 [cs.HC] https:\/\/arxiv.org\/abs\/1709.10257"},{"key":"e_1_3_2_1_18_1","volume-title":"A concordance correlation coefficient to evaluate reproducibility. Biometrics","author":"Lawrence I","year":"1989","unstructured":"I Lawrence and Kuei Lin. 1989. A concordance correlation coefficient to evaluate reproducibility. Biometrics (1989), 255--268."},{"key":"e_1_3_2_1_19_1","volume-title":"Emotion separation and recognition from a facial expression by generating the poker face with vision transformers. arXiv preprint arXiv:2207.11081","author":"Li Jia","year":"2022","unstructured":"Jia Li, Jiantao Nie, Dan Guo, Richang Hong, and Meng Wang. 2022. Emotion separation and recognition from a facial expression by generating the poker face with vision transformers. arXiv preprint arXiv:2207.11081 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689004"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3172944.3172969"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479219"},{"key":"e_1_3_2_1_23_1","volume-title":"Albert Ali Salah, and Itir Onal Ertugrul","author":"Ning Mang","year":"2024","unstructured":"Mang Ning, Albert Ali Salah, and Itir Onal Ertugrul. 2024. Representation Learning and Identity Adversarial Training for Facial Behavior Understanding. arXiv preprint arXiv:2407.11243 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-97546-3_53"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1957656.1957781"},{"key":"e_1_3_2_1_27_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. arxiv","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. arxiv: 1908.07490 [cs.CL] https:\/\/arxiv.org\/abs\/1908.07490"},{"key":"e_1_3_2_1_28_1","volume-title":"COLD fusion: Calibrated and ordinal latent distribution fusion for uncertainty-aware multimodal emotion recognition","author":"Tellamekala Mani Kumar","year":"2023","unstructured":"Mani Kumar Tellamekala, Shahin Amiriparian, Bj\u00f6rn W Schuller, Elisabeth Andr\u00e9, Timo Giesbrecht, and Michel Valstar. 2023. COLD fusion: Calibrated and ordinal latent distribution fusion for uncertainty-aware multimodal emotion recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612873"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612852"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612852"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Amir Zadeh Minghai Chen Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2017. Tensor Fusion Network for Multimodal Sentiment Analysis. arxiv: 1707.07250 [cs.CL] https:\/\/arxiv.org\/abs\/1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77772-2_36"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688988","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3688988","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:29Z","timestamp":1750295849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3688988"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":35,"alternative-id":["10.1145\/3664647.3688988","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3688988","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}