{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T10:56:59Z","timestamp":1781521019920,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T00:00:00Z","timestamp":1781568000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,17]]},"DOI":"10.1145\/3816713.3818220","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T10:34:51Z","timestamp":1781519691000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-Modal Emotion Discrepancy Detection: Detecting When Voice Reveals Emotions That Text Conceals"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2888-7051","authenticated-orcid":false,"given":"Kaung Hset","family":"Hein","sequence":"first","affiliation":[{"name":"School of Information Technology, King Mongkut's University of Technology Thonburi, Bangkok, Bangkok, Thailand"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2384-0462","authenticated-orcid":false,"given":"Jonathan","family":"Chan","sequence":"additional","affiliation":[{"name":"School of Information Technology, King Mongkut's University of Technology Thonburi, Bangkok, Thailand"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,16]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","unstructured":"Steven\u00a0R. Livingstone and Frank\u00a0A. Russo. 2018. The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS). PLOS ONE 13 5 (2018) e0196391. 10.1371\/journal.pone.0196391","DOI":"10.1371\/journal.pone.0196391"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Amir Zadeh Rowan Zellers Eli Pincus and Louis-Philippe Morency. 2016. Multimodal Sentiment Intensity Analysis in Videos. IEEE Intelligent Systems 31 6 (2016) 82\u201388.","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_3_2_4_2","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In NeurIPS 2020."},{"key":"e_1_3_3_2_5_2","unstructured":"Jochen Hartmann. 2022. Emotion English DistilROBErta-base. Hugging Face Model Hub. https:\/\/huggingface.co\/j-hartmann\/emotion-english-distilroberta-base"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Brian McFee et al. 2015. librosa: Audio and Music Signal Analysis in Python. In SciPy 2015. 18\u201324.","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_3_2_7_2","unstructured":"Jiachen Luo Huy Phan and Joshua Reiss. 2023. Cross-Modal Fusion Techniques for Utterance-Level Emotion Recognition from Text and Speech. In ICASSP 2023."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Soujanya Poria Erik Cambria Rajiv Bajpai and Amir Hussain. 2017. A Review of Affective Computing: From Unimodal Analysis to Multimodal Fusion. Information Fusion 37 (2017) 98\u2013125.","DOI":"10.1016\/j.inffus.2017.02.003"},{"key":"e_1_3_3_2_9_2","unstructured":"Yufei Wang and Mengyue Wu. 2024. Evaluation of Data Inconsistency for Multi-Modal Sentiment Analysis. arXiv:https:\/\/arXiv.org\/abs\/2406.03004 (2024)."},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"crossref","unstructured":"Martin Klasen Ren\u00e9 Chen and Klaus Mathiak. 2012. Multisensory Emotions: Perception Combination and Underlying Neural Processes. Reviews in the Neurosciences 23 4 (2012) 381\u2013392.","DOI":"10.1515\/revneuro-2012-0040"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","unstructured":"Thanyathorn Thanapattheerakul Katherine Mao Jacqueline Amoranto and Jonathan\u00a0H. Chan. 2018. Emotion in a Century: A Review of Emotion Recognition. In IAIT 2018. ACM 1\u20138. 10.1145\/3291280.3291788","DOI":"10.1145\/3291280.3291788"},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","unstructured":"Kaung\u00a0Myat Kyaw and Jonathan\u00a0Hoyin Chan. 2024. A Framework for Synthetic Audio Conversations Generation using Large Language Models. In IEEE\/WIC WI-IAT 2024. 355\u2013359. 10.1109\/WI-IAT62293.2024.00056","DOI":"10.1109\/WI-IAT62293.2024.00056"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Johannes Wagner et al. 2023. Dawn of the Transformer Era in Speech Emotion Recognition: Closing the Valence Gap. IEEE TPAMI 45 9 (2023) 10745\u201310759.","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"M.P.A. Ramaswamy and S.\u00a0Palaniswamy. 2024. Multimodal Emotion Recognition: A Comprehensive Review. WIREs Data Mining and Knowledge Discovery 14 6 (2024) e1563.","DOI":"10.1002\/widm.1563"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Tianqi Chen and Carlos Guestrin. 2016. XGBoost: A Scalable Tree Boosting System. In KDD 2016. 785\u2013794.","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_3_2_16_2","unstructured":"Victor Sanh Lysandre Debut Julien Chaumond and Thomas Wolf. 2019. DistilBERT a distilled version of BERT: smaller faster cheaper and lighter. arXiv:https:\/\/arXiv.org\/abs\/1910.01108 (2019)."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","unstructured":"Soujanya Poria Devamanyu Hazarika Navonil Majumder Gautam Naik Erik Cambria and Rada Mihalcea. 2019. MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations. In ACL 2019. 527\u2013536. 10.18653\/v1\/P19-1050","DOI":"10.18653\/v1\/P19-1050"}],"event":{"name":"IAIT 2026: 14th International Conference on Advances in Information Technology","location":"Bangkok Thailand","acronym":"IAIT '26"},"container-title":["Proceedings of the 14th International Conference on Advances in Information Technology"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T10:37:51Z","timestamp":1781519871000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3816713.3818220"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,16]]},"references-count":16,"alternative-id":["10.1145\/3816713.3818220","10.1145\/3816713"],"URL":"https:\/\/doi.org\/10.1145\/3816713.3818220","relation":{},"subject":[],"published":{"date-parts":[[2026,6,16]]},"assertion":[{"value":"2026-06-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}