{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T16:04:16Z","timestamp":1772553856209,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T00:00:00Z","timestamp":1696809600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CNS-2016719"],"award-info":[{"award-number":["CNS-2016719"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,9]]},"DOI":"10.1145\/3577190.3614110","type":"proceedings-article","created":{"date-parts":[[2023,10,7]],"date-time":"2023-10-07T22:30:48Z","timestamp":1696717848000},"page":"207-215","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Enhancing Resilience to Missing Data in Audio-Text Emotion Recognition with Multi-Scale Chunk Regularization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1933-1590","authenticated-orcid":false,"given":"Wei-Cheng","family":"Lin","sequence":"first","affiliation":[{"name":"Electrical and Computer Engineering, The University of Texas at Dallas, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9613-1002","authenticated-orcid":false,"given":"Lucas","family":"Goncalves","sequence":"additional","affiliation":[{"name":"Electrical and Computer Engineering, The University of Texas at Dallas, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4075-4072","authenticated-orcid":false,"given":"Carlos","family":"Busso","sequence":"additional","affiliation":[{"name":"Electrical and Computer Engineering, The University of Texas at Dallas, United States"}]}],"member":"320","published-online":{"date-parts":[[2023,10,9]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"A. Baevski Y. Zhou A. Mohamed and M. Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems (NeurIPS 2020) Vol.\u00a033. Virtual 12449\u201312460.  A. Baevski Y. Zhou A. Mohamed and M. Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. In Advances in Neural Information Processing Systems (NeurIPS 2020) Vol.\u00a033. Virtual 12449\u201312460."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2015.2493525"},{"key":"e_1_3_2_1_4_1","volume-title":"Speech and Multimodal Information. In Sixth International Conference on Multimodal Interfaces ICMI","author":"Busso C.","year":"2004","unstructured":"C. Busso , Z. Deng , S. Yildirim , M. Bulut , C.M. Lee , A. Kazemzadeh , S. Lee , U. Neumann , and S. Narayanan . 2004. Analysis of Emotion Recognition using Facial Expressions , Speech and Multimodal Information. In Sixth International Conference on Multimodal Interfaces ICMI 2004 . ACM Press, State College, PA, 205\u2013211. https:\/\/doi.org\/10.1145\/1027933.1027968 10.1145\/1027933.1027968 C. Busso, Z. Deng, S. Yildirim, M. Bulut, C.M. Lee, A. Kazemzadeh, S. Lee, U. Neumann, and S. Narayanan. 2004. Analysis of Emotion Recognition using Facial Expressions, Speech and Multimodal Information. In Sixth International Conference on Multimodal Interfaces ICMI 2004. ACM Press, State College, PA, 205\u2013211. https:\/\/doi.org\/10.1145\/1027933.1027968"},{"key":"e_1_3_2_1_5_1","volume-title":"7th International Seminar on Speech Production (ISSP","author":"Busso C.","year":"2006","unstructured":"C. Busso and S.S. Narayanan . 2006. Interplay between linguistic and affective goals in facial expression during emotional utterances. In 7th International Seminar on Speech Production (ISSP 2006 ). Ubatuba-SP, Brazil, 549\u2013556. C. Busso and S.S. Narayanan. 2006. Interplay between linguistic and affective goals in facial expression during emotional utterances. In 7th International Seminar on Speech Production (ISSP 2006). Ubatuba-SP, Brazil, 549\u2013556."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.905145"},{"key":"#cr-split#-e_1_3_2_1_7_1.1","doi-asserted-by":"crossref","unstructured":"A. Cohen I. Rimon E. Aflalo and H.H. Permuter. 2022. A study on data augmentation in voice anti-spoofing. Speech Communication 141 (June 2022) 56-67. https:\/\/doi.org\/10.1016\/j.specom.2022.04.005 10.1016\/j.specom.2022.04.005","DOI":"10.1016\/j.specom.2022.04.005"},{"key":"#cr-split#-e_1_3_2_1_7_1.2","doi-asserted-by":"crossref","unstructured":"A. Cohen I. Rimon E. Aflalo and H.H. Permuter. 2022. A study on data augmentation in voice anti-spoofing. Speech Communication 141 (June 2022) 56-67. https:\/\/doi.org\/10.1016\/j.specom.2022.04.005","DOI":"10.1016\/j.specom.2022.04.005"},{"key":"e_1_3_2_1_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American","author":"Devlin J.","year":"2019","unstructured":"J. Devlin , M.W. Chang , K. Lee , and K. Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2019). Minneapolis , Minnesota , 4171\u20134186. J. Devlin, M.W. Chang, K. Lee, and K. Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2019). Minneapolis, Minnesota, 4171\u20134186."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2682899"},{"key":"e_1_3_2_1_10_1","volume-title":"Semi-Supervised Deep Generative Modelling of Incomplete Multi-Modality Emotional Data. In ACM international conference on Multimedia (MM","author":"Du C.","year":"2018","unstructured":"C. Du , C. Du , H. Wang , J. Li , W.-L. Zheng , B.-Liang Lu, and H. He . 2018 . Semi-Supervised Deep Generative Modelling of Incomplete Multi-Modality Emotional Data. In ACM international conference on Multimedia (MM 2018 ). Seoul, Republic of Korea, 108\u2013116. https:\/\/doi.org\/10.1145\/3240508.3240528 10.1145\/3240508.3240528 C. Du, C. Du, H. Wang, J. Li, W.-L. Zheng, B.-Liang Lu, and H. He. 2018. Semi-Supervised Deep Generative Modelling of Incomplete Multi-Modality Emotional Data. In ACM international conference on Multimedia (MM 2018). Seoul, Republic of Korea, 108\u2013116. https:\/\/doi.org\/10.1145\/3240508.3240528"},{"key":"e_1_3_2_1_11_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2022","author":"Goncalves L.","year":"2022","unstructured":"L. Goncalves and C. Busso . 2022. AuxFormer: Robust Approach to Audiovisual Emotion Recognition . In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2022 ). Singapore, 7357\u20137361. https:\/\/doi.org\/10.1109\/ICASSP43922. 2022 .9747157 10.1109\/ICASSP43922.2022.9747157 L. Goncalves and C. Busso. 2022. AuxFormer: Robust Approach to Audiovisual Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2022). Singapore, 7357\u20137361. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747157"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3216993"},{"key":"e_1_3_2_1_13_1","volume-title":"Learning Cross-modal Audiovisual Representations with Ladder Networks for Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023","author":"Goncalves L.","year":"2023","unstructured":"L. Goncalves and C. Busso . 2023 . Learning Cross-modal Audiovisual Representations with Ladder Networks for Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023 ). Rhodes island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357. 2023 .10096138 10.1109\/ICASSP49357.2023.10096138 L. Goncalves and C. Busso. 2023. Learning Cross-modal Audiovisual Representations with Ladder Networks for Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023). Rhodes island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096138"},{"key":"#cr-split#-e_1_3_2_1_14_1.1","unstructured":"L. Goncalves S.-G. Leem W.-C. Lin B. Sisman and C. Busso. 2023. Versatile Audiovisual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks. ArXiv e-prints (arXiv:2305.07216) (May 2023) 1-14. https:\/\/doi.org\/10.48550\/arXiv.2305.07216 arxiv:2305.07216\u00a0[cs.LG] 10.48550\/arXiv.2305.07216"},{"key":"#cr-split#-e_1_3_2_1_14_1.2","unstructured":"L. Goncalves S.-G. Leem W.-C. Lin B. Sisman and C. Busso. 2023. Versatile Audiovisual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks. ArXiv e-prints (arXiv:2305.07216) (May 2023) 1-14. https:\/\/doi.org\/10.48550\/arXiv.2305.07216 arxiv:2305.07216\u00a0[cs.LG]"},{"key":"e_1_3_2_1_15_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2019","author":"Han J.","year":"2019","unstructured":"J. Han , Z. Zhang , Z. Ren , and B. Schuller . 2019. Implicit Fusion by Joint Audiovisual Training for Emotion Recognition in Mono Modality . In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2019 ). Brighton, UK, 5861\u20135865. https:\/\/doi.org\/10.1109\/ICASSP. 2019 .8682773 10.1109\/ICASSP.2019.8682773 J. Han, Z. Zhang, Z. Ren, and B. Schuller. 2019. Implicit Fusion by Joint Audiovisual Training for Emotion Recognition in Mono Modality. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2019). Brighton, UK, 5861\u20135865. https:\/\/doi.org\/10.1109\/ICASSP.2019.8682773"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2389824"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00385"},{"key":"e_1_3_2_1_18_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2020","author":"Huang J.","year":"2020","unstructured":"J. Huang , J. Tao , B. Liu , Z. Lian , and M. Niu . 2020. Multimodal transformer fusion for continuous emotion recognition . In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2020 ). Barcelona, Spain, 3507\u20133511. https:\/\/doi.org\/10.1109\/ICASSP40776. 2020 .9053762 10.1109\/ICASSP40776.2020.9053762 J. Huang, J. Tao, B. Liu, Z. Lian, and M. Niu. 2020. Multimodal transformer fusion for continuous emotion recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2020). Barcelona, Spain, 3507\u20133511. https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053762"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/S1071-5819(03)00047-8"},{"key":"e_1_3_2_1_20_1","volume-title":"Self-Supervised Learning with Cross-Modal Transformers for Emotion Recognition. In IEEE Spoken Language Technology Workshop (SLT 2021","author":"Khare A.","year":"2021","unstructured":"A. Khare , S. Parthasarathy , and S. Sundaram . 2021 . Self-Supervised Learning with Cross-Modal Transformers for Emotion Recognition. In IEEE Spoken Language Technology Workshop (SLT 2021 ). Shenzhen, China, 381\u2013388. https:\/\/doi.org\/10.1109\/SLT48900. 2021 .9383618 10.1109\/SLT48900.2021.9383618 A. Khare, S. Parthasarathy, and S. Sundaram. 2021. Self-Supervised Learning with Cross-Modal Transformers for Emotion Recognition. In IEEE Spoken Language Technology Workshop (SLT 2021). Shenzhen, China, 381\u2013388. https:\/\/doi.org\/10.1109\/SLT48900.2021.9383618"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.102185"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2014.2304637"},{"key":"e_1_3_2_1_23_1","volume-title":"IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021","author":"Lin J.","year":"2021","unstructured":"J. Lin , Y. Wang , K. Kalgaonkar , G. Keren , D. Zhang , and C. Fuegen . 2021. A Time-Domain Convolutional Recurrent Network for Packet Loss Concealment . In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021 ). Toronto, ON, Canada, 7148\u20137152. https:\/\/doi.org\/10.1109\/ICASSP39728. 2021 .9413595 10.1109\/ICASSP39728.2021.9413595 J. Lin, Y. Wang, K. Kalgaonkar, G. Keren, D. Zhang, and C. Fuegen. 2021. A Time-Domain Convolutional Recurrent Network for Packet Loss Concealment. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). Toronto, ON, Canada, 7148\u20137152. https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9413595"},{"key":"#cr-split#-e_1_3_2_1_24_1.1","doi-asserted-by":"crossref","unstructured":"W.-C. Lin and C. Busso. 2020. An Efficient Temporal Modeling Approach for Speech Emotion Recognition by Mapping Varied Duration Sentences into Fixed Number of Chunks. In Interspeech 2020. Shanghai China 2322-2326. https:\/\/doi.org\/10.21437\/Interspeech.2020-2636 10.21437\/Interspeech.2020-2636","DOI":"10.21437\/Interspeech.2020-2636"},{"key":"#cr-split#-e_1_3_2_1_24_1.2","doi-asserted-by":"crossref","unstructured":"W.-C. Lin and C. Busso. 2020. An Efficient Temporal Modeling Approach for Speech Emotion Recognition by Mapping Varied Duration Sentences into Fixed Number of Chunks. In Interspeech 2020. Shanghai China 2322-2326. https:\/\/doi.org\/10.21437\/Interspeech.2020-2636","DOI":"10.21437\/Interspeech.2020-2636"},{"key":"#cr-split#-e_1_3_2_1_25_1.1","doi-asserted-by":"crossref","unstructured":"W.-C. Lin and C. Busso. 2022. Chunk-Level Speech Emotion Recognition: A General Framework of Sequence-to-One Dynamic Temporal Modeling. IEEE Transactions on Affective Computing Early Access (2022). https:\/\/doi.org\/10.1109\/TAFFC.2021.3083821 10.1109\/TAFFC.2021.3083821","DOI":"10.1109\/TAFFC.2021.3083821"},{"key":"#cr-split#-e_1_3_2_1_25_1.2","doi-asserted-by":"crossref","unstructured":"W.-C. Lin and C. Busso. 2022. Chunk-Level Speech Emotion Recognition: A General Framework of Sequence-to-One Dynamic Temporal Modeling. IEEE Transactions on Affective Computing Early Access (2022). https:\/\/doi.org\/10.1109\/TAFFC.2021.3083821","DOI":"10.1109\/TAFFC.2021.3083821"},{"key":"e_1_3_2_1_26_1","volume-title":"Role of Lexical Boundary Information in Chunk-Level Segmentation for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023","author":"Lin W.-C.","year":"2023","unstructured":"W.-C. Lin and C. Busso . 2023 . Role of Lexical Boundary Information in Chunk-Level Segmentation for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023 ). Rhodes island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357. 2023 .10096861 10.1109\/ICASSP49357.2023.10096861 W.-C. Lin and C. Busso. 2023. Role of Lexical Boundary Information in Chunk-Level Segmentation for Speech Emotion Recognition. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023). Rhodes island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096861"},{"key":"e_1_3_2_1_27_1","volume-title":"Multimodal Emotion Recognition with Capsule Graph Convolutional Based Representation Fusion. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021","author":"Liu J.","year":"2021","unstructured":"J. Liu , S. Chen , L. Wang , Z. Liu , Y. Fu , L. Guo , and J. Dang . 2021 . Multimodal Emotion Recognition with Capsule Graph Convolutional Based Representation Fusion. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021 ). Toronto, ON, Canada, 6339\u20136343. https:\/\/doi.org\/10.1109\/ICASSP39728. 2021 .9413608 10.1109\/ICASSP39728.2021.9413608 J. Liu, S. Chen, L. Wang, Z. Liu, Y. Fu, L. Guo, and J. Dang. 2021. Multimodal Emotion Recognition with Capsule Graph Convolutional Based Representation Fusion. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). Toronto, ON, Canada, 6339\u20136343. https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9413608"},{"key":"#cr-split#-e_1_3_2_1_28_1.1","unstructured":"Y. Liu M. Ott N. Goyal J. Du M. Joshi D. Chen O. Levy M. Lewis L. Zettlemoyer and V. Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. ArXiv e-prints (arXiv:1907.11692) (July 2019) 1-12. https:\/\/doi.org\/10.48550\/arXiv.1907.11692 arxiv:1907.11692\u00a0[cs.CL] 10.48550\/arXiv.1907.11692"},{"key":"#cr-split#-e_1_3_2_1_28_1.2","unstructured":"Y. Liu M. Ott N. Goyal J. Du M. Joshi D. Chen O. Levy M. Lewis L. Zettlemoyer and V. Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. ArXiv e-prints (arXiv:1907.11692) (July 2019) 1-12. https:\/\/doi.org\/10.48550\/arXiv.1907.11692 arxiv:1907.11692\u00a0[cs.CL]"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2736999"},{"key":"e_1_3_2_1_30_1","volume-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Conference on Neural Information Processing Systems (NeurIPS","author":"Lu J.","year":"2019","unstructured":"J. Lu , D. Batra , D. Parikh , and S. Lee . 2019 . ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Conference on Neural Information Processing Systems (NeurIPS 2019 ). Vancouver, BC, Canada, 1\u201311. J. Lu, D. Batra, D. Parikh, and S. Lee. 2019. ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks. In Conference on Neural Information Processing Systems (NeurIPS 2019). Vancouver, BC, Canada, 1\u201311."},{"key":"e_1_3_2_1_31_1","volume-title":"Conference on Empirical Methods in Natural Language Processing (EMNLP","author":"Luong T.","year":"2015","unstructured":"T. Luong , H. Pham , and C.D. Manning . 2015. Effective Approaches to Attention-based Neural Machine Translation . In Conference on Empirical Methods in Natural Language Processing (EMNLP 2015 ). Lisbon, Portugal, 1412\u20131421. T. Luong, H. Pham, and C.D. Manning. 2015. Effective Approaches to Attention-based Neural Machine Translation. In Conference on Empirical Methods in Natural Language Processing (EMNLP 2015). Lisbon, Portugal, 1412\u20131421."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"S. Mariooryad R. Lotfian and C. Busso. 2014. Building A Naturalistic Emotional Speech Corpus by Retrieving Expressive Behaviors From Existing Speech Corpora. In Interspeech 2014. Singapore 238\u2013242.  S. Mariooryad R. Lotfian and C. Busso. 2014. Building A Naturalistic Emotional Speech Corpus by Retrieving Expressive Behaviors From Existing Speech Corpora. In Interspeech 2014. Singapore 238\u2013242.","DOI":"10.21437\/Interspeech.2014-60"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10803-006-0251-6"},{"key":"#cr-split#-e_1_3_2_1_34_1.1","doi-asserted-by":"crossref","unstructured":"M. McAuliffe M. Socolof S. Mihuc M. Wagner and M. Sonderegger. 2017. Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi. In Interspeech 2017. Stockholm Sweden 498-502. https:\/\/doi.org\/10.21437\/Interspeech.2017-1386 10.21437\/Interspeech.2017-1386","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"#cr-split#-e_1_3_2_1_34_1.2","doi-asserted-by":"crossref","unstructured":"M. McAuliffe M. Socolof S. Mihuc M. Wagner and M. Sonderegger. 2017. Montreal Forced Aligner: Trainable Text-Speech Alignment Using Kaldi. In Interspeech 2017. Stockholm Sweden 498-502. https:\/\/doi.org\/10.21437\/Interspeech.2017-1386","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"e_1_3_2_1_35_1","volume-title":"AAAI Conference on Artificial Intelligence (AAAI","author":"Mittal T.","year":"2020","unstructured":"T. Mittal , U. Bhattacharya , R. Chandra , A. Bera , and D. Manocha . 2020. M3ER: Multiplicative Multimodal Emotion Recognition using Facial, Textual, and Speech Cues . In AAAI Conference on Artificial Intelligence (AAAI 2020 ), Vol.\u00a034. New York, NY, USA, 1359\u20131367. https:\/\/doi.org\/10.1609\/aaai.v34i02.5492 10.1609\/aaai.v34i02.5492 T. Mittal, U. Bhattacharya, R. Chandra, A. Bera, and D. Manocha. 2020. M3ER: Multiplicative Multimodal Emotion Recognition using Facial, Textual, and Speech Cues. In AAAI Conference on Artificial Intelligence (AAAI 2020), Vol.\u00a034. New York, NY, USA, 1359\u20131367. https:\/\/doi.org\/10.1609\/aaai.v34i02.5492"},{"key":"#cr-split#-e_1_3_2_1_36_1.1","unstructured":"M.M. Mohamed and B.W. Schuller. 2020. ConcealNet: An End-to-end Neural Network for Packet Loss Concealment in Deep Speech Emotion Recognition. ArXiv e-prints (arXiv:2005.07777) (May 2020) 1-5. https:\/\/doi.org\/10.48550\/arXiv.2005.07777 arxiv:2005.07777\u00a0[cs.AS] 10.48550\/arXiv.2005.07777"},{"key":"#cr-split#-e_1_3_2_1_36_1.2","unstructured":"M.M. Mohamed and B.W. Schuller. 2020. ConcealNet: An End-to-end Neural Network for Packet Loss Concealment in Deep Speech Emotion Recognition. ArXiv e-prints (arXiv:2005.07777) (May 2020) 1-5. https:\/\/doi.org\/10.48550\/arXiv.2005.07777 arxiv:2005.07777\u00a0[cs.AS]"},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Multimodal Interaction (ICMI","author":"Parthasarathy S.","year":"2020","unstructured":"S. Parthasarathy and S. Sundaram . 2020. Training Strategies to Handle Missing Modalities for Audio-Visual Expression Recognition . In International Conference on Multimodal Interaction (ICMI 2020 ). Utrecht, The Netherlands, 400\u2013404. https:\/\/doi.org\/10.1145\/3395035.3425202 10.1145\/3395035.3425202 S. Parthasarathy and S. Sundaram. 2020. Training Strategies to Handle Missing Modalities for Audio-Visual Expression Recognition. In International Conference on Multimodal Interaction (ICMI 2020). Utrecht, The Netherlands, 400\u2013404. https:\/\/doi.org\/10.1145\/3395035.3425202"},{"key":"e_1_3_2_1_38_1","volume-title":"AAAI Conference on Artificial Intelligence (AAAI","author":"Pham H.","year":"2019","unstructured":"H. Pham , P.P. Liang , T. Manzini , L.-P. Morency , and B. P\u00f3czos . 2019. Found in Translation: Learning Robust Joint Representations by Cyclic Translations between Modalities . In AAAI Conference on Artificial Intelligence (AAAI 2019 ), Vol.\u00a033. Honolulu, HI, USA, 6892\u20136899. https:\/\/doi.org\/10.1609\/aaai.v33i01.33016892 10.1609\/aaai.v33i01.33016892 H. Pham, P.P. Liang, T. Manzini, L.-P. Morency, and B. P\u00f3czos. 2019. Found in Translation: Learning Robust Joint Representations by Cyclic Translations between Modalities. In AAAI Conference on Artificial Intelligence (AAAI 2019), Vol.\u00a033. Honolulu, HI, USA, 6892\u20136899. https:\/\/doi.org\/10.1609\/aaai.v33i01.33016892"},{"key":"#cr-split#-e_1_3_2_1_39_1.1","unstructured":"F. Qian and J. Han. 2022. Contrastive Regularization for Multimodal Emotion Recognition Using Audio and Text. ArXiv e-prints (arXiv:2211.10885) (November 2022) 1-5. https:\/\/doi.org\/10.48550\/arXiv.2211.10885 arxiv:2211.10885\u00a0[cs.SD] 10.48550\/arXiv.2211.10885"},{"key":"#cr-split#-e_1_3_2_1_39_1.2","unstructured":"F. Qian and J. Han. 2022. Contrastive Regularization for Multimodal Emotion Recognition Using Audio and Text. ArXiv e-prints (arXiv:2211.10885) (November 2022) 1-5. https:\/\/doi.org\/10.48550\/arXiv.2211.10885 arxiv:2211.10885\u00a0[cs.SD]"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Affective Computing and Intelligent Interaction and Workshops (ACII 2009","author":"Setz C.","year":"2009","unstructured":"C. Setz , J. Schumm , C. Lorenz , B. Arnrich , and G. Tr\u00f6ster . 2009. Using ensemble classifier systems for handling missing data in emotion recognition from physiology: one step towards a practical system . In International Conference on Affective Computing and Intelligent Interaction and Workshops (ACII 2009 ). Amsterdam, Netherlands, 1\u20138. https:\/\/doi.org\/10.1109\/ACII. 2009 .5349590 10.1109\/ACII.2009.5349590 C. Setz, J. Schumm, C. Lorenz, B. Arnrich, and G. Tr\u00f6ster. 2009. Using ensemble classifier systems for handling missing data in emotion recognition from physiology: one step towards a practical system. In International Conference on Affective Computing and Intelligent Interaction and Workshops (ACII 2009). Amsterdam, Netherlands, 1\u20138. https:\/\/doi.org\/10.1109\/ACII.2009.5349590"},{"key":"#cr-split#-e_1_3_2_1_41_1.1","doi-asserted-by":"crossref","unstructured":"S. Siriwardhana T. Kaluarachchi M. Billinghurst and S. Nanayakkara. 2020. Multimodal emotion recognition with transformer-based self supervised feature fusion. IEEE Access 8 (September 2020) 176274-176285. https:\/\/doi.org\/10.1109\/ACCESS.2020.3026823 10.1109\/ACCESS.2020.3026823","DOI":"10.1109\/ACCESS.2020.3026823"},{"key":"#cr-split#-e_1_3_2_1_41_1.2","doi-asserted-by":"crossref","unstructured":"S. Siriwardhana T. Kaluarachchi M. Billinghurst and S. Nanayakkara. 2020. Multimodal emotion recognition with transformer-based self supervised feature fusion. IEEE Access 8 (September 2020) 176274-176285. https:\/\/doi.org\/10.1109\/ACCESS.2020.3026823","DOI":"10.1109\/ACCESS.2020.3026823"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.37"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2017.87"},{"key":"#cr-split#-e_1_3_2_1_44_1.1","doi-asserted-by":"crossref","unstructured":"A. Triantafyllopoulos G. Keren J. Wagner I. Steiner and B. Schuller. 2019. Towards Robust Speech Emotion Recognition Using Deep Residual Networks for Speech Enhancement. In Interspeech 2019. Graz Austria 1691-1695. https:\/\/doi.org\/10.21437\/Interspeech.2019-1811 10.21437\/Interspeech.2019-1811","DOI":"10.21437\/Interspeech.2019-1811"},{"key":"#cr-split#-e_1_3_2_1_44_1.2","doi-asserted-by":"crossref","unstructured":"A. Triantafyllopoulos G. Keren J. Wagner I. Steiner and B. Schuller. 2019. Towards Robust Speech Emotion Recognition Using Deep Residual Networks for Speech Enhancement. In Interspeech 2019. Graz Austria 1691-1695. https:\/\/doi.org\/10.21437\/Interspeech.2019-1811","DOI":"10.21437\/Interspeech.2019-1811"},{"key":"#cr-split#-e_1_3_2_1_45_1.1","doi-asserted-by":"crossref","unstructured":"Y.-H.H. Tsai S. Bai P.P. Liang J.Z. Kolter L.-P. Morency and R. Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. In Association for Computational Linguistics (ACL 2019) Vol.\u00a01. Florence Italy 6558-6569. https:\/\/doi.org\/10.18653\/v1\/p19-1656 10.18653\/v1","DOI":"10.18653\/v1\/P19-1656"},{"key":"#cr-split#-e_1_3_2_1_45_1.2","doi-asserted-by":"crossref","unstructured":"Y.-H.H. Tsai S. Bai P.P. Liang J.Z. Kolter L.-P. Morency and R. Salakhutdinov. 2019. Multimodal Transformer for Unaligned Multimodal Language Sequences. In Association for Computational Linguistics (ACL 2019) Vol.\u00a01. Florence Italy 6558-6569. https:\/\/doi.org\/10.18653\/v1\/p19-1656","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_2_1_46_1","unstructured":"A. Vaswani N. Shazeer N. Parmar J. Uszkoreit L. Jones A.N. Gomez \u0141. Kaiser and I. Polosukhin. 2017. Attention is all you need. In In Advances in Neural Information Processing Systems (NIPS 2017). Long Beach CA USA 5998\u20136008.  A. Vaswani N. Shazeer N. Parmar J. Uszkoreit L. Jones A.N. Gomez \u0141. Kaiser and I. Polosukhin. 2017. Attention is all you need. In In Advances in Neural Information Processing Systems (NIPS 2017). Long Beach CA USA 5998\u20136008."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-AFFC.2011.12"},{"key":"#cr-split#-e_1_3_2_1_48_1.1","doi-asserted-by":"crossref","unstructured":"J. Wagner A. Triantafyllopoulos H. Wierstorf M. Schmitt F. Burkhardt F. Eyben and B.W. Schuller. 2023. Dawn of the Transformer Era in Speech Emotion Recognition: Closing the Valence Gap. IEEE Transactions on Pattern Analysis and Machine Intelligence Early Access (2023). https:\/\/doi.org\/10.1109\/TPAMI.2023.3263585. 10.1109\/TPAMI.2023.3263585","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"#cr-split#-e_1_3_2_1_48_1.2","doi-asserted-by":"crossref","unstructured":"J. Wagner A. Triantafyllopoulos H. Wierstorf M. Schmitt F. Burkhardt F. Eyben and B.W. Schuller. 2023. Dawn of the Transformer Era in Speech Emotion Recognition: Closing the Valence Gap. IEEE Transactions on Pattern Analysis and Machine Intelligence Early Access (2023). https:\/\/doi.org\/10.1109\/TPAMI.2023.3263585.","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"#cr-split#-e_1_3_2_1_49_1.1","doi-asserted-by":"crossref","unstructured":"Y. Wang G. Shen Y. Xu J. Li and Z. Zhao. 2021. Learning Mutual Correlation in Multimodal Transformer for Speech Emotion Recognition. In Interspeech (2021). Brno Czech Republic 4518-4522. https:\/\/doi.org\/10.21437\/Interspeech.2021-2004 10.21437\/Interspeech.2021-2004","DOI":"10.21437\/Interspeech.2021-2004"},{"key":"#cr-split#-e_1_3_2_1_49_1.2","doi-asserted-by":"crossref","unstructured":"Y. Wang G. Shen Y. Xu J. Li and Z. Zhao. 2021. Learning Mutual Correlation in Multimodal Transformer for Speech Emotion Recognition. In Interspeech (2021). Brno Czech Republic 4518-4522. https:\/\/doi.org\/10.21437\/Interspeech.2021-2004","DOI":"10.21437\/Interspeech.2021-2004"},{"key":"e_1_3_2_1_50_1","volume-title":"HuggingFace\u2019s transformers: State-of-the-art natural language processing. ArXiv e-prints (arXiv:1910.03771v5) (October","author":"Wolf T.","year":"2019","unstructured":"T. Wolf , L. Debut , V. Sanh , J. Chaumond , C. Delangue , A. Moi , P. Cistac , T. Rault , R. Louf , M. Funtowicz , J. Davison , S. Shleifer , P. von Platen , C. Ma , Y. Jernite , J. Plu , C. Xu , T. Le Scao , S. Gugger , M. Drame , and Q.\u00a0Lhoest amd A.M.\u00a0Rush. 2019. HuggingFace\u2019s transformers: State-of-the-art natural language processing. ArXiv e-prints (arXiv:1910.03771v5) (October 2019 ), 1\u20138. https:\/\/doi.org\/10.48550\/arXiv.1910.03771 arxiv:1910.03771\u00a0[cs.CL] 10.48550\/arXiv.1910.03771 T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T. Le Scao, S. Gugger, M. Drame, and Q.\u00a0Lhoest amd A.M.\u00a0Rush. 2019. HuggingFace\u2019s transformers: State-of-the-art natural language processing. ArXiv e-prints (arXiv:1910.03771v5) (October 2019), 1\u20138. https:\/\/doi.org\/10.48550\/arXiv.1910.03771 arxiv:1910.03771\u00a0[cs.CL]"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ac49a7"},{"key":"e_1_3_2_1_52_1","volume-title":"Multimodal Speech Emotion Recognition Using Audio and Text. In IEEE Spoken Language Technology Workshop (SLT 2018","author":"Yoon S.","year":"2018","unstructured":"S. Yoon , S. Byun , and K. Jung . 2018 . Multimodal Speech Emotion Recognition Using Audio and Text. In IEEE Spoken Language Technology Workshop (SLT 2018 ),. Athens, Greece, 112\u2013118. https:\/\/doi.org\/10.1109\/SLT. 2018 .8639583 10.1109\/SLT.2018.8639583 S. Yoon, S. Byun, and K. Jung. 2018. Multimodal Speech Emotion Recognition Using Audio and Text. In IEEE Spoken Language Technology Workshop (SLT 2018),. Athens, Greece, 112\u2013118. https:\/\/doi.org\/10.1109\/SLT.2018.8639583"},{"key":"e_1_3_2_1_53_1","volume-title":"Audio\/Visual Emotion Challenge and Workshop (AVEC","author":"Zhao J.","year":"2018","unstructured":"J. Zhao , R. Li , S. Chen , and Q. Jin . 2018. Multi-modal multi-cultural dimensional continues emotion recognition in dyadic interactions . In Audio\/Visual Emotion Challenge and Workshop (AVEC 2018 ). Seoul, Republic of Korea, 65\u201372. https:\/\/doi.org\/10.1145\/3266302.3266313 10.1145\/3266302.3266313 J. Zhao, R. Li, S. Chen, and Q. Jin. 2018. Multi-modal multi-cultural dimensional continues emotion recognition in dyadic interactions. In Audio\/Visual Emotion Challenge and Workshop (AVEC 2018). Seoul, Republic of Korea, 65\u201372. https:\/\/doi.org\/10.1145\/3266302.3266313"},{"key":"e_1_3_2_1_54_1","volume-title":"International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC 2014","author":"Zheng W.-L.","year":"2014","unstructured":"W.-L. Zheng , B.-N. Dong , and B.\u00a0 L. Lu . 2014 . Multimodal emotion recognition using EEG and eye tracking data . In International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC 2014 ). Chicago, IL, USA, 5040\u20135043. https:\/\/doi.org\/10.1109\/EMBC. 2014.6944757 10.1109\/EMBC.2014.6944757 W.-L. Zheng, B.-N. Dong, and B.\u00a0L. Lu. 2014. Multimodal emotion recognition using EEG and eye tracking data. In International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC 2014). Chicago, IL, USA, 5040\u20135043. https:\/\/doi.org\/10.1109\/EMBC.2014.6944757"},{"key":"e_1_3_2_1_55_1","volume-title":"Exploiting Modality-Invariant Feature for Robust Multimodal Emotion Recognition with Missing Modalities. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023","author":"Zuo H.","year":"2023","unstructured":"H. Zuo , R. Liu , J. Zhao , G. Gao , and H. Li . 2023 . Exploiting Modality-Invariant Feature for Robust Multimodal Emotion Recognition with Missing Modalities. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023 ). Rhodes Island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357. 2023 .10095836 10.1109\/ICASSP49357.2023.10095836 H. Zuo, R. Liu, J. Zhao, G. Gao, and H. Li. 2023. Exploiting Modality-Invariant Feature for Robust Multimodal Emotion Recognition with Missing Modalities. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2023). Rhodes Island, Greece, 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10095836"}],"event":{"name":"ICMI '23: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Paris France","acronym":"ICMI '23","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577190.3614110","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3577190.3614110","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:51:11Z","timestamp":1750182671000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3577190.3614110"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,9]]},"references-count":68,"alternative-id":["10.1145\/3577190.3614110","10.1145\/3577190"],"URL":"https:\/\/doi.org\/10.1145\/3577190.3614110","relation":{},"subject":[],"published":{"date-parts":[[2023,10,9]]},"assertion":[{"value":"2023-10-09","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}