{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T15:59:54Z","timestamp":1779206394121,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,14]],"date-time":"2019-10-14T00:00:00Z","timestamp":1571011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,14]]},"DOI":"10.1145\/3340555.3355713","type":"proceedings-article","created":{"date-parts":[[2019,10,17]],"date-time":"2019-10-17T12:49:48Z","timestamp":1571316588000},"page":"562-566","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":65,"title":["Exploring Emotion Features and Fusion Strategies for Audio-Video Emotion Recognition"],"prefix":"10.1145","author":[{"given":"Hengshun","family":"Zhou","sequence":"first","affiliation":[{"name":"National Engineering Laboratory for Speech and Language Information Processing (NEL-SLIP), University of Science and Technology of China, P.R.China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Debin","family":"Meng","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, SIAT-SenseTime Joint Lab, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuanyuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Speech and Language Information Processing (NEL-SLIP), University of Science and Technology of China, P.R.China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaojiang","family":"Peng","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, SIAT-SenseTime Joint Lab, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jun","family":"Du","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Speech and Language Information Processing (NEL-SLIP), University of Science and Technology of China, P.R.China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Wang","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, SIAT-SenseTime Joint Lab, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[{"name":"ShenZhen Key Lab of Computer Vision and Pattern Recognition, SIAT-SenseTime Joint Lab, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2019,10,14]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Sarah\u00a0Adel Bargal Emad Barsoum Cristian\u00a0Canton Ferrer and Cha Zhang. 2016. Emotion recognition in the wild from videos using images. In ACM ICMI.  Sarah\u00a0Adel Bargal Emad Barsoum Cristian\u00a0Canton Ferrer and Cha Zhang. 2016. Emotion recognition in the wild from videos using images. In ACM ICMI.","DOI":"10.1145\/2993148.2997627"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Emad Barsoum Cha Zhang Cristian Canton\u00a0Ferrer and Zhengyou Zhang. 2016. Training Deep Networks for Facial Expression Recognition with Crowd-Sourced Label Distribution. In ACM ICMI.  Emad Barsoum Cha Zhang Cristian Canton\u00a0Ferrer and Zhengyou Zhang. 2016. Training Deep Networks for Facial Expression Recognition with Crowd-Sourced Label Distribution. In ACM ICMI.","DOI":"10.1145\/2993148.2993165"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2018.2860246"},{"key":"e_1_3_2_1_4_1","volume-title":"Arcface: Additive angular margin loss for deep face recognition. arXiv preprint arXiv:1801.07698(2018).","author":"Deng Jiankang","year":"2018","unstructured":"Jiankang Deng , Jia Guo , and Stefanos Zafeiriou . 2018 . Arcface: Additive angular margin loss for deep face recognition. arXiv preprint arXiv:1801.07698(2018). Jiankang Deng, Jia Guo, and Stefanos Zafeiriou. 2018. Arcface: Additive angular margin loss for deep face recognition. arXiv preprint arXiv:1801.07698(2018)."},{"key":"e_1_3_2_1_5_1","volume-title":"Engagement and Cohesion PredictionTasks. In ACM International Conference on Mutimodal Interaction.","author":"Dhall Abhinav","year":"2019","unstructured":"Abhinav Dhall , Roland Goecke , Shreya Ghosh , and Tom Gedeon . 2019 . EmotiW 2019: Automatic Emotion , Engagement and Cohesion PredictionTasks. In ACM International Conference on Mutimodal Interaction. Abhinav Dhall, Roland Goecke, Shreya Ghosh, and Tom Gedeon. 2019. EmotiW 2019: Automatic Emotion, Engagement and Cohesion PredictionTasks. In ACM International Conference on Mutimodal Interaction."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"e_1_3_2_1_7_1","volume-title":"Encyclopedia of database systems","author":"Dix Alan","unstructured":"Alan Dix . 2009. Human-computer interaction . In Encyclopedia of database systems . Springer , 1327\u20131331. Alan Dix. 2009. Human-computer interaction. In Encyclopedia of database systems. Springer, 1327\u20131331."},{"key":"e_1_3_2_1_8_1","unstructured":"Yingruo Fan Jacqueline\u00a0CK Lam and Victor\u00a0OK Li. 2018. Video-based Emotion Recognition Using Deeply-Supervised Neural Networks. In ACM ICMI.  Yingruo Fan Jacqueline\u00a0CK Lam and Victor\u00a0OK Li. 2018. Video-based Emotion Recognition Using Deeply-Supervised Neural Networks. In ACM ICMI."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Yin Fan Xiangju Lu Dian Li and Yuanliu Liu. 2016. Video-based emotion recognition using CNN-RNN and C3D hybrid networks. In ACM ICMI.  Yin Fan Xiangju Lu Dian Li and Yuanliu Liu. 2016. Video-based emotion recognition using CNN-RNN and C3D hybrid networks. In ACM ICMI.","DOI":"10.1145\/2993148.2997632"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Felix\u00a0A Gers J\u00fcrgen Schmidhuber and Fred Cummins. 1999. Learning to forget: Continual prediction with LSTM. (1999).  Felix\u00a0A Gers J\u00fcrgen Schmidhuber and Fred Cummins. 1999. Learning to forget: Continual prediction with LSTM. (1999).","DOI":"10.1049\/cp:19991218"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"e_1_3_2_1_12_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_13_1","unstructured":"Ping Hu Dongqi Cai Shandong Wang Anbang Yao and Yurong Chen. 2017. Learning supervised scoring ensemble for emotion recognition in the wild. In ACM ICMI.  Ping Hu Dongqi Cai Shandong Wang Anbang Yao and Yurong Chen. 2017. Learning supervised scoring ensemble for emotion recognition in the wild. In ACM ICMI."},{"key":"e_1_3_2_1_14_1","volume-title":"Mental models in cognitive science. Cognitive science 4, 1","author":"Johnson-Laird Philip\u00a0Nicholas","year":"1980","unstructured":"Philip\u00a0Nicholas Johnson-Laird . 1980. Mental models in cognitive science. Cognitive science 4, 1 ( 1980 ), 71\u2013115. Philip\u00a0Nicholas Johnson-Laird. 1980. Mental models in cognitive science. Cognitive science 4, 1 (1980), 71\u2013115."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00109"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2868382"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264989"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264992"},{"key":"e_1_3_2_1_19_1","volume-title":"Clinical diagnosis of depression in primary care: a meta-analysis. The Lancet 374, 9690","author":"Mitchell J","year":"2009","unstructured":"Alex\u00a0 J Mitchell , Amol Vaze , and Sanjay Rao . 2009. Clinical diagnosis of depression in primary care: a meta-analysis. The Lancet 374, 9690 ( 2009 ), 609\u2013619. Alex\u00a0J Mitchell, Amol Vaze, and Sanjay Rao. 2009. Clinical diagnosis of depression in primary care: a meta-analysis. The Lancet 374, 9690 (2009), 609\u2013619."},{"key":"e_1_3_2_1_20_1","first-page":"1","article-title":"AffectNet: A Database for Facial Expression, Valence, and Arousal Computing in the Wild","volume":"99","author":"Mollahosseini Ali","year":"1949","unstructured":"Ali Mollahosseini , Behzad Hasani , and Mohammad\u00a0 H. Mahoor . 1949 . AffectNet: A Database for Facial Expression, Valence, and Arousal Computing in the Wild . IEEE Transactions on Affective Computing PP , 99 (1949), 1 \u2013 1 . Ali Mollahosseini, Behzad Hasani, and Mohammad\u00a0H. Mahoor. 1949. AffectNet: A Database for Facial Expression, Valence, and Arousal Computing in the Wild. IEEE Transactions on Affective Computing PP, 99 (1949), 1\u20131.","journal-title":"IEEE Transactions on Affective Computing PP"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Omkar\u00a0M Parkhi Andrea Vedaldi Andrew Zisserman 2015. Deep face recognition.. In BMVC Vol.\u00a01. 6.  Omkar\u00a0M Parkhi Andrea Vedaldi Andrew Zisserman 2015. Deep face recognition.. In BMVC Vol.\u00a01. 6.","DOI":"10.5244\/C.29.41"},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on International Conference on Machine Learning.","author":"Shang Wenling","year":"2016","unstructured":"Wenling Shang , Diogo Almeida , Diogo Almeida , and Honglak Lee . 2016 . Understanding and improving convolutional neural networks via concatenated rectified linear units . In International Conference on International Conference on Machine Learning. Wenling Shang, Diogo Almeida, Diogo Almeida, and Honglak Lee. 2016. Understanding and improving convolutional neural networks via concatenated rectified linear units. In International Conference on International Conference on Machine Learning."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3143008"},{"key":"e_1_3_2_1_24_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. (2017).  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. (2017)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Valentin Vielzeuf St\u00e9phane Pateux and Fr\u00e9d\u00e9ric Jurie. 2017. Temporal multimodal fusion for video emotion classification in the wild. In ACM ICMI.  Valentin Vielzeuf St\u00e9phane Pateux and Fr\u00e9d\u00e9ric Jurie. 2017. Temporal multimodal fusion for video emotion classification in the wild. In ACM ICMI.","DOI":"10.1145\/3136755.3143011"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264991"},{"key":"e_1_3_2_1_27_1","unstructured":"Kai Wang Xiaojiang Peng Jianfei Yang Debin Meng and Yu Qiao. 2019. Region Attention Networks for Pose and Occlusion Robust Facial Expression Recognition. arXiv preprint arXiv:1905.04075(2019).  Kai Wang Xiaojiang Peng Jianfei Yang Debin Meng and Yu Qiao. 2019. Region Attention Networks for Pose and Occlusion Robust Facial Expression Recognition. arXiv preprint arXiv:1905.04075(2019)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_31"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264981"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1174"},{"key":"e_1_3_2_1_31_1","unstructured":"Anbang Yao Dongqi Cai Ping Hu Shandong Wang Liang Sha and Yurong Chen. 2016. HoloNet: towards robust emotion recognition in the wild. In ACM ICMI.  Anbang Yao Dongqi Cai Ping Hu Shandong Wang Liang Sha and Yurong Chen. 2016. HoloNet: towards robust emotion recognition in the wild. In ACM ICMI."},{"key":"e_1_3_2_1_32_1","volume-title":"Deep Fusion: An Attention Guided Factorized Bilinear Pooling for Audio-video Emotion Recognition. arXiv preprint arXiv:1901.04889(2019).","author":"Zhang Yuanyuan","year":"2019","unstructured":"Yuanyuan Zhang , Zi-Rui Wang , and Jun Du . 2019 . Deep Fusion: An Attention Guided Factorized Bilinear Pooling for Audio-video Emotion Recognition. arXiv preprint arXiv:1901.04889(2019). Yuanyuan Zhang, Zi-Rui Wang, and Jun Du. 2019. Deep Fusion: An Attention Guided Factorized Bilinear Pooling for Audio-video Emotion Recognition. arXiv preprint arXiv:1901.04889(2019)."}],"event":{"name":"ICMI '19: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Suzhou China","acronym":"ICMI '19"},"container-title":["2019 International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3355713","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3340555.3355713","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:28Z","timestamp":1750202008000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3355713"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,14]]},"references-count":32,"alternative-id":["10.1145\/3340555.3355713","10.1145\/3340555"],"URL":"https:\/\/doi.org\/10.1145\/3340555.3355713","relation":{},"subject":[],"published":{"date-parts":[[2019,10,14]]},"assertion":[{"value":"2019-10-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}