{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T02:09:02Z","timestamp":1777342142814,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,14]],"date-time":"2019-10-14T00:00:00Z","timestamp":1571011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,14]]},"DOI":"10.1145\/3340555.3355720","type":"proceedings-article","created":{"date-parts":[[2019,10,17]],"date-time":"2019-10-17T12:49:48Z","timestamp":1571316588000},"page":"595-601","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":31,"title":["Multi-Attention Fusion Network for Video-based Emotion Recognition"],"prefix":"10.1145","author":[{"given":"Yanan","family":"Wang","sequence":"first","affiliation":[{"name":"KDDI Research, Inc."}]},{"given":"Jianming","family":"Wu","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc."}]},{"given":"Keiichiro","family":"Hoashi","sequence":"additional","affiliation":[{"name":"KDDI Research, Inc."}]}],"member":"320","published-online":{"date-parts":[[2019,10,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 2019 on International Conference on Multimodal Interaction. ACM.","author":"Abhinav Dhall","year":"2019","unstructured":"Dhall Abhinav , Goecke Roland , Ghosh Shreya , and Gedeon Tom . 2019 . EmotiW 2019: Automatic Emotion, Engagement and Cohesion PredictionTasks . In Proceedings of the 2019 on International Conference on Multimodal Interaction. ACM. Dhall Abhinav, Goecke Roland, Ghosh Shreya, and Gedeon Tom. 2019. EmotiW 2019: Automatic Emotion, Engagement and Cohesion PredictionTasks. In Proceedings of the 2019 on International Conference on Multimodal Interaction. ACM."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2010.09.020"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2993165"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Ken Chatfield Karen Simonyan Andrea Vedaldi and Andrew Zisserman. 2014. Return of the Devil in the Details:Delving Deep into Convolutional Nets. arXiv preprint arXiv:1405.3531(2014).  Ken Chatfield Karen Simonyan Andrea Vedaldi and Andrew Zisserman. 2014. Return of the Devil in the Details:Delving Deep into Convolutional Nets. arXiv preprint arXiv:1405.3531(2014).","DOI":"10.5244\/C.28.6"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2010.09.020"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-42051-1_16"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Kun Han Dong Yu and Ivan Tashev. 2014. Speech emotion recognition using deep neural network and extreme learning machine. In Fifteenth annual conference of the international speech communication association.  Kun Han Dong Yu and Ivan Tashev. 2014. Speech emotion recognition using deep neural network and extreme learning machine. In Fifteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2014-57"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Kun Han Dong Yu and Ivan Tashev. 2014. Speech emotion recognition using deep neural network and extreme learning machine. In Fifteenth annual conference of the international speech communication association.  Kun Han Dong Yu and Ivan Tashev. 2014. Speech emotion recognition using deep neural network and extreme learning machine. In Fifteenth annual conference of the international speech communication association.","DOI":"10.21437\/Interspeech.2014-57"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Che-Wei Huang Shrikanth Narayanan 2017. Characterizing types of convolution in deep convolutional recurrent neural networks for robust speech emotion recognition. arXiv preprint arXiv:1706.02901(2017).  Che-Wei Huang Shrikanth Narayanan 2017. Characterizing types of convolution in deep convolutional recurrent neural networks for robust speech emotion recognition. arXiv preprint arXiv:1706.02901(2017).","DOI":"10.1109\/ICME.2017.8019296"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dss.2018.09.002"},{"key":"e_1_3_2_1_14_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097\u20131105.  Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097\u20131105."},{"key":"e_1_3_2_1_15_1","unstructured":"PaulPu Liang Ruslan Salakhutdinov and Louis-Philippe Morency. 2018. Computational Modeling of Human Multimodal Language: The MOSEI Dataset and Interpretable Dynamic Fusion.  PaulPu Liang Ruslan Salakhutdinov and Louis-Philippe Morency. 2018. Computational Modeling of Human Multimodal Language: The MOSEI Dataset and Interpretable Dynamic Fusion."},{"key":"e_1_3_2_1_16_1","volume-title":"Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio.","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin , Minwei Feng , Cicero Nogueira\u00a0dos Santos , Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017 . A Structured Self-attentive Sentence Embedding . arXiv preprint arXiv:1703.03130(2017). Zhouhan Lin, Minwei Feng, Cicero Nogueira\u00a0dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A Structured Self-attentive Sentence Embedding. arXiv preprint arXiv:1703.03130(2017)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264989"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3242969.3264992"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Minh-Thang Luong Hieu Pham and Christopher\u00a0D Manning. 2015. Effective approaches to attention-based neural machine translation. arXiv preprint arXiv:1508.04025(2015).  Minh-Thang Luong Hieu Pham and Christopher\u00a0D Manning. 2015. Effective approaches to attention-based neural machine translation. arXiv preprint arXiv:1508.04025(2015).","DOI":"10.18653\/v1\/D15-1166"},{"key":"e_1_3_2_1_20_1","volume-title":"Silent messages : implicit communication of emotions and attitudes","author":"Mehrabian Albert","unstructured":"Albert Mehrabian . 1981. Silent messages : implicit communication of emotions and attitudes . Belmont, Calif . : Wadsworth Pub. Co . Albert Mehrabian. 1981. Silent messages : implicit communication of emotions and attitudes. Belmont, Calif. : Wadsworth Pub. Co."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_22_1","unstructured":"Christopher Pramerdorfer and Martin Kampel. 2016. Facial expression recognition using convolutional neural networks: state of the art. arXiv preprint arXiv:1612.02903(2016).  Christopher Pramerdorfer and Martin Kampel. 2016. Facial expression recognition using convolutional neural networks: state of the art. arXiv preprint arXiv:1612.02903(2016)."},{"key":"e_1_3_2_1_23_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556(2014).  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556(2014)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298188"},{"key":"e_1_3_2_1_25_1","unstructured":"Yichuan Tang. 2013. Deep learning using linear support vector machines. In arXiv preprint arXiv:1306.0239.  Yichuan Tang. 2013. Deep learning using linear support vector machines. In arXiv preprint arXiv:1306.0239."},{"key":"e_1_3_2_1_26_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008."},{"key":"e_1_3_2_1_27_1","volume-title":"Lightweight Deep Convolutional Neural Networks for Facial Expression Recognition. In 2019 IEEE 21th International Workshop on Multimedia Signal Processing (MMSP).","author":"Wang Yanan","year":"2019","unstructured":"Yanan Wang , Jianming Wu , and Keiichiro Hoashi . 2019 . Lightweight Deep Convolutional Neural Networks for Facial Expression Recognition. In 2019 IEEE 21th International Workshop on Multimedia Signal Processing (MMSP). Yanan Wang, Jianming Wu, and Keiichiro Hoashi. 2019. Lightweight Deep Convolutional Neural Networks for Facial Expression Recognition. In 2019 IEEE 21th International Workshop on Multimedia Signal Processing (MMSP)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_32_1","volume-title":"Modality Attention for End-to-end Audio-visual Speech Recognition. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6565\u20136569","author":"Zhou Pan","year":"2019","unstructured":"Pan Zhou , Wenwen Yang , Wei Chen , Yanfeng Wang , and Jia Jia . 2019 . Modality Attention for End-to-end Audio-visual Speech Recognition. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6565\u20136569 . Pan Zhou, Wenwen Yang, Wei Chen, Yanfeng Wang, and Jia Jia. 2019. Modality Attention for End-to-end Audio-visual Speech Recognition. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 6565\u20136569."}],"event":{"name":"ICMI '19: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Suzhou China","acronym":"ICMI '19"},"container-title":["2019 International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3355720","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3340555.3355720","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:29Z","timestamp":1750202009000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3340555.3355720"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,14]]},"references-count":32,"alternative-id":["10.1145\/3340555.3355720","10.1145\/3340555"],"URL":"https:\/\/doi.org\/10.1145\/3340555.3355720","relation":{},"subject":[],"published":{"date-parts":[[2019,10,14]]},"assertion":[{"value":"2019-10-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}