{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:46:34Z","timestamp":1776887194720,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T00:00:00Z","timestamp":1538438400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Basic Research Program of China","award":["2015CB351704"],"award-info":[{"award-number":["2015CB351704"]}]},{"name":"Jiangsu Provincial Key Research and Development Program","award":["BE2016616"],"award-info":[{"award-number":["BE2016616"]}]},{"DOI":"10.13039\/501100011002","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572009"],"award-info":[{"award-number":["61572009"]}],"id":[{"id":"10.13039\/501100011002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,2]]},"DOI":"10.1145\/3242969.3264992","type":"proceedings-article","created":{"date-parts":[[2018,10,2]],"date-time":"2018-10-02T12:09:29Z","timestamp":1538482169000},"page":"646-652","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":56,"title":["Multiple Spatio-temporal Feature Learning for Video-based Emotion Recognition in the Wild"],"prefix":"10.1145","author":[{"given":"Cheng","family":"Lu","sequence":"first","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Wenming","family":"Zheng","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Chaolong","family":"Li","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Chuangao","family":"Tang","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Suyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Simeng","family":"Yan","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]},{"given":"Yuan","family":"Zong","sequence":"additional","affiliation":[{"name":"Southeast University, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2018,10,2]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Amanjot Kaur and Tom Gedeon","author":"Abhinav Dhall Roland Goecke","year":"2018","unstructured":"Roland Goecke Abhinav Dhall , Amanjot Kaur and Tom Gedeon . 2018 . EmotiW 2018: Audio-Video, Student Engagement and Group-Level Affect Prediction ( in press). (2018). Roland Goecke Abhinav Dhall, Amanjot Kaur and Tom Gedeon. 2018. EmotiW 2018: Audio-Video, Student Engagement and Group-Level Affect Prediction (in press). (2018)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/79.911197"},{"key":"e_1_3_2_1_3_1","volume-title":"Imagenet: A large-scale hierarchical image database IEEE Conference on Computer Vision and Pattern Recognition","author":"Deng Jia","year":"2009","unstructured":"Jia Deng , Wei Dong , Richard Socher , Li-Jia Li , Kai Li , and Li Fei-Fei . 2009 . Imagenet: A large-scale hierarchical image database IEEE Conference on Computer Vision and Pattern Recognition . IEEE , 248--255. Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 248--255."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2522848.2531749"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818346.2830596"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2010.09.020"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2997632"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3143009"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_2_1_15_1","volume-title":"Convolutional neural networks pretrained on large face recognition datasets for emotion classification from video. arXiv preprint arXiv:1711.04598","author":"Knyazev Boris","year":"2017","unstructured":"Boris Knyazev , Roman Shvetsov , Natalia Efremova , and Artem Kuharenko . 2017. Convolutional neural networks pretrained on large face recognition datasets for emotion classification from video. arXiv preprint arXiv:1711.04598 ( 2017 ). Boris Knyazev, Roman Shvetsov, Natalia Efremova, and Artem Kuharenko. 2017. Convolutional neural networks pretrained on large face recognition datasets for emotion classification from video. arXiv preprint arXiv:1711.04598 (2017)."},{"key":"e_1_3_2_1_16_1","volume-title":"Hinton","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky , Ilya Sutskever , and Geoffrey E . Hinton . 2012 . Imagenet classification with deep convolutional neural networks Advances in neural information processing systems. 1097--1105. Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. Imagenet classification with deep convolutional neural networks Advances in neural information processing systems. 1097--1105."},{"key":"e_1_3_2_1_17_1","volume-title":"Eleventh Annual Conference of the International Speech Communication Association, InterSpeech.","author":"Mikolov Tom\u00e1\u0161","year":"2010","unstructured":"Tom\u00e1\u0161 Mikolov , Martin Karafi\u00e1t , Luk\u00e1\u0161 Burget , Jan \u010cernock\u1ef3 , and Sanjeev Khudanpur . 2010 . Recurrent neural network based language model . In Eleventh Annual Conference of the International Speech Communication Association, InterSpeech. Tom\u00e1\u0161 Mikolov, Martin Karafi\u00e1t, Luk\u00e1\u0161 Burget, Jan \u010cernock\u1ef3, and Sanjeev Khudanpur. 2010. Recurrent neural network based language model. In Eleventh Annual Conference of the International Speech Communication Association, InterSpeech."},{"key":"e_1_3_2_1_18_1","first-page":"6","article-title":"Deep face recognition","volume":"1","author":"Parkhi Omkar M.","year":"2015","unstructured":"Omkar M. Parkhi , Andrea Vedaldi , Andrew Zisserman , 2015 . Deep face recognition . In BMVC , Vol. 1. 6 . Omkar M. Parkhi, Andrea Vedaldi, Andrew Zisserman, et al.. 2015. Deep face recognition. In BMVC, Vol. 1. 6.","journal-title":"BMVC"},{"key":"e_1_3_2_1_19_1","volume-title":"Speech recognition with primarily temporal cues. Science","author":"Shannon Robert V.","year":"1995","unstructured":"Robert V. Shannon , Fan-Gang Zeng , Vivek Kamath , John Wygonski , and Michael Ekelid . 1995. Speech recognition with primarily temporal cues. Science Vol. 270 , 5234 ( 1995 ), 303--304. Robert V. Shannon, Fan-Gang Zeng, Vivek Kamath, John Wygonski, and Michael Ekelid. 1995. Speech recognition with primarily temporal cues. Science Vol. 270, 5234 (1995), 303--304."},{"key":"e_1_3_2_1_20_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman . 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 ( 2014 ). Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3143011"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.09.072"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2997630"},{"key":"e_1_3_2_1_25_1","unstructured":"Jason Yosinski Jeff Clune Yoshua Bengio and Hod Lipson. 2014. How transferable are features in deep neural networks? Advances in neural information processing systems. 3320--3328.   Jason Yosinski Jeff Clune Yoshua Bengio and Hod Lipson. 2014. How transferable are features in deep neural networks? Advances in neural information processing systems. 3320--3328."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"}],"event":{"name":"ICMI '18: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"Boulder CO USA","acronym":"ICMI '18","sponsor":["SIGCHI Specialist Interest Group in Computer-Human Interaction of the ACM"]},"container-title":["Proceedings of the 20th ACM International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3264992","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3242969.3264992","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:06:58Z","timestamp":1750212418000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3242969.3264992"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,2]]},"references-count":26,"alternative-id":["10.1145\/3242969.3264992","10.1145\/3242969"],"URL":"https:\/\/doi.org\/10.1145\/3242969.3264992","relation":{},"subject":[],"published":{"date-parts":[[2018,10,2]]},"assertion":[{"value":"2018-10-02","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}