{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,12]],"date-time":"2025-06-12T04:14:17Z","timestamp":1749701657389,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":37,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811030048"},{"type":"electronic","value":"9789811030055"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-981-10-3005-5_52","type":"book-chapter","created":{"date-parts":[[2016,10,21]],"date-time":"2016-10-21T11:48:07Z","timestamp":1477050487000},"page":"632-644","source":"Crossref","is-referenced-by-count":0,"title":["Emotion Recognition in Videos via Fusing Multimodal Features"],"prefix":"10.1007","author":[{"given":"Shizhe","family":"Chen","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yujie","family":"Dian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinrui","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaozhu","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haibo","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2016,10,22]]},"reference":[{"key":"52_CR1","doi-asserted-by":"crossref","first-page":"1517","DOI":"10.21437\/Interspeech.2005-446","volume":"5","author":"F Burkhardt","year":"2005","unstructured":"Burkhardt, F., Paeschke, A., Rolfes, M., Sendlmeier, W.F., Weiss, B.: A database of German emotional speech. Interspeech 5, 1517\u20131520 (2005)","journal-title":"Interspeech"},{"key":"52_CR2","doi-asserted-by":"crossref","unstructured":"Lucey, P., Cohn, J.F., Kanade, T., Saragih, J., Ambadar, Z., Matthews, I.: The extended cohn-kanade dataset (ck+): a complete dataset for action unit and emotion-specified expression. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition-Workshops, pp. 94\u2013101. IEEE (2010)","DOI":"10.1109\/CVPRW.2010.5543262"},{"key":"52_CR3","doi-asserted-by":"crossref","unstructured":"Ringeval, F., Schuller, B., Valstar, M., Jaiswal, S., Marchi, E., Lalanne, D., Cowie, R., Pantic, M.: AV+ EC 2015: the first affect recognition challenge bridging across audio, video, and physiological data. In: Proceedings of the 5th International Workshop on Audio\/Visual Emotion Challenge, pp. 3\u20138. ACM (2015)","DOI":"10.1145\/2808196.2811642"},{"key":"52_CR4","doi-asserted-by":"crossref","unstructured":"Abhinav Dhall, O.V., Murthy, R., Goecke, R., Joshi, J., Gedeon, T.: Video and image based emotion recognition challenges in the wild: Emotiw 2015. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, pp. 423\u2013426. ACM (2015)","DOI":"10.1145\/2818346.2829994"},{"key":"52_CR5","doi-asserted-by":"crossref","unstructured":"Li, Y., Tao, J., Schuller, B., Shan, S., Jiang, D., Jia, J.: MEC 2016: the multimodal emotion recognition challenge of CCPR 2016. In: Chinese Conference on Pattern Recognition (CCPR), Chengdu, China (2016)","DOI":"10.1007\/978-981-10-3005-5_55"},{"key":"52_CR6","doi-asserted-by":"crossref","unstructured":"Chen, S., Jin, Q., Li, X., Yang, G., Jieping, X.: Speech emotion classification using acoustic features. In: 2014 9th International Symposium on Chinese Spoken Language Processing (ISCSLP), pp. 579\u2013583. IEEE (2014)","DOI":"10.1109\/ISCSLP.2014.6936664"},{"key":"52_CR7","doi-asserted-by":"crossref","unstructured":"Xia, R., Deng, J., Schuller, B., Liu, Y.: Modeling gender information for emotion recognition using denoising autoencoder. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 990\u2013994. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6853745"},{"key":"52_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Xia, R., Zhang, Z., Liu, Y., Schuller, B.: Introducing shared-hidden-layer autoencoders for transfer learning and their application in acoustic emotion recognition. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4818\u20134822. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6854517"},{"key":"52_CR9","doi-asserted-by":"crossref","unstructured":"Huang, Z., Dong, M., Mao, Q., Zhan, Y.: Speech emotion recognition using CNN. In: Proceedings of the 22nd ACM International Conference on Multimedia, pp. 801\u2013804. ACM (2014)","DOI":"10.1145\/2647868.2654984"},{"key":"52_CR10","doi-asserted-by":"crossref","unstructured":"Jianlong, W., Lin, Z., Zha, H.: Multiple models fusion for emotion recognition in the wild. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, Seattle, WA, USA, 09\u201313 November 2015, pp. 475\u2013481 (2015)","DOI":"10.1145\/2818346.2830582"},{"key":"52_CR11","doi-asserted-by":"crossref","unstructured":"Sun, B., Li, L., Zhou, G., Xuewen, W., He, J., Lejun, Y., Li, D., Wei, Q.: Combining multimodal features within a fusion network for emotion recognition in the wild. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, Seattle, WA, USA, 09\u201313 November 2015, pp. 497\u2013502 (2015)","DOI":"10.1145\/2818346.2830586"},{"key":"52_CR12","doi-asserted-by":"crossref","unstructured":"Yao, A., Shao, J., Ma, N., Chen, Y.: Capturing AU-aware facial features and their latent relations for emotion recognition in the wild. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, pp. 451\u2013458. ACM (2015)","DOI":"10.1145\/2818346.2830585"},{"key":"52_CR13","doi-asserted-by":"crossref","unstructured":"Kim, Y., Provost, E.M.: Say cheese vs. smile: reducing speech-related variability for facial emotion recognition. In: Proceedings of the 22nd ACM International Conference on Multimedia, pp. 27\u201336. ACM (2014)","DOI":"10.1145\/2647868.2654934"},{"key":"52_CR14","doi-asserted-by":"crossref","unstructured":"Jung, H., Lee, S., Yim, J., Park, S., Kim, J.: Joint fine-tuning in deep neural networks for facial expression recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2983\u20132991 (2015)","DOI":"10.1109\/ICCV.2015.341"},{"key":"52_CR15","doi-asserted-by":"crossref","unstructured":"Xue, B., Chen, F., Shaobin, Z.: A study on sentiment computing and classification of sina weibo with word2vec. In: 2014 IEEE International Congress on Big Data, pp. 358\u2013363. IEEE (2014)","DOI":"10.1109\/BigData.Congress.2014.59"},{"key":"52_CR16","doi-asserted-by":"crossref","first-page":"e12","DOI":"10.1017\/ATSIP.2014.11","volume":"3","author":"W Chung-Hsien","year":"2014","unstructured":"Chung-Hsien, W., Lin, J.-C., Wei, W.-L.: Survey on audiovisual emotion recognition: databases, features, and data fusion strategies. APSIPA Trans. Signal Inf. Process. 3, e12 (2014)","journal-title":"APSIPA Trans. Signal Inf. Process."},{"key":"52_CR17","doi-asserted-by":"crossref","unstructured":"Rozgic, V., Ananthakrishnan, S., Saleem, S., Kumar, R., Vembu, A.N., Prasad, R.: Emotion recognition using acoustic and lexical features. In: 13th Annual Conference of the International Speech Communication Association, INTERSPEECH 2012, Portland, Oregon, USA, 9\u201313 September 2012, pp. 366\u2013369 (2012)","DOI":"10.21437\/Interspeech.2012-118"},{"key":"52_CR18","doi-asserted-by":"crossref","unstructured":"Huang, Z., Dang, T., Cummins, N., Stasak, B., Le, P., Sethu, V., Epps, J.: An investigation of annotation delay compensation and output-associative fusion for multimodal continuous emotion prediction. In: The International Workshop on Audio\/Visual Emotion Challenge (2015)","DOI":"10.1145\/2808196.2811640"},{"key":"52_CR19","doi-asserted-by":"crossref","unstructured":"Zuxuan, W., Jiang, Y.-G., Wang, J., Jian, P., Xue, X.: Exploring inter-feature and inter-class relationships with deep neural networks for video classification. In: Proceedings of the ACM International Conference on Multimedia, MM 2014, Orlando, FL, USA, 03\u201307 November 2014, pp. 167\u2013176 (2014)","DOI":"10.1145\/2647868.2654931"},{"key":"52_CR20","doi-asserted-by":"crossref","unstructured":"Chen, J., Chen, Z., Chi, Z., Hong, F.: Emotion recognition in the wild with feature fusion and multiple kernel learning. In: Proceedings of the 16th International Conference on Multimodal Interaction, ICMI 2014, Istanbul, Turkey, 12\u201316 November 2014, pp. 508\u2013513 (2014)","DOI":"10.1145\/2663204.2666277"},{"key":"52_CR21","doi-asserted-by":"crossref","unstructured":"Eyben, F., Llmer, M., Schuller, B.: Opensmile: the Munich versatile and fast open-source audio feature extractor. In: ACM International Conference on Multimedia, MM, pp. 1459\u20131462 (2010)","DOI":"10.1145\/1873951.1874246"},{"issue":"9\u201310","key":"52_CR22","doi-asserted-by":"crossref","first-page":"1062","DOI":"10.1016\/j.specom.2011.01.011","volume":"53","author":"B Schuller","year":"2011","unstructured":"Schuller, B., Batliner, A., Steidl, S., Seppi, D.: Recognising realistic emotions and affect in speech: state of the art and lessons learnt from the first challenge. Speech Commun. 53(9\u201310), 1062\u20131087 (2011)","journal-title":"Speech Commun."},{"key":"52_CR23","doi-asserted-by":"crossref","unstructured":"Schuller, B., Steidl, S., Batliner, A., Vinciarelli, A., Scherer, K., Ringeval, F., Chetouani, M., Weninger, F., Eyben, F., Marchi, E.: The interspeech 2013 computational paralinguistics challenge: social signals, conflict, emotion, autism. In: INTERSPEECH 2013, Conference of the International Speech Communication Association, pp. 148\u2013152 (2013)","DOI":"10.21437\/Interspeech.2013-56"},{"issue":"4","key":"52_CR24","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1016\/B978-0-08-051584-7.50010-3","volume":"28","author":"SB Davis","year":"1990","unstructured":"Davis, S.B.: Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences. Read. Speech Recogn. 28(4), 65\u201374 (1990)","journal-title":"Read. Speech Recogn."},{"key":"52_CR25","doi-asserted-by":"crossref","unstructured":"Pancoast, S., Akbacak, M.: Softening quantization in bag-of-audio-words. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASS 2014, pp. 1370\u20131374 (2014)","DOI":"10.1109\/ICASSP.2014.6853821"},{"key":"52_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1007\/978-3-319-10593-2_47","volume-title":"Computer Vision \u2013 ECCV 2014","author":"M Mathias","year":"2014","unstructured":"Mathias, M., Benenson, R., Pedersoli, M., Gool, L.: Face detection without bells and whistles. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8692, pp. 720\u2013735. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10593-2_47"},{"key":"52_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/978-3-319-10599-4_7","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Z Zhang","year":"2014","unstructured":"Zhang, Z., Luo, P., Loy, C.C., Tang, X.: Facial landmark detection by deep multi-task learning. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8694, pp. 94\u2013108. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10599-4_7"},{"issue":"5","key":"52_CR28","doi-asserted-by":"crossref","first-page":"918","DOI":"10.1109\/TPAMI.2015.2469286","volume":"38","author":"Z Zhang","year":"2016","unstructured":"Zhang, Z., Luo, P., Loy, C.C., Tang, X.: Learning deep representation for face alignment with auxiliary attributes. IEEE Trans. Pattern Anal. Mach. Intell. 38(5), 918\u2013930 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"52_CR29","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vision 60(2), 91\u2013110 (2004)","journal-title":"Int. J. Comput. Vision"},{"key":"52_CR30","unstructured":"Csurka, G., Dance, C., Fan, L., Willamowski, J., Bray, C.: Visual categorization with bags of keypoints. In: Workshop on Statistical Learning in Computer Vision, ECCV, Prague, vol. 1, pp. 1\u20132 (2004)"},{"key":"52_CR31","doi-asserted-by":"crossref","unstructured":"Jianlong, W., Lin, Z., Zha, H.: Multiple models fusion for emotion recognition in the wild. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, pp. 475\u2013481. ACM (2015)","DOI":"10.1145\/2818346.2830582"},{"key":"52_CR32","unstructured":"LeCun, Y., Bengio, Y.: Convolutional networks for images, speech, and time series. In: The Handbook of Brain Theory and Neural Networks, vol. 3361, no. 10 (1995)"},{"key":"52_CR33","unstructured":"Tang, Y.: Deep learning using support vector machines. CoRR, abs\/1306.0239, 2 (2013)"},{"key":"52_CR34","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)"},{"key":"52_CR35","unstructured":"Rehurek, R., Sojka, P.: Software framework for topic modelling with large corpora. In: Proceedings of the LREC 2010 Workshop on New Challenges for NLP Frameworks. Citeseer (2010)"},{"issue":"2","key":"52_CR36","doi-asserted-by":"crossref","first-page":"415","DOI":"10.1109\/72.991427","volume":"13","author":"C-W Hsu","year":"2002","unstructured":"Hsu, C.-W., Lin, C.-J.: A comparison of methods for multiclass support vector machines. IEEE Trans. Neural Netw. 13(2), 415\u2013425 (2002)","journal-title":"IEEE Trans. Neural Netw."},{"issue":"3","key":"52_CR37","first-page":"18","volume":"2","author":"A Liaw","year":"2002","unstructured":"Liaw, A., Wiener, M.: Classification and regression by randomforest. R News 2(3), 18\u201322 (2002)","journal-title":"R News"}],"container-title":["Communications in Computer and Information Science","Pattern Recognition"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-10-3005-5_52","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T19:07:37Z","timestamp":1749668857000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-981-10-3005-5_52"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9789811030048","9789811030055"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-981-10-3005-5_52","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2016]]}}}