{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T12:41:59Z","timestamp":1771677719602,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2022,11,19]],"date-time":"2022-11-19T00:00:00Z","timestamp":1668816000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,11,19]],"date-time":"2022-11-19T00:00:00Z","timestamp":1668816000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R & D Program of China","doi-asserted-by":"crossref","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s00371-022-02721-w","type":"journal-article","created":{"date-parts":[[2022,11,19]],"date-time":"2022-11-19T19:02:50Z","timestamp":1668884570000},"page":"6205-6220","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["STAN: spatiotemporal attention network for video-based facial expression recognition"],"prefix":"10.1007","volume":"39","author":[{"given":"Yufan","family":"Yi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7764-0941","authenticated-orcid":false,"given":"Yiping","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Ziyi","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Linhui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xinli","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Tian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,19]]},"reference":[{"key":"2721_CR1","doi-asserted-by":"crossref","unstructured":"Cao, Z., Chu, Z., Liu, D., et\u00a0al.: A vector-based representation to enhance head pose estimation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1188\u20131197 (2021)","DOI":"10.1109\/WACV48630.2021.00123"},{"key":"2721_CR2","doi-asserted-by":"crossref","unstructured":"Cao, Z., Liu, D., Wang, Q., et\u00a0al.: Towards unbiased label distribution learning for facial pose estimation using anisotropic spherical gaussian. arXiv preprint arXiv:2208.09122 (2022)","DOI":"10.1007\/978-3-031-19775-8_43"},{"key":"2721_CR3","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.ins.2017.10.044","volume":"428","author":"L Chen","year":"2018","unstructured":"Chen, L., Zhou, M., Su, W., et al.: Softmax regression based deep sparse autoencoder network for facial emotion recognition in human-robot interaction. Inf. Sci. 428, 49\u201361 (2018)","journal-title":"Inf. Sci."},{"issue":"37","key":"2721_CR4","doi-asserted-by":"publisher","first-page":"28,169","DOI":"10.1007\/s11042-020-09412-5","volume":"79","author":"DY Choi","year":"2020","unstructured":"Choi, D.Y., Song, B.C.: Semi-supervised learning for facial expression-based emotion recognition in the continuous domain. Multimed. Tools Appl. 79(37), 28,169-28,187 (2020)","journal-title":"Multimed. Tools Appl."},{"key":"2721_CR5","doi-asserted-by":"crossref","unstructured":"Cui, Y., Yan, L., Cao, Z., et\u00a0al.: Tf-blender: Temporal feature blender for video object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8138\u20138147 (2021)","DOI":"10.1109\/ICCV48922.2021.00803"},{"issue":"03","key":"2721_CR6","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1109\/MMUL.2012.26","volume":"19","author":"A Dhall","year":"2012","unstructured":"Dhall, A., Goecke, R., Lucey, S., et al.: Collecting large, richly annotated facial-expression databases from movies. IEEE Multimed. 19(03), 34\u201341 (2012)","journal-title":"IEEE Multimed."},{"key":"2721_CR7","doi-asserted-by":"crossref","unstructured":"Ding, H., Zhou, S.K., Chellappa, R.: Facenet2expnet: Regularizing a deep face recognition net for expression recognition. In: 2017 12th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2017), IEEE, pp. 118\u2013126 (2017)","DOI":"10.1109\/FG.2017.23"},{"key":"2721_CR8","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"2721_CR9","doi-asserted-by":"crossref","unstructured":"Fu, J., Zheng, H., Mei, T.: Look closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4438\u20134446 (2017)","DOI":"10.1109\/CVPR.2017.476"},{"key":"2721_CR10","doi-asserted-by":"crossref","unstructured":"Gao, J., Zhao, Y.: Tfe: A transformer architecture for occlusion aware facial expression recognition. Frontiers in Neurorobotics 15 (2021)","DOI":"10.3389\/fnbot.2021.763100"},{"issue":"1","key":"2721_CR11","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/s00371-018-1585-8","volume":"36","author":"I Gogi\u0107","year":"2020","unstructured":"Gogi\u0107, I., Manhart, M., Pand\u017ei\u0107, I.S., et al.: Fast facial expression recognition using local binary features and shallow neural networks. Vis. Comput. 36(1), 97\u2013112 (2020). https:\/\/doi.org\/10.1007\/s00371-018-1585-8","journal-title":"Vis. Comput."},{"key":"2721_CR12","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Learning spatio-temporal features with 3d residual networks for action recognition. In: Proceedings of the IEEE international conference on computer vision workshops, pp. 3154\u20133160 (2017)","DOI":"10.1109\/ICCVW.2017.373"},{"key":"2721_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"3","key":"2721_CR14","doi-asserted-by":"publisher","first-page":"1089","DOI":"10.1109\/TCSVT.2021.3074259","volume":"32","author":"J Hu","year":"2021","unstructured":"Hu, J., Liao, X., Wang, W., et al.: Detecting compressed deepfake videos in social networks using frame-temporality two-stream convolutional network. IEEE Trans. Circ. Syst. Video Technol. 32(3), 1089\u20131102 (2021)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"2721_CR15","doi-asserted-by":"publisher","first-page":"176","DOI":"10.1016\/j.jvcir.2018.12.039","volume":"59","author":"M Hu","year":"2019","unstructured":"Hu, M., Wang, H., Wang, X., et al.: Video facial emotion recognition based on local enhanced motion history image and cnn-ctslstm networks. J. Vis. Commun. Image Represent. 59, 176\u2013185 (2019)","journal-title":"J. Vis. Commun. Image Represent."},{"issue":"8","key":"2721_CR16","doi-asserted-by":"publisher","first-page":"2617","DOI":"10.1007\/s00371-021-02136-z","volume":"38","author":"M Hu","year":"2022","unstructured":"Hu, M., Ge, P., Wang, X., et al.: A spatio-temporal integrated model based on local and global features for video expression recognition. Vis. Comput. 38(8), 2617\u20132634 (2022). https:\/\/doi.org\/10.1007\/s00371-021-02136-z","journal-title":"Vis. Comput."},{"key":"2721_CR17","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/j.ins.2021.08.043","volume":"580","author":"Q Huang","year":"2021","unstructured":"Huang, Q., Huang, C., Wang, X., et al.: Facial expression recognition with grid-wise attention and visual transformer. Inf. Sci. 580, 35\u201354 (2021)","journal-title":"Inf. Sci."},{"issue":"1","key":"2721_CR18","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2012","unstructured":"Ji, S., Xu, W., Yang, M., et al.: 3d convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35(1), 221\u2013231 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2721_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, X., Zong, Y., Zheng, W., et\u00a0al.: Dfew: A large-scale database for recognizing dynamic facial expressions in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2881\u20132889 (2020)","DOI":"10.1145\/3394171.3413620"},{"key":"2721_CR20","doi-asserted-by":"crossref","unstructured":"Jung, H., Lee, S., Yim, J., et\u00a0al.: Joint fine-tuning in deep neural networks for facial expression recognition. In: Proceedings of the IEEE international conference on computer vision, pp. 2983\u20132991 (2015)","DOI":"10.1109\/ICCV.2015.341"},{"key":"2721_CR21","doi-asserted-by":"crossref","unstructured":"Kim, D.H., Lee, M.K., Choi, D.Y., et\u00a0al.: Multi-modal emotion recognition using semi-supervised learning and multiple neural networks in-the-wild. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 529\u2013535 (2017)","DOI":"10.1145\/3136755.3143005"},{"key":"2721_CR22","doi-asserted-by":"crossref","unstructured":"Kumar, V., Rao, S., Yu, L.: Noisy student training using body language dataset improves facial expression recognition. In: European Conference on Computer Vision, Springer, pp. 756\u2013773 (2020)","DOI":"10.1007\/978-3-030-66415-2_53"},{"issue":"2","key":"2721_CR23","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/s00371-019-01627-4","volume":"36","author":"K Li","year":"2020","unstructured":"Li, K., Jin, Y., Akram, M.W., et al.: Facial expression recognition with convolutional neural networks via a new face cropping and rotation strategy. Vis. Comput. 36(2), 391\u2013404 (2020). https:\/\/doi.org\/10.1007\/s00371-019-01627-4","journal-title":"Vis. Comput."},{"issue":"5","key":"2721_CR24","doi-asserted-by":"publisher","first-page":"2439","DOI":"10.1109\/TIP.2018.2886767","volume":"28","author":"Y Li","year":"2018","unstructured":"Li, Y., Zeng, J., Shan, S., et al.: Occlusion aware facial expression recognition using cnn with attention mechanism. IEEE Trans. Image Process. 28(5), 2439\u20132450 (2018)","journal-title":"IEEE Trans. Image Process."},{"issue":"3","key":"2721_CR25","doi-asserted-by":"publisher","first-page":"499","DOI":"10.1007\/s00371-019-01636-3","volume":"36","author":"D Liang","year":"2020","unstructured":"Liang, D., Liang, H., Yu, Z., et al.: Deep convolutional bilstm fusion network for facial expression recognition. Vis. Comput. 36(3), 499\u2013508 (2020). https:\/\/doi.org\/10.1007\/s00371-019-01636-3","journal-title":"Vis. Comput."},{"key":"2721_CR26","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-022-02413-5","author":"X Liang","year":"2022","unstructured":"Liang, X., Xu, L., Zhang, W., et al.: A convolution-transformer dual branch network for head-pose and occlusion facial expression recognition. Vis. Comput. (2022). https:\/\/doi.org\/10.1007\/s00371-022-02413-5","journal-title":"Vis. Comput."},{"issue":"5","key":"2721_CR27","doi-asserted-by":"publisher","first-page":"955","DOI":"10.1109\/JSTSP.2020.3002391","volume":"14","author":"X Liao","year":"2020","unstructured":"Liao, X., Li, K., Zhu, X., et al.: Robust detection of image operator chain with two-stream convolutional neural network. IEEE J. Sel. Top. Signal Process. 14(5), 955\u2013968 (2020)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"2721_CR28","doi-asserted-by":"crossref","unstructured":"Liu, D., Cui, Y., Tan, W., et\u00a0al.: (2021a) Sg-net: Spatial granularity network for one-stage video instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9816\u20139825","DOI":"10.1109\/CVPR46437.2021.00969"},{"key":"2721_CR29","doi-asserted-by":"crossref","unstructured":"Liu, D., Cui, Y., Yan, L., et\u00a0al.: Densernet: Weakly supervised visual localization using multi-scale feature aggregation. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 6101\u20136109 (2021b)","DOI":"10.1609\/aaai.v35i7.16760"},{"key":"2721_CR30","doi-asserted-by":"crossref","unstructured":"Mavani, V., Raman, S., Miyapuram, K.P.: Facial expression recognition using visual saliency and deep learning. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 2783\u20132788 (2017)","DOI":"10.1109\/ICCVW.2017.327"},{"key":"2721_CR31","doi-asserted-by":"crossref","unstructured":"Meng, D., Peng, X., Wang, K., et\u00a0al.: Frame attention networks for facial expression recognition in videos. In: 2019 IEEE International Conference on Image Processing (ICIP), IEEE, pp. 3866\u20133870 (2019)","DOI":"10.1109\/ICIP.2019.8803603"},{"key":"2721_CR32","unstructured":"M\u00fcller, R., Kornblith, S., Hinton, G.E.: When does label smoothing help? Adv. Neural Inform. process. syst. 32 (2019)"},{"key":"2721_CR33","doi-asserted-by":"crossref","unstructured":"Ouyang, X., Kawaai, S., Goh, E.G.H., et\u00a0al.: Audio-visual emotion recognition using deep transfer learning and multiple temporal models. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 577\u2013582 (2017)","DOI":"10.1145\/3136755.3143012"},{"issue":"3","key":"2721_CR34","doi-asserted-by":"publisher","first-page":"1487","DOI":"10.1109\/TIP.2017.2774041","volume":"27","author":"Y Peng","year":"2017","unstructured":"Peng, Y., He, X., Zhao, J.: Object-part attention model for fine-grained image classification. IEEE Trans. Image Process. 27(3), 1487\u20131500 (2017)","journal-title":"IEEE Trans. Image Process."},{"key":"2721_CR35","first-page":"322","volume-title":"International Work-Conference on Artificial Neural Networks","author":"P Salgado","year":"2021","unstructured":"Salgado, P., Banos, O., Villalonga, C.: Facial expression interpretation in asd using deep learning. In: International Work-Conference on Artificial Neural Networks, pp. 322\u2013333. Springer (2021)"},{"issue":"11","key":"2721_CR36","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"2721_CR37","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1016\/j.patrec.2017.10.022","volume":"119","author":"N Sun","year":"2019","unstructured":"Sun, N., Li, Q., Huan, R., et al.: Deep spatial-temporal feature fusion for facial expression recognition in static images. Pattern Recogn. Lett. 119, 49\u201361 (2019)","journal-title":"Pattern Recogn. Lett."},{"issue":"2","key":"2721_CR38","doi-asserted-by":"publisher","first-page":"888","DOI":"10.1109\/TNSE.2021.3139671","volume":"9","author":"J Tan","year":"2021","unstructured":"Tan, J., Liao, X., Liu, J., et al.: Channel attention image steganography with generative adversarial networks. IEEE Trans. Network Sci. Eng. 9(2), 888\u2013903 (2021)","journal-title":"IEEE Trans. Network Sci. Eng."},{"key":"2721_CR39","doi-asserted-by":"crossref","unstructured":"Vielzeuf, V., Pateux, S., Jurie, F.: Temporal multimodal fusion for video emotion classification in-the-wild. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 569\u2013576 (2017)","DOI":"10.1145\/3136755.3143011"},{"key":"2721_CR40","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1007\/978-3-319-97909-0_12","volume-title":"Chinese Conference on Biometric Recognition","author":"H Wang","year":"2018","unstructured":"Wang, H., Zhou, G., Hu, M., et al.: Video emotion recognition using local enhanced motion history image and cnn-rnn networks. In: Chinese Conference on Biometric Recognition, pp. 109\u2013119. Springer (2018)"},{"key":"2721_CR41","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1109\/TIP.2019.2956143","volume":"29","author":"K Wang","year":"2020","unstructured":"Wang, K., Peng, X., Yang, J., et al.: Region attention networks for pose and occlusion robust facial expression recognition. IEEE Trans. Image Process. 29, 4057\u20134069 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"2721_CR42","unstructured":"Wen, Z., Lin, W., Wang, T., et\u00a0al.: Distract your attention: multi-head cross attention network for facial expression recognition. arXiv preprint arXiv:2109.07270 (2021)"},{"key":"2721_CR43","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J.Y., et\u00a0al.: Cbam: Convolutional block attention module. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01234-2_1"},{"issue":"10","key":"2721_CR44","doi-asserted-by":"publisher","first-page":"6642","DOI":"10.1109\/TCSVT.2022.3177320","volume":"32","author":"L Yan","year":"2022","unstructured":"Yan, L., Ma, S., Wang, Q., et al.: Video captioning using global-local representation. IEEE Trans. Circ. Syst. Video Technol. 32(10), 6642\u20136656 (2022). https:\/\/doi.org\/10.1109\/TCSVT.2022.3177320","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"2721_CR45","doi-asserted-by":"crossref","unstructured":"Yan, L., Wang, Q., Cui, Y., et\u00a0al.: Gl-rg: Global-local representation granularity for video captioning. arXiv preprint arXiv:2205.10706 (2022b)","DOI":"10.24963\/ijcai.2022\/384"},{"key":"2721_CR46","doi-asserted-by":"publisher","first-page":"5984","DOI":"10.1109\/TIP.2021.3089942","volume":"30","author":"CB Zhang","year":"2021","unstructured":"Zhang, C.B., Jiang, P.T., Hou, Q., et al.: Delving deep into label smoothing. IEEE Trans. Image Process. 30, 5984\u20135996 (2021)","journal-title":"IEEE Trans. Image Process."},{"issue":"9","key":"2721_CR47","doi-asserted-by":"publisher","first-page":"4193","DOI":"10.1109\/TIP.2017.2689999","volume":"26","author":"K Zhang","year":"2017","unstructured":"Zhang, K., Huang, Y., Du, Y., et al.: Facial expression recognition based on deep evolutional spatial-temporal networks. IEEE Trans. Image Process. 26(9), 4193\u20134203 (2017)","journal-title":"IEEE Trans. Image Process."},{"key":"2721_CR48","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., et\u00a0al.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02721-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-022-02721-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02721-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,9]],"date-time":"2023-11-09T12:05:42Z","timestamp":1699531542000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-022-02721-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11,19]]},"references-count":48,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["2721"],"URL":"https:\/\/doi.org\/10.1007\/s00371-022-02721-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,11,19]]},"assertion":[{"value":"28 October 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 November 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Conflict of interest The authors declare that they have no conflict of interest. Data cannot be made available.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}