{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T18:58:56Z","timestamp":1757617136410,"version":"3.44.0"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"25","license":[{"start":{"date-parts":[[2024,10,12]],"date-time":"2024-10-12T00:00:00Z","timestamp":1728691200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,12]],"date-time":"2024-10-12T00:00:00Z","timestamp":1728691200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61876217","62176175"],"award-info":[{"award-number":["61876217","62176175"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-20350-4","type":"journal-article","created":{"date-parts":[[2024,10,12]],"date-time":"2024-10-12T05:02:03Z","timestamp":1728709323000},"page":"29371-29390","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Class-attention video transformer for engagement prediction"],"prefix":"10.1007","volume":"84","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5629-9134","authenticated-orcid":false,"given":"Xusheng","family":"Ai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Victor","family":"Sheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunhua","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Han","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiming","family":"Cui","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,12]]},"reference":[{"issue":"1","key":"20350_CR1","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1080\/17470218.2014.925481","volume":"68","author":"ME Nicholls","year":"2015","unstructured":"Nicholls ME, Loveless KM, Thomas NA, Loetscher T, Churches O (2015) Some participants may be better than others: Sustained attention and motivation are higher early in semester. Q J Exp Psychol. 68(1):10\u201318","journal-title":"Q J Exp Psychol."},{"key":"20350_CR2","doi-asserted-by":"crossref","unstructured":"Thong\u00a0Huynh V, Kim S-H, Lee G-S, Yang H-J (2019) Engagement intensity prediction with facial behavior features. In: Proc. ICMI, pp 567\u2013571 . ACM","DOI":"10.1145\/3340555.3355714"},{"key":"20350_CR3","doi-asserted-by":"crossref","unstructured":"Zhu B, Lan X, Guo X, Barner KE, Boncelet C (2020) Multi-rate attention based GRU model for engagement prediction. In: Proc. ICMI, pp 841\u2013848 . ACM","DOI":"10.1145\/3382507.3417965"},{"key":"20350_CR4","unstructured":"Gupta A, D\u2019Cunha A, Awasthi K, Balasubramanian V (2016) DAiSEE: Towards user engagement recognition in the wild. arXiv:1609.01885"},{"key":"20350_CR5","doi-asserted-by":"crossref","unstructured":"Huang T, Mei Y, Zhang H, Liu S, Yang H (2019) Fine-grained engagement recognition in online learning environment. In: Proc. ICEIEC, pp 338\u2013341 . IEEE","DOI":"10.1109\/ICEIEC.2019.8784559"},{"key":"20350_CR6","doi-asserted-by":"crossref","unstructured":"Wang Y, Kotha A, Hong P-h, Qiu M (2020) Automated student engagement monitoring and evaluation during learning in the wild. In: Proc. CSCloud, pp 270\u2013275 . IEEE","DOI":"10.1109\/CSCloud-EdgeCom49738.2020.00054"},{"key":"20350_CR7","doi-asserted-by":"crossref","unstructured":"Abedi A, Khan SS (2021) Improving state-of-the-art in detecting student engagement with ResNet and TCN hybrid network. In: Proc. CRV, pp 151\u2013157 . IEEE","DOI":"10.1109\/CRV52889.2021.00028"},{"key":"20350_CR8","doi-asserted-by":"crossref","unstructured":"Mohamad\u00a0Nezami O, Dras M, Hamey L, Richards D, Wan S, Paris C (2019) Automatic recognition of student engagement using deep learning and facial expression. In: Proc. ECMLPKDD, pp 273\u2013289 . Springer","DOI":"10.1007\/978-3-030-46133-1_17"},{"key":"20350_CR9","doi-asserted-by":"crossref","unstructured":"Grafsgaard JF, Wiggins JB, Boyer KE, Wiebe EN, Lester JC (2013) Automatically recognizing facial indicators of frustration: A learning-centric analysis. In: Proc. ACII, pp 159\u2013165 . IEEE","DOI":"10.1109\/ACII.2013.33"},{"key":"20350_CR10","doi-asserted-by":"crossref","unstructured":"Littlewort G, Whitehill J, Wu T, Fasel I, Frank M, Movellan J, Bartlett M (2011) The computer expression recognition toolbox (CERT). In: Proc. FG, pp 298\u2013305 . IEEE","DOI":"10.1109\/FG.2011.5771414"},{"key":"20350_CR11","doi-asserted-by":"crossref","unstructured":"Bosch N, D\u2019Mello S, Baker R, Ocumpaugh J, Shute V, Ventura M, Wang L, Zhao W (2015) Automatic detection of learning-centered affective states in the wild. In: Proc. AUI, pp 379\u2013388 . ACM","DOI":"10.1145\/2678025.2701397"},{"key":"20350_CR12","doi-asserted-by":"crossref","unstructured":"Saneiro M, Santos OC, Salmeron-Majadas S, Boticario JG (2014) Towards emotion detection in educational scenarios from facial expressions and body movements through multimodal approaches. The Sci World J","DOI":"10.1155\/2014\/484873"},{"issue":"1","key":"20350_CR13","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1109\/TAFFC.2014.2316163","volume":"5","author":"J Whitehill","year":"2014","unstructured":"Whitehill J, Serpell Z, Lin Y-C, Foster A, Movellan JR (2014) The faces of engagement: Automatic recognition of student engagementfrom facial expressions. IEEE Trans Affect Comput 5(1):86\u201398","journal-title":"IEEE Trans Affect Comput"},{"key":"20350_CR14","doi-asserted-by":"crossref","unstructured":"Kamath A, Biswas A, Balasubramanian V (2016) A crowdsourced approach to student engagement recognition in e-learning environments. In: Proc. WACV, pp 1\u20139 . IEEE","DOI":"10.1109\/WACV.2016.7477618"},{"issue":"1","key":"20350_CR15","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1109\/TAFFC.2016.2515084","volume":"8","author":"H Monkaresi","year":"2016","unstructured":"Monkaresi H, Bosch N, Calvo RA, D\u2019Mello SK (2016) Automated detection of engagement using video-based estimation of facial expressions and heart rate. IEEE Trans Affect Comput 8(1):15\u201328","journal-title":"IEEE Trans Affect Comput"},{"key":"20350_CR16","doi-asserted-by":"crossref","unstructured":"Kaur A, Mustafa A, Mehta L, Dhall A (2018) Prediction and localization of student engagement in the wild. In: Proc. DICTA, pp 1\u20138 . IEEE","DOI":"10.1109\/DICTA.2018.8615851"},{"key":"20350_CR17","doi-asserted-by":"crossref","unstructured":"Yang J, Wang K, Peng X, Qiao Y (2018) Deep recurrent multi-instance learning with spatio-temporal features for engagement intensity prediction. In: Proc. ICMI, pp 594\u2013598 . ACM","DOI":"10.1145\/3242969.3264981"},{"key":"20350_CR18","doi-asserted-by":"crossref","unstructured":"Wu J, Yang B, Wang Y, Hattori G (2020) Advanced multi-instance learning method with multi-features engineering and conservative optimization for engagement intensity prediction. In: Proc. ICMI, pp 777\u2013783 . ACM","DOI":"10.1145\/3382507.3417959"},{"key":"20350_CR19","doi-asserted-by":"crossref","unstructured":"Baltrusaitis T, Zadeh A, Lim YC, Morency L-P (2018) Openface 2.0: Facial behavior analysis toolkit. In: Proc. FG, pp 59\u201366 . IEEE","DOI":"10.1109\/FG.2018.00019"},{"key":"20350_CR20","doi-asserted-by":"crossref","unstructured":"Geng L, Xu M, Wei Z, Zhou X (2019) Learning deep spatiotemporal feature for engagement recognition of online courses. In: Proc. SSCI, pp 442\u2013447 . IEEE","DOI":"10.1109\/SSCI44817.2019.9002713"},{"key":"20350_CR21","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Goyal P, Girshick R, He K, Doll\u00e1r P (2017) Focal loss for dense object detection. In: Proc. ICCV, pp 2980\u20132988 . IEEE","DOI":"10.1109\/ICCV.2017.324"},{"key":"20350_CR22","doi-asserted-by":"crossref","unstructured":"Zhang H, Xiao X, Huang T, Liu S, Xia Y, Li J (2019) An novel end-to-end network for automatic student engagement recognition. In: Proc. ICEIEC, pp 342\u2013345 . IEEE","DOI":"10.1109\/ICEIEC.2019.8784507"},{"key":"20350_CR23","unstructured":"He J, Pedroza I, Liu Q (2019) MetaNet: A boosting-inspired deep learning image classification ensemble technique. In: Proceedings. IPCV, World Academy of Science, Engineering and Technology pp 51\u201354"},{"issue":"10","key":"20350_CR24","doi-asserted-by":"publisher","first-page":"6609","DOI":"10.1007\/s10489-020-02139-8","volume":"51","author":"J Liao","year":"2021","unstructured":"Liao J, Liang Y, Pan J (2021) Deep facial spatiotemporal network for engagement prediction in online learning. Appl Intell 51(10):6609\u20136621","journal-title":"Appl Intell"},{"key":"20350_CR25","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"20350_CR26","doi-asserted-by":"crossref","unstructured":"Xie S, Sun C, Huang J, Tu Z, Murphy K (2018) Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In: Proc. ECCV, pp 305\u2013321 . Springer","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"20350_CR27","doi-asserted-by":"crossref","unstructured":"Wu H, Xiao B, Codella N, Liu M, Dai X, Yuan L, Zhang L (2021) Cvt: Introducing convolutions to vision transformers. In: Proc. ICCV, pp 22\u201331","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"20350_CR28","unstructured":"Lee SH, Lee S, Song BC (2021) Vision transformer for small-size datasets. arXiv:2112.13492"},{"key":"20350_CR29","doi-asserted-by":"crossref","unstructured":"Liu Z, Ning J, Cao Y, Wei Y, Zhang Z, Lin S, Hu H (2022) Video swin transformer. In: Proc. CVPR, pp 3202\u20133211 . IEEE","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"20350_CR30","doi-asserted-by":"crossref","unstructured":"Touvron H, Cord M, Sablayrolles A, Synnaeve G, J\u00e9gou H (2021) Going deeper with image transformers. In: Proc. ICCV, pp 32\u201342 . IEEE","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"20350_CR31","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv:2010.11929"},{"key":"20350_CR32","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv:1412.6980"},{"key":"20350_CR33","doi-asserted-by":"crossref","unstructured":"Huang G, Sun Y, Liu Z, Sedra D, Weinberger KQ (2016) Deep networks with stochastic depth. In: Proc. ECCV, pp 646\u2013661 . Springer","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"20350_CR34","unstructured":"Gu A, Dao T (2023) Mamba: Linear-time sequence modeling with selective state spaces. arXiv:2312.00752"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20350-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-20350-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20350-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T23:18:30Z","timestamp":1757114310000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-20350-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,12]]},"references-count":34,"journal-issue":{"issue":"25","published-online":{"date-parts":[[2025,7]]}},"alternative-id":["20350"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-20350-4","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2024,10,12]]},"assertion":[{"value":"25 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 September 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 October 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 October 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}]}}