{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,4]],"date-time":"2026-07-04T11:22:44Z","timestamp":1783164164155,"version":"3.54.6"},"reference-count":60,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62303379"],"award-info":[{"award-number":["62303379"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.engappai.2026.115077","type":"journal-article","created":{"date-parts":[[2026,5,17]],"date-time":"2026-05-17T17:45:39Z","timestamp":1779039939000},"page":"115077","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"P2","title":["A unified multimodal framework for human behavior understanding via motion and language alignment"],"prefix":"10.1016","volume":"178","author":[{"given":"Sen","family":"Jia","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wen","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2026.115077_b1","series-title":"MotionLLM: Understanding human behaviors from human motions and videos","author":"Chen","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b2","series-title":"Dense point clouds matter: Dust-GS for scene reconstruction from sparse viewpoints","author":"Chen","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b3","doi-asserted-by":"crossref","unstructured":"Chi, H.-g., Ha, M.H., Chi, S., Lee, S.W., Huang, Q., Ramani, K., 2022. Infogcn: Representation learning for human skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20186\u201320196.","DOI":"10.1109\/CVPR52688.2022.01955"},{"key":"10.1016\/j.engappai.2026.115077_b4","unstructured":"Endo, M., Hsu, J., Li, J., Wu, J., 2023a. Motion Question Answering via Modular Motion Programs. In: Proceedings of the International Conference on Machine Learning. ICML."},{"key":"10.1016\/j.engappai.2026.115077_b5","series-title":"International Conference on Machine Learning","first-page":"9312","article-title":"Motion question answering via modular motion programs","author":"Endo","year":"2023"},{"issue":"1","key":"10.1016\/j.engappai.2026.115077_b6","doi-asserted-by":"crossref","first-page":"97","DOI":"10.5566\/ias.3087","article-title":"Advancing fall detection utilizing skeletal joint image representation and deformable layers","volume":"43","author":"Erg\u00fcder","year":"2024","journal-title":"Image Anal. Ster."},{"key":"10.1016\/j.engappai.2026.115077_b7","doi-asserted-by":"crossref","unstructured":"Goel, S., Pavlakos, G., Rajasegaran, J., Kanazawa, A., Malik, J., 2023. Humans in 4d: Reconstructing and tracking humans with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 14783\u201314794.","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"10.1016\/j.engappai.2026.115077_b8","series-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"26424","article-title":"MTSA: Multi-turn safety alignment for LLMs through multi-round red-teaming","author":"Guo","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b9","doi-asserted-by":"crossref","unstructured":"Guo, C., Zou, S., Zuo, X., Wang, S., Ji, W., Li, X., Cheng, L., 2022. Generating diverse and natural 3d human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 5152\u20135161.","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"10.1016\/j.engappai.2026.115077_b10","doi-asserted-by":"crossref","unstructured":"Guo, C., Zuo, X., Wang, S., Zou, S., Sun, Q., Deng, A., Gong, M., Cheng, L., 2020. Action2motion: Conditioned generation of 3d human motions. In: Proceedings of the 28th ACM International Conference on Multimedia. pp. 2021\u20132029.","DOI":"10.1145\/3394171.3413635"},{"key":"10.1016\/j.engappai.2026.115077_b11","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C., 2015. ActivityNet: A Large-Scale Video Benchmark for Human Activity Understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"10.1016\/j.engappai.2026.115077_b12","doi-asserted-by":"crossref","unstructured":"Hong, F., Pan, L., Cai, Z., Liu, Z., 2022. Versatile Multi-Modal Pre-Training for Human-Centric Perception. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 16156\u201316166.","DOI":"10.1109\/CVPR52688.2022.01568"},{"key":"10.1016\/j.engappai.2026.115077_b13","doi-asserted-by":"crossref","unstructured":"Hu, H., Dong, S., Zhao, Y., Lian, D., Li, Z., Gao, S., 2022. Transrac: Encoding multi-scale temporal correlation with transformers for repetitive action counting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19013\u201319022.","DOI":"10.1109\/CVPR52688.2022.01843"},{"issue":"1","key":"10.1016\/j.engappai.2026.115077_b14","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1186\/1475-925X-12-66","article-title":"Challenges, issues and trends in fall detection systems","volume":"12","author":"Igual","year":"2013","journal-title":"Biomed. Eng. Online"},{"key":"10.1016\/j.engappai.2026.115077_b15","series-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"14389","article-title":"Knowledge unlearning for mitigating privacy risks in language models","author":"Jang","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b16","series-title":"ICML","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.engappai.2026.115077_b17","doi-asserted-by":"crossref","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T., 2023. MotionGPT: Human Motion as a Foreign Language. In: Advances in Neural Information Processing Systems (NeurIPS).","DOI":"10.52202\/075280-0880"},{"key":"10.1016\/j.engappai.2026.115077_b18","doi-asserted-by":"crossref","unstructured":"Jin, P., Takanobu, R., Zhang, W., Cao, X., Yuan, L., 2024. Chat-univi: Unified visual representation empowers large language models with image and video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 13700\u201313710.","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"10.1016\/j.engappai.2026.115077_b19","first-page":"8255","article-title":"Flame: Free-form language-based motion synthesis & editing","volume":"vol. 37","author":"Kim","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b20","doi-asserted-by":"crossref","unstructured":"Lee, J., Lee, M., Lee, D., Lee, S., 2023. Hierarchically decomposed graph convolutional networks for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 10444\u201310453.","DOI":"10.1109\/ICCV51070.2023.00958"},{"key":"10.1016\/j.engappai.2026.115077_b21","series-title":"ChatMotion: A multimodal multi-agent for human motion analysis","author":"Lei","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b22","doi-asserted-by":"crossref","unstructured":"Li, L., 2024. CPSeg: Finer-grained Image Semantic Segmentation via Chain-of-Thought Language Prompting. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. pp. 513\u2013522.","DOI":"10.1109\/WACV57701.2024.00057"},{"key":"10.1016\/j.engappai.2026.115077_b23","series-title":"2025 International Conference on 3D Vision (3DV)","first-page":"240","article-title":"Unimotion: Unifying 3d human motion synthesis and understanding","author":"Li","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b24","series-title":"Videochat: Chat-centric video understanding","author":"Li","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b25","doi-asserted-by":"crossref","unstructured":"Li, L., Jia, S., Wang, J., Jiang, Z., Zhou, F., Dai, J., Zhang, T., Wu, Z., Hwang, J.-N., 2025. Human motion instruction tuning. In: Proceedings of the Computer Vision and Pattern Recognition Conference. pp. 17582\u201317591.","DOI":"10.1109\/CVPR52734.2025.01638"},{"key":"10.1016\/j.engappai.2026.115077_b26","series-title":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence, IJCAI-25","first-page":"8168","article-title":"Multimodal knowledge retrieval-augmented iterative alignment for satellite commonsense conversation","author":"Li","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b27","series-title":"Video-llava: Learning united visual representation by alignment before projection","author":"Lin","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b28","series-title":"Motion-x: A large-scale 3D expressive whole-body human motion dataset","author":"Lin","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b29","series-title":"Findings of the Association for Computational Linguistics: ACL 2024","first-page":"1817","article-title":"Towards safer large language models through machine unlearning","author":"Liu","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b30","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J., 2024. LLaVA: Visual Instruction Tuning. In: Proceedings of the Conference on Neural Information Processing Systems (NeurIPS).","DOI":"10.52202\/075280-1516"},{"issue":"8","key":"10.1016\/j.engappai.2026.115077_b31","doi-asserted-by":"crossref","first-page":"4137","DOI":"10.1109\/TCSVT.2023.3240472","article-title":"TranSkeleton: Hierarchical spatial\u2013temporal transformer for skeleton-based action recognition","volume":"33","author":"Liu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"10","key":"10.1016\/j.engappai.2026.115077_b32","doi-asserted-by":"crossref","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","article-title":"Ntu rgb+ d 120: A large-scale benchmark for 3d human activity understanding","volume":"42","author":"Liu","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2026.115077_b33","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J., 2019. AMASS: Archive of Motion Capture as Surface Shapes. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV.2019.00554"},{"issue":"9","key":"10.1016\/j.engappai.2026.115077_b34","doi-asserted-by":"crossref","DOI":"10.3390\/s19091988","article-title":"UP-fall detection dataset: A multimodal approach","volume":"19","author":"Mart\u00ednez-Villase\u00f1or","year":"2019","journal-title":"Sensors"},{"key":"10.1016\/j.engappai.2026.115077_b35","series-title":"GPT-3.5: Generative pre-trained transformer 3.5","author":"OpenAI","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b36","series-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing","first-page":"3419","article-title":"Red teaming language models with language models","author":"Perez","year":"2022"},{"key":"10.1016\/j.engappai.2026.115077_b37","series-title":"European Conference on Computer Vision","first-page":"480","article-title":"Temos: Generating diverse human motions from textual descriptions","author":"Petrovich","year":"2022"},{"key":"10.1016\/j.engappai.2026.115077_b38","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G., 2023. Tmr: Text-to-motion retrieval using contrastive 3d human motion synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 9488\u20139497.","DOI":"10.1109\/ICCV51070.2023.00870"},{"issue":"4","key":"10.1016\/j.engappai.2026.115077_b39","doi-asserted-by":"crossref","first-page":"236","DOI":"10.1089\/big.2016.0028","article-title":"The kit motion-language dataset","volume":"4","author":"Plappert","year":"2016","journal-title":"Big Data"},{"key":"10.1016\/j.engappai.2026.115077_b40","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1016\/j.robot.2018.07.006","article-title":"Learning a bidirectional mapping between human whole-body motion and natural language using deep recurrent neural networks","volume":"109","author":"Plappert","year":"2018","journal-title":"Robot. Auton. Syst. (RAS)"},{"key":"10.1016\/j.engappai.2026.115077_b41","doi-asserted-by":"crossref","unstructured":"Qu, H., Cai, Y., Liu, J., 2024. Llms are good action recognizers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 18395\u201318406.","DOI":"10.1109\/CVPR52733.2024.01741"},{"key":"10.1016\/j.engappai.2026.115077_b42","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I., 2021. Learning Transferable Visual Models From Natural Language Supervision. In: Proceedings of the 38th International Conference on Machine Learning. ICML, pp. 8748\u20138763."},{"key":"10.1016\/j.engappai.2026.115077_b43","series-title":"Advances in Neural Information Processing Systems","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b44","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.-T., Wang, G., 2016. NTU RGB+D: A Large Scale Dataset for 3D Human Activity Analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2016.115"},{"key":"10.1016\/j.engappai.2026.115077_b45","article-title":"Learning video-text aligned representations for video captioning","author":"Shi","year":"2023","journal-title":"IEEE Trans. Multimed. (TMM)"},{"key":"10.1016\/j.engappai.2026.115077_b46","series-title":"Every shot counts: Using exemplars for repetition counting in videos","author":"Sinha","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b47","series-title":"MotionCLIP: Exposing human motion generation to CLIP space","author":"Tevet","year":"2022"},{"key":"10.1016\/j.engappai.2026.115077_b48","doi-asserted-by":"crossref","first-page":"28","DOI":"10.3389\/frobt.2015.00028","article-title":"A review of human activity recognition methods","volume":"2","author":"Vrigkas","year":"2015","journal-title":"Front. Robot. AI"},{"key":"10.1016\/j.engappai.2026.115077_b49","doi-asserted-by":"crossref","unstructured":"Wandt, B., Rosenhahn, B., 2019. Repnet: Weakly supervised training of an adversarial reprojection network for 3d human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 7782\u20137791.","DOI":"10.1109\/CVPR.2019.00797"},{"key":"10.1016\/j.engappai.2026.115077_b50","series-title":"2025 IEEE 19th International Conference on Automatic Face and Gesture Recognition","first-page":"1","article-title":"Freqmixformerv2: Lightweight frequency-aware mixed transformer for human skeleton action recognition","author":"Wu","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b51","doi-asserted-by":"crossref","unstructured":"Wu, W., Zheng, C., Yang, Z., Chen, C., Das, S., Lu, A., 2024. Frequency guidance matters: Skeletal action recognition by frequency-aware mixed transformer. In: Proceedings of the 32nd ACM International Conference on Multimedia. pp. 4660\u20134669.","DOI":"10.1145\/3664647.3681009"},{"key":"10.1016\/j.engappai.2026.115077_b52","doi-asserted-by":"crossref","unstructured":"Xin, W., Miao, Q., Liu, Y., Liu, R., Pun, C.-M., Shi, C., 2023. Skeleton mixformer: Multivariate topology representation for skeleton-based action recognition. In: Proceedings of the 31st ACM International Conference on Multimedia. pp. 2211\u20132220.","DOI":"10.1145\/3581783.3611900"},{"key":"10.1016\/j.engappai.2026.115077_b53","series-title":"Understanding human behaviors from skeletal data: A review of datasets and methods","author":"Yang","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b54","doi-asserted-by":"crossref","unstructured":"Yao, Z., Cheng, X., Huang, Z., Li, L., 2025. Countllm: Towards generalizable repetitive action counting via large language model. In: Proceedings of the Computer Vision and Pattern Recognition Conference. pp. 19143\u201319153.","DOI":"10.1109\/CVPR52734.2025.01783"},{"key":"10.1016\/j.engappai.2026.115077_b55","series-title":"Poserac: Pose saliency transformer for repetitive action counting","author":"Yao","year":"2023"},{"key":"10.1016\/j.engappai.2026.115077_b56","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Black, M.J., Tang, S., 2021. We Are More Than Our Joints: Predicting How 3D Bodies Move. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR46437.2021.00338"},{"key":"10.1016\/j.engappai.2026.115077_b57","series-title":"Findings of the Association for Computational Linguistics: ACL 2024","first-page":"891","article-title":"Knowledgeable preference alignment for LLMs in domain-specific question answering","author":"Zhang","year":"2024"},{"key":"10.1016\/j.engappai.2026.115077_b58","series-title":"Videollama 3: Frontier multimodal foundation models for image and video understanding","author":"Zhang","year":"2025"},{"key":"10.1016\/j.engappai.2026.115077_b59","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Q., Wang, Y., 2023. Learning discriminative representations for skeleton based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10608\u201310617.","DOI":"10.1109\/CVPR52729.2023.01022"},{"key":"10.1016\/j.engappai.2026.115077_b60","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Yan, X., Cheng, Z.-Q., Yan, Y., Dai, Q., Hua, X.-S., 2024. Blockgcn: Redefine topology awareness for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2049\u20132058.","DOI":"10.1109\/CVPR52733.2024.00200"}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626013606?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197626013606?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,7,4]],"date-time":"2026-07-04T11:08:21Z","timestamp":1783163301000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197626013606"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":60,"alternative-id":["S0952197626013606"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2026.115077","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A unified multimodal framework for human behavior understanding via motion and language alignment","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2026.115077","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115077"}}