{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T14:16:40Z","timestamp":1773325000534,"version":"3.50.1"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2025,10,17]],"date-time":"2025-10-17T00:00:00Z","timestamp":1760659200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100002701","name":"Ministry of Education","doi-asserted-by":"publisher","award":["2020R1A6A1A03038540"],"award-info":[{"award-number":["2020R1A6A1A03038540"]}],"id":[{"id":"10.13039\/501100002701","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["RS-2019-II190231"],"award-info":[{"award-number":["RS-2019-II190231"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019403","name":"Information Technology Research Centre","doi-asserted-by":"publisher","award":["IITP-2025-RS-2022-00156354"],"award-info":[{"award-number":["IITP-2025-RS-2022-00156354"]}],"id":[{"id":"10.13039\/501100019403","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019403","name":"Information Technology Research Centre","doi-asserted-by":"publisher","award":["IITP-2025-RS-2023-00254529"],"award-info":[{"award-number":["IITP-2025-RS-2023-00254529"]}],"id":[{"id":"10.13039\/501100019403","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100019403","name":"Information Technology Research Centre","doi-asserted-by":"publisher","award":["2024-0-00037"],"award-info":[{"award-number":["2024-0-00037"]}],"id":[{"id":"10.13039\/501100019403","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100010418","name":"Institute for Information and Communications Technology Promotion","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100002546","name":"Sejong University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100002546","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Engineering Applications of Artificial Intelligence"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1016\/j.engappai.2025.112739","type":"journal-article","created":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T11:41:58Z","timestamp":1761133318000},"page":"112739","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"P1","title":["Temporal diffuser: Timing scale-aware modulation for sign language production"],"prefix":"10.1016","volume":"163","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2535-9112","authenticated-orcid":false,"given":"Kim-Thuy","family":"Kha","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7565-5736","authenticated-orcid":false,"given":"Anh H.","family":"Vo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2870-4240","authenticated-orcid":false,"given":"Van-Vang","family":"Le","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7142-5976","authenticated-orcid":false,"given":"Oh-Young","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4645-1395","authenticated-orcid":false,"given":"Yong-Guk","family":"Kim","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.engappai.2025.112739_b1","doi-asserted-by":"crossref","first-page":"57","DOI":"10.1016\/j.patrec.2023.08.001","article-title":"The effectiveness of T5, GPT-2, and BERT on text-to-image generation task","volume":"173","author":"Bahani","year":"2023","journal-title":"Pattern Recognit. Lett."},{"key":"10.1016\/j.engappai.2025.112739_b2","doi-asserted-by":"crossref","unstructured":"Baltatzis, V., Potamias, R.A., Ververas, E., Sun, G., Deng, J., Zafeiriou, S., 2024. Neural Sign Actors: A diffusion model for 3D sign language production from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1985\u20131995.","DOI":"10.1109\/CVPR52733.2024.00194"},{"key":"10.1016\/j.engappai.2025.112739_b3","series-title":"Lumiere: A space-time diffusion model for video generation","author":"Bar-Tal","year":"2024"},{"key":"10.1016\/j.engappai.2025.112739_b4","unstructured":"Camgoz, N.C., Koller, O., Hadfield, S., Bowden, R., 2020. Sign language transformers: Joint end-to-end sign language recognition and translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10023\u201310033."},{"issue":"1","key":"10.1016\/j.engappai.2025.112739_b5","doi-asserted-by":"crossref","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","article-title":"Openpose: Realtime multi-person 2d pose estimation using part affinity fields","volume":"43","author":"Cao","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.engappai.2025.112739_b6","series-title":"On the difference of bert-style and clip-style text encoders","author":"Chen","year":"2023"},{"key":"10.1016\/j.engappai.2025.112739_b7","doi-asserted-by":"crossref","first-page":"8502","DOI":"10.1109\/TASE.2024.3486203","article-title":"EEGCiD: EEG condensation into diffusion model","volume":"22","author":"Chen","year":"2025","journal-title":"IEEE Trans. Autom. Sci. Eng."},{"key":"10.1016\/j.engappai.2025.112739_b8","series-title":"Findings of the Association for Computational Linguistics: ACL","article-title":"On the Difference of BERT-style and CLIP-style Text Encoders","author":"Chen, Zhihong","year":"2023"},{"key":"10.1016\/j.engappai.2025.112739_b9","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K., 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). pp. 4171\u20134186.","DOI":"10.18653\/v1\/N19-1423"},{"key":"10.1016\/j.engappai.2025.112739_b10","doi-asserted-by":"crossref","unstructured":"Dey, R., Salem, F.M., 2017. Gate-variants of Gated Recurrent Unit (GRU) neural networks. In: IEEE 60th International Midwest Symposium on Circuits and Systems. MWSCAS, pp. 1597\u20131600.","DOI":"10.1109\/MWSCAS.2017.8053243"},{"key":"10.1016\/j.engappai.2025.112739_b11","series-title":"Word-conditioned 3D American sign language motion generation","author":"Dong","year":"2024"},{"key":"10.1016\/j.engappai.2025.112739_b12","doi-asserted-by":"crossref","unstructured":"Duarte, A., Palaskar, S., Ventura, L., Ghadiyaram, D., DeHaan, K., Metze, F., Torres, J., Giro-i Nieto, X., 2021. How2sign: a large-scale multimodal dataset for continuous american sign language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2735\u20132744.","DOI":"10.1109\/CVPR46437.2021.00276"},{"key":"10.1016\/j.engappai.2025.112739_b13","doi-asserted-by":"crossref","first-page":"17351","DOI":"10.1007\/s00521-020-04867-x","article-title":"A CNN\u2013LSTM model for gold price time-series forecasting","volume":"32","author":"E.","year":"2020","journal-title":"Neural Comput. Appl."},{"key":"10.1016\/j.engappai.2025.112739_b14","unstructured":"Ekkasit, P., Pu, W., Minwoo, L., Chen, C., 2024. MMM: Generative Masked Motion Model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR."},{"key":"10.1016\/j.engappai.2025.112739_b15","unstructured":"Fang, S., Sui, C., Zhang, X., Tian, Y., 2023. SignDiff: Learning Diffusion Models for American Sign Language Production. arXiv preprint arXiv:2308.16082."},{"key":"10.1016\/j.engappai.2025.112739_b16","first-page":"3785","article-title":"RWTH-PHOENIX-weather: A large vocabulary sign language recognition and translation corpus","volume":"vol. 9","author":"Forster","year":"2012"},{"key":"10.1016\/j.engappai.2025.112739_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.measurement.2022.111759","article-title":"Deep learning CNN-LSTM-MLP hybrid fusion model for feature optimizations and daily solar radiation prediction","volume":"202","author":"Ghimire","year":"2022","journal-title":"Measurement"},{"key":"10.1016\/j.engappai.2025.112739_b18","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.engappai.2025.112739_b19","series-title":"Classifier-free diffusion guidance","author":"Ho","year":"2022"},{"key":"10.1016\/j.engappai.2025.112739_b20","doi-asserted-by":"crossref","unstructured":"Huang, W., Pan, W., Zhao, Z., Tian, Q., 2021. Towards fast and high-quality sign language production. In: Proceedings of the 29th ACM International Conference on Multimedia. pp. 3172\u20133181.","DOI":"10.1145\/3474085.3475463"},{"key":"10.1016\/j.engappai.2025.112739_b21","first-page":"3","article-title":"Non-autoregressive sign language production with Gaussian space","volume":"vol. 1","author":"Hwang","year":"2021"},{"key":"10.1016\/j.engappai.2025.112739_b22","series-title":"IEEE International Symposium on INnovations in Intelligent SysTems and Applications","first-page":"1","article-title":"A hybrid translation system from turkish spoken language to turkish sign language","author":"Kayahan","year":"2019"},{"key":"10.1016\/j.engappai.2025.112739_b23","doi-asserted-by":"crossref","unstructured":"Kezar, L., Thomason, J., Sehyr, Z.S., 2023. Improving Sign Recognition with Phonology. In: The 17th Conference of the European Chapter of the Association for Computational Linguistics. EACL.","DOI":"10.18653\/v1\/2023.eacl-main.200"},{"key":"10.1016\/j.engappai.2025.112739_b24","unstructured":"Kouremenos, D., Ntalianis, K.S., Siolas, G., Stafylopatis, A., 2018. Statistical Machine Translation for Greek to Greek Sign Language Using Parallel Corpora Produced via Rule-Based Machine Translation. In: CIMA@ ICTAI. pp. 28\u201342."},{"key":"10.1016\/j.engappai.2025.112739_b25","doi-asserted-by":"crossref","unstructured":"Krishna, S., Ukey, J., 2021. Gan based indian sign language synthesis. In: Proceedings of the Twelfth Indian Conference on Computer Vision, Graphics and Image Processing. pp. 1\u20138.","DOI":"10.1145\/3490035.3490301"},{"issue":"4","key":"10.1016\/j.engappai.2025.112739_b26","doi-asserted-by":"crossref","first-page":"261","DOI":"10.1007\/s10489-024-06042-4","article-title":"HSTforU: anomaly detection in aerial and ground-based videos with hierarchical spatio-temporal transformer for U-net","volume":"55","author":"Le","year":"2025","journal-title":"Appl. Intell."},{"key":"10.1016\/j.engappai.2025.112739_b27","doi-asserted-by":"crossref","unstructured":"Lin, W., Wu, Z., Chen, J., Huang, J., Jin, L., 2023. Scale-aware modulation meet transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 6015\u20136026.","DOI":"10.1109\/ICCV51070.2023.00553"},{"key":"10.1016\/j.engappai.2025.112739_b28","doi-asserted-by":"crossref","unstructured":"Ma, J., Gao, W., Wu, J., Wang, C., 2000. A continuous Chinese sign language recognition system. In: Proceedings Fourth IEEE International Conference on Automatic Face and Gesture Recognition (Cat. No. PR00580). pp. 428\u2013433.","DOI":"10.1109\/AFGR.2000.840670"},{"key":"10.1016\/j.engappai.2025.112739_b29","series-title":"Ms2sl: multimodal spoken data-driven continuous sign language production","author":"Ma","year":"2024"},{"key":"10.1016\/j.engappai.2025.112739_b30","doi-asserted-by":"crossref","unstructured":"Narasimhaswamy, S., Bhattacharya, U., Chen, X., Dasgupta, I., Mitra, S., Hoai, M., 2024. HanDiffuser: Text-to-Image Generation With Realistic Hand Appearances. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 2468\u20132479.","DOI":"10.1109\/CVPR52733.2024.00239"},{"issue":"23","key":"10.1016\/j.engappai.2025.112739_b31","doi-asserted-by":"crossref","first-page":"13153","DOI":"10.1007\/s00500-022-07014-x","article-title":"Dynamic GAN for high-quality sign language video generation from skeletal poses using generative adversarial networks","volume":"26","author":"Natarajan","year":"2022","journal-title":"Soft Comput."},{"key":"10.1016\/j.engappai.2025.112739_b32","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., Choutas, V., Ghorbani, N., Bolkart, T., Osman, A.A., Tzionas, D., Black, M.J., 2019. Expressive body capture: 3d hands, face, and body from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 10975\u201310985.","DOI":"10.1109\/CVPR.2019.01123"},{"key":"10.1016\/j.engappai.2025.112739_b33","series-title":"Single motion diffusion","author":"Raab","year":"2023"},{"key":"10.1016\/j.engappai.2025.112739_b34","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.engappai.2025.112739_b35","series-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing","article-title":"Sentence-BERT: Sentence embeddings using siamese BERT-networks","author":"Reimers","year":"2019"},{"key":"10.1016\/j.engappai.2025.112739_b36","series-title":"Adversarial training for multi-channel sign language production","author":"Saunders","year":"2020"},{"key":"10.1016\/j.engappai.2025.112739_b37","series-title":"ECCV","first-page":"687","article-title":"Progressive transformers for end-to-end sign language production","author":"Saunders","year":"2020"},{"key":"10.1016\/j.engappai.2025.112739_b38","series-title":"16th IEEE International Conference on Automatic Face and Gesture Recognition","first-page":"1","article-title":"Anonysign: Novel human appearance synthesis for sign language video anonymisation","author":"Saunders","year":"2021"},{"issue":"7","key":"10.1016\/j.engappai.2025.112739_b39","doi-asserted-by":"crossref","first-page":"2113","DOI":"10.1007\/s11263-021-01457-9","article-title":"Continuous 3d multi-channel sign language production via progressive transformers and mixture density networks","volume":"129","author":"Saunders","year":"2021","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.engappai.2025.112739_b40","doi-asserted-by":"crossref","unstructured":"Saunders, B., Camgoz, N.C., Bowden, R., 2021c. Mixed signals: Sign language production via a mixture of motion primitives. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 1919\u20131929.","DOI":"10.1109\/ICCV48922.2021.00193"},{"key":"10.1016\/j.engappai.2025.112739_b41","series-title":"Denoising diffusion implicit models","author":"Song","year":"2020"},{"issue":"4","key":"10.1016\/j.engappai.2025.112739_b42","doi-asserted-by":"crossref","first-page":"891","DOI":"10.1007\/s11263-019-01281-2","article-title":"Text2Sign: towards sign language production using neural machine translation and generative adversarial networks","volume":"128","author":"Stoll","year":"2020","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.engappai.2025.112739_b43","series-title":"European Conference on Computer Vision","first-page":"353","article-title":"SignSynth: Data-driven sign language video generation","author":"Stoll","year":"2020"},{"issue":"1","key":"10.1016\/j.engappai.2025.112739_b44","first-page":"22","article-title":"Artificial intelligence for sign language translation\u2013A design science research study","volume":"53","author":"Strobel","year":"2023","journal-title":"Commun. Assoc. Inf. Syst."},{"key":"10.1016\/j.engappai.2025.112739_b45","series-title":"ECCV","first-page":"358","article-title":"Motionclip: Exposing human motion generation to clip space","author":"Tevet","year":"2022"},{"key":"10.1016\/j.engappai.2025.112739_b46","series-title":"3rd International Conference on Intelligent Sustainable Systems","first-page":"1250","article-title":"Generation of indian sign language by sentence processing and generative adversarial networks","author":"Vasani","year":"2020"},{"key":"10.1016\/j.engappai.2025.112739_b47","doi-asserted-by":"crossref","first-page":"6140","DOI":"10.1109\/TMM.2025.3565929","article-title":"Instruction-Driven 3D facial expression generation and transition","volume":"27","author":"Vo","year":"2025","journal-title":"IEEE Trans. Multimed."},{"issue":"4","key":"10.1016\/j.engappai.2025.112739_b48","doi-asserted-by":"crossref","first-page":"440","DOI":"10.18178\/ijmlc.2019.9.4.823","article-title":"Deep learning for vietnamese sign language recognition in video sequence","volume":"9","author":"Vo","year":"2019","journal-title":"Int. J. Mach. Learn. Comput."},{"key":"10.1016\/j.engappai.2025.112739_b49","doi-asserted-by":"crossref","DOI":"10.1016\/j.cmpb.2023.107602","article-title":"Brain image segmentation of the corpus callosum by combining Bi-directional convolutional LSTM and U-net using multi-slice CT and MRI","volume":"238","author":"Wong","year":"2023","journal-title":"Comput. Methods Programs Biomed."},{"key":"10.1016\/j.engappai.2025.112739_b50","first-page":"6234","article-title":"G2P-DDM: Generating sign pose sequence from gloss sequence with discrete diffusion model","volume":"vol. 38","author":"Xie","year":"2024"},{"key":"10.1016\/j.engappai.2025.112739_b51","series-title":"T2S-GPT: Dynamic vector quantization for autoregressive sign language production from text","author":"Yin","year":"2024"},{"key":"10.1016\/j.engappai.2025.112739_b52","doi-asserted-by":"crossref","unstructured":"Yin, A., Zhong, T., Tang, L., Jin, W., Jin, T., Zhao, Z., 2023. Gloss attention for gloss-free sign language translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 2551\u20132562.","DOI":"10.1109\/CVPR52729.2023.00251"},{"key":"10.1016\/j.engappai.2025.112739_b53","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Wei, G., Zheng, J., Zou, J., Wei, Y., Zhang, Y., Li, H., 2024. Make Pixels Dance: High-Dynamic Video Generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 8850\u20138860.","DOI":"10.1109\/CVPR52733.2024.00845"},{"key":"10.1016\/j.engappai.2025.112739_b54","series-title":"SLTUNET: A simple unified model for sign language translation","author":"Zhang","year":"2023"},{"key":"10.1016\/j.engappai.2025.112739_b55","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2025.111840","article-title":"Video saliency prediction via single feature enhancement and temporal recurrence","volume":"160","author":"Zhang","year":"2025","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.engappai.2025.112739_b56","article-title":"Spatiotemporal dual-branch feature-guided fusion network for driver attention prediction","author":"Zhang","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.engappai.2025.112739_b57","doi-asserted-by":"crossref","unstructured":"Zhou, B., Chen, Z., Clap\u00e9s, A., Wan, J., Liang, Y., Escalera, S., Lei, Z., Zhang, D., 2023. Gloss-free sign language translation: Improving from visual-language pretraining. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 20871\u201320881.","DOI":"10.1109\/ICCV51070.2023.01908"}],"container-title":["Engineering Applications of Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197625027708?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0952197625027708?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T23:57:52Z","timestamp":1773273472000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0952197625027708"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":57,"alternative-id":["S0952197625027708"],"URL":"https:\/\/doi.org\/10.1016\/j.engappai.2025.112739","relation":{},"ISSN":["0952-1976"],"issn-type":[{"value":"0952-1976","type":"print"}],"subject":[],"published":{"date-parts":[[2026,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Temporal diffuser: Timing scale-aware modulation for sign language production","name":"articletitle","label":"Article Title"},{"value":"Engineering Applications of Artificial Intelligence","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.engappai.2025.112739","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 The Authors. Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"112739"}}