{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:06:51Z","timestamp":1780931211590,"version":"3.54.1"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.113983","type":"journal-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T15:34:23Z","timestamp":1779464063000},"page":"113983","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Text-to-motion retrieval by text-to-motion generation"],"prefix":"10.1016","volume":"180","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3319-692X","authenticated-orcid":false,"given":"Honghu","family":"Pan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qianqian","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8153-2636","authenticated-orcid":false,"given":"Guoqing","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongyong","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113983_b1","doi-asserted-by":"crossref","unstructured":"N. Messina, J. Sedmidubsky, F. Falchi, T. Rebok, Text-to-motion retrieval: Towards joint understanding of human motion data and natural language, in: Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2023, pp. 2420\u20132425.","DOI":"10.1145\/3539618.3592069"},{"key":"10.1016\/j.patcog.2026.113983_b2","doi-asserted-by":"crossref","unstructured":"M. Petrovich, M.J. Black, G. Varol, TMR: Text-to-motion retrieval using contrastive 3D human motion synthesis, in: IEEE International Conference on Computer Vision, 2023, pp. 9488\u20139497.","DOI":"10.1109\/ICCV51070.2023.00870"},{"key":"10.1016\/j.patcog.2026.113983_b3","doi-asserted-by":"crossref","unstructured":"A. Miech, J.-B. Alayrac, I. Laptev, J. Sivic, A. Zisserman, Thinking fast and slow: Efficient text-to-visual retrieval with transformers, in: IEEE Conference on Computer Vision and Pattern Recognition, 2021, pp. 9826\u20139836.","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"10.1016\/j.patcog.2026.113983_b4","doi-asserted-by":"crossref","unstructured":"S.K. Gorti, N. Vouitsis, J. Ma, K. Golestan, M. Volkovs, A. Garg, G. Yu, X-pool: Cross-modal language-video attention for text-video retrieval, in: IEEE Conference on Computer Vision and Pattern Recognition, 2022, pp. 5006\u20135015.","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"10.1016\/j.patcog.2026.113983_b5","doi-asserted-by":"crossref","DOI":"10.1109\/TIFS.2024.3426335","article-title":"Unified conditional image generation for visible-infrared person re-identification","author":"Pan","year":"2024","journal-title":"IEEE Trans. Inf. Forensics Secur."},{"key":"10.1016\/j.patcog.2026.113983_b6","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113983_b7","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113983_b8","doi-asserted-by":"crossref","unstructured":"Y. Yang, L. Cao, H. Shi, H. Zhang, Multi-Instance Multi-Label Learning for Text-motion Retrieval, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 5829\u20135837.","DOI":"10.1145\/3664647.3681444"},{"key":"10.1016\/j.patcog.2026.113983_b9","doi-asserted-by":"crossref","unstructured":"H. Shi, H. Zhang, Modal-Enhanced Semantic Modeling for Fine-Grained 3D Human Motion Retrieval, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 10114\u201310123.","DOI":"10.1145\/3664647.3681625"},{"key":"10.1016\/j.patcog.2026.113983_b10","doi-asserted-by":"crossref","unstructured":"Y. Yang, H. Shi, H. Zhang, Hierarchical Semantics Alignment for 3D Human Motion Retrieval, in: Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2024, pp. 1083\u20131092.","DOI":"10.1145\/3626772.3657804"},{"key":"10.1016\/j.patcog.2026.113983_b11","article-title":"Motiondiffuse: Text-driven human motion generation with diffusion model","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113983_b12","doi-asserted-by":"crossref","unstructured":"R. Dabral, M.H. Mughal, V. Golyanik, C. Theobalt, Mofusion: A framework for denoising-diffusion-based motion synthesis, in: IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 9760\u20139770.","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"10.1016\/j.patcog.2026.113983_b13","unstructured":"J. Song, C. Meng, S. Ermon, Denoising diffusion implicit models, in: International Conference on Learning Representations, 2020."},{"key":"10.1016\/j.patcog.2026.113983_b14","unstructured":"J. Ho, A. Jain, P. Abbeel, Denoising diffusion probabilistic models, in: Conference on Neural Information Processing Systems, vol. 33, 2020, pp. 6840\u20136851."},{"key":"10.1016\/j.patcog.2026.113983_b15","series-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"10.1016\/j.patcog.2026.113983_b16","doi-asserted-by":"crossref","unstructured":"C. Guo, S. Zou, X. Zuo, S. Wang, W. Ji, X. Li, L. Cheng, Generating diverse and natural 3d human motions from text, in: IEEE Conference on Computer Vision and Pattern Recognition, 2022, pp. 5152\u20135161.","DOI":"10.1109\/CVPR52688.2022.00509"},{"issue":"4","key":"10.1016\/j.patcog.2026.113983_b17","doi-asserted-by":"crossref","first-page":"236","DOI":"10.1089\/big.2016.0028","article-title":"The kit motion-language dataset","volume":"4","author":"Plappert","year":"2016","journal-title":"Big Data"},{"key":"10.1016\/j.patcog.2026.113983_b18","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"10.1016\/j.patcog.2026.113983_b19","doi-asserted-by":"crossref","first-page":"6289","DOI":"10.1109\/TIP.2023.3331309","article-title":"Orientation cues-aware facial relationship representation for head pose estimation via transformer","volume":"32","author":"Liu","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.113983_b20","doi-asserted-by":"crossref","first-page":"1677","DOI":"10.1109\/TMM.2023.3238548","article-title":"TransIFC: Invariant cues-aware feature concentration learning for efficient fine-grained bird image classification","volume":"27","author":"Liu","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113983_b21","article-title":"MMATrans: Muscle movement aware representation learning for facial expression recognition via transformers","author":"Liu","year":"2024","journal-title":"IEEE Trans. Ind. Inform."},{"key":"10.1016\/j.patcog.2026.113983_b22","doi-asserted-by":"crossref","unstructured":"K. Yin, S. Zou, Y. Ge, Z. Tian, Tri-modal motion retrieval by learning a joint embedding space, in: IEEE Conference on Computer Vision and Pattern Recognition, 2024, pp. 1596\u20131605.","DOI":"10.1109\/CVPR52733.2024.00158"},{"key":"10.1016\/j.patcog.2026.113983_b23","unstructured":"Z. Li, W. Yuan, L. Qiu, S. Zhu, X. Gu, W. Shen, Y. Dong, Z. Dong, L.T. Yang, et al., LaMP: Language-Motion Pretraining for Motion Generation, Retrieval, and Captioning, in: The Thirteenth International Conference on Learning Representations."},{"key":"10.1016\/j.patcog.2026.113983_b24","doi-asserted-by":"crossref","unstructured":"R. Wang, C. Ma, G. Li, H. Xu, Y. Li, Z. Wang, You Think, You ACT: The New Task of Arbitrary Text to Motion Generation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 12012\u201312022.","DOI":"10.1109\/ICCV51701.2025.01117"},{"key":"10.1016\/j.patcog.2026.113983_b25","doi-asserted-by":"crossref","unstructured":"M. Petrovich, M.J. Black, G. Varol, Action-conditioned 3d human motion synthesis with transformer vae, in: IEEE International Conference on Computer Vision, 2021, pp. 10985\u201310995.","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"10.1016\/j.patcog.2026.113983_b26","series-title":"In Proceedings of the European Conference on Computer Vision","first-page":"480","article-title":"TEMOS: Generating diverse human motions from textual descriptions","author":"Petrovich","year":"2022"},{"key":"10.1016\/j.patcog.2026.113983_b27","doi-asserted-by":"crossref","unstructured":"J. Zhang, Y. Zhang, X. Cun, Y. Zhang, H. Zhao, H. Lu, X. Shen, Y. Shan, Generating human motion from textual descriptions with discrete representations, in: IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 14730\u201314740.","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"10.1016\/j.patcog.2026.113983_b28","doi-asserted-by":"crossref","unstructured":"K. Karunratanakul, K. Preechakul, S. Suwajanakorn, S. Tang, Guided motion diffusion for controllable human motion synthesis, in: IEEE International Conference on Computer Vision, 2023, pp. 2151\u20132162.","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"10.1016\/j.patcog.2026.113983_b29","unstructured":"Y. Xie, V. Jampani, L. Zhong, D. Sun, H. Jiang, OmniControl: Control Any Joint at Any Time for Human Motion Generation, in: The Twelfth International Conference on Learning Representations."},{"issue":"6","key":"10.1016\/j.patcog.2026.113983_b30","doi-asserted-by":"crossref","first-page":"8068","DOI":"10.1109\/TII.2023.3266366","article-title":"LDCNet: Limb direction cues-aware network for flexible HPE in industrial behavioral biometrics systems","volume":"20","author":"Liu","year":"2023","journal-title":"IEEE Trans. Ind. Inform."},{"key":"10.1016\/j.patcog.2026.113983_b31","doi-asserted-by":"crossref","first-page":"8464","DOI":"10.1109\/TMM.2022.3197364","article-title":"EHPE: Skeleton cues-based gaussian coordinate encoding for efficient human pose estimation","volume":"26","author":"Liu","year":"2022","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113983_b32","doi-asserted-by":"crossref","unstructured":"M. Alikhani, F. Han, H. Ravi, M. Kapadia, V. Pavlovic, M. Stone, Cross-modal coherence for text-to-image retrieval, in: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, 2022, pp. 10427\u201310435, 10.","DOI":"10.1609\/aaai.v36i10.21285"},{"key":"10.1016\/j.patcog.2026.113983_b33","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111096","article-title":"Cross-modal independent matching network for image-text retrieval","volume":"159","author":"Ke","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113983_b34","doi-asserted-by":"crossref","unstructured":"P. Jin, H. Li, Z. Cheng, K. Li, X. Ji, C. Liu, L. Yuan, J. Chen, Diffusionret: Generative text-video retrieval with diffusion model, in: IEEE International Conference on Computer Vision, 2023, pp. 2470\u20132481.","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"10.1016\/j.patcog.2026.113983_b35","doi-asserted-by":"crossref","unstructured":"S. Zhao, C. Gao, Y. Shao, W.-S. Zheng, N. Sang, Weakly supervised text-based person re-identification, in: IEEE International Conference on Computer Vision, 2021, pp. 11395\u201311404.","DOI":"10.1109\/ICCV48922.2021.01120"},{"key":"10.1016\/j.patcog.2026.113983_b36","doi-asserted-by":"crossref","DOI":"10.1109\/TIP.2023.3327924","article-title":"Clip-driven fine-grained text-image person re-identification","author":"Yan","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.patcog.2026.113983_b37","doi-asserted-by":"crossref","unstructured":"S. Yan, N. Dong, J. Liu, L. Zhang, J. Tang, Learning comprehensive representations with richer self for text-to-image person re-identification, in: Proceedings of the 31st ACM International Conference on Multimedia, 2023, pp. 6202\u20136211.","DOI":"10.1145\/3581783.3611832"},{"key":"10.1016\/j.patcog.2026.113983_b38","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.patcog.2026.113983_b39","doi-asserted-by":"crossref","unstructured":"P. Guan, R. Pei, B. Shao, J. Liu, W. Li, J. Gu, H. Xu, S. Xu, Y. Yan, E.Y. Lam, Pidro: Parallel isomeric attention with dynamic routing for text-video retrieval, in: IEEE International Conference on Computer Vision, 2023, pp. 11164\u201311173.","DOI":"10.1109\/ICCV51070.2023.01025"},{"key":"10.1016\/j.patcog.2026.113983_b40","doi-asserted-by":"crossref","unstructured":"X. Wang, L. Zhu, Y. Yang, T2vlad: global-local sequence alignment for text-video retrieval, in: IEEE Conference on Computer Vision and Pattern Recognition, 2021, pp. 5079\u20135088.","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"10.1016\/j.patcog.2026.113983_b41","doi-asserted-by":"crossref","unstructured":"B. Fang, W. Wu, C. Liu, Y. Zhou, Y. Song, W. Wang, X. Shu, X. Ji, J. Wang, Uatvr: Uncertainty-adaptive text-video retrieval, in: IEEE International Conference on Computer Vision, 2023, pp. 13723\u201313733.","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"10.1016\/j.patcog.2026.113983_b42","series-title":"In Proceedings of the European Conference on Computer Vision","first-page":"390","article-title":"Uncertainty-aware sign language video retrieval with probability distribution modeling","author":"Wu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113983_b43","doi-asserted-by":"crossref","unstructured":"R. Rombach, A. Blattmann, D. Lorenz, P. Esser, B. Ommer, High-resolution image synthesis with latent diffusion models, in: IEEE Conference on Computer Vision and Pattern Recognition, 2022, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10.1016\/j.patcog.2026.113983_b44","doi-asserted-by":"crossref","unstructured":"W. Weng, R. Feng, Y. Wang, Q. Dai, C. Wang, D. Yin, Z. Zhao, K. Qiu, J. Bao, Y. Yuan, et al., Art-v: Auto-regressive text-to-video generation with diffusion models, in: IEEE Conference on Computer Vision and Pattern Recognition, 2024, pp. 7395\u20137405.","DOI":"10.1109\/CVPRW63382.2024.00735"},{"key":"10.1016\/j.patcog.2026.113983_b45","doi-asserted-by":"crossref","unstructured":"X. Yang, X. Wang, Diffusion model as representation learner, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 18938\u201318949.","DOI":"10.1109\/ICCV51070.2023.01736"},{"key":"10.1016\/j.patcog.2026.113983_b46","doi-asserted-by":"crossref","unstructured":"W. Zhao, Y. Rao, Z. Liu, B. Liu, J. Zhou, J. Lu, Unleashing text-to-image diffusion models for visual perception, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 5729\u20135739.","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"10.1016\/j.patcog.2026.113983_b47","doi-asserted-by":"crossref","unstructured":"J. Xu, S. Liu, A. Vahdat, W. Byeon, X. Wang, S. De Mello, Open-vocabulary panoptic segmentation with text-to-image diffusion models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 2955\u20132966.","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"10.1016\/j.patcog.2026.113983_b48","unstructured":"Y. Zhu, Y. Wu, K. Olszewski, J. Ren, S. Tulyakov, Y. Yan, Discrete Contrastive Diffusion for Cross-Modal Music and Image Generation, in: International Conference on Learning Representations."},{"key":"10.1016\/j.patcog.2026.113983_b49","series-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"10.1016\/j.patcog.2026.113983_b50","doi-asserted-by":"crossref","unstructured":"C. Guo, X. Zuo, S. Wang, S. Zou, Q. Sun, A. Deng, M. Gong, L. Cheng, Action2motion: Conditioned generation of 3d human motions, in: Proceedings of the 28th ACM International Conference on Multimedia, 2020, pp. 2021\u20132029.","DOI":"10.1145\/3394171.3413635"},{"key":"10.1016\/j.patcog.2026.113983_b51","doi-asserted-by":"crossref","unstructured":"N. Mahmood, N. Ghorbani, N.F. Troje, G. Pons-Moll, M.J. Black, AMASS: Archive of motion capture as surface shapes, in: IEEE International Conference on Computer Vision, 2019, pp. 5442\u20135451.","DOI":"10.1109\/ICCV.2019.00554"},{"key":"10.1016\/j.patcog.2026.113983_b52","series-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"10.1016\/j.patcog.2026.113983_b53","doi-asserted-by":"crossref","unstructured":"D. Jiang, M. Ye, Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval, in: IEEE Conference on Computer Vision and Pattern Recognition, 2023, pp. 2787\u20132797.","DOI":"10.1109\/CVPR52729.2023.00273"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009489?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009489?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:53:49Z","timestamp":1780930429000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009489"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":53,"alternative-id":["S0031320326009489"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113983","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Text-to-motion retrieval by text-to-motion generation","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113983","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113983"}}