{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:09:29Z","timestamp":1765343369209,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755168","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"9852-9861","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ANT: Adaptive Neural Temporal-Aware Text-to-Motion Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1966-6059","authenticated-orcid":false,"given":"Wenshuo","family":"Chen","sequence":"first","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3134-4208","authenticated-orcid":false,"given":"Kuimou","family":"Yu","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1676-1547","authenticated-orcid":false,"given":"Jia","family":"Haozhe","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2353-2436","authenticated-orcid":false,"given":"Kaishen","family":"Yuan","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6251-9694","authenticated-orcid":false,"given":"Zexu","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Electrical and Data Engineering (SEDE), University of Technology Sydney, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1305-3097","authenticated-orcid":false,"given":"Bowen","family":"Tian","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-3132-9414","authenticated-orcid":false,"given":"Songning","family":"Lai","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6715-357X","authenticated-orcid":false,"given":"Hongru","family":"Xiao","sequence":"additional","affiliation":[{"name":"College of Civil Engineering, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4172-4935","authenticated-orcid":false,"given":"Erhang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chongxin, Shandong University, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8600-7099","authenticated-orcid":false,"given":"Lei","family":"Wang","sequence":"additional","affiliation":[{"name":"Griffith University, Brisbane, Queensland, Australia and Data61\/CSIRO, Canberra, ACT, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4532-0924","authenticated-orcid":false,"given":"Yutao","family":"Yue","sequence":"additional","affiliation":[{"name":"Thrust of Artificial Intelligence and Thrust of Intelligent Transportation, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China and JITRI, Institute of Deep Perception Technology, Wuxi, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katie Millican Malcolm Reynolds Roman Ring Eliza Rutherford Serkan Cabi Tengda Han Zhitao Gong Sina Samangooei Marianne Monteiro Jacob Menick Sebastian Borgeaud Andrew Brock Aida Nematzadeh Sahand Sharifzadeh Mikolaj Binkowski Ricardo Barreira Oriol Vinyals Andrew Zisserman and Karen Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. arXiv:2204.14198 [cs.CV] https:\/\/arxiv. org\/abs\/2204.14198"},{"key":"e_1_3_2_2_2_1","volume-title":"TEACH: Temporal Action Composition for 3D Humans. arXiv:2209.04066 [cs.CV] https:\/\/arxiv.org\/abs\/2209.04066","author":"Athanasiou Nikos","year":"2022","unstructured":"Nikos Athanasiou, Mathis Petrovich, Michael J. Black, and G\u00fcl Varol. 2022. TEACH: Temporal Action Composition for 3D Humans. arXiv:2209.04066 [cs.CV] https:\/\/arxiv.org\/abs\/2209.04066"},{"key":"e_1_3_2_2_3_1","unstructured":"Wenshuo Chen Haozhe Jia Songning Lai Keming Wu Hongru Xiao Lijie Hu and Yutao Yue. 2025. Free-T2M: Frequency Enhanced Text-to-Motion Diffusion Model With Consistency Loss. arXiv:2501.18232 [cs.CV] https:\/\/arxiv.org\/abs\/ 2501.18232"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681034"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Xin Chen Biao Jiang Wen Liu Zilong Huang Bin Fu Tao Chen Jingyi Yu and Gang Yu. 2023. Executing your Commands via Motion Diffusion in Latent Space. arXiv:2212.04048 [cs.CV] https:\/\/arxiv.org\/abs\/2212.04048","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Wenxun Dai Ling-Hao Chen Jingbo Wang Jinpeng Liu Bo Dai and Yansong Tang. 2024. MotionLCM: Real-time Controllable Motion Generation via Latent Consistency Model. arXiv:2404.19759 [cs.CV] https:\/\/arxiv.org\/abs\/2404.19759","DOI":"10.1007\/978-3-031-72640-8_22"},{"key":"e_1_3_2_2_7_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_2_8_1","volume-title":"Sen Wang, and Li Cheng.","author":"Guo Chuan","year":"2023","unstructured":"Chuan Guo, Yuxuan Mu, Muhammad Gohar Javed, Sen Wang, and Li Cheng. 2023. MoMask: Generative Masked Modeling of 3D Human Motions. (2023). arXiv:2312.00063 [cs.CV]"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Chuan Guo Xinxin Zuo Sen Wang and Li Cheng. 2022. TM2T: Stochastic and Tokenized Modeling for the Reciprocal Generation of 3D Human Motions and Texts. arXiv:2207.01696 [cs.CV] https:\/\/arxiv.org\/abs\/2207.01696","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295408"},{"key":"e_1_3_2_2_15_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the Knowledge in a Neural Network. arXiv:1503.02531 [stat.ML] https:\/\/arxiv.org\/abs\/1503.02531"},{"key":"e_1_3_2_2_16_1","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-Free Diffusion Guidance. arXiv:2207.12598 [cs.LG] https:\/\/arxiv.org\/abs\/2207.12598"},{"key":"e_1_3_2_2_17_1","volume-title":"SALAD: Skeleton-aware Latent Diffusion for Text-driven Motion Generation and Editing. arXiv:2503.13836 [cs.CV] https:\/\/arxiv.org\/abs\/ 2503.13836","author":"Hong Seokhyeon","year":"2025","unstructured":"Seokhyeon Hong, Chaelin Kim, Serin Yoon, Junghyun Nam, Sihun Cha, and Junyong Noh. 2025. SALAD: Skeleton-aware Latent Diffusion for Text-driven Motion Generation and Editing. arXiv:2503.13836 [cs.CV] https:\/\/arxiv.org\/abs\/ 2503.13836"},{"key":"e_1_3_2_2_18_1","volume-title":"ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment. arXiv:2403.05135 [cs.CV]","author":"Hu Xiwei","year":"2024","unstructured":"Xiwei Hu, Rui Wang, Yixiao Fang, Bin Fu, Pei Cheng, and Gang Yu. 2024. ELLA: Equip Diffusion Models with LLM for Enhanced Semantic Alignment. arXiv:2403.05135 [cs.CV]"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Xun Huang and Serge Belongie. 2017. Arbitrary Style Transfer in Real-time with Adaptive Instance Normalization. arXiv:1703.06868 [cs.CV] https:\/\/arxiv.org\/ abs\/1703.06868","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Yiheng Huang Hui Yang Chuanchen Luo Yuxi Wang Shibiao Xu Zhaoxiang Zhang Man Zhang and Junran Peng. 2024. StableMoFusion: Towards Robust and Efficient Diffusion-based Motion Generation Framework. arXiv:2405.05691 [cs.CV] https:\/\/arxiv.org\/abs\/2405.05691","DOI":"10.1145\/3664647.3681657"},{"key":"e_1_3_2_2_21_1","unstructured":"Zhihan Huang Yuting Wei and Yuxin Chen. 2024. Denoising diffusion probabilistic models are optimally adaptive to unknown low dimensionality. arXiv:2410.18784 [cs.LG] https:\/\/arxiv.org\/abs\/2410.18784"},{"key":"e_1_3_2_2_22_1","unstructured":"Biao Jiang Xin Chen Wen Liu Jingyi Yu Gang Yu and Tao Chen. 2023. MotionGPT: Human Motion as a Foreign Language. arXiv:2306.14795 [cs.CV] https:\/\/arxiv.org\/abs\/2306.14795"},{"key":"e_1_3_2_2_23_1","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. DecoupledWeight Decay Regularization. arXiv:1711.05101 [cs.LG] https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"e_1_3_2_2_24_1","unstructured":"Cheng Lu Yuhao Zhou Fan Bao Jianfei Chen Chongxuan Li and Jun Zhu. 2022. DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps. arXiv:2206.00927 [cs.LG] https:\/\/arxiv.org\/abs\/2206.00927"},{"key":"e_1_3_2_2_25_1","unstructured":"Cheng Lu Yuhao Zhou Fan Bao Jianfei Chen Chongxuan Li and Jun Zhu. 2023. DPM-Solver++: Fast Solver for Guided Sampling of Diffusion Probabilistic Models. arXiv:2211.01095 [cs.LG] https:\/\/arxiv.org\/abs\/2211.01095"},{"key":"e_1_3_2_2_26_1","volume":"201","author":"Mahmood Naureen","unstructured":"Naureen Mahmood, Nima Ghorbani, Nikolaus F. Troje, Gerard Pons-Moll, and Michael J. Black. 2019. AMASS: Archive of Motion Capture as Surface Shapes. In International Conference on Computer Vision. 5442--5451.","journal-title":"Michael J. Black."},{"key":"e_1_3_2_2_27_1","unstructured":"Zichong Meng Yiming Xie Xiaogang Peng Zeyu Han and Huaizu Jiang. 2024. Rethinking Diffusion for Text-Driven Human Motion Generation. arXiv:2411.16575 [cs.CV] https:\/\/arxiv.org\/abs\/2411.16575"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Mathis Petrovich Michael J. Black and G\u00fcl Varol. 2021. Action-Conditioned 3D Human Motion Synthesis with Transformer VAE. arXiv:2104.05670 [cs.CV] https:\/\/arxiv.org\/abs\/2104.05670","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_2_29_1","volume-title":"TEMOS: Generating diverse human motions from textual descriptions. arXiv:2204.14109 [cs.CV] https:\/\/arxiv.org\/abs\/2204.14109","author":"Petrovich Mathis","year":"2022","unstructured":"Mathis Petrovich, Michael J. Black, and G\u00fcl Varol. 2022. TEMOS: Generating diverse human motions from textual descriptions. arXiv:2204.14109 [cs.CV] https:\/\/arxiv.org\/abs\/2204.14109"},{"key":"e_1_3_2_2_30_1","volume-title":"Korrawe Karunratanakul, Pu Wang, Hongfei Xue, Chen Chen, Chuan Guo, Junli Cao, Jian Ren, and Sergey Tulyakov.","author":"Pinyoanuntapong Ekkasit","year":"2024","unstructured":"Ekkasit Pinyoanuntapong, Muhammad Usama Saleem, Korrawe Karunratanakul, Pu Wang, Hongfei Xue, Chen Chen, Chuan Guo, Junli Cao, Jian Ren, and Sergey Tulyakov. 2024. ControlMM: Controllable Masked Motion Generation. arXiv:2410.10780 [cs.CV] https:\/\/arxiv.org\/abs\/2410.10780"},{"key":"e_1_3_2_2_31_1","volume-title":"Pu Wang, Minwoo Lee, Srijan Das, and Chen Chen.","author":"Pinyoanuntapong Ekkasit","year":"2024","unstructured":"Ekkasit Pinyoanuntapong, Muhammad Usama Saleem, Pu Wang, Minwoo Lee, Srijan Das, and Chen Chen. 2024. BAMM: Bidirectional Autoregressive Motion Model. arXiv:2403.19435 [cs.CV] https:\/\/arxiv.org\/abs\/2403.19435"},{"key":"e_1_3_2_2_32_1","volume-title":"MMM: Generative Masked Motion Model. arXiv:2312.03596 [cs.CV] https:\/\/arxiv.org\/ abs\/2312.03596","author":"Pinyoanuntapong Ekkasit","year":"2024","unstructured":"Ekkasit Pinyoanuntapong, Pu Wang, Minwoo Lee, and Chen Chen. 2024. MMM: Generative Masked Motion Model. arXiv:2312.03596 [cs.CV] https:\/\/arxiv.org\/ abs\/2312.03596"},{"key":"e_1_3_2_2_33_1","unstructured":"Matthias Plappert Christian Mandery and Tamim Asfour. [n. d.]. The KIT Motion-Language Dataset. Big Data ([n. d.])."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Yurui Qian Qi Cai Yingwei Pan Yehao Li Ting Yao Qibin Sun and Tao Mei. 2024. Boosting Diffusion Models with Moving Average Sampling in Frequency Domain. arXiv:2403.17870 [cs.CV] https:\/\/arxiv.org\/abs\/2403.17870","DOI":"10.1109\/CVPR52733.2024.00851"},{"key":"e_1_3_2_2_35_1","volume-title":"International conference on machine learning. PmLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748--8763."},{"key":"e_1_3_2_2_36_1","volume":"202","author":"Raffel Colin","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 [cs.LG] https:\/\/arxiv.org\/abs\/1910.10683","journal-title":"Peter J. Liu."},{"key":"e_1_3_2_2_37_1","unstructured":"Jiaming Song Chenlin Meng and Stefano Ermon. 2022. Denoising Diffusion Implicit Models. arXiv:2010.02502 [cs.LG] https:\/\/arxiv.org\/abs\/2010.02502"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Guy Tevet Brian Gordon Amir Hertz Amit H. Bermano and Daniel Cohen- Or. 2022. MotionCLIP: Exposing Human Motion Generation to CLIP Space. arXiv:2203.08063 [cs.CV] https:\/\/arxiv.org\/abs\/2203.08063","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"e_1_3_2_2_39_1","volume-title":"Daniel Cohen- Or, and Amit H. Bermano","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Daniel Cohen- Or, and Amit H. Bermano. 2022. Human Motion Diffusion Model. arXiv:2209.14916 [cs.CV] https:\/\/arxiv.org\/abs\/2209.14916"},{"key":"e_1_3_2_2_40_1","volume-title":"Human motion diffusion model. arXiv preprint arXiv:2209.14916","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Daniel Cohen-Or, and Amit H Bermano. 2022. Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)."},{"key":"e_1_3_2_2_41_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi Chenliang Li Yuanhong Xu Hehong Chen Junfeng Tian Qi Qian Ji Zhang Fei Huang and Jingren Zhou. 2024. mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality. arXiv:2304.14178 [cs.CL] https:\/\/arxiv.org\/abs\/2304.14178"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Ye Yuan Jiaming Song Umar Iqbal Arash Vahdat and Jan Kautz. 2023. PhysDiff: Physics-Guided Human Motion Diffusion Model. arXiv:2212.02500 [cs.CV] https: \/\/arxiv.org\/abs\/2212.02500","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Beichen Zhang Pan Zhang Xiaoyi Dong Yuhang Zang and Jiaqi Wang. 2024. Long-CLIP: Unlocking the Long-Text Capability of CLIP. arXiv:2403.15378 [cs.CV] https:\/\/arxiv.org\/abs\/2403.15378","DOI":"10.1007\/978-3-031-72983-6_18"},{"key":"e_1_3_2_2_45_1","unstructured":"Jianrong Zhang Yangsong Zhang Xiaodong Cun Shaoli Huang Yong Zhang Hongwei Zhao Hongtao Lu and Xi Shen. 2023. T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations. arXiv:2301.06052 [cs.CV] https:\/\/arxiv.org\/abs\/2301.06052"},{"key":"e_1_3_2_2_46_1","unstructured":"Mingyuan Zhang Zhongang Cai Liang Pan Fangzhou Hong Xinying Guo Lei Yang and Ziwei Liu. 2022. MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model. arXiv:2208.15001 [cs.CV] https:\/\/arxiv.org\/ abs\/2208.15001"},{"key":"e_1_3_2_2_47_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model","author":"Zhang Mingyuan","year":"2024","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2024. Motiondiffuse: Text-driven human motion generation with diffusion model. IEEE transactions on pattern analysis and machine intelligence 46, 6 (2024), 4115--4128."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755168","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:05:03Z","timestamp":1765343103000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755168"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":48,"alternative-id":["10.1145\/3746027.3755168","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755168","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}