{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,19]],"date-time":"2025-11-19T07:09:28Z","timestamp":1763536168920,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,11,15]],"date-time":"2023-11-15T00:00:00Z","timestamp":1700006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,11,15]]},"DOI":"10.1145\/3623264.3624443","type":"proceedings-article","created":{"date-parts":[[2023,10,29]],"date-time":"2023-10-29T21:02:31Z","timestamp":1698613351000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Objective Evaluation Metric for Motion Generative Models: Validating Fr\u00e9chet Motion Distance on Foot Skating and Over-smoothing Artifacts."],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5729-700X","authenticated-orcid":false,"given":"Antoine","family":"Maiorca","sequence":"first","affiliation":[{"name":"ISIA Lab, University of Mons, Belgium"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2276-8494","authenticated-orcid":false,"given":"Hugo","family":"Bohy","sequence":"additional","affiliation":[{"name":"ISIA Lab, University of Mons, Belgium"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4286-3421","authenticated-orcid":false,"given":"Youngwoo","family":"Yoon","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute (ETRI), Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7024-2150","authenticated-orcid":false,"given":"Thierry","family":"Dutoit","sequence":"additional","affiliation":[{"name":"ISIA Lab, University of Mons, Belgium"}]}],"member":"320","published-online":{"date-parts":[[2023,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392462"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392469"},{"key":"e_1_3_2_1_3_1","volume-title":"Better fine-tuning by reducing representational collapse. arXiv preprint arXiv:2008.03156","author":"Aghajanyan Armen","year":"2020","unstructured":"Armen Aghajanyan, Akshat Shrivastava, Anchit Gupta, Naman Goyal, Luke Zettlemoyer, and Sonal Gupta. 2020. Better fine-tuning by reducing representational collapse. arXiv preprint arXiv:2008.03156 (2020)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3066111"},{"key":"e_1_3_2_1_5_1","volume-title":"please: A spatio-temporal transformer for 3d human motion prediction. arXiv preprint arXiv:2004.08692 2, 3","author":"Aksan Emre","year":"2020","unstructured":"Emre Aksan, Peng Cao, Manuel Kaufmann, and Otmar Hilliges. 2020. Attention, please: A spatio-temporal transformer for 3d human motion prediction. arXiv preprint arXiv:2004.08692 2, 3 (2020), 5."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547797"},{"key":"e_1_3_2_1_7_1","volume-title":"Mae-ast: Masked autoencoding audio spectrogram transformer. arXiv preprint arXiv:2203.16691","author":"Baade Alan","year":"2022","unstructured":"Alan Baade, Puyuan Peng, and David Harwath. 2022. Mae-ast: Masked autoencoding audio spectrogram transformer. arXiv preprint arXiv:2203.16691 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020), 12449\u201312460."},{"key":"e_1_3_2_1_9_1","volume-title":"Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254","author":"Bao Hangbo","year":"2021","unstructured":"Hangbo Bao, Li Dong, Songhao Piao, and Furu Wei. 2021. Beit: Bert pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)."},{"key":"e_1_3_2_1_10_1","volume-title":"Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150","author":"Beltagy Iz","year":"2020","unstructured":"Iz Beltagy, Matthew\u00a0E Peters, and Arman Cohan. 2020. Longformer: The long-document transformer. arXiv preprint arXiv:2004.05150 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Began: Boundary equilibrium generative adversarial networks. arXiv preprint arXiv:1703.10717","author":"Berthelot David","year":"2017","unstructured":"David Berthelot, Thomas Schumm, and Luke Metz. 2017. Began: Boundary equilibrium generative adversarial networks. arXiv preprint arXiv:1703.10717 (2017)."},{"key":"e_1_3_2_1_12_1","volume-title":"Demystifying MMD GANs. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=r1lUOzWCW","author":"Bi\u0144kowski Miko\u0142aj","year":"2018","unstructured":"Miko\u0142aj Bi\u0144kowski, Dougal\u00a0J. Sutherland, Michael Arbel, and Arthur Gretton. 2018. Demystifying MMD GANs. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=r1lUOzWCW"},{"key":"e_1_3_2_1_13_1","volume-title":"Pros and Cons of GAN Evaluation Measures. CoRR abs\/1802.03446","author":"Borji Ali","year":"2018","unstructured":"Ali Borji. 2018. Pros and Cons of GAN Evaluation Measures. CoRR abs\/1802.03446 (2018). arXiv:1802.03446http:\/\/arxiv.org\/abs\/1802.03446"},{"key":"e_1_3_2_1_14_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"volume-title":"Proceedings of the 2023 International Conference on Computer Graphics Theory and Applications.","author":"Chang Ziyi","key":"e_1_3_2_1_15_1","unstructured":"Ziyi Chang, Edmund J.\u00a0C. Findlay, Haozheng Zhang, and Hubert P.\u00a0H. Shum. 2022. Unifying Human Motion Synthesis and Style Transfer with Denoising Diffusion Probabilistic Models. In Proceedings of the 2023 International Conference on Computer Graphics Theory and Applications."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413669"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00611"},{"key":"e_1_3_2_1_18_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. CoRR abs\/1810.04805 (2018). arXiv:1810.04805http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_1_19_1","volume-title":"Jukebox: A generative model for music. arXiv preprint arXiv:2005.00341","author":"Dhariwal Prafulla","year":"2020","unstructured":"Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong\u00a0Wook Kim, Alec Radford, and Ilya Sutskever. 2020. Jukebox: A generative model for music. arXiv preprint arXiv:2005.00341 (2020)."},{"key":"e_1_3_2_1_20_1","volume-title":"Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates, Inc., 8780\u20138794. https:\/\/proceedings.neurips.cc\/paper\/2021\/file\/49ad23d1ec9fa4bd8d77d02681df5cfa-Paper.pdf"},{"key":"e_1_3_2_1_21_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/0047-259X(82)90077-X"},{"key":"e_1_3_2_1_23_1","volume-title":"Recurrent Network Models for Human Dynamics. 2015 IEEE International Conference on Computer Vision (ICCV)","author":"Fragkiadaki Katerina","year":"2015","unstructured":"Katerina Fragkiadaki, Sergey Levine, Panna Felsen, and Jitendra Malik. 2015. Recurrent Network Models for Human Dynamics. 2015 IEEE International Conference on Computer Vision (ICCV) (2015), 4346\u20134354."},{"key":"e_1_3_2_1_24_1","unstructured":"Shanghua Gao Pan Zhou Ming-Ming Cheng and Shuicheng Yan. 2023. Masked Diffusion Transformer is a Strong Image Synthesizer. arxiv:2303.14389\u00a0[cs.CV]"},{"key":"e_1_3_2_1_25_1","volume-title":"Contrastive audio-visual masked autoencoder. arXiv preprint arXiv:2210.07839","author":"Gong Yuan","year":"2022","unstructured":"Yuan Gong, Andrew Rouditchenko, Alexander\u00a0H Liu, David Harwath, Leonid Karlinsky, Hilde Kuehne, and James Glass. 2022. Contrastive audio-visual masked autoencoder. arXiv preprint arXiv:2210.07839 (2022)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"e_1_3_2_1_27_1","volume-title":"Long Text Generation via Adversarial Training with Leaked Information. arXiv preprint arXiv:1709.08624","author":"Guo Jiaxian","year":"2017","unstructured":"Jiaxian Guo, Sidi Lu, Han Cai, Weinan Zhang, Yong Yu, and Jun Wang. 2017. Long Text Generation via Adversarial Training with Leaked Information. arXiv preprint arXiv:1709.08624 (2017)."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 12633\u201312646","author":"Hatamizadeh Ali","year":"2023","unstructured":"Ali Hatamizadeh, Hongxu Yin, Greg Heinrich, Jan Kautz, and Pavlo Molchanov. 2023. Global context vision transformers. In International Conference on Machine Learning. PMLR, 12633\u201312646."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417836"},{"key":"e_1_3_2_1_30_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073663"},{"key":"e_1_3_2_1_32_1","volume-title":"A Two-part Transformer Network for Controllable Motion Synthesis. arXiv preprint arXiv:2304.12571","author":"Hou Shuaiying","year":"2023","unstructured":"Shuaiying Hou, Hongyu Tao, Hujun Bao, and Weiwei Xu. 2023. A Two-part Transformer Network for Controllable Motion Synthesis. arXiv preprint arXiv:2304.12571 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"MAViL: Masked Audio-Video Learners. arXiv preprint arXiv:2212.08071","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Vasu Sharma, Hu Xu, Chaitanya Ryali, Haoqi Fan, Yanghao Li, Shang-Wen Li, Gargi Ghosh, Jitendra Malik, and Christoph Feichtenhofer. 2022a. MAViL: Masked Audio-Video Learners. arXiv preprint arXiv:2212.08071 (2022)."},{"key":"e_1_3_2_1_34_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022b. Masked autoencoders that listen. Advances in Neural Information Processing Systems 35 (2022), 28708\u201328720.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_35_1","volume-title":"Dance revolution: Long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119","author":"Huang Ruozi","year":"2020","unstructured":"Ruozi Huang, Huang Hu, Wei Wu, Kei Sawada, Mi Zhang, and Daxin Jiang. 2020. Dance revolution: Long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119 (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_37_1","volume-title":"In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Jain A","year":"2016","unstructured":"A Jain, AR Zamir, S Savarese, and A Saxena. 2016. Deep learning on spatio-temporal graphs. In In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, NV, USA. 27\u201330."},{"key":"e_1_3_2_1_38_1","unstructured":"Maurice\u00a0George Kendall. 1948. Rank correlation methods. (1948)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Kevin Kilgour Mauricio Zuluaga Dominik Roblek and Matthew Sharifi. 2019. Fr\u00e9chet Audio Distance: A Reference-Free Metric for Evaluating Music Enhancement Algorithms.. In INTERSPEECH. 2350\u20132354.","DOI":"10.21437\/Interspeech.2019-2219"},{"key":"e_1_3_2_1_40_1","volume-title":"Refining Generative Process with Discriminator Guidance in Score-based Diffusion Models. arXiv preprint arXiv:2211.17091","author":"Kim Dongjun","year":"2022","unstructured":"Dongjun Kim, Yeongmin Kim, Wanmo Kang, and Il-Chul Moon. 2022. Refining Generative Process with Discriminator Guidance in Score-based Diffusion Models. arXiv preprint arXiv:2211.17091 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Evaluating gesture-generation in a large-scale open challenge: The GENEA Challenge","author":"Kucherenko Taras","year":"2022","unstructured":"Taras Kucherenko, Pieter Wolfert, Youngwoo Yoon, Carla Viegas, Teodor Nikolov, Mihail Tsakov, and Gustav\u00a0Eje Henter. 2023. Evaluating gesture-generation in a large-scale open challenge: The GENEA Challenge 2022. arXiv preprint arXiv:2303.08737 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-72113-8_20"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763\u2013772","author":"Lee Gilwoo","year":"2019","unstructured":"Gilwoo Lee, Zhiwei Deng, Shugao Ma, Takaaki Shiratori, Siddhartha\u00a0S Srinivasa, and Yaser Sheikh. 2019. Talking with hands 16.2 m: A large-scale dataset of synchronized body-finger motion and audio for conversational motion analysis and synthesis. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 763\u2013772."},{"key":"e_1_3_2_1_44_1","volume-title":"Priorgrad: Improving conditional denoising diffusion models with data-driven adaptive prior. arXiv preprint arXiv:2106.06406","author":"Kim Heeseung","year":"2021","unstructured":"Sang-gil Lee, Heeseung Kim, Chaehun Shin, Xu Tan, Chang Liu, Qi Meng, Tao Qin, Wei Chen, Sungroh Yoon, and Tie-Yan Liu. 2021. Priorgrad: Improving conditional denoising diffusion models with data-driven adaptive prior. arXiv preprint arXiv:2106.06406 (2021)."},{"key":"e_1_3_2_1_45_1","first-page":"23689","article-title":"Binauralgrad: A two-stage conditional diffusion probabilistic model for binaural audio synthesis","volume":"35","author":"Leng Yichong","year":"2022","unstructured":"Yichong Leng, Zehua Chen, Junliang Guo, Haohe Liu, Jiawei Chen, Xu Tan, Danilo Mandic, Lei He, Xiangyang Li, Tao Qin, 2022. Binauralgrad: A two-stage conditional diffusion probabilistic model for binaural audio synthesis. Advances in Neural Information Processing Systems 35 (2022), 23689\u201323700.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","volume-title":"Learning to generate diverse dance motions with transformer. arXiv preprint arXiv:2008.08171","author":"Li Jiaman","year":"2020","unstructured":"Jiaman Li, Yihang Yin, Hang Chu, Yi Zhou, Tingwu Wang, Sanja Fidler, and Hao Li. 2020. Learning to generate diverse dance motions with transformer. arXiv preprint arXiv:2008.08171 (2020)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00101"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_49_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74\u201381. https:\/\/aclanthology.org\/W04-1013"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392422"},{"key":"e_1_3_2_1_51_1","volume-title":"An improved evaluation framework for generative adversarial networks. arXiv preprint arXiv:1803.07474","author":"Liu Shaohui","year":"2018","unstructured":"Shaohui Liu, Yi Wei, Jiwen Lu, and Jie Zhou. 2018. An improved evaluation framework for generative adversarial networks. arXiv preprint arXiv:1803.07474 (2018)."},{"key":"e_1_3_2_1_52_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00036"},{"key":"e_1_3_2_1_54_1","unstructured":"Liqian Ma Xu Jia Qianru Sun Bernt Schiele Tinne Tuytelaars and Luc Van\u00a0Gool. 2017. Pose guided person image generation. In Advances in Neural Information Processing Systems. 405\u2013415."},{"key":"e_1_3_2_1_55_1","volume-title":"Towards Lightweight Neural Animation: Exploration of Neural Network Pruning in Mixture of Experts-based Animation Models. arXiv preprint arXiv:2201.04042","author":"Maiorca Antoine","year":"2022","unstructured":"Antoine Maiorca, Nathan Hubens, Sohaib Laraba, and Thierry Dutoit. 2022a. Towards Lightweight Neural Animation: Exploration of Neural Network Pruning in Mixture of Experts-based Animation Models. arXiv preprint arXiv:2201.04042 (2022)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532719.3543228"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00958"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","unstructured":"Julieta Martinez Michael Black and Javier Romero. 2017. On Human Motion Prediction Using Recurrent Neural Networks. 4674\u20134683. https:\/\/doi.org\/10.1109\/CVPR.2017.497","DOI":"10.1109\/CVPR.2017.497"},{"key":"e_1_3_2_1_59_1","volume-title":"International Conference on Learning Representations.","author":"Morozov Stanislav","year":"2020","unstructured":"Stanislav Morozov, Andrey Voynov, and Artem Babenko. 2020. On self-supervised image representations for GAN evaluation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_1_62_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_63_1","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:2112.10752\u00a0[cs.CV]"},{"key":"e_1_3_2_1_64_1","volume-title":"Improved techniques for training gans. Advances in neural information processing systems 29","author":"Salimans Tim","year":"2016","unstructured":"Tim Salimans, Ian Goodfellow, Wojciech Zaremba, Vicki Cheung, Alec Radford, and Xi Chen. 2016. Improved techniques for training gans. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_65_1","volume-title":"Towards Device Efficient Conditional Image Generation. In 33rd British Machine Vision Conference 2022, BMVC 2022","author":"Shah A","year":"2022","unstructured":"Nisarg\u00a0A Shah and Gaurav Bharaj. 2022. Towards Device Efficient Conditional Image Generation. In 33rd British Machine Vision Conference 2022, BMVC 2022, London, UK, November 21-24, 2022. BMVA Press. https:\/\/bmvc2022.mpi-inf.mpg.de\/0689.pdf"},{"key":"e_1_3_2_1_66_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_67_1","volume-title":"Super-convergence: Very fast training of neural networks using large learning rates. In Artificial intelligence and machine learning for multi-domain operations applications, Vol.\u00a011006","author":"Smith N","year":"2019","unstructured":"Leslie\u00a0N Smith and Nicholay Topin. 2019. Super-convergence: Very fast training of neural networks using large learning rates. In Artificial intelligence and machine learning for multi-domain operations applications, Vol.\u00a011006. SPIE, 369\u2013386."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530178"},{"key":"e_1_3_2_1_69_1","volume-title":"Rethinking the Inception Architecture for Computer Vision. CoRR abs\/1512.00567","author":"Szegedy Christian","year":"2015","unstructured":"Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. 2015. Rethinking the Inception Architecture for Computer Vision. CoRR abs\/1512.00567 (2015). arXiv:1512.00567http:\/\/arxiv.org\/abs\/1512.00567"},{"key":"e_1_3_2_1_70_1","volume-title":"Human Motion Diffusion Model. arXiv preprint arXiv:2209.14916","author":"Tevet Guy","year":"2022","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yonatan Shafir, Amit\u00a0H Bermano, and Daniel Cohen-Or. 2022. Human Motion Diffusion Model. arXiv preprint arXiv:2209.14916 (2022)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1186\/1687-6180-2012-72"},{"key":"e_1_3_2_1_72_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01981"},{"key":"e_1_3_2_1_74_1","volume-title":"Scene-aware Generative Network for Human Motion Synthesis. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Wang Jingbo","year":"2021","unstructured":"Jingbo Wang, Sijie Yan, Bo Dai, and Dahua Lin. 2021. Scene-aware Generative Network for Human Motion Synthesis. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 12201\u201312210."},{"key":"e_1_3_2_1_75_1","volume-title":"NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion Synthesis System. arXiv preprint arXiv:2209.13204","author":"Wang Weiqiang","year":"2022","unstructured":"Weiqiang Wang, Xuefei Zhe, Huan Chen, Di Kang, Tingguang Li, Ruizhi Chen, and Linchao Bao. 2022b. NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion Synthesis System. arXiv preprint arXiv:2209.13204 (2022)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.3390\/a13120319"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.226"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00449"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3558058"},{"key":"e_1_3_2_1_83_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3272127.3275090"},{"key":"e_1_3_2_1_85_1","volume-title":"Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001","author":"Zhang Mingyuan","year":"2022","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2022. Motiondiffuse: Text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485664"}],"event":{"name":"MIG '23: The 16th ACM SIGGRAPH Conference on Motion, Interaction and Games","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"],"location":"Rennes France","acronym":"MIG '23"},"container-title":["ACM SIGGRAPH Conference on Motion Interaction and Games"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623264.3624443","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3623264.3624443","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T16:22:44Z","timestamp":1756484564000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623264.3624443"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,15]]},"references-count":86,"alternative-id":["10.1145\/3623264.3624443","10.1145\/3623264"],"URL":"https:\/\/doi.org\/10.1145\/3623264.3624443","relation":{},"subject":[],"published":{"date-parts":[[2023,11,15]]},"assertion":[{"value":"2023-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}