{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T07:12:26Z","timestamp":1773040346758,"version":"3.50.1"},"reference-count":97,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T00:00:00Z","timestamp":1633046400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,10]]},"DOI":"10.1109\/iccv48922.2021.01315","type":"proceedings-article","created":{"date-parts":[[2022,2,28]],"date-time":"2022-02-28T22:08:02Z","timestamp":1646086082000},"page":"13381-13392","source":"Crossref","is-referenced-by-count":436,"title":["AI Choreographer: Music Conditioned 3D Dance Generation with AIST++"],"prefix":"10.1109","author":[{"given":"Ruilong","family":"Li","sequence":"first","affiliation":[{"name":"University of Southern California"}]},{"given":"Shan","family":"Yang","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"David A.","family":"Ross","sequence":"additional","affiliation":[{"name":"Google Research"}]},{"given":"Angjoo","family":"Kanazawa","sequence":"additional","affiliation":[{"name":"Google Research"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"ref38","first-page":"958","article-title":"Multi-modal dense video captioning","author":"iashin","year":"2020","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073663"},{"key":"ref32","first-page":"6626","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","author":"heusel","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00723"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/1230100.1230123"},{"key":"ref37","doi-asserted-by":"crossref","first-page":"185:1","DOI":"10.1145\/3272127.3275108","article-title":"Deep inertial poser learning to reconstruct human pose from sparseinertial measurements in real time","volume":"37","author":"huang","year":"2018","journal-title":"ACM Trans on Graphics (Proc of SIGGRAPH)"},{"key":"ref36","article-title":"Dance revolution: Long-term dance generation with music via curriculum learning","author":"huang","year":"2021","journal-title":"International Conference on Learning Representations"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2820903.2820918"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925975"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00059"},{"key":"ref29","article-title":"fairmotion - tools to load, process and visualize motion capture data","author":"gopinath","year":"2020"},{"key":"ref20","first-page":"501","article-title":"Example-based automatic music-driven conventional dance motion synthesis","volume":"18","author":"fan","year":"2011","journal-title":"IEEE Transactions on Visualization and Computer Graphics"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3267851.3267898"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1016\/j.cag.2020.09.009","article-title":"Learning to dance: A graph convolutional adversarial network to generate realistic dance motions from audio","volume":"94","author":"ferreira","year":"0","journal-title":"Computers & Graphics"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.494"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2020.04.007"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_44"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1006\/cviu.2000.0894"},{"key":"ref50","article-title":"The dancing species: how moving together in time helps make us human","author":"lamothe","year":"2019","journal-title":"Aeon"},{"key":"ref51","article-title":"Dancing to music","author":"lee","year":"2019"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00754"},{"key":"ref57","doi-asserted-by":"crossref","DOI":"10.1145\/2816795.2818013","article-title":"SMPL: A skinned multi-person linear model","author":"loper","year":"2015","journal-title":"SIGGRAPH Asia"},{"key":"ref56","article-title":"Auto-conditioned recurrent networks for extended complex human motion synthesis","author":"li","year":"2018","journal-title":"ICLRE"},{"key":"ref55","article-title":"Learning to generate diverse dance motions with transformer","author":"li","year":"2020"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311539"},{"key":"ref52","article-title":"Listen to dance: Music-driven choreography generation using autoregressive encoder-decoder network","author":"lee","year":"2018"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.573"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00724"},{"key":"ref3","article-title":"Attention, please: A spatio-temporal transformer for 3d human motion prediction","author":"aksan","year":"2020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.471"},{"key":"ref5","first-page":"26","article-title":"Groovenet: Real-time music-driven dance movement generation using artificial neural networks","volume":"8","author":"alemi","year":"2017","journal-title":"Networks"},{"key":"ref8","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"bengio","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093627"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/566654.566606"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/VR50410.2021.00037"},{"key":"ref46","article-title":"Vibe: Video inference for human body pose and shape estimation","author":"kocabas","year":"2019"},{"key":"ref45","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/1401132.1401202"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413848"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683343"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003750"},{"key":"ref73","doi-asserted-by":"crossref","first-page":"209","DOI":"10.1145\/3355089.3356505","article-title":"Neural state machine for character-scene interactions","volume":"38","author":"starke","year":"2019","journal-title":"ACM Trans Graph"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00790"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2006.00964.x"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.48"},{"key":"ref76","article-title":"Learning video representations using contrastive bidirectional transformer","author":"sun","year":"2019"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392450"},{"key":"ref75","year":"2020"},{"key":"ref78","article-title":"Deepdance: Music-to-dance motion choreography with adversarial learning","author":"sun","year":"2020","journal-title":"IEEE Transactions on Multimedia"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240526"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/1186822.1073247"},{"key":"ref61","first-page":"649","article-title":"Dance music, movement and tempo preferences","author":"moelants","year":"2003","journal-title":"Proceedings of the 5th Triennial ESCOM Conference"},{"key":"ref63","first-page":"83","article-title":"Fmdistance: A fast and effective distance function for motion capture data","author":"onuma","year":"2008","journal-title":"In Eurographics (Short papers)"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CA.2000.889031"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00444"},{"key":"ref66","article-title":"Improving language understanding by generative pre-training","author":"radford","year":"2018"},{"key":"ref67","article-title":"Music-oriented dance video synthesis with pose perceptual loss","author":"ren","year":"2019"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413932"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2977333"},{"key":"ref69","first-page":"3171","article-title":"Fastspeech: Fast, robust and controllable text to speech","author":"ren","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref1","year":"0"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201366"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.496"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414005"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851872"},{"key":"ref96","article-title":"Music2dance: Music-driven dance generation using wavenet","author":"zhuang","year":"2020"},{"key":"ref97","article-title":"Towards 3d dance motion synthesis and control","author":"zhuang","year":"2020"},{"key":"ref10","article-title":"Learning statistical models of human motion","volume":"2000","author":"bowden","year":"2000","journal-title":"IEEE Workshop on Human Modeling Analysis and Synthesis CVPR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/344779.344865"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.173"},{"key":"ref13","article-title":"Baby talk: Understanding and generating image descriptions","year":"0"},{"key":"ref14","article-title":"OpenPose: realtime multi-person 2D pose estimation using Part Affinity Fields","author":"cao","year":"2018"},{"key":"ref15","first-page":"1","article-title":"Choreomaster: choreography-oriented music-driven dance synthesis","volume":"40","author":"chen","year":"2021","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00156"},{"key":"ref82","first-page":"501","article-title":"Aist dance video database: Multi-genre, multi-dancer, and multi-camera database for dance information processing","author":"tsuchida","year":"2019","journal-title":"Proceedings of the 20th International Society for Music Information Retrieval Conference ISMIR 2019"},{"key":"ref17","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018"},{"key":"ref81","article-title":"Feel the music: Automatically generating a dance for an input song","author":"tendulkar","year":"2020"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2895266"},{"key":"ref84","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref19","first-page":"3059","article-title":"Weakly supervised dense event captioning in videos","author":"duan","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682804"},{"key":"ref80","doi-asserted-by":"crossref","first-page":"1025","DOI":"10.1145\/1553374.1553505","article-title":"Factored conditional restricted boltzmann machines for modeling motion style","author":"taylor","year":"2009","journal-title":"Proceedings of the 26th Annual International Conference on Machine Learning"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413581"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref86","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"venugopalan","year":"2014"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00901"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00722"}],"event":{"name":"2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Montreal, QC, Canada","start":{"date-parts":[[2021,10,10]]},"end":{"date-parts":[[2021,10,17]]}},"container-title":["2021 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9709627\/9709628\/09710065.pdf?arnumber=9710065","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,28]],"date-time":"2023-01-28T03:09:25Z","timestamp":1674875365000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9710065\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10]]},"references-count":97,"URL":"https:\/\/doi.org\/10.1109\/iccv48922.2021.01315","relation":{},"subject":[],"published":{"date-parts":[[2021,10]]}}}