{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,3]],"date-time":"2026-07-03T19:34:53Z","timestamp":1783107293834,"version":"3.54.6"},"reference-count":108,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"publisher","award":["2022ZD0160201"],"award-info":[{"award-number":["2022ZD0160201"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72192821,62472282"],"award-info":[{"award-number":["72192821,62472282"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["YG2023QNA35"],"award-info":[{"award-number":["YG2023QNA35"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01239","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"13336-13348","source":"Crossref","is-referenced-by-count":7,"title":["Go to Zero: Towards Zero-Shot Motion Generation with Million-Scale Data"],"prefix":"10.1109","author":[{"given":"Ke","family":"Fan","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shunlin","family":"Lu","sequence":"additional","affiliation":[{"name":"CUHK,Shenzhen"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minyue","family":"Dai","sequence":"additional","affiliation":[{"name":"Fudan University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Runyi","family":"Yu","sequence":"additional","affiliation":[{"name":"HKUST"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lixing","family":"Xiao","sequence":"additional","affiliation":[{"name":"Zhejiang University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhiyang","family":"Dou","sequence":"additional","affiliation":[{"name":"HKU"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junting","family":"Dong","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lizhuang","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingbo","family":"Wang","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00084"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00051"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00051"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/VR50410.2021.00037"},{"key":"ref8","volume-title":"Video generation models as world simulators","author":"Brooks","year":"2024"},{"key":"ref9","article-title":"Language models are few-shot learners","volume":"abs\/2005.14165","author":"Brown","year":"2020","journal-title":"ArXiv"},{"key":"ref10","article-title":"Dancetogether! identity-preserving multi-person interactive video generation","author":"Chen","year":"2025","journal-title":"arXiv preprint"},{"key":"ref11","article-title":"Motionclr: Motion generation and trainingfree editing via understanding attention mechanisms","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Pay attention and move better: Harnessing attention for interactive motion generation and training-free editing","author":"Chen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0365-1"},{"key":"ref15","article-title":"Laserhuman: Language-guided sceneaware human motion generation in free environment","author":"Cong","year":"2024","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"ref17","article-title":"Towards synthesized and editable motion in-betweening through part-wise phase representation","author":"Dai","year":"2025","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72640-8_22"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72640-8_22"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618205"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_6"},{"key":"ref22","article-title":"Motionwavelet: Human motion prediction via wavelet manifold learning","author":"Feng*","year":"2024","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16223"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00186"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27973"},{"key":"ref28","first-page":"6840","article-title":"Denoising diffusion probabilistic models","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530094"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681657"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01152"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3259183"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02113"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2025.3542631"},{"key":"ref35","article-title":"Motiongpt: Human motion as a foreign language","author":"Jiang","year":"2024","journal-title":"NeurIPS"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00838"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20014"},{"key":"ref40","article-title":"Dispose: Disentangling pose guidance for controllable human image animation","author":"Li","year":"2024","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00939"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00053"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02042-6"},{"key":"ref44","article-title":"Motion-x: A largescale 3d expressive whole-body human motion dataset","author":"Lin","year":"2024","journal-title":"NeurIPS"},{"key":"ref45","article-title":"Human motion modeling using dvgans","author":"Lin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2008.17"},{"key":"ref47","article-title":"Plan, posture and go: Towards open-world text-to-motion generation","author":"Liu","year":"2024","journal-title":"ECCV"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596800"},{"key":"ref50","article-title":"Humantomato: Text-aligned whole-body motion generation","author":"Lu","year":"2024","journal-title":"ICML"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02595"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/3DV62453.2024.00149"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00506"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00197"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1089\/big.2016.0028"},{"key":"ref57","first-page":"13","article-title":"Learning a bidirectional mapping between human wholebody motion and natural language using deep recurrent neural networks","volume":"109","author":"Plappert","year":"2018","journal-title":"RAS"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00078"},{"key":"ref59","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref60","article-title":"Sam 2: Segment anything in images and videos","author":"Ravi","year":"2024","journal-title":"arXiv preprint"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref62","article-title":"Human motion diffusion as a generative prior","author":"Shafir","year":"2024","journal-title":"ICLR"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687565"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3658140"},{"key":"ref65","first-page":"22562265","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","author":"Sohl-Dickstein","year":"2015","journal-title":"ICML"},{"key":"ref66","article-title":"Denoising diffusion implicit models","author":"Song","year":"2021","journal-title":"ICLR"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356505"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530178"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"ref70","article-title":"Human motion diffusion model","author":"Tevet","year":"2022","journal-title":"ICLR"},{"key":"ref71","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref72","article-title":"Aist dance video database: Multigenre, multi-dancer, and multi-camera database for dance information processing","volume-title":"Proceedings of the 20th International Society for Music Information Retrieval Conference, ISMIR 2019","author":"Tsuchida"},{"key":"ref73","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72913-3_3"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72913-3_3"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01924"},{"key":"ref77","article-title":"Sims: Simulating human-scene interactions with real world script planning","author":"Wang","year":"2025","journal-title":"ICCV 2025"},{"key":"ref78","article-title":"Quo vadis, motion generation? from large language models to large motion models","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01634"},{"key":"ref80","first-page":"14959","article-title":"Humanise: Language-conditioned human motion generation in 3d scenes","author":"Wang","year":"2022","journal-title":"NeurIPS"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"ref82","article-title":"Motionllm: Multimodal motion-language learning with large language models","author":"Wu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00940"},{"key":"ref84","article-title":"Unified human-scene interaction via prompted chain-of-contacts","author":"Xiao","year":"2024","journal-title":"ICLR"},{"key":"ref85","article-title":"Omnicontrol: Control any joint at any time for human motion generation","author":"Xie","year":"2024","journal-title":"ICLR"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28443"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00212"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00212"},{"key":"ref89","article-title":"Motionbank: A large-scale video motion benchmark with disentangled rule-based annotations","author":"Xu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02101"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00173"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0321-5"},{"key":"ref93","article-title":"Egochoir: Capturing 3d human-object interaction regions from egocentric views","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00487"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01632"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00956"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730640"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680864"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72624-8_23"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0292-6"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_2"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_2"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00135"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11443317.pdf?arnumber=11443317","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T19:45:10Z","timestamp":1777578310000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11443317\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":108,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01239","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}