{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:30:14Z","timestamp":1781587814911,"version":"3.54.5"},"reference-count":55,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Nature Science Foundation of China","doi-asserted-by":"publisher","award":["62425114,62121002,U23B2028,62232006,62472395"],"award-info":[{"award-number":["62425114,62121002,U23B2028,62232006,62472395"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01172","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"12615-12625","source":"Crossref","is-referenced-by-count":1,"title":["GestureHYDRA: Semantic Co-Speech Gesture Synthesis via Hybrid Modality Diffusion Transformer and Cascaded-Synchronized Retrieval-Augmented Generation"],"prefix":"10.1109","author":[{"given":"Quanwei","family":"Yang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Luying","family":"Huang","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kaisiyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiazhi","family":"Guan","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shengyi","family":"He","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fengguo","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lingyun","family":"Yu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yingying","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haocheng","family":"Feng","sequence":"additional","affiliation":[{"name":"Baidu Inc."}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongtao","family":"Xie","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13946"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/S0885-2014(99)80017-3"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"ref4","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"NeurIPS"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3685523"},{"key":"ref8","article-title":"Region-aware text-to-image generation via hard binding and soft refinement","author":"Chen","year":"2024","journal-title":"CoRR"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687677"},{"key":"ref10","article-title":"The llama 3 herd of models","author":"Dubey","year":"2024","journal-title":"CoRR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687571"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00998"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_20"},{"key":"ref14","first-page":"79","article-title":"Evaluation of speech-togesture generation using bi-directional LSTM network","author":"Hasegawa","year":"2018","journal-title":"IVA"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00779"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1186\/s41235-016-0004-9"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1002\/cav.6"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418815"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/1778765.1778861"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.01315"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01021"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00155"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680684"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/2485895.2485900"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref33","article-title":"Denoising diffusion implicit models","author":"Song","year":"2021","journal-title":"ICLR"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00231"},{"key":"ref35","author":"Tevet","year":"2022","journal-title":"Human motion diffusion model"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"ref37","first-page":"6306","article-title":"Neural discrete representation learning","author":"van den Oord","year":"2017","journal-title":"NIPS"},{"key":"ref38","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2013.09.008"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3166627"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25354"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591509"},{"key":"ref43","article-title":"Mambatalk: Efficient holistic gesture synthesis with selective state space models","volume-title":"NeurIPS","author":"Xu","year":"2024"},{"key":"ref44","article-title":"Qwen2.5 technical report","author":"Yang","year":"2024","journal-title":"CoRR"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547896"},{"key":"ref46","article-title":"Showmaker: Creating high-fidelity 2d human video via fine-grained diffusion modeling","author":"Yang","year":"2024","journal-title":"NeurIPS"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/650"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474789"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"ref52","article-title":"$\\text{mr}^{2}$ ag: Multimodal retrieval-reflection-augmented generation for knowledge-based VQA","author":"Zhang","year":"2024","journal-title":"CoRR"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3658134"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01959"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444338.pdf?arnumber=11444338","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:23:19Z","timestamp":1777530199000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444338\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01172","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}