{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T13:27:06Z","timestamp":1762954026466,"version":"3.41.0"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T00:00:00Z","timestamp":1728172800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T00:00:00Z","timestamp":1728172800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,10,6]]},"DOI":"10.1109\/smc54092.2024.10832087","type":"proceedings-article","created":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:39:20Z","timestamp":1737398360000},"page":"2999-3006","source":"Crossref","is-referenced-by-count":1,"title":["SAM-Wav2lip++: Enhancing Behavioral Realism in Synthetic Agents Through Audio-Driven Speech and Action Refinement"],"prefix":"10.1109","author":[{"given":"Bihui","family":"Yu","sequence":"first","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dawei","family":"Liu","sequence":"additional","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huiyang","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences,Beijing,China,100049"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"GuiYong","family":"Chang","sequence":"additional","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingxuan","family":"Wei","sequence":"additional","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Linzhuang","family":"Sun","sequence":"additional","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Songtao","family":"Tian","sequence":"additional","affiliation":[{"name":"Tsinghua University,Department of Mathematical Sciences,Beijing,China,100084"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liping","family":"Bu","sequence":"additional","affiliation":[{"name":"Shenyang Institute of Computing Technology, Chinese Academy of Sciences,Shenyang,China,110168"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/620"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"ref5","article-title":"Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Geneface++: Generalized and stable real-time audio-driven 3d talking face generation","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"I2vgen-xl: High-quality image-to-video synthe-sis via cascaded diffusion models","author":"Zhang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"volume-title":"The grid audio-visual speech corpus","year":"2006","author":"Cooke","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2014.2336244"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2016.2515617"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54184-6_6"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2889052"},{"key":"ref16","article-title":"Lrs3-ted: a large-scale dataset for visual speech recognition","author":"Afouras","year":"2018","journal-title":"arXiv preprint"},{"issue":"4","key":"ref17","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3072959.3073640","article-title":"Syn-thesizing obama: learning lip sync from audio","volume":"36","author":"Suwajanakorn","year":"2017","journal-title":"ACM Transactions on Graphics (ToG)"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-950"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756582"},{"key":"ref22","first-page":"1","article-title":"Faccforensics++: Learning to detect manipulated facial images","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Rossler"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"ref25","article-title":"Obamanet: Photo-realistic lip-sync from text","author":"Kumar","year":"2017","journal-title":"arXiv preprint"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555399"},{"key":"ref28","first-page":"1428","article-title":"Towards automatic face-to-face translation","volume-title":"Proceedings of the 27th ACM international conference on multimedia","author":"KR"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/152"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20154"},{"key":"ref34","article-title":"First order motion model for image animation","volume":"32","author":"Siarohin","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00449"},{"key":"ref37","article-title":"Perpetual motion: Generating unbounded human motion","author":"Zhang","year":"2020","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.4135\/9781446251409.n4"},{"key":"ref39","article-title":"Learning to generate diverse dance motions with transformer","author":"Li","year":"2020","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"Dance revolution: Long-term dance generation with music via curriculum learning","author":"Huang","year":"2020","journal-title":"arXiv preprint"},{"issue":"2018","key":"ref41","first-page":"1","article-title":"Generating animated videos of human activities from natural language descriptions","volume":"1","author":"Lin","year":"2018","journal-title":"Learning"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00084"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"ref46","article-title":"Human motion diffusion model","author":"Tevet","year":"2022","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00212"}],"event":{"name":"2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","start":{"date-parts":[[2024,10,6]]},"location":"Kuching, Malaysia","end":{"date-parts":[[2024,10,10]]}},"container-title":["2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830919\/10830920\/10832087.pdf?arnumber=10832087","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T17:42:36Z","timestamp":1748626956000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832087\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,6]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/smc54092.2024.10832087","relation":{},"subject":[],"published":{"date-parts":[[2024,10,6]]}}}