{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T08:09:58Z","timestamp":1759565398418,"version":"3.41.0"},"publisher-location":"Cham","reference-count":44,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031915772","type":"print"},{"value":"9783031915789","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91578-9_8","type":"book-chapter","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:23:39Z","timestamp":1749201819000},"page":"137-147","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Disentangling Planning, Driving and Rendering for Photorealistic Avatar Agents"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9507-6741","authenticated-orcid":false,"given":"Duomin","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0621-3544","authenticated-orcid":false,"given":"Bin","family":"Dai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7241-8519","authenticated-orcid":false,"given":"Yu","family":"Deng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8268-7517","authenticated-orcid":false,"given":"Baoyuan","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"8_CR1","doi-asserted-by":"crossref","unstructured":"Can computer personalities be human personalities? Int. J. Hum.-Comput. Stud. 43(2), 223\u2013239 (1995)","DOI":"10.1006\/ijhc.1995.1042"},{"key":"8_CR2","unstructured":"Autogpt: The heart of open-source agent ecosystem (2023). https:\/\/github.com\/Significant-Gravitas\/AutoGPT#-autogpt-the-heart-of-the-open-source-agent-ecosystem"},{"key":"8_CR3","doi-asserted-by":"publisher","unstructured":"Badler, N.: Virtual humans for animation, ergonomics, and simulation. In: Proceedings IEEE Nonrigid and Articulated Motion Workshop, pp. 28\u201336 (1997). https:\/\/doi.org\/10.1109\/NAMW.1997.609848","DOI":"10.1109\/NAMW.1997.609848"},{"key":"8_CR4","unstructured":"Besta, M., et\u00a0al.: Graph of thoughts: solving elaborate problems with large language models. arXiv preprint arXiv:2308.09687 (2023)"},{"key":"8_CR5","unstructured":"Brohan, A., et\u00a0al.: Rt-2: vision-language-action models transfer web knowledge to robotic control. arXiv preprint arXiv:2307.15818 (2023)"},{"key":"8_CR6","unstructured":"Brohan, A., et\u00a0al.: Rt-1: robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"8_CR7","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. CoRR abs\/2005.14165 (2020). https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"8_CR8","doi-asserted-by":"crossref","unstructured":"Deng, Y., Wang, D., Ren, X., Chen, X., Wang, B.: Learning one-shot 4d head avatar synthesis using synthetic data. arXiv preprint arXiv:2311.18729 (2023)","DOI":"10.1109\/CVPR52733.2024.00680"},{"key":"8_CR9","doi-asserted-by":"crossref","unstructured":"Deng, Y., Yang, J., Xu, S., Chen, D., Jia, Y., Tong, X.: Accurate 3d face reconstruction with weakly-supervised learning: from single image to image set. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"8_CR10","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"8_CR11","doi-asserted-by":"crossref","unstructured":"Hyde, J., Carter, E.J., Kiesler, S., Hodgins, J.K.: Using an interactive avatar\u2019s facial expressiveness to increase persuasiveness and socialness. In: Proceedings of the 33rd Annual ACM Conference on Human Factors in Computing Systems, pp. 1719\u20131728. CHI \u201915, Association for Computing Machinery, New York, NY, USA (2015)","DOI":"10.1145\/2702123.2702465"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"I\u015f\u0131k, M., et al.: Humanrf: high-fidelity neural radiance fields for humans in motion. ACM Trans. Graph. (TOG) 42(4), 1\u201312 (2023)","DOI":"10.1145\/3592415"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Jennings, N.R., Sycara, K., Wooldridge, M.: A roadmap of agent research and development. Auton. Agents Multi-Agent Syst. 1(1), 7\u201338 (1998)","DOI":"10.1023\/A:1010090405266"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of stylegan. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"8_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"8_CR16","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S.S., Reid, M., Matsuo, Y., Iwasawa, Y.: Large language models are zero-shot reasoners. Adv. Neural Inf. Process. Syst. 35, 22199\u201322213 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"8_CR17","doi-asserted-by":"crossref","unstructured":"Kyrlitsias, C., Michael-Grigoriou, D.: Social interaction with agents and avatars in immersive virtual environments: a survey. Front. Virtual Real. 2 (2022)","DOI":"10.3389\/frvir.2021.786665"},{"key":"8_CR18","doi-asserted-by":"crossref","unstructured":"Luo, C., Song, S., Xie, W., Shen, L., Gunes, H.: Learning multi-dimensional edge feature-based au relation graph for facial action unit recognition. arXiv preprint arXiv:2205.01782 (2022)","DOI":"10.24963\/ijcai.2022\/173"},{"issue":"2","key":"8_CR19","doi-asserted-by":"publisher","first-page":"151","DOI":"10.1109\/T-AFFC.2013.4","volume":"4","author":"SM Mavadati","year":"2013","unstructured":"Mavadati, S.M., Mahoor, M.H., Bartlett, K., Trinh, P., Cohn, J.F.: Disfa: a spontaneous facial action intensity database. IEEE Trans. Affect. Comput. 4(2), 151\u2013160 (2013)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"8_CR20","doi-asserted-by":"crossref","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: Nerf: representing scenes as neural radiance fields for view synthesis. In: European Conference on Computer Vision, pp. 405\u2013421. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Ng, E., Subramanian, S., Klein, D., Kanazawa, A., Darrell, T., Ginosar, S.: Can language models learn to listen? In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10083\u201310093 (October 2023)","DOI":"10.1109\/ICCV51070.2023.00925"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Oh, C.S., Bailenson, J.N., Welch, G.F.: A systematic review of social presence: definition, antecedents, and implications. Front. Robot. AI 5 (2018)","DOI":"10.3389\/frobt.2018.00114"},{"key":"8_CR23","unstructured":"van\u00a0den Oord, A., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning (2018)"},{"key":"8_CR24","doi-asserted-by":"crossref","unstructured":"Oorloff, T., Yaoob, Y.: One-shot face re-enactment using hybrid latent spaces of stylegan2 (2023)","DOI":"10.1109\/ICCV51070.2023.01915"},{"key":"8_CR25","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback (2022)"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Park, J.S., O\u2019Brien, J.C., Cai, C.J., Morris, M.R., Liang, P., Bernstein, M.S.: Generative agents: interactive simulacra of human behavior (2023)","DOI":"10.1145\/3586183.3606763"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Park, K., et al.: Nerfies: deformable neural radiance fields. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5865\u20135874 (2021)","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"8_CR28","doi-asserted-by":"publisher","first-page":"39976","DOI":"10.1109\/ACCESS.2018.2855970","volume":"6","author":"VM Petrovi\u0107","year":"2018","unstructured":"Petrovi\u0107, V.M.: Artificial intelligence and virtual worlds - toward human-level ai agents. IEEE Access 6, 39976\u201339988 (2018)","journal-title":"IEEE Access"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Thaler, M., Schl\u00f6gl, S., Groth, A.: Agent vs. avatar: comparing embodied conversational agents concerning characteristics of the uncanny valley. In: 2020 IEEE International Conference on Human-Machine Systems (ICHMS), pp.\u00a01\u20136 (2020)","DOI":"10.1109\/ICHMS49158.2020.9209539"},{"key":"8_CR31","unstructured":"Touvron, H., et\u00a0al.: LLAMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Wang, D., Deng, Y., Yin, Z., Shum, H.Y., Wang, B.: Progressive disentangled representation learning for fine-grained controllable talking head synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 17979\u201317989 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01724"},{"key":"8_CR33","doi-asserted-by":"crossref","unstructured":"Wang, H., Gaddy, V., Beveridge, J.R., Ortega, F.R.: Building an emotionally responsive avatar with dynamic facial expressions in human-computer interactions. Multimodal Technol. Interact. 5(3) (2021)","DOI":"10.3390\/mti5030013"},{"key":"8_CR34","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: A survey on large language model based autonomous agents (2023)","DOI":"10.1007\/s11704-024-40231-1"},{"key":"8_CR35","unstructured":"Wang, X., et al.: Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Wang, Z., Chiu, Y.Y., Chiu, Y.C.: Humanoid agents: platform for simulating human-like generative agents. arXiv preprint arXiv:2310.05418 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.15"},{"key":"8_CR37","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"8_CR38","doi-asserted-by":"crossref","unstructured":"Xiang, J., Yang, J., Deng, Y., Tong, X.: Gram-hd: 3d-consistent image generation at high resolution with generative radiance manifolds. arXiv preprint arXiv:2206.07255 (2022)","DOI":"10.1109\/ICCV51070.2023.00209"},{"key":"8_CR39","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models. arXiv preprint arXiv:2305.10601 (2023)"},{"key":"8_CR40","unstructured":"Yao, S., et al.: React: synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629 (2022)"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Yu, J., Zhu, H., Jiang, L., Loy, C.C., Cai, W., Wu, W.: Celebv-text: a large-scale facial text-video dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14805\u201314814 (2023)","DOI":"10.1109\/CVPR52729.2023.01422"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yin, Z., Zhou, D., Wang, D., Wong, F., Wang, B.: Talking head generation with probabilistic audio-to-visual diffusion priors. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7645\u20137655 (October 2023)","DOI":"10.1109\/ICCV51070.2023.00703"},{"issue":"10","key":"8_CR43","doi-asserted-by":"publisher","first-page":"692","DOI":"10.1016\/j.imavis.2014.06.002","volume":"32","author":"X Zhang","year":"2014","unstructured":"Zhang, X., et al.: Bp4d-spontaneous: a high-resolution spontaneous 3d dynamic facial expression database. Image Vis. Comput. 32(10), 692\u2013706 (2014)","journal-title":"Image Vis. Comput."},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Zhou, M., Bai, Y., Zhang, W., Yao, T., Zhao, T., Mei, T.: Responsive listening head generation: a benchmark dataset and baseline. In: Computer Vision \u2013 ECCV 2022, pp. 124\u2013142. Springer Nature Switzerland, Cham (2022)","DOI":"10.1007\/978-3-031-19839-7_8"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91578-9_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:23:54Z","timestamp":1749201834000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91578-9_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031915772","9783031915789"],"references-count":44,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91578-9_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The proposed method is for entertainment and animation generation. Deceptive misuse of this method is strictly prohibited.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics consideration"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}