{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T15:46:10Z","timestamp":1780587970088,"version":"3.54.1"},"publisher-location":"Cham","reference-count":85,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727634","type":"print"},{"value":"9783031727641","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T00:00:00Z","timestamp":1729814400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72764-1_24","type":"book-chapter","created":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T14:03:10Z","timestamp":1729778590000},"page":"418-437","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":31,"title":["REMOS: 3D Motion-Conditioned Reaction Synthesis for\u00a0Two-Person Interactions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5361-8806","authenticated-orcid":false,"given":"Anindita","family":"Ghosh","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1245-4146","authenticated-orcid":false,"given":"Rishabh","family":"Dabral","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1630-2006","authenticated-orcid":false,"given":"Vladislav","family":"Golyanik","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6104-6625","authenticated-orcid":false,"given":"Christian","family":"Theobalt","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2189-2429","authenticated-orcid":false,"given":"Philipp","family":"Slusallek","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,25]]},"reference":[{"key":"24_CR1","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Ma, S., Morency, L.P., Sheikh, Y.: To react or not to react: end-to-end visual pose forecasting for personalized avatar during dyadic conversations. In: 2019 International Conference on Multimodal Interaction (2019)","DOI":"10.1145\/3340555.3353725"},{"key":"24_CR2","doi-asserted-by":"crossref","unstructured":"Ao, T., Zhang, Z., Liu, L.: GestureDiffuCLIP: gesture diffusion model with clip latents. In: SIGGRAPH (2023)","DOI":"10.1145\/3592097"},{"key":"24_CR3","doi-asserted-by":"crossref","unstructured":"Aristidou, A., Yiannakidis, A., Aberman, K., Cohen-Or, D., Shamir, A., Chrysanthou, Y.: Rhythm is a dancer: music-driven motion synthesis with global structure. IEEE Trans. Visualiz. Comput. Graph. (2022)","DOI":"10.1109\/TVCG.2022.3163676"},{"key":"24_CR4","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: Teach: temporal action composition for 3D humans. In: 2022 International Conference on 3D Vision (3DV) (2022)","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"24_CR5","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Childs, E., Rewkowski, N., Manocha, D.: Speech2affectivegestures: synthesizing co-speech gestures with generative adversarial affective expression learning. In: Proceedings of the 29th ACM International Conference on Multimedia (2021)","DOI":"10.1145\/3474085.3475223"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Rewkowski, N., Banerjee, A., Guhan, P., Bera, A., Manocha, D.: Text2gestures: a transformer-based network for generating emotive body gestures for virtual agents. In: IEEE Conference on Virtual Reality and 3D User Interfaces (IEEE VR) (2021)","DOI":"10.1109\/VR50410.2021.00037"},{"key":"24_CR7","unstructured":"Bjorck, N., Gomes, C.P., Selman, B., Weinberger, K.Q.: Understanding batch normalization. In: Advances in Neural Information Processing Systems (2018)"},{"key":"24_CR8","unstructured":"https:\/\/captury.com (2023)"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Chan, J.C., Leung, H., Tang, J.K., Komura, T.: A virtual reality dance training system using motion capture technology. IEEE Trans. Learn. Technol. 4(2), 187\u2013195 (2010)","DOI":"10.1109\/TLT.2010.27"},{"key":"24_CR10","doi-asserted-by":"crossref","unstructured":"Chopin, B., Tang, H., Daoudi, M.: Bipartite graph diffusion model for human interaction generation. In: Winter Conference on Applications of Computer Vision (WACV) (2024)","DOI":"10.1109\/WACV57701.2024.00525"},{"key":"24_CR11","doi-asserted-by":"crossref","unstructured":"Chopin, B., Tang, H., Otberdout, N., Daoudi, M., Sebe, N.: Interaction transformer for human reaction generation. IEEE Trans. Multim. (2023)","DOI":"10.1109\/TMM.2023.3242152"},{"key":"24_CR12","unstructured":"Cummins, A.: In Search of the Ninja: The Historical Truth of Ninjutsu. The History Press (2012)"},{"key":"24_CR13","doi-asserted-by":"crossref","unstructured":"Dabral, R., Mughal, M.H., Golyanik, V., Theobalt, C.: Mofusion: A framework for denoising-diffusion-based motion synthesis. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Egges, A., Papagiannakis, G., Magnenat-Thalmann, N.: Presence and interaction in mixed reality environments. The Visual Computer (2007)","DOI":"10.1007\/s00371-007-0113-z"},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"Elfwing, S., Uchibe, E., Doya, K.: Sigmoid-weighted linear units for neural network function approximation in reinforcement learning. Neural Networks (2018)","DOI":"10.1016\/j.neunet.2017.12.012"},{"key":"24_CR16","doi-asserted-by":"crossref","unstructured":"Fieraru, M., Zanfir, M., Oneata, E., Popa, A.I., Olaru, V., Sminchisescu, C.: Three-dimensional reconstruction of human interactions. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00724"},{"key":"24_CR17","unstructured":"Gandikota, R., Brown, N.: Pro-ddpm: progressive growing of variable denoising diffusion probabilistic models for faster convergence. In: 33rd British Machine Vision Conference 2022, BMVC (2022)"},{"key":"24_CR18","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"24_CR19","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Dabral, R., Golyanik, V., Theobalt, C., Slusallek, P.: IMoS: intent-driven full-body motion synthesis for human-object interactions. In: Computer Graphics Forum, vol.\u00a042. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14739"},{"key":"24_CR20","doi-asserted-by":"crossref","unstructured":"Goel, A., Men, Q., Ho, E.S.L.: Interaction mix and match: synthesizing close interaction using conditional hierarchical GAN with multi-hot class embedding. Comput. Graph. Forum (2022)","DOI":"10.1111\/cgf.14647"},{"key":"24_CR21","doi-asserted-by":"crossref","unstructured":"Gu, D., Shim, J., Jang, J., Kang, C., Joo, K.: ContactGen: contact-guided interactive 3D human generation for partners. Proc. AAAI Conf. Artif. Intell. 38(3), 1923\u20131931 (2024)","DOI":"10.1609\/aaai.v38i3.27962"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"24_CR23","doi-asserted-by":"publisher","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: TM2T: stochastic and\u00a0tokenized modeling for\u00a0the\u00a0reciprocal generation of\u00a03D human motions and\u00a0texts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXV, pp. 580\u2013597. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_34","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"24_CR24","doi-asserted-by":"crossref","unstructured":"Guo, W., Bie, X., Alameda-Pineda, X., Moreno-Noguer, F.: Multi-person extreme motion prediction. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01271"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Habibie, I., et al.: A motion matching-based framework for controllable gesture synthesis from speech. In: ACM SIGGRAPH Conference Proceedings (2022)","DOI":"10.1145\/3528233.3530750"},{"key":"24_CR26","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"144","DOI":"10.1007\/978-3-642-17080-5_17","volume-title":"Artificial Intelligence and Cognitive Science","author":"E Hanser","year":"2010","unstructured":"Hanser, E., Mc Kevitt, P., Lunney, T., Condell, J.: SceneMaker: intelligent multimodal visualisation of natural language scripts. In: Coyle, L., Freyne, J. (eds.) AICS 2009. LNCS (LNAI), vol. 6206, pp. 144\u2013153. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-17080-5_17"},{"key":"24_CR27","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANS trained by a two time-scale update rule converge to a local nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"24_CR28","doi-asserted-by":"crossref","unstructured":"Ho, E.S., Komura, T.: Planning tangling motions for humanoids. In: IEEE-RAS International Conference on Humanoid Robots (2007)","DOI":"10.1109\/ICHR.2007.4813918"},{"key":"24_CR29","doi-asserted-by":"crossref","unstructured":"Ho, E.S.L., Komura, T.: Character motion synthesis by topology coordinates. Comput. Graph. Forum 28(2), 299\u2013308 (2009)","DOI":"10.1111\/j.1467-8659.2009.01369.x"},{"key":"24_CR30","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural Inf. Process. Syst. 33 (2020)"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23(1) (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"24_CR32","unstructured":"Hu, T., Zhu, X., Guo, W.: Two-person interaction recognition based on key poses. J. Comput. Inf. Syst. (2014)"},{"key":"24_CR33","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Diffusion-based generation, optimization, and planning in 3d scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Huang, Y., et al.: Genre-conditioned long-term 3d dance generation driven by music. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747838"},{"key":"24_CR35","doi-asserted-by":"crossref","unstructured":"Karunratanakul, K., Preechakul, K., Suwajanakorn, S., Tang, S.: Guided motion diffusion for controllable human motion synthesis. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"24_CR36","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"24_CR37","doi-asserted-by":"crossref","unstructured":"Komura, T., Ho, E.S.L., Lau, R.W.H.: Animating reactive motion using momentum-based inverse kinematics. Comput. Animat. Virt. Worlds 16(3\u20134), 213\u2013223 (2005)","DOI":"10.1002\/cav.101"},{"key":"24_CR38","doi-asserted-by":"crossref","unstructured":"Kulkarni, N., et al.: Nifty: neural object interaction fields for guided human motion synthesis. arXiv preprint arXiv:2307.07511 (2023)","DOI":"10.1109\/CVPR52733.2024.00096"},{"key":"24_CR39","doi-asserted-by":"crossref","unstructured":"Kundu, J.N., Buckchash, H., Mandikal, P., Jamkhandi, A., Radhakrishnan, V.B., et\u00a0al.: Cross-conditioned recurrent networks for long-term synthesis of inter-person human motion interactions. In: Winter Conference on Applications of Computer Vision (WACV) (2020)","DOI":"10.1109\/WACV45572.2020.9093627"},{"key":"24_CR40","doi-asserted-by":"crossref","unstructured":"Li, J., Clegg, A., Mottaghi, R., Wu, J., Puig, X., Liu, C.K.: Controllable human-object interaction synthesis. arXiv preprint arXiv:2312.03913 (2023)","DOI":"10.1007\/978-3-031-72940-9_4"},{"key":"24_CR41","doi-asserted-by":"crossref","unstructured":"Li, J., Wu, J., Liu, C.K.: Object motion guided human motion synthesis. ACM Trans. Graph. (2023)","DOI":"10.1145\/3618333"},{"key":"24_CR42","doi-asserted-by":"crossref","unstructured":"Liang, H., Zhang, W., Li, W., Yu, J., Xu, L.: InterGen: diffusion-based multi-human motion generation under complex interactions. Int. J. Comput. Vision (2024)","DOI":"10.1007\/s11263-024-02042-6"},{"key":"24_CR43","doi-asserted-by":"crossref","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.Y., Kot, A.C.: NTU RGB+D 120: a large-scale benchmark for 3d human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. (2019)","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"24_CR44","unstructured":"Liu, X., Yi, L.: Geneoh diffusion: towards generalizable hand-object interaction denoising via denoising diffusion. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"24_CR45","doi-asserted-by":"crossref","unstructured":"Men, Q., Shum, H.P., Ho, E.S., Leung, H.: Gan-based reactive motion synthesis with class-aware discriminators for human\u2013human interaction. Comput. Graph. (2022)","DOI":"10.1016\/j.cag.2021.09.014"},{"key":"24_CR46","doi-asserted-by":"crossref","unstructured":"Mousas, C.: Performance-driven dance motion control of a virtual partner character. In: IEEE Conference on Virtual Reality and 3D User Interfaces (VR) (2018)","DOI":"10.1109\/VR.2018.8446498"},{"key":"24_CR47","doi-asserted-by":"crossref","unstructured":"Mughal, M.H., Dabral, R., Habibie, I., Donatelli, L., Habermann, M., Theobalt, C.: Convofusion: multi-modal conversational diffusion for co-speech gesture synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00138"},{"key":"24_CR48","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: Learning to listen: Modeling non-deterministic dyadic facial motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"24_CR49","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"24_CR50","doi-asserted-by":"crossref","unstructured":"Po, R., et al.: State of the art on diffusion models for visual computing. arXiv preprints (2023)","DOI":"10.1111\/cgf.15063"},{"key":"24_CR51","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. In: International Conference on Virtual Reality (2022)"},{"key":"24_CR52","doi-asserted-by":"crossref","unstructured":"Rempe, D., et al.: Trace and pace: Controllable pedestrian animation via guided trajectory diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.01322"},{"key":"24_CR53","doi-asserted-by":"crossref","unstructured":"Senecal, S., Nijdam, N.A., Aristidou, A., Magnenat-Thalmann, N.: Salsa dance learning evaluation and motion analysis in gamified virtual reality environment. Multim. Tools Appl. (2020)","DOI":"10.1007\/s11042-020-09192-y"},{"key":"24_CR54","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"24_CR55","doi-asserted-by":"crossref","unstructured":"Shen, Y., Yang, L., Ho, E.S.L., Shum, H.P.H.: Interaction-based human activity comparison. IEEE Trans. Visualiz. Comput. Graph. (2020)","DOI":"10.1109\/TVCG.2019.2893247"},{"key":"24_CR56","doi-asserted-by":"crossref","unstructured":"Shimada, S., Golyanik, V., Xu, W., P\u00e9rez, P., Theobalt, C.: Neural monocular 3d human motion capture with physical awareness. ACM Trans. Graph. (2021)","DOI":"10.1145\/3476576.3476640"},{"key":"24_CR57","doi-asserted-by":"crossref","unstructured":"Shimada, S., Golyanik, V., Xu, W., Theobalt, C.: Physcap: physically plausible monocular 3d motion capture in real time. ACM Trans. Graph. (2020)","DOI":"10.1145\/3414685.3417877"},{"key":"24_CR58","doi-asserted-by":"crossref","unstructured":"Shum, H.P., Komura, T., Shiraishi, M., Yamazaki, S.: Interaction patches for multi-character animation. ACM Trans. Graph. 27(5) (2008)","DOI":"10.1145\/1409060.1409067"},{"key":"24_CR59","doi-asserted-by":"crossref","unstructured":"Shum, H.P., Komura, T., Yamazaki, S.: Simulating competitive interactions using singly captured motions. In: Proceedings of ACM Symposium on Virtual Reality Software and Technology (2007)","DOI":"10.1145\/1315184.1315194"},{"key":"24_CR60","unstructured":"Siyao, L., et al.: Duolando: follower GPT with off-policy reinforcement learning for dance accompaniment. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"24_CR61","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning (ICML) (2015)"},{"key":"24_CR62","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"24_CR63","doi-asserted-by":"crossref","unstructured":"Spring, H.: Swing and the lindy hop: dance, venue, media, and tradition. Am. Music 15(2), 183 (1997)","DOI":"10.2307\/3052731"},{"key":"24_CR64","doi-asserted-by":"crossref","unstructured":"Starke, S., Zhao, Y., Komura, T., Zaman, K.: Local motion phases for learning multi-contact character movements. ACM Trans. Graph. (2020)","DOI":"10.1145\/3386569.3392450"},{"key":"24_CR65","doi-asserted-by":"crossref","unstructured":"Tanaka, M., Fujiwara, K.: Role-aware interaction generation from textual description. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01466"},{"key":"24_CR66","doi-asserted-by":"crossref","unstructured":"Tanke, J., et al.: Social diffusion: long-term multiple human motion anticipation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00880"},{"key":"24_CR67","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Bermano, A.H., Cohen-Or, D.: Human motion diffusion model. arXiv preprint arXiv:2209.14916 (2022)"},{"key":"24_CR68","doi-asserted-by":"crossref","unstructured":"Tseng, J., Castellon, R., Liu, K.: Edge: editable dance generation from music. In: Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00051"},{"key":"24_CR69","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"24_CR70","unstructured":"Wang, J., Xu, H., Narasimhan, M., Wang, X.: Multi-person 3d motion prediction with multi-range transformers. Adv. Neural Inf. Process. Syst. (2021)"},{"key":"24_CR71","doi-asserted-by":"crossref","unstructured":"Wang, Z., et al.: Move as you say, interact as you can: language-guided human motion generation with scene affordance. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"24_CR72","unstructured":"Xie, Y., Jampani, V., Zhong, L., Sun, D., Jiang, H.: OmniControl: control any joint at any time for human motion generation. In: International Conference on Learning Representations (ICLR) (2024)"},{"key":"24_CR73","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: Codetalker: speech-driven 3d facial animation with discrete motion prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"24_CR74","doi-asserted-by":"crossref","unstructured":"Xu, S., Li, Z., Wang, Y.X., Gui, L.Y.: InterDiff: generating 3d human-object interactions with physics-informed diffusion. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01371"},{"key":"24_CR75","doi-asserted-by":"crossref","unstructured":"Ye, Y., et al.: Affordance diffusion: Synthesizing hand-object interactions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.02153"},{"key":"24_CR76","doi-asserted-by":"crossref","unstructured":"Yoon, Y., Ko, W.R., Jang, M., Lee, J., Kim, J., Lee, G.: Robots learn social skills: end-to-end learning of co-speech gesture generation for humanoid robots. In: International Conference on Robotics and Automation (ICRA). IEEE (2019)","DOI":"10.1109\/ICRA.2019.8793720"},{"key":"24_CR77","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: Physdiff: physics-guided human motion diffusion model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"24_CR78","doi-asserted-by":"crossref","unstructured":"Yun, K., Honorio, J., Chattopadhyay, D., Berg, T.L., Samaras, D.: Two-person interaction detection using body-pose features and multiple instance learning. In: IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (2012)","DOI":"10.1109\/CVPRW.2012.6239234"},{"key":"24_CR79","doi-asserted-by":"crossref","unstructured":"Zamfirescu-Pereira, J., Wong, R.Y., Hartmann, B., Yang, Q.: Why Johnny can\u2019t prompt: how non-AI experts try (and fail) to design LLM prompts. In: Proceedings of Conference on Human Factors in Computing Systems (CHI) (2023)","DOI":"10.1145\/3544548.3581388"},{"key":"24_CR80","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2m-gpt: generating human motion from textual descriptions with discrete representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"24_CR81","unstructured":"Zhang, M., et al.: Motiondiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"24_CR82","doi-asserted-by":"crossref","unstructured":"Zhang, W., Dabral, R., Leimk\u00fchler, T., Golyanik, V., Habermann, M., Theobalt, C.: ROAM: robust and object-aware motion generation using neural pose descriptors. In: International Conference on 3D Vision (3DV) (2024)","DOI":"10.1109\/3DV62453.2024.00130"},{"key":"24_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, X., Bhatnagar, B.L., Starke, S., Guzov, V., Pons-Moll, G.: Couch: towards controllable human-chair interactions. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-20065-6_30"},{"key":"24_CR84","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Wang, B.: UDE: a unified driving engine for human motion generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00545"},{"key":"24_CR85","doi-asserted-by":"crossref","unstructured":"Zhu, L., Liu, X., Liu, X., Qian, R., Liu, Z., Yu, L.: Taming diffusion models for audio-driven co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01016"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72764-1_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T06:29:40Z","timestamp":1732948180000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72764-1_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,25]]},"ISBN":["9783031727634","9783031727641"],"references-count":85,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72764-1_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,25]]},"assertion":[{"value":"25 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}