{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T10:04:03Z","timestamp":1780913043732,"version":"3.54.1"},"publisher-location":"Cham","reference-count":117,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726903","type":"print"},{"value":"9783031726910","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72691-0_25","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:07:43Z","timestamp":1730570863000},"page":"445-465","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":26,"title":["Nymeria: A Massive Collection of\u00a0Multimodal Egocentric Daily Motion in\u00a0the\u00a0Wild"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-3698-0787","authenticated-orcid":false,"given":"Lingni","family":"Ma","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2643-7457","authenticated-orcid":false,"given":"Yuting","family":"Ye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2412-1141","authenticated-orcid":false,"given":"Fangzhou","family":"Hong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1304-5577","authenticated-orcid":false,"given":"Vladimir","family":"Guzov","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2063-8903","authenticated-orcid":false,"given":"Yifeng","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5766-641X","authenticated-orcid":false,"given":"Rowan","family":"Postyeni","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9625-9186","authenticated-orcid":false,"given":"Luis","family":"Pesqueira","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4599-4357","authenticated-orcid":false,"given":"Alexander","family":"Gamino","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8443-6711","authenticated-orcid":false,"given":"Vijay","family":"Baiyya","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3646-6373","authenticated-orcid":false,"given":"Hyo Jin","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0276-4665","authenticated-orcid":false,"given":"Kevin","family":"Bailey","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8120-7412","authenticated-orcid":false,"given":"David S.","family":"Fosas","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5926-0905","authenticated-orcid":false,"given":"C. Karen","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4220-5958","authenticated-orcid":false,"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2799-5808","authenticated-orcid":false,"given":"Jakob","family":"Engel","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1833-8378","authenticated-orcid":false,"given":"Renzo De","family":"Nardi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9091-8989","authenticated-orcid":false,"given":"Richard","family":"Newcombe","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"25_CR1","unstructured":"Apple Vision Pro. https:\/\/www.apple.com\/apple-vision-pro\/"},{"key":"25_CR2","unstructured":"HTC VIVE. vive.com"},{"key":"25_CR3","unstructured":"Magic Leap 2. https:\/\/www.magicleap.com\/magic-leap-2"},{"key":"25_CR4","unstructured":"Meta momentum library. https:\/\/github.com\/facebookincubator\/momentum\/"},{"key":"25_CR5","unstructured":"Meta Quest. https:\/\/www.meta.com\/quest\/"},{"key":"25_CR6","unstructured":"Microsoft HoloLens. https:\/\/learn.microsoft.com\/en-us\/hololens\/"},{"key":"25_CR7","unstructured":"Movella XSens MVN Link motion capture. https:\/\/www.movella.com\/products\/motion-capture\/xsens-mvn-link"},{"key":"25_CR8","unstructured":"Project Aria Machine Perception Services. https:\/\/facebookresearch.github.io\/projectaria_tools\/docs\/ARK\/mps"},{"key":"25_CR9","unstructured":"Ray-Ban Meta smart glasses. https:\/\/www.meta.com\/smart-glasses\/"},{"key":"25_CR10","unstructured":"Rokoko. https:\/\/www.rokoko.com\/"},{"key":"25_CR11","unstructured":"Vuzix smart glasses. https:\/\/www.vuzix.com\/pages\/smart-glasses"},{"key":"25_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-20068-7_1","volume-title":"Computer Vision \u2013 ECCV 2022","author":"H Akada","year":"2022","unstructured":"Akada, H., et al.: UnrealEgo: a new dataset for robust egocentric 3D human motion capture. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 1\u201317. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_1"},{"key":"25_CR13","doi-asserted-by":"crossref","unstructured":"Araujo, J.P., et al.: CIRCLE: capture in rich contextual environments. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02032"},{"key":"25_CR14","unstructured":"Banerjee, P., et al.: Introducing HOT3D: an egocentric dataset for 3D hand and object tracking (2024)"},{"key":"25_CR15","doi-asserted-by":"crossref","unstructured":"Black, M.J., Patel, P., Tesch, J., Yang, J.: BEDLAM: a synthetic dataset of bodies exhibiting detailed lifelike animated motion. In: Proceedings IEEE\/CVF Conference\u00a0on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00843"},{"key":"25_CR16","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901 (2020)"},{"key":"25_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"557","DOI":"10.1007\/978-3-031-20071-7_33","volume-title":"Computer Vision \u2013 ECCV 2022","author":"Z Cai","year":"2022","unstructured":"Cai, Z., et al.: HuMMan: multi-modal 4D human dataset for versatile sensing and modeling. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13667, pp. 557\u2013577. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_33"},{"key":"25_CR18","unstructured":"Cai, Z., et al.: SMPLer-X: scaling up expressive human pose and shape estimation (2023)"},{"key":"25_CR19","unstructured":"Cai, Z., et al.: Playing for 3D human recovery. arXiv preprint arXiv:2110.07588 (2021)"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Castillo, A., et al.: BoDiffusion: diffusing sparse observations for full-body human motion synthesis. In: ICCV (2023)","DOI":"10.1109\/ICCVW60793.2023.00456"},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"25_CR22","unstructured":"Cong, P., et al.: LaserHuman: language-guided scene-aware human motion generation in free environment (2024)"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"Dabral, R., Mughal, M.H., Golyanik, V., Theobalt, C.: MoFusion: a framework for denoising-diffusion-based motion synthesis. In: Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Damen, D., et al.: Scaling egocentric vision: the epic-kitchens dataset. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"issue":"11","key":"25_CR25","doi-asserted-by":"publisher","first-page":"4125","DOI":"10.1109\/TPAMI.2020.2991965","volume":"43","author":"D Damen","year":"2021","unstructured":"Damen, D., et al.: The epic-kitchens dataset: collection, challenges and baselines. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) 43(11), 4125\u20134141 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)"},{"key":"25_CR26","doi-asserted-by":"publisher","unstructured":"Delmas, G., Weinzaepfel, P., Lucas, T., Moreno-Noguer, F., Rogez, G.: PoseScript: 3D human poses from natural language. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 346\u2013362. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_20","DOI":"10.1007\/978-3-031-20068-7_20"},{"key":"25_CR27","unstructured":"Dhariwal, P., Jun, H., Payne, C., Kim, J.W., Radford, A., Sutskever, I.: Jukebox: a generative model for music. arXiv preprint arXiv:2005.00341 (2020)"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Du, Y., Kips, R., Pumarola, A., Starke, S., Thabet, A., Sanakoyeu, A.: Avatars grow legs: generating smooth human motion from sparse tracking inputs with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 481\u2013490 (2023)","DOI":"10.1109\/CVPR52729.2023.00054"},{"key":"25_CR29","unstructured":"Engel, J., Koltun, V., Cremers, D.: Direct sparse odometry (2016)"},{"key":"25_CR30","unstructured":"Engel, J., et al.: Project aria: a new tool for egocentric multi-modal AI research (2023)"},{"key":"25_CR31","doi-asserted-by":"crossref","unstructured":"Feng, Y., Lin, J., Dwivedi, S.K., Sun, Y., Patel, P., Black, M.J.: ChatPose: chatting about 3D human pose. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00204"},{"issue":"6","key":"25_CR32","doi-asserted-by":"publisher","first-page":"e0253157","DOI":"10.1371\/journal.pone.0253157","volume":"16","author":"S Ghorbani","year":"2021","unstructured":"Ghorbani, S., et al.: MoVi: a large multi-purpose human motion and video dataset. PLoS ONE 16(6), e0253157 (2021)","journal-title":"PLoS ONE"},{"key":"25_CR33","doi-asserted-by":"crossref","unstructured":"Goel, S., Pavlakos, G., Rajasegaran, J., Kanazawa, A., Malik, J.: Humans in 4D: reconstructing and tracking humans with transformers. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01358"},{"key":"25_CR34","unstructured":"Grauman, K., et al.: Ego4D: around the world in 3,000 hours of egocentric video. In: CVPR, pp. 18995\u201319012 (2022)"},{"key":"25_CR35","unstructured":"Grauman, K., et al.: Ego-Exo4D: understanding skilled human activity from first- and third-person perspectives. In: CVPR (2024)"},{"key":"25_CR36","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5152\u20135161 (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"25_CR37","doi-asserted-by":"publisher","unstructured":"Guo, C., Zuo, X., Wang, S., Cheng, L.: TM2T: stochastic and tokenized modeling for the reciprocal generation of 3D human motions and texts. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 580\u2013597. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_34","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"25_CR38","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2Motion: conditioned generation of 3D human motions. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2021\u20132029 (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"25_CR39","doi-asserted-by":"crossref","unstructured":"Guzov, V., Mir, A., Sattler, T., Pons-Moll, G.: Human poseitioning system (HPS): 3D human pose estimation and self-localization in large scenes from body-mounted sensors. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00430"},{"issue":"4","key":"25_CR40","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1145\/3386569.3392480","volume":"39","author":"FG Harvey","year":"2020","unstructured":"Harvey, F.G., Yurick, M., Nowrouzezahrai, D., Pal, C.: Robust motion in-betweening. ACM Trans. Graph. (TOG) 39(4), 60\u20131 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"6","key":"25_CR41","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3272127.3275108","volume":"37","author":"Y Huang","year":"2018","unstructured":"Huang, Y., Kaufmann, M., Aksan, E., Black, M.J., Hilliges, O., Pons-Moll, G.: Deep inertial poser: learning to reconstruct human pose from sparse inertial measurements in real time. ACM Trans. Graph. (TOG) 37(6), 1\u201315 (2018)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"7","key":"25_CR42","doi-asserted-by":"publisher","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","volume":"36","author":"C Ionescu","year":"2013","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3.6M: large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE Trans. Pattern Anal. Mach. Intell. 36(7), 1325\u20131339 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR43","unstructured":"Jiang, B., Chen, X., Liu, W., Yu, J., Yu, G., Chen, T.: MotionGPT: human motion as a foreign language. In: Advances in Neural Information Processing Systems (2024)"},{"key":"25_CR44","unstructured":"Jiang, J., Streli, P., Meier, M., Fender, A., Holz, C.: EgoPoser: robust real-time ego-body pose estimation in large scenes. arXiv preprint arXiv:2308.06493 (2023)"},{"key":"25_CR45","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1007\/978-3-031-20065-6_26","volume-title":"Computer Vision - ECCV 2022","author":"J Jiang","year":"2022","unstructured":"Jiang, J., et al.: AvatarPoser: articulated full-body pose tracking from sparse motion sensing. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13665, pp. 443\u2013460. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20065-6_26"},{"key":"25_CR46","doi-asserted-by":"crossref","unstructured":"Jiang, N., et al.: Scaling up dynamic human-scene interaction modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"25_CR47","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Ye, Y., Gopinath, D., Won, J., Winkler, A.W., Liu, C.K.: Transformer inertial poser: real-time human motion reconstruction from sparse IMUs with simultaneous terrain generation. In: SIGGRAPH Asia 2022 Conference Papers, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3550469.3555428"},{"key":"25_CR48","doi-asserted-by":"crossref","unstructured":"Joo, H., et al.: Panoptic studio: a massively multiview system for social motion capture. In: The IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.381"},{"key":"25_CR49","unstructured":"Joo, H., et al.: Panoptic studio: a massively multiview system for social interaction capture. IEEE Trans. Pattern Anal. Mach. Intell. (2017)"},{"key":"25_CR50","doi-asserted-by":"crossref","unstructured":"Kanazawa, A., Zhang, J.Y., Felsen, P., Malik, J.: Learning 3D human dynamics from video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5614\u20135623 (2019)","DOI":"10.1109\/CVPR.2019.00576"},{"key":"25_CR51","doi-asserted-by":"crossref","unstructured":"Karunratanakul, K., Preechakul, K., Suwajanakorn, S., Tang, S.: Guided motion diffusion for controllable human motion synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2151\u20132162 (2023)","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"25_CR52","doi-asserted-by":"crossref","unstructured":"Kaufmann, M., et al.: EMDB: the electromagnetic database of global 3D human pose and shape in the wild. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01345"},{"key":"25_CR53","doi-asserted-by":"crossref","unstructured":"Kaufmann, M., et al.: EM-pose: 3D human pose estimation from sparse electromagnetic trackers. In: The IEEE International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01131"},{"key":"25_CR54","doi-asserted-by":"crossref","unstructured":"Kerbl, B., Kopanas, G., Leimk\u00fchler, T., Drettakis, G.: 3D Gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42(4) (2023). https:\/\/repo-sam.inria.fr\/fungraph\/3d-gaussian-splatting\/","DOI":"10.1145\/3592433"},{"key":"25_CR55","doi-asserted-by":"crossref","unstructured":"Khirodkar, R., Bansal, A., Ma, L., Newcombe, R., Vo, M., Kitani, K.: EgoHumans: an egocentric 3D multi-human benchmark. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01814"},{"key":"25_CR56","unstructured":"Kim, J., Kim, J., Na, J., Joo, H.: ParaHome: parameterizing everyday home activities towards 3D generative modeling of human-object interactions (2024)"},{"key":"25_CR57","doi-asserted-by":"crossref","unstructured":"Kwon, T., Tekin, B., St\u00fchmer, J., Bogo, F., Pollefeys, M.: H2o: two hands manipulating objects for first person interaction recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10138\u201310148 (2021)","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"25_CR58","doi-asserted-by":"crossref","unstructured":"Lee, J., Joo, H.: Mocap everyone everywhere: lightweight motion capture with smartwatches and a head-mounted camera. arXiv preprint arXiv:2401.00847 (2024)","DOI":"10.1109\/CVPR52733.2024.00110"},{"key":"25_CR59","doi-asserted-by":"crossref","unstructured":"Li, G., Zhao, et al.: EgoGen: an egocentric synthetic data generator (2024)","DOI":"10.1109\/CVPR52733.2024.01374"},{"key":"25_CR60","doi-asserted-by":"crossref","unstructured":"Li, J., Liu, K., Wu, J.: Ego-body pose estimation via ego-head pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17142\u201317151 (2023)","DOI":"10.1109\/CVPR52729.2023.01644"},{"key":"25_CR61","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"25_CR62","unstructured":"Lin, J., et al.: Motion-X: a large-scale 3D expressive whole-body human motion dataset. In: Advances in Neural Information Processing Systems (2023)"},{"key":"25_CR63","doi-asserted-by":"crossref","unstructured":"Ling, H.Y., Zinno, F., Cheng, G., van\u00a0de Panne, M.: Character controllers using motion VAEs. ACM Trans. Graph. 39(4), 1\u201340 (2020)","DOI":"10.1145\/3386569.3392422"},{"key":"25_CR64","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM Trans. Graphics (Proc. SIGGRAPH Asia) 34(6), 248:1\u2013248:16 (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"25_CR65","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1007\/978-3-031-20068-7_24","volume-title":"Computer Vision - ECCV 2022","author":"T Lucas","year":"2022","unstructured":"Lucas, T., Baradel, F., Weinzaepfel, P., Rogez, G.: PoseGPT: quantization-based 3D human motion generation and forecasting. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 417\u2013435. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_24"},{"key":"25_CR66","unstructured":"Luo, Z., Hachiuma, R., Yuan, Y., Kitani, K.: Dynamics-regulated kinematic policy for egocentric pose estimation. In: Neural Information Processing Systems (2021)"},{"issue":"2","key":"25_CR67","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1111\/cgf.14768","volume":"42","author":"D Luvizon","year":"2023","unstructured":"Luvizon, D., Habermann, M., Golyanik, V., Kortylewski, A., Theobalt, C.: Scene-aware 3D multi-human motion capture from a single camera. Comput. Graph. Forum 42(2), 371\u2013383 (2023)","journal-title":"Comput. Graph. Forum"},{"key":"25_CR68","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: AMASS: archive of motion capture as surface shapes. In: International Conference on Computer Vision, pp. 5442\u20135451 (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"25_CR69","doi-asserted-by":"crossref","unstructured":"von Marcard, T., Henschel, R., Black, M.J., Rosenhahn, B., Pons-Moll, G.: Recovering accurate 3D human pose in the wild using IMUs and a moving camera. In: Proceedings of the European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"25_CR70","doi-asserted-by":"crossref","unstructured":"Marcard, T., Rosenhahn, B., Black, M., Pons-Moll, G.: Sparse inertial poser: automatic 3D human pose estimation from sparse IMUs. Comput. Graph. Forum 36(2) (2017). Proceedings of the 38th Annual Conference of the European Association for Computer Graphics (Eurographics) (2017)","DOI":"10.1111\/cgf.13131"},{"key":"25_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1007\/978-3-030-58452-8_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"B Mildenhall","year":"2020","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 405\u2013421. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_24"},{"key":"25_CR72","doi-asserted-by":"crossref","unstructured":"Mollyn, V., Arakawa, R., Goel, M., Harrison, C., Ahuja, K.: IMUPoser: full-body pose estimation using IMUs in phones, watches, and earbuds. In: Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems, CHI 2023. Association for Computing Machinery, New York (2023)","DOI":"10.1145\/3544548.3581392"},{"key":"25_CR73","doi-asserted-by":"crossref","unstructured":"Mourikis, A.I., Roumeliotis, S.I.: A multi-state constraint Kalman filter for vision-aided inertial navigation. In: Proceedings 2007 IEEE International Conference on Robotics and Automation, pp. 3565\u20133572. IEEE (2007)","DOI":"10.1109\/ROBOT.2007.364024"},{"key":"25_CR74","unstructured":"Movella: MVN user manual. https:\/\/www.movella.com\/hubfs\/MVN_User_Manual.pdf"},{"key":"25_CR75","doi-asserted-by":"crossref","unstructured":"Mur-Artal, Ra\u00fal, M.J.M.M., Tard\u00f3s, J.D.: ORB-SLAM: a versatile and accurate monocular SLAM system. IEEE Trans. Robot. 31(5), 1147\u20131163 (2015)","DOI":"10.1109\/TRO.2015.2463671"},{"key":"25_CR76","unstructured":"van\u00a0den Oord, A., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning. In: Proceedings of the 31st International Conference on Neural Information Processing Systems (2017)"},{"key":"25_CR77","unstructured":"OpenAI.: Achiam, J., et al.: GPT-4 technical report (2023)"},{"key":"25_CR78","doi-asserted-by":"crossref","unstructured":"Pan, X., et al.: Aria digital twin: a new benchmark dataset for egocentric 3D machine perception (2023)","DOI":"10.1109\/ICCV51070.2023.01842"},{"key":"25_CR79","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"25_CR80","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: Proceedings IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10975\u201310985 (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"25_CR81","doi-asserted-by":"publisher","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682, pp. 480\u2013497. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_28","DOI":"10.1007\/978-3-031-20047-2_28"},{"issue":"4","key":"25_CR82","doi-asserted-by":"publisher","first-page":"236","DOI":"10.1089\/big.2016.0028","volume":"4","author":"M Plappert","year":"2016","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The KIT motion-language dataset. Big Data 4(4), 236\u2013252 (2016)","journal-title":"Big Data"},{"key":"25_CR83","doi-asserted-by":"crossref","unstructured":"Punnakkal, A.R., Chandrasekaran, A., Athanasiou, N., Quiros-Ramirez, A., Black, M.J.: BABEL: bodies, action and behavior with English labels. In: Proceedings IEEE\/CVF Conference\u00a0on Computer Vision and Pattern Recognition (CVPR), pp. 722\u2013731 (2021)","DOI":"10.1109\/CVPR46437.2021.00078"},{"issue":"1","key":"25_CR84","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"25_CR85","unstructured":"Raina, N., et al.: EgoBlur: responsible innovation in aria. ArXiv abs\/2308.13093 (2023)"},{"key":"25_CR86","doi-asserted-by":"crossref","unstructured":"Rempe, D., Birdal, T., Hertzmann, A., Yang, J., Sridhar, S., Guibas, L.J.: Humor: 3D human motion model for robust pose estimation. In: International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.01129"},{"key":"25_CR87","doi-asserted-by":"crossref","unstructured":"Alp\u00a0Gueler, R., Natalia\u00a0Neverova, I.K.: Densepose: dense human pose estimation in the wild. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00762"},{"key":"25_CR88","unstructured":"Roetenberg, D., Luinge, H., Slycke, P.: Xsens MVN: full 6DOF human motion tracking using miniature inertial sensors. Xsens Motion Technol. BV Technical report 3 (2009)"},{"key":"25_CR89","doi-asserted-by":"crossref","unstructured":"Rong, Y., Shiratori, T., Joo, H.: FrankMocap: a monocular 3D whole-body pose estimation system via regression and integration. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1749\u20131759 (2021)","DOI":"10.1109\/ICCVW54120.2021.00201"},{"key":"25_CR90","doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"25_CR91","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: Human motion diffusion as a generative prior. In: ICLR (2023)"},{"key":"25_CR92","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+D: a large scale dataset for 3D human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"issue":"6","key":"25_CR93","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417877","volume":"39","author":"S Shimada","year":"2020","unstructured":"Shimada, S., Golyanik, V., Xu, W., Theobalt, C.: PhysCap: physically plausible monocular 3d motion capture in real time. ACM Trans. Graph. (ToG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (ToG)"},{"issue":"1","key":"25_CR94","first-page":"1","volume":"1","author":"O Sorkine-Hornung","year":"2017","unstructured":"Sorkine-Hornung, O., Rabinovich, M.: Least-squares rigid motion using SVD. Computing 1(1), 1\u20135 (2017)","journal-title":"Computing"},{"key":"25_CR95","doi-asserted-by":"publisher","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to CLIP Space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXII. LNCS, vol. 13682, pp. 358\u2013374. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"25_CR96","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: ICLR (2023)"},{"key":"25_CR97","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"572","DOI":"10.1007\/978-3-031-20065-6_33","volume-title":"Computer Vision \u2013 ECCV 2022","author":"G Tiwari","year":"2022","unstructured":"Tiwari, G., Antic, D., Lenssen, J.E., Sarafianos, N., Tung, T., Pons-Moll, G.: Pose-NDF: modeling human pose manifolds with neural distance fields. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13665, pp. 572\u2013589. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20065-6_33"},{"issue":"6","key":"25_CR98","doi-asserted-by":"publisher","first-page":"6794","DOI":"10.1109\/TPAMI.2020.3029700","volume":"45","author":"D Tome","year":"2020","unstructured":"Tome, D., et al.: SelfPose: 3D egocentric pose estimation from a headset mounted camera. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 6794\u20136806 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"25_CR99","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models (2023)"},{"key":"25_CR100","doi-asserted-by":"crossref","unstructured":"Trumble, M., Gilbert, A., Malleson, C., Hilton, A., Collomosse, J.: Total capture: 3D human pose estimation fusing video and inertial sensors. In: Proceedings of 28th British Machine Vision Conference, pp. 1\u201313 (2017)","DOI":"10.5244\/C.31.14"},{"key":"25_CR101","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CiDER: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"25_CR102","doi-asserted-by":"crossref","unstructured":"Wang, J., Liu, L., Xu, W., Sarkar, K., Luvizon, D., Theobalt, C.: Estimating egocentric 3D human pose in the wild with external weak supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13157\u201313166 (2022)","DOI":"10.1109\/CVPR52688.2022.01281"},{"key":"25_CR103","doi-asserted-by":"crossref","unstructured":"Wang, J., Luvizon, D., Xu, W., Liu, L., Sarkar, K., Theobalt, C.: Scene-aware egocentric 3D human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13031\u201313040 (2023)","DOI":"10.1109\/CVPR52729.2023.01252"},{"key":"25_CR104","unstructured":"Wouwe, T., Lee, S., Falisse, A., Delp, S., Liu, C.: Diffusion inertial poser: human motion reconstruction from arbitrary sparse IMU configurations. In: CVPR (2024)"},{"key":"25_CR105","doi-asserted-by":"crossref","unstructured":"Yang, D., Kang, J., Ma, L., Greer, J., Ye, Y., Lee, S.H.: DivaTrack: diverse bodies and motions from acceleration-enhanced three-point trackers. In: EuroGraphics (2024)","DOI":"10.1111\/cgf.15057"},{"key":"25_CR106","doi-asserted-by":"crossref","unstructured":"Yang, D., Kim, D., Lee, S.H.: LoBSTr: real-time lower-body pose prediction from sparse upper-body tracking signals. In: Computer Graphics Forum, vol.\u00a040, pp. 265\u2013275. Wiley Online Library (2021)","DOI":"10.1111\/cgf.142631"},{"key":"25_CR107","doi-asserted-by":"crossref","unstructured":"Ye, V., Pavlakos, G., Malik, J., Kanazawa, A.: Decoupling human and camera motion from videos in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.02033"},{"key":"25_CR108","doi-asserted-by":"crossref","unstructured":"Yi, H., Huang, C.H.P., Tripathi, S., Hering, L., Thies, J., Black, M.J.: MIME: human-aware 3D scene generation. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12965\u201312976, June 2023","DOI":"10.1109\/CVPR52729.2023.01246"},{"issue":"4","key":"25_CR109","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592099","volume":"42","author":"X Yi","year":"2023","unstructured":"Yi, X., et al.: EgoLocate: real-time motion capture, localization, and mapping with sparse body-mounted sensors. ACM Trans. Graph. (TOG) 42(4), 1\u201317 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"25_CR110","doi-asserted-by":"crossref","unstructured":"Yi, X., et al.: Physical inertial poser (PIP): physics-aware real-time human motion tracking from sparse inertial sensors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13167\u201313178 (2022)","DOI":"10.1109\/CVPR52688.2022.01282"},{"issue":"4","key":"25_CR111","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3450626.3459786","volume":"40","author":"X Yi","year":"2021","unstructured":"Yi, X., Zhou, Y., Xu, F.: Transpose: real-time 3D human translation and pose estimation with six inertial sensors. ACM Trans. Graph. (TOG) 40(4), 1\u201313 (2021)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"25_CR112","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"180","DOI":"10.1007\/978-3-031-20068-7_11","volume-title":"Computer Vision \u2013 ECCV 2022","author":"S Zhang","year":"2022","unstructured":"Zhang, S., et al.: EgoBody: human body shape and motion of interacting people from head-mounted devices. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 180\u2013200. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_11"},{"key":"25_CR113","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BERTScore: evaluating text generation with bert. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"25_CR114","unstructured":"Zhang, Y., et al.: MotionGPT: finetuned LLMs are general-purpose motion generators. arXiv preprint arXiv:2306.10900 (2023)"},{"key":"25_CR115","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Liu, R., Aberman, K., Hanocka, R.: TEDi: temporally-entangled diffusion for long-term motion synthesis (2023)","DOI":"10.1145\/3641519.3657515"},{"key":"25_CR116","doi-asserted-by":"publisher","unstructured":"Zheng, Y., et al.: GIMO: gaze-informed human motion prediction in context. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13673, pp. 676\u2013694. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19778-9_39","DOI":"10.1007\/978-3-031-19778-9_39"},{"key":"25_CR117","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"389","DOI":"10.1007\/978-3-030-01240-3_24","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Z Zheng","year":"2018","unstructured":"Zheng, Z., et al.: HybridFusion: real-time performance capture using a single depth sensor and sparse IMUs. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11213, pp. 389\u2013406. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_24"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72691-0_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T18:10:33Z","timestamp":1730571033000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72691-0_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726903","9783031726910"],"references-count":117,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72691-0_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}