{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T01:39:50Z","timestamp":1774402790713,"version":"3.50.1"},"publisher-location":"Cham","reference-count":94,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729126","type":"print"},{"value":"9783031729133","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72913-3_6","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:45:16Z","timestamp":1733089516000},"page":"91-110","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["F-HOI: Toward Fine-Grained Semantic-Aligned 3D Human-Object Interactions"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5891-2911","authenticated-orcid":false,"given":"Jie","family":"Yang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7737-4287","authenticated-orcid":false,"given":"Xuesong","family":"Niu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5726-7672","authenticated-orcid":false,"given":"Nan","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9511-7532","authenticated-orcid":false,"given":"Ruimao","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1524-7148","authenticated-orcid":false,"given":"Siyuan","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"6_CR1","doi-asserted-by":"crossref","unstructured":"Belkhale, S., et al.: RT-H: action hierarchies using language. arXiv preprint arXiv:2403.01823 (2024)","DOI":"10.15607\/RSS.2024.XX.049"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Bhatnagar, B.L., Xie, X., Petrov, I.A., Sminchisescu, C., Theobalt, C., Pons-Moll, G.: Behave: dataset and method for tracking human object interactions. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01547"},{"key":"6_CR3","doi-asserted-by":"crossref","unstructured":"Bogo, F., Kanazawa, A., Lassner, C., Gehler, P., Romero, J., Black, M.J.: Keep it SMPL: automatic estimation of 3D human pose and shape from a single image. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46454-1_34"},{"key":"6_CR4","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Wang, Z., He, Y., Wang, J., Deng, J.: HICO: a benchmark for recognizing human-object interactions in images. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.122"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Sharegpt4v: improving large multi-modal models with better captions. arXiv preprint arXiv:2311.12793 (2023)","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Chen, Y., Huang, S., Yuan, T., Qi, S., Zhu, Y., Zhu, S.C.: Holistic++ scene understanding: single-view 3D holistic scene parsing and human pose estimation with human-object interaction and physical commonsense. In: CVPR (2019)","DOI":"10.1109\/ICCV.2019.00874"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wang, F., Wang, Y., Liu, H.: Text-to-3D using gaussian splatting. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02022"},{"key":"6_CR8","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Corsetti, J., Boscaini, D., Oh, C., Cavallaro, A., Poiesi, F.: Open-vocabulary object 6D pose estimation. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01711"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"Delmas, G., Weinzaepfel, P., Lucas, T., Moreno-Noguer, F., Rogez, G.: Posescript: 3D human poses from natural language. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20068-7_20"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Delmas, G., Weinzaepfel, P., Moreno-Noguer, F., Rogez, G.: Posefix: correcting 3D human poses with natural language. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.01379"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Diller, C., Dai, A.: CG-HOI: contact-guided 3D human-object interaction generation. arXiv preprint arXiv:2311.16097 (2023)","DOI":"10.1109\/CVPR52733.2024.01880"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"Feng, Y., Lin, J., Dwivedi, S.K., Sun, Y., Patel, P., Black, M.J.: Posegpt: chatting about 3D human pose. arXiv preprint arXiv:2311.18836 (2023)","DOI":"10.1109\/CVPR52733.2024.00204"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Feng, Y., Lin, J., Dwivedi, S.K., Sun, Y., Patel, P., Black, M.J.: Chatpose: chatting about 3D human pose. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00204"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Geijtenbeek, T., Pronost, N.: Interactive character animation using simulated physics: a state-of-the-art review. In: Computer Graphics Forum, vol.\u00a031, pp. 2492\u20132515. Wiley Online Library (2012)","DOI":"10.1111\/j.1467-8659.2012.03189.x"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Girshick, R., Doll\u00e1r, P., He, K.: Detecting and recognizing human-object interactions. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00872"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Hassan, M., Choutas, V., Tzionas, D., Black, M.J.: Resolving 3D human pose ambiguities with 3D scene constraints. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00237"},{"key":"6_CR18","doi-asserted-by":"crossref","unstructured":"Hassan, M., Ghosh, P., Tesch, J., Tzionas, D., Black, M.J.: Populating 3D scenes by learning human-scene interaction. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01447"},{"issue":"4","key":"6_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073663","volume":"36","author":"D Holden","year":"2017","unstructured":"Holden, D., Komura, T., Saito, J.: Phase-functioned neural networks for character control. ACM TOG 36(4), 1\u201313 (2017)","journal-title":"ACM TOG"},{"key":"6_CR20","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"6_CR21","unstructured":"Huang, R., et al.: AudioGPT: understanding and generating speech, music, sound, and talking head. arXiv preprint arXiv:2304.12995 (2023)"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Diffusion-based generation, optimization, and planning in 3D scenes. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Huang, Y., et\u00a0al.: Smartedit: exploring complex instruction-based image editing with multimodal large language models. arXiv preprint arXiv:2312.06739 (2023)","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Jiang, N., et al.: Full-body articulated human-object interaction. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00859"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Jiang, N., et al.: Scaling up dynamic human-scene interaction modeling. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"6_CR26","doi-asserted-by":"crossref","unstructured":"Kudo, T., Richardson, J.: Sentencepiece: a simple and language independent subword tokenizer and detokenizer for neural text processing. arXiv preprint arXiv:1808.06226 (2018)","DOI":"10.18653\/v1\/D18-2012"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: LISA: reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Li, J., Clegg, A., Mottaghi, R., Wu, J., Puig, X., Liu, C.K.: Controllable human-object interaction synthesis. arXiv preprint arXiv:2312.03913 (2023)","DOI":"10.1007\/978-3-031-72940-9_4"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Li, J., Wu, J., Liu, C.K.: Object motion guided human motion synthesis. ACM Trans. Graph. (TOG) (2023)","DOI":"10.1145\/3618333"},{"key":"6_CR30","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv preprint arXiv:2205.06355 (2023)"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Liao, Y., Zhang, A., Lu, M., Wang, Y., Li, X., Liu, S.: GEN-VLKT: simplify association and enhance interaction understanding for hoi detection. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"6_CR32","doi-asserted-by":"crossref","unstructured":"Lin, C.H., et al.: Magic3d: high-resolution text-to-3D content creation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"6_CR33","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Lin, J., Liu, L., Lu, D., Jia, K.: SAM-6D: segment anything model meets zero-shot 6D object pose estimation. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02636"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Lin, J., Zeng, A., Wang, H., Zhang, L., Li, Y.: One-stage 3D whole-body mesh recovery with component aware transformer. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02027"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"6_CR38","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: NeurIPS (2023)"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: HOI4D: a 4D egocentric dataset for category-level human-object interaction. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.02034"},{"key":"6_CR40","unstructured":"Liu, Z., et\u00a0al.: InternGPT: solving vision-centric tasks by interacting with chatbots beyond language. arXiv preprint arXiv:2305.05662 (2023)"},{"key":"6_CR41","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. In: ACM TOG (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"6_CR42","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"6_CR43","doi-asserted-by":"crossref","unstructured":"Mourot, L., Hoyet, L., Le\u00a0Clerc, F., Schnitzler, F., Hellier, P.: A survey on deep learning for skeleton-based human animation. In: Computer Graphics Forum, vol.\u00a041, pp. 122\u2013157. Wiley Online Library (2022)","DOI":"10.1111\/cgf.14426"},{"key":"6_CR44","unstructured":"OpenAI: Introducing chatgpt (2022)"},{"key":"6_CR45","unstructured":"OpenAI: GPT-4 technical report (2023)"},{"key":"6_CR46","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"6_CR47","unstructured":"Peng, X., Xie, Y., Wu, Z., Jampani, V., Sun, D., Jiang, H.: Hoi-diff: text-driven synthesis of 3D human-object interactions using diffusion models. arXiv preprint arXiv:2312.06553 (2023)"},{"key":"6_CR48","doi-asserted-by":"crossref","unstructured":"Pi, R., et al.: DetGPT: detect what you need via reasoning. arXiv:2305.14167 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.876"},{"key":"6_CR49","doi-asserted-by":"crossref","unstructured":"Pi, R., Yao, L., Gao, J., Zhang, J., Zhang, T.: Perceptiongpt: effectively fusing visual perception into LLM. arXiv preprint arXiv:2311.06612 (2023)","DOI":"10.1109\/CVPR52733.2024.02561"},{"key":"6_CR50","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: Dreamfusion: text-to-3D using 2D diffusion (2022)"},{"key":"6_CR51","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"issue":"4","key":"6_CR52","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2897824.2925867","volume":"35","author":"M Savva","year":"2016","unstructured":"Savva, M., Chang, A.X., Hanrahan, P., Fisher, M., Nie\u00dfner, M.: Pigraphs: learning interaction snapshots from observations. ACM TOG 35(4), 1\u201312 (2016)","journal-title":"ACM TOG"},{"key":"6_CR53","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving AI tasks with chatgpt and its friends in huggingface. arXiv preprint arXiv:2303.17580 (2023)"},{"issue":"1","key":"6_CR54","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1177\/1357034X11431845","volume":"18","author":"J Stacey","year":"2012","unstructured":"Stacey, J., Suchman, L.: Animation and automation-the liveliness and labours of bodies and machines. Body Soc. 18(1), 1\u201346 (2012)","journal-title":"Body Soc."},{"key":"6_CR55","doi-asserted-by":"crossref","unstructured":"Taheri, O., Choutas, V., Black, M.J., Tzionas, D.: Goal: generating 4D whole-body motion for hand-object grasping. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01291"},{"key":"6_CR56","doi-asserted-by":"crossref","unstructured":"Taheri, O., Ghorbani, N., Black, M.J., Tzionas, D.: Grab: a dataset of whole-body human grasping of objects. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58548-8_34"},{"key":"6_CR57","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"6_CR58","series-title":"LNCS","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_21","volume-title":"ECCV 2022","author":"G Tevet","year":"2022","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: Motionclip: exposing human motion generation to clip space. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13682. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20047-2_21"},{"key":"6_CR59","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"6_CR60","doi-asserted-by":"crossref","unstructured":"Wang, J., Xu, H., Xu, J., Liu, S., Wang, X.: Synthesizing long-term 3D human motion and interaction in 3D scenes. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00928"},{"key":"6_CR61","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. arXiv:2305.11175 (2023)"},{"key":"6_CR62","doi-asserted-by":"crossref","unstructured":"Wang, X., Li, G., Kuo, Y.L., Kocabas, M., Aksan, E., Hilliges, O.: Reconstructing action-conditioned human-object interactions using commonsense knowledge priors. In: 2022 International Conference on 3D Vision (3DV), pp. 353\u2013362. IEEE (2022)","DOI":"10.1109\/3DV57658.2022.00047"},{"key":"6_CR63","unstructured":"Wang, Y., Lin, J., Zeng, A., Luo, Z., Zhang, J., Zhang, L.: Physhoi: physics-based imitation of dynamic human-object interaction. arXiv preprint arXiv:2312.04393 (2023)"},{"key":"6_CR64","unstructured":"Wang, Z., Chen, Y., Liu, T., Zhu, Y., Liang, W., Huang, S.: Humanise: language-conditioned human motion generation in 3D scenes. In: NeurIPS (2022)"},{"key":"6_CR65","doi-asserted-by":"crossref","unstructured":"Wen, B., Yang, W., Kautz, J., Birchfield, S.: Foundationpose: unified 6D pose estimation and tracking of novel objects. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01692"},{"key":"6_CR66","doi-asserted-by":"crossref","unstructured":"Weng, Z., Yeung, S.: Holistic 3D human and scene mesh estimation from single view images. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00040"},{"key":"6_CR67","unstructured":"Xiao, Z., et al.: Unified human-scene interaction via prompted chain-of-contacts. arXiv preprint arXiv:2309.07918 (2023)"},{"key":"6_CR68","doi-asserted-by":"crossref","unstructured":"Xie, K., Wang, T., Iqbal, U., Guo, Y., Fidler, S., Shkurti, F.: Physics-based human motion estimation and synthesis from videos. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01133"},{"key":"6_CR69","doi-asserted-by":"crossref","unstructured":"Xie, X., Bhatnagar, B.L., Pons-Moll, G.: Chore: contact, human and object reconstruction from a single RGB image. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20086-1_8"},{"key":"6_CR70","unstructured":"Xu, J., et al.: Pixel aligned language models. arXiv preprint arXiv:2312.09237 (2023)"},{"key":"6_CR71","doi-asserted-by":"crossref","unstructured":"Xu, J., Xu, H., Ni, B., Yang, X., Wang, X., Darrell, T.: Hierarchical style-based networks for motion synthesis. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58621-8_11"},{"key":"6_CR72","doi-asserted-by":"crossref","unstructured":"Xu, S., Li, Z., Wang, Y.X., Gui, L.Y.: Interdiff: generating 3D human-object interactions with physics-informed diffusion. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01371"},{"key":"6_CR73","unstructured":"Xu, X., Joo, H., Mori, G., Savva, M.: D3D-HOI: dynamic 3D human-object interactions from videos. arXiv preprint arXiv:2108.08420 (2021)"},{"key":"6_CR74","unstructured":"Yang, J., Li, B., Yang, F., Zeng, A., Zhang, L., Zhang, R.: Boosting human-object interaction detection with text-to-image diffusion model. arXiv preprint arXiv:2305.12252 (2023)"},{"key":"6_CR75","doi-asserted-by":"crossref","unstructured":"Yang, J., Li, B., Zeng, A., Zhang, L., Zhang, R.: Open-world human-object interaction detection via multi-modal prompts. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.01604"},{"key":"6_CR76","doi-asserted-by":"crossref","unstructured":"Yang, J., Wang, C., Li, Z., Wang, J., Zhang, R.: Semantic human parsing via scalable semantic transfer over multiple label domains. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01861"},{"key":"6_CR77","doi-asserted-by":"crossref","unstructured":"Yang, J., Zeng, A., Li, F., Liu, S., Zhang, R., Zhang, L.: Neural interactive keypoint detection. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01388"},{"key":"6_CR78","unstructured":"Yang, J., Zeng, A., Zhang, R., Zhang, L.: Unipose: detecting any keypoints. arXiv preprint arXiv:2310.08530 (2023)"},{"key":"6_CR79","unstructured":"Yang, R., et al.: GPT4Tools: teaching large language model to use tools via self-instruction. arXiv preprint arXiv:2305.18752 (2023)"},{"key":"6_CR80","unstructured":"Yang, Z., et al.: MM-ReAct: prompting chatgpt for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"6_CR81","doi-asserted-by":"crossref","unstructured":"Yuan, H., Wang, M., Ni, D., Xu, L.: Detecting human-object interactions with object-guided cross-modal calibrated semantics. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i3.20229"},{"key":"6_CR82","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Song, J., Iqbal, U., Vahdat, A., Kautz, J.: Physdiff: physics-guided human motion diffusion model. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01467"},{"key":"6_CR83","doi-asserted-by":"crossref","unstructured":"Zanfir, A., Marinoiu, E., Sminchisescu, C.: Monocular 3D pose and shape estimation of multiple people in natural scenes: the importance of multiple scene constraints. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00229"},{"key":"6_CR84","doi-asserted-by":"crossref","unstructured":"Zhang, D., et al.: SpeechGPT: empowering large language models with intrinsic cross-modal conversational abilities. arXiv preprint arXiv:2305.11000 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"6_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"6_CR86","doi-asserted-by":"crossref","unstructured":"Zhang, H., et\u00a0al.: Llava-grounding: grounded visual chat with large multimodal models. arXiv preprint arXiv:2312.02949 (2023)","DOI":"10.1007\/978-3-031-72775-7_2"},{"key":"6_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, J.Y., Pepose, S., Joo, H., Ramanan, D., Malik, J., Kanazawa, A.: Perceiving 3D human-object spatial arrangements from a single image in the wild. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58610-2_3"},{"key":"6_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, S., Zhang, Y., Bogo, F., Pollefeys, M., Tang, S.: Learning motion priors for 4D human body capture in 3D scenes. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01115"},{"key":"6_CR89","doi-asserted-by":"crossref","unstructured":"Zhang, X., Bhatnagar, B.L., Starke, S., Guzov, V., Pons-Moll, G.: Couch: towards controllable human-chair interactions. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20065-6_30"},{"key":"6_CR90","unstructured":"Zhang, X., et al.: Force: dataset and method for intuitive physics guided human-object interaction. arXiv preprint arXiv:2403.11237 (2024)"},{"key":"6_CR91","unstructured":"Zheng, K., He, X., Wang, X.E.: Minigpt-5: interleaved vision-and-language generation via generative vokens. arXiv preprint arXiv:2310.02239 (2023)"},{"key":"6_CR92","unstructured":"Zheng, L., et al.: Judging LLM-as-a-judge with MT-bench and chatbot arena. arXiv preprint arXiv:2306.05685 (2023)"},{"key":"6_CR93","unstructured":"Zhou, J., Wang, J., Ma, B., Liu, Y.S., Huang, T., Wang, X.: Uni3d: exploring unified 3D representation at scale. arXiv preprint arXiv:2310.06773 (2023)"},{"key":"6_CR94","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72913-3_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:22:23Z","timestamp":1733095343000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72913-3_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031729126","9783031729133"],"references-count":94,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72913-3_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}