{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:45:06Z","timestamp":1777657506029,"version":"3.51.4"},"reference-count":125,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128777","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"8203-8210","source":"Crossref","is-referenced-by-count":1,"title":["COLLAGE: Collaborative Human-Agent Interaction Generation Using Hierarchical Latent Diffusion and Language Models"],"prefix":"10.1109","author":[{"given":"Divyanshu","family":"Daiya","sequence":"first","affiliation":[{"name":"Purdue University,IDEAS Lab,Department of Computer Science"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Damon","family":"Conover","sequence":"additional","affiliation":[{"name":"DEVCOM Army Research Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aniket","family":"Bera","sequence":"additional","affiliation":[{"name":"Purdue University,IDEAS Lab,Department of Computer Science"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Cg-hoi: Contact-guided 3d human-object interaction generation","volume-title":"arXiv","author":"Diller","year":"2023"},{"key":"ref2","article-title":"Controllable human-object interaction synthesis","volume-title":"arXiv","author":"Li","year":"2023"},{"key":"ref3","article-title":"Hoi-diff: Text-driven synthesis of 3d human-object interactions using diffusion models","volume-title":"arXiv","author":"Peng","year":"2023"},{"key":"ref4","article-title":"THOR: Text to human-object interaction diffusion via relation intervention","volume-title":"arXiv","author":"Wu","year":"2024"},{"key":"ref5","article-title":"Interdrearner: Zero-shot text to 3d dynamic human-object interaction","volume-title":"ar Xiv","author":"Xu","year":"2024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00056"},{"key":"ref7","volume-title":"ChatGPT","year":"2023"},{"key":"ref8","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv"},{"key":"ref9","article-title":"Large language models are zero-shot reasoners","author":"Kojima","year":"2022","journal-title":"NeurIPS"},{"key":"ref10","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","author":"Huang","year":"2022","journal-title":"ICML"},{"key":"ref11","article-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2023","journal-title":"CoRL"},{"key":"ref12","article-title":"Prompt, plan, perform:LIm-based humanoid control via quantized imitation learning","volume-title":"ICRA","author":"Sun","year":"2024"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01360"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/3DV53792.2021.00086"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"ref17","article-title":"Core4d: A 4d human-object-human interaction dataset for collaborative object rearrangement","volume-title":"arXiv","author":"Zhang","year":"2024"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02042-6"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25206"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_20"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00220"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00875"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_15"},{"key":"ref27","article-title":"Stochastic multi-person 3d motion forecasting","author":"Xu","year":"2023","journal-title":"ICLR"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/3DV50981.2020.00102"},{"key":"ref29","article-title":"GMD: Controllable human motion synthesis via guided diffusion models","volume-title":"ICCV","author":"Karunratanakul","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01322"},{"key":"ref31","article-title":"OmniControl: Control any joint at any time for human motion generation","author":"Xie","year":"2023","journal-title":"arXiv"},{"key":"ref32","article-title":"Tlcontrol: Trajectory and language control for human motion syn-thesis","volume-title":"ar Xiv","author":"Wan","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_23"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01447"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00928"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01203"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01981"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_18"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01354"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02029"},{"key":"ref42","article-title":"ROAM: Robust and object-aware motion generation using neural pose descriptors","author":"Zhang","year":"2023","journal-title":"a rXiv"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00870"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"ref47","article-title":"MotionDiffuse: Text-driven human motion generation with diffusion model","author":"Zhang","year":"2022","journal-title":"ar Xiv"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2019.00084"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i7.25996"},{"key":"ref54","article-title":"HumanTOMATO: Text-aligned whole-body motion generation","author":"Lu","year":"2023","journal-title":"ar Xiv"},{"key":"ref55","article-title":"Single motion diffusion","author":"Raab","year":"2023","journal-title":"arXiv"},{"key":"ref56","article-title":"Human motion diffusion as a generative prior","author":"Shafir","year":"2023","journal-title":"a rXiv"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"ref58","article-title":"Under-standing text-driven motion synthesis with keyframe collaboration via diffusion models","volume-title":"arXiv","author":"Wei","year":"2023"},{"key":"ref59","article-title":"TEDi: Temporally-entangled diffusion for long-term motion synthesis","volume-title":"arXiv","author":"Zhang","year":"2023"},{"key":"ref60","article-title":"MotionScript: Natural language descriptions for expressive 3d human motions","author":"Yazdian","year":"2023","journal-title":"arXiv"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00051"},{"key":"ref62","article-title":"EMDM: Efficient motion diffusion model for fast, high-quality motion generation","author":"Zhou","year":"2023","journal-title":"arXiv"},{"key":"ref63","article-title":"Contact-aware human motion generation from textual descriptions","author":"Ma","year":"2024","journal-title":"ar Xiv"},{"key":"ref64","article-title":"Interactive humanoid: Online full-body motion reaction synthesis with social affordance canonicalization and forecasting","volume-title":"ar Xiv","author":"Liu","year":"2023"},{"key":"ref65","article-title":"InterControl: Generate human motion interactions by controlling every joint","author":"Wang","year":"2023","journal-title":"arXiv"},{"key":"ref66","article-title":"ReMoS: Reactive 3d motion synthesis for two-person interactions","volume-title":"ar Xiv","author":"Ghosh","year":"2023"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"ref68","article-title":"LaserHuman: Language-guided scene-aware human motion generation in free environment","author":"Cong","year":"2024","journal-title":"arXiv"},{"key":"ref69","article-title":"Task-oriented human-object interactions generation with implicit neural representations","author":"Li","year":"2023","journal-title":"arXiv"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02153"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00064"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20062-5_1"},{"key":"ref73","article-title":"ArtiGrasp: Physically plausible synthesis of bi-manual dexterous grasping and articulation","volume-title":"ar Xiv","author":"Zhang","year":"2023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20086-1_8"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_3"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/3DV57658.2022.00047"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00458"},{"key":"ref78","article-title":"Compositional 3d human-object neural animation","author":"Hou","year":"2023","journal-title":"arXiv"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01361"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01934"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01541"},{"key":"ref82","article-title":"Zero-shot learning for the primitives of 3d affordance in general objects","author":"Kim","year":"2024","journal-title":"arXiv"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356505"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01291"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_15"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_30"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00886"},{"key":"ref88","article-title":"D3D-HOI: Dynamic 3d human-object interactions from videos","author":"Xu","year":"2021","journal-title":"a rXiv"},{"key":"ref89","article-title":"IMoS: Intent-driven full-body motion synthesis for human-object interactions","volume-title":"ar Xiv","author":"Ghosh","year":"2022"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3151614"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25308"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS47582.2021.9555788"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01371"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3618333"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i7.16736"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392474"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591525"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591487"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530057"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530735"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/3606931"},{"key":"ref102","article-title":"Synthesizing physically plausible human motions in 3d scenes","author":"Pan","year":"2023","journal-title":"ar Xiv"},{"key":"ref103","article-title":"Physically plausible full-body hand-object interaction synthesis","volume-title":"ar Xiv","author":"Braun","year":"2023"},{"key":"ref104","article-title":"PhysHOI: Physics-based imitation of dynamic human-object interaction","volume-title":"arXiv","author":"Wang","year":"2023"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00087"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00916"},{"key":"ref107","article-title":"MoConVQ: Unified physics-based motion control via scalable discrete representations","author":"Yao","year":"2023","journal-title":"ar Xiv"},{"key":"ref108","article-title":"MotionGPT: Human motion as a foreign language","author":"Jiang","year":"2023","journal-title":"NeurIPS"},{"key":"ref109","article-title":"Unified human-scene interaction via prompted chain-of-contacts","volume-title":"arXiv","author":"Xiao","year":"2023"},{"key":"ref110","article-title":"Denoising diffusion probabilistic models","author":"Ho","year":"2020","journal-title":"NeurIPS"},{"key":"ref111","article-title":"Autoregressive denoising diffusion models for multivariate probabilistic time series forecasting","volume-title":"ICML","author":"Rasul","year":"2021"},{"key":"ref112","article-title":"An empirical evaluation of generic convolutional and recurrent networks for sequence modeling","author":"Bai","year":"2018","journal-title":"arXiv"},{"key":"ref113","article-title":"Graph attention networks","author":"Velickovic","year":"2017","journal-title":"arXiv"},{"key":"ref114","article-title":"Csdi: Conditional score-based diffusion models for probabilistic time series imputation","author":"Tashiro","year":"2021","journal-title":"NeurIPS"},{"key":"ref115","article-title":"Denoising diffusion implicit models","author":"Song","year":"2020","journal-title":"arXiv"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"ref117","article-title":"Human motion diffusion model","author":"Tevet","year":"2022","journal-title":"ar Xiv preprint"},{"key":"ref118","article-title":"Human motion diffusion as a generative prior","author":"Shafir","year":"2023","journal-title":"ar Xiv"},{"key":"ref119","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"key":"ref120","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv"},{"key":"ref121","article-title":"Denoising diffusion implicit models","author":"Song","year":"2020","journal-title":"arXiv"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1089\/big.2016.0028"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"ref125","article-title":"Isolating sources of disentanglement in variational autoencoders","volume":"31","author":"Chen","year":"2018","journal-title":"NeurIPS"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128777.pdf?arnumber=11128777","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:08:59Z","timestamp":1756879739000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128777\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":125,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128777","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}