{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:01:50Z","timestamp":1780020110942,"version":"3.53.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116124","type":"journal-article","created":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:31:52Z","timestamp":1778859112000},"page":"116124","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["ObjectDiff: An object-centric diffusion policy with modality-specific conditioning for robot manipulation"],"prefix":"10.1016","volume":"346","author":[{"given":"Yong","family":"Xu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhiyu","family":"Wei","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5277-9859","authenticated-orcid":false,"given":"Ruotao","family":"Xu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9375-1905","authenticated-orcid":false,"given":"Zihan","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Si","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116124_b1","unstructured":"M. Shridhar, L. Manuelli, D. Fox, Perceiver-actor: A multi-task transformer for robotic manipulation, in: Conference on Robot Learning, 2023, pp. 785\u2013799."},{"key":"10.1016\/j.knosys.2026.116124_b2","series-title":"Dnact: Diffusion guided multi-task 3d policy learning","author":"Yan","year":"2024"},{"key":"10.1016\/j.knosys.2026.116124_b3","unstructured":"A. Ajay, Y. Du, A. Gupta, J.B. Tenenbaum, T.S. Jaakkola, P. Agrawal, Is Conditional Generative Modeling All You Need for Decision Making?, in: Proc. IEEE Int. Conf. Learn. Representation, 2022."},{"key":"10.1016\/j.knosys.2026.116124_b4","article-title":"Diffusion policy: Visuomotor policy learning via action diffusion","author":"Chi","year":"2023","journal-title":"Int. J. Robot. Res."},{"key":"10.1016\/j.knosys.2026.116124_b5","unstructured":"Z. Liang, Y. Mu, M. Ding, F. Ni, M. Tomizuka, P. Luo, AdaptDiffuser: Diffusion Models as Adaptive Self-evolving Planners, in: Proc. Int. Conf. Mach. Learn., (ISSN: 2640-3498) 2023, pp. 20725\u201320745."},{"key":"10.1016\/j.knosys.2026.116124_b6","unstructured":"H. Li, Q. Feng, Z. Zheng, J. Feng, A. Knoll, Generalizable Robotic Manipulation: Object-Centric Diffusion Policy with Language Guidance, in: Workshop on Embodiment-Aware Robot Learning."},{"key":"10.1016\/j.knosys.2026.116124_b7","doi-asserted-by":"crossref","unstructured":"X. Huang, S. Belongie, Arbitrary style transfer in real-time with adaptive instance normalization, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 1501\u20131510.","DOI":"10.1109\/ICCV.2017.167"},{"key":"10.1016\/j.knosys.2026.116124_b8","unstructured":"T.-W. Ke, N. Gkanatsios, K. Fragkiadaki, 3D Diffuser Actor: Policy Diffusion with 3D Scene Representations, in: First Workshop on Vision-Language Models for Navigation and Manipulation At ICRA 2024."},{"key":"10.1016\/j.knosys.2026.116124_b9","series-title":"No training, no problem: Rethinking classifier-free guidance for diffusion models","author":"Sadat","year":"2024"},{"key":"10.1016\/j.knosys.2026.116124_b10","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112190","article-title":"DiffSkill: Improving reinforcement learning through diffusion-based skill denoiser for robotic manipulation","volume":"300","author":"Liu","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113738","article-title":"Memory-gated diffusion policy: advancing robotic behaviour learning with memory-oriented architectures","author":"Huang","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b12","first-page":"95456","article-title":"Diffusion-reward adversarial imitation learning","volume":"37","author":"Lai","year":"2025","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b13","first-page":"31124","article-title":"Learning an actionable discrete diffusion policy via large-scale actionless video pre-training","volume":"37","author":"He","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b14","series-title":"Conference on Robot Learning\/Proceedings of Machine Learning Research","article-title":"Chaineddiffuser: Unifying trajectory diffusion and keypose prediction for robotic manipulation","author":"Xian","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b15","doi-asserted-by":"crossref","unstructured":"X. Ma, S. Patidar, I. Haughton, S. James, Hierarchical diffusion policy for kinematics-aware multi-task robotic manipulation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 18081\u201318090.","DOI":"10.1109\/CVPR52733.2024.01712"},{"key":"10.1016\/j.knosys.2026.116124_b16","first-page":"112386","article-title":"Prediction with action: Visual policy learning via joint denoising process","volume":"37","author":"Guo","year":"2025","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b17","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114152","article-title":"A segmented motion synthesis method for robotic task-oriented locomotion imitation system","author":"Shi","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b18","article-title":"Diffusion model-based path follower for a salamander-like robot","author":"Liu","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b19","series-title":"Conference on Robot Learning","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","author":"Shridhar","year":"2022"},{"key":"10.1016\/j.knosys.2026.116124_b20","series-title":"Conference on Robot Learning","first-page":"175","article-title":"Instruction-driven history-aware policies for robotic manipulations","author":"Guhur","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b21","unstructured":"E. Jang, A. Irpan, M. Khansari, D. Kappler, F. Ebert, C. Lynch, S. Levine, C. Finn, Bc-z: Zero-shot task generalization with robotic imitation learning, in: Conference on Robot Learning, 2022, pp. 991\u20131002."},{"key":"10.1016\/j.knosys.2026.116124_b22","series-title":"International Conference on Machine Learning","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","author":"Huang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116124_b23","series-title":"Instruction tuning for large language models: A survey","author":"Zhang","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b24","first-page":"11525","article-title":"Object-centric learning with slot attention","volume":"33","author":"Locatello","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b25","doi-asserted-by":"crossref","unstructured":"C. Devin, P. Abbeel, T. Darrell, S. Levine, Deep object-centric representations for generalizable robot learning, in: 2018 IEEE International Conference on Robotics and Automation, 2018, pp. 7111\u20137118.","DOI":"10.1109\/ICRA.2018.8461196"},{"key":"10.1016\/j.knosys.2026.116124_b26","series-title":"Conference on Robot Learning","first-page":"1199","article-title":"Viola: Imitation learning for vision-based manipulation with object proposal priors","author":"Zhu","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b27","series-title":"2024 IEEE International Conference on Robotics and Automation","first-page":"15424","article-title":"Composing pre-trained object-centric representations for robotics from\u201d what\u201d and\u201d where\u201d foundation models","author":"Shi","year":"2024"},{"key":"10.1016\/j.knosys.2026.116124_b28","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1016\/j.knosys.2015.05.032","article-title":"Exploiting semantic knowledge for robot object recognition","volume":"86","author":"Ruiz-Sarmiento","year":"2015","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b29","article-title":"A semantic knowledge-based method for home service robot to grasp an object","volume":"297","author":"Li","year":"2024","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b30","first-page":"8085","article-title":"Genesis-v2: Inferring unordered object representations without iterative refinement","volume":"34","author":"Engelcke","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b31","series-title":"Conference on Robot Learning","first-page":"979","article-title":"Graph-structured visual imitation","author":"Sieb","year":"2020"},{"key":"10.1016\/j.knosys.2026.116124_b32","article-title":"Unsupervised learning of object keypoints for perception and control","volume":"32","author":"Kulkarni","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b33","article-title":"Unsupervised learning of object structure and dynamics from videos","volume":"32","author":"Minderer","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b34","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.110491","article-title":"UPG: 3D vision-based prediction framework for robotic grasping in multi-object scenes","volume":"270","author":"Li","year":"2023","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116124_b35","doi-asserted-by":"crossref","unstructured":"S. Qian, K. Mo, V. Blukis, D.F. Fouhey, D. Fox, A. Goyal, 3D-MVP: 3D Multiview Pretraining for Manipulation, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 22530\u201322539.","DOI":"10.1109\/CVPR52734.2025.02098"},{"key":"10.1016\/j.knosys.2026.116124_b36","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116124_b37","series-title":"European Conference on Computer Vision","first-page":"38","article-title":"Grounding dino: Marrying dino with grounded pre-training for open-set object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.knosys.2026.116124_b38","unstructured":"A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, et al., Learning transferable visual models from natural language supervision, in: International Conference on Machine Learning, 2021, pp. 8748\u20138763."},{"key":"10.1016\/j.knosys.2026.116124_b39","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.127063","article-title":"Roformer: Enhanced transformer with rotary position embedding","volume":"568","author":"Su","year":"2024","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.116124_b40","series-title":"Denoising diffusion implicit models","author":"Song","year":"2020"},{"issue":"2","key":"10.1016\/j.knosys.2026.116124_b41","doi-asserted-by":"crossref","first-page":"3019","DOI":"10.1109\/LRA.2020.2974707","article-title":"Rlbench: The robot learning benchmark & learning environment","volume":"5","author":"James","year":"2020","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.knosys.2026.116124_b42","doi-asserted-by":"crossref","unstructured":"S. James, K. Wada, T. Laidlow, A.J. Davison, Coarse-to-fine q-attention: Efficient learning for visual robotic manipulation via discretisation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2022, pp. 13739\u201313748.","DOI":"10.1109\/CVPR52688.2022.01337"},{"key":"10.1016\/j.knosys.2026.116124_b43","unstructured":"S. Chen, R. Garcia, C. Schmid, I. Laptev, PolarNet: 3D Point Clouds for Language-Guided Robotic Manipulation, in: Conference on Robot Learning, 2023."},{"key":"10.1016\/j.knosys.2026.116124_b44","series-title":"Conference on Robot Learning","first-page":"694","article-title":"Rvt: Robotic view transformer for 3d object manipulation","author":"Goyal","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b45","series-title":"Conference on Robot Learning","first-page":"3949","article-title":"Act3D: 3D feature field transformers for multi-task robotic manipulation","author":"Gervet","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b46","series-title":"Conference on Robot Learning","first-page":"284","article-title":"Gnfactor: Multi-task real robot learning with generalizable neural feature fields","author":"Ze","year":"2023"},{"key":"10.1016\/j.knosys.2026.116124_b47","series-title":"European Conference on Computer Vision","first-page":"349","article-title":"Manigaussian: Dynamic gaussian splatting for multi-task robotic manipulation","author":"Lu","year":"2024"},{"key":"10.1016\/j.knosys.2026.116124_b48","series-title":"THE COLOSSEUM: A benchmark for evaluating generalization for robotic manipulation","author":"Pumacay","year":"2024"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008506?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008506?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:05:48Z","timestamp":1780016748000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008506"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":48,"alternative-id":["S0950705126008506"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116124","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"ObjectDiff: An object-centric diffusion policy with modality-specific conditioning for robot manipulation","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116124","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116124"}}