{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:52:07Z","timestamp":1777654327416,"version":"3.51.4"},"reference-count":89,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10610175","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"16841-16849","source":"Crossref","is-referenced-by-count":12,"title":["Crossway Diffusion: Improving Diffusion-based Visuomotor Policy via Self-supervised Learning"],"prefix":"10.1109","author":[{"given":"Xiang","family":"Li","sequence":"first","affiliation":[{"name":"Stony Brook University,Department of Computer Science,New York,11790"}]},{"given":"Varun","family":"Belagali","sequence":"additional","affiliation":[{"name":"Stony Brook University,Department of Computer Science,New York,11790"}]},{"given":"Jinghuan","family":"Shang","sequence":"additional","affiliation":[{"name":"Stony Brook University,Department of Computer Science,New York,11790"}]},{"given":"Michael S.","family":"Ryoo","sequence":"additional","affiliation":[{"name":"Stony Brook University,Department of Computer Science,New York,11790"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Alvinn: An autonomous land vehicle in a neural network","author":"Pomerleau","year":"1988","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"ref3","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref4","first-page":"15 084","article-title":"Decision transformer: Reinforcement learning via sequence modeling","volume":"34","author":"Chen","year":"2021","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref5","article-title":"Offline reinforcement learning as one big sequence modeling problem","author":"Janner","year":"2021","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3204708"},{"key":"ref7","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref9","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref10","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref11","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"International Conference on Machine Learning","author":"Sohl-Dickstein"},{"key":"ref12","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref13","article-title":"Denoising diffusion implicit models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Song"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657500"},{"key":"ref15","article-title":"Planning with diffusion for flexible behavior synthesis","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Janner"},{"key":"ref16","article-title":"Diffusion policies as an expressive policy class for offline reinforcement learning","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Wang"},{"key":"ref17","article-title":"Imitating human behaviour with diffusion models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Pearce"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref19","article-title":"What mat ters in learning from offline demonstrations for robot manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Mandlekar"},{"key":"ref20","article-title":"Hierarchical text-conditional image generation with clip latents","author":"Ramesh","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.4324\/9780203901854-3"},{"key":"ref24","article-title":"Diffusion model-augmented behavioral cloning","author":"Wang","year":"2023"},{"key":"ref25","article-title":"Synthetic experience replay","author":"Lu","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.027"},{"key":"ref27","article-title":"Learning universal policies via text-guided video generation","author":"Dai","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19842-7_27"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref30","article-title":"Spatial broadcast decoder: A simple architecture for learning disentangled representations in vaes","author":"Watters","year":"2019"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref33","first-page":"158","article-title":"Implicit behavioral cloning","volume-title":"Conference on Robot Learning (CoRL)","author":"Florence"},{"key":"ref34","first-page":"5639","article-title":"Curl: Contrastive unsupervised representations for reinforcement learning","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Laskin"},{"key":"ref35","first-page":"30 865","article-title":"Does self-supervised learning really improve reinforcement learning from pixels?","volume":"35","author":"Li","year":"2022","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/4168.001.0001"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1163\/016918611X558261"},{"key":"ref38","first-page":"12","article-title":"Robot learning from demonstration","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","volume":"97","author":"Atkeson"},{"key":"ref39","first-page":"2","article-title":"Algorithms for inverse reinforcement learning","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","volume":"1","author":"Ng"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1561\/9781680834116"},{"key":"ref41","article-title":"Generative adversarial imitation learning","volume":"29","author":"Ho","year":"2016","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459670"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2019.2956365"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461249"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461076"},{"key":"ref46","first-page":"20 132","article-title":"A minimalist approach to offline reinforcement learning","volume":"34","author":"Fujimoto","year":"2021","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201311"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2018.XIV.049"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/1015330.1015430"},{"key":"ref50","article-title":"Graph decision transformer","author":"Hu","year":"2023"},{"key":"ref51","article-title":"Online decision transformer","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Zheng"},{"key":"ref52","article-title":"Distributional decision transformer for hindsight information matching","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Furuta"},{"key":"ref53","article-title":"Bootstrapped transformer for offline reinforcement learning","author":"Wang","year":"2022","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref54","article-title":"Diffusion models for reinforcement learning: A survey","author":"Zhu","year":"2023"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390294"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref57","first-page":"10 078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"ref59","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref60","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Chen"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI53787.2023.10230477"},{"key":"ref62","article-title":"Neural neural textures make sim2real consistent","volume-title":"Conference on Robot Learning (CoRL)","author":"Burgert"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1093\/bioinformatics\/btad191"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00673"},{"key":"ref65","article-title":"Recurrent world models facilitate policy evolution","volume":"31","author":"Ha","year":"2018","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref67","article-title":"A framework for efficient robotic manipulation","author":"Zhan","year":"2021","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref68","article-title":"Rrl: Resnet as representation for reinforcement learning","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Shah"},{"key":"ref69","first-page":"9870","article-title":"Decoupling representation learning from reinforcement learning","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Stooke"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636363"},{"key":"ref71","article-title":"Vrl3: A data-driven framework for visual deep reinforcement learning","author":"Wang","year":"2022","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref72","article-title":"Masked visual pre-training for motor control","author":"Xiao","year":"2022"},{"key":"ref73","article-title":"Representation learning with contrastive predictive coding","author":"Oord","year":"2018"},{"key":"ref74","first-page":"2117","article-title":"Deep variational reinforcement learning for pomdps","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Igl"},{"key":"ref75","first-page":"2555","article-title":"Learning latent dynamics for planning from pixels","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Hafner"},{"key":"ref76","article-title":"Learning representations in reinforcement learning: An information bottleneck approach","author":"Yingjun","year":"2019"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17276"},{"key":"ref78","first-page":"741","article-title":"Stochastic latent actor-critic: Deep reinforcement learning with a latent variable model","volume":"33","author":"Lee","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3176413"},{"key":"ref80","first-page":"3686","article-title":"Deep reinforcement and infomax learning","volume":"33","author":"Mazoure","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref81","first-page":"11 890","article-title":"Predictive information accelerates learning in rl","volume":"33","author":"Lee","year":"2020","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref82","first-page":"3875","article-title":"Bootstrap latent-predictive representations for multitask reinforcement learning","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Guo"},{"key":"ref83","article-title":"Data-efficient reinforcement learning with momentum predictive representations","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Schwarzer"},{"key":"ref84","article-title":"Learning invariant representations for reinforcement learning without reconstruction","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Zhang"},{"key":"ref85","article-title":"Playvirtual: Augmenting cycle-consistent virtual trajectories for reinforcement learning","volume":"34","author":"Yu","year":"2021","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref86","article-title":"Unsupervised learning of object key-points for perception and control","volume":"32","author":"Kulkarni","year":"2019","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00661"},{"key":"ref88","article-title":"Progressive distillation for fast sampling of diffusion models","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Salimans"},{"key":"ref89","article-title":"Consistency models","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Song"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","location":"Yokohama, Japan","start":{"date-parts":[[2024,5,13]]},"end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10610175.pdf?arnumber=10610175","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,10]],"date-time":"2024-08-10T05:24:53Z","timestamp":1723267493000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10610175\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":89,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10610175","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}