{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T14:51:32Z","timestamp":1779202292827,"version":"3.51.4"},"reference-count":98,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127489","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"15493-15500","source":"Crossref","is-referenced-by-count":3,"title":["Blox-Net: Generative Design-for-Robot-Assembly Using VLM Supervision, Physics Simulation, and a Robot with Reset"],"prefix":"10.1109","author":[{"given":"Andrew","family":"Goldberg","sequence":"first","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kavish","family":"Kondap","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianshuang","family":"Qiu","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zehan","family":"Ma","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Letian","family":"Fu","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Justin","family":"Kerr","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huang","family":"Huang","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaiyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kuan","family":"Fang","sequence":"additional","affiliation":[{"name":"Cornell University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ken","family":"Goldberg","sequence":"additional","affiliation":[{"name":"The AUTOLab at UC Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Give Me Liberty! An American History: Seagull Fourth Edition","volume":"1","author":"Foner","year":"2013","journal-title":"WW Norton & Company"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1201\/9781420027358"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.2307\/1266770"},{"key":"ref4","article-title":"Computer-aided manufacturing","author":"Chang","year":"1991","journal-title":"Prentice-Hall, Inc."},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.2118\/6057-PA"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/70.782031"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1002\/9781119667889"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1002\/9780470172506"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/BF01891840"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.autcon.2021.103569"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555525"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636328"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160763"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197293"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811777"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1201\/9781315136370"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25555-7_14"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460887"},{"key":"ref19","first-page":"88218831","article-title":"Zero-shot text-to-image generation","volume-title":"International Conference on Machine Learning, PMLR","author":"Ramesh","year":"2021"},{"key":"ref20","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref21","first-page":"27 730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref22","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv preprint"},{"key":"ref23","first-page":"31769","article-title":"Generating long videos of dynamic scenes","volume":"35","author":"Brooks","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00770"},{"key":"ref25","article-title":"Dreamfusion: Text-to-3d using 2d diffusion","volume-title":"International Conference on Learning Representations (ICLR)","author":"Poole","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"ref28","article-title":"Chipnemo: Domain-adapted llms for chip design","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","volume-title":"Design for Assembly - A Designer\u2019s Handbook","author":"Boothroyd","year":"1983"},{"key":"ref30","article-title":"The hitachi assemblability evaluation mrthod (aem)","volume-title":"Proceedings of 1st Int. Conf. on Product Design for Assembly","author":"Miyakawa","year":"1986"},{"key":"ref31","volume-title":"Manufacturing Producibility Handbook","year":"1960"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-6393-8_3"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/BF02601481"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1299\/jsmec.45.567"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1201\/9781420089288"},{"key":"ref36","volume-title":"Integrated and Simultaneous Design for Robotic Assembly: Product Development, Planning\u2026","author":"Rampersad","year":"1994"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611595"},{"key":"ref38","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610554"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.3390\/buildings13071772"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1115\/1.4067333"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3638529.3654138"},{"key":"ref43","article-title":"Query2cad: Generating cad models using natural language queries","author":"Badagabettu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","article-title":"Cad-11m: Large language model for cad generation","volume-title":"Proceedings of the neural information processing systems conference. neurIPS","author":"Wu","year":"2023"},{"key":"ref45","author":"Feng","year":"2025","journal-title":"Reflective planning: Vision-language models for multi-stage long-horizon robotic manipulation"},{"key":"ref46","author":"Lessin","year":"2025","journal-title":"Larry page has a new ai startup - the information"},{"key":"ref47","volume-title":"Adam: AI Powered CAD","year":"2025"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/1531326.1531379"},{"key":"ref49","article-title":"Learning a probabilistic latent space of object shapes via 3d generative-adversarial modeling","volume-title":"29th Conference on Neural Information Processing Systems (NIPS 2016)","author":"Wu","year":"2016"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_4"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.264"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00025"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.260"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00459"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00465"},{"key":"ref56","article-title":"Adaptive procedural task generation for hard-exploration problems","volume-title":"International Conference on Representation Learning (ICLR)","author":"Fang","year":"2021"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2021.xvii.010"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10341727"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00553"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161431"},{"key":"ref61","article-title":"Gensim: Generating robotic simulation tasks via large language models","volume-title":"International Conference on Learning Representations (ICLR)","author":"Wang","year":"2024"},{"key":"ref62","first-page":"3766","article-title":"Scaling up and distilling down: Language-guided robot skill acquisition","volume-title":"Proceedings of The 7th Conference on Robot Learning","volume":"229","author":"Ha","year":"2023"},{"key":"ref63","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning, PMLR","author":"Radford","year":"2021"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20893-6_7"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01808"},{"key":"ref67","volume-title":"Gpt-4o system card","year":"2024"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref69","author":"Radford","year":"2018","journal-title":"Improving language understanding by generative pre-training"},{"issue":"8","key":"ref70","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"issue":"240","key":"ref71","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref72","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International Conference on Machine Learning","author":"Li","year":"2023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00063"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref76","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023","journal-title":"arXiv preprint"},{"key":"ref77","article-title":"Vima: General robot manipulation with multimodal prompts","volume-title":"Fortieth International Conference on Machine Learning","author":"Jiang","year":"2023"},{"key":"ref78","volume-title":"Octo: An open-source generalist robot policy","year":"2023"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.062"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11128272"},{"key":"ref81","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv preprint"},{"key":"ref82","article-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref83","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"International Conference on Machine Learning, PMLR","author":"Huang","year":"2022"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref87","article-title":"Voyager: An open-ended embodied agent with large language models","author":"Wang","year":"2024","journal-title":"Neural Information Processing Systems (NIPS)"},{"key":"ref88","article-title":"Large language models as general pattern machines","volume-title":"Conference on Robot Learning (CoRL)","author":"Mirchandani","year":"2023"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11128156"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10131-7"},{"issue":"3","key":"ref91","first-page":"8","article-title":"Improving image generation with better captions","volume":"2","author":"Betker","year":"2023","journal-title":"Computer Science"},{"key":"ref92","volume-title":"Pybullet, a python module for physics simulation for games, robotics and machine learning","author":"Coumans","year":"2016-2021"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/358669.358692"},{"key":"ref95","article-title":"Visual instruction tuning","author":"Liu","year":"2023","journal-title":"NeurIPS"},{"key":"ref96","author":"Chiang","year":"2023","journal-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality"},{"key":"ref97","article-title":"A touch, vision, and language dataset for multimodal alignment","volume-title":"Forty-first International Conference on Machine Learning","author":"Fu","year":"2024"},{"key":"ref98","volume-title":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning","year":"2025"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127489.pdf?arnumber=11127489","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T18:55:25Z","timestamp":1764010525000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127489\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":98,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127489","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}