{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:13:26Z","timestamp":1777655606897,"version":"3.51.4"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730238","type":"print"},{"value":"9783031730245","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T00:00:00Z","timestamp":1732406400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73024-5_18","type":"book-chapter","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T16:38:50Z","timestamp":1732552730000},"page":"297-314","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["BlenderAlchemy: Editing 3D Graphics with\u00a0Vision-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8426-7543","authenticated-orcid":false,"given":"Ian","family":"Huang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2992-5803","authenticated-orcid":false,"given":"Guandao","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8315-4886","authenticated-orcid":false,"given":"Leonidas","family":"Guibas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,24]]},"reference":[{"key":"18_CR1","unstructured":"Blendergpt. https:\/\/github.com\/gd3kr\/BlenderGPT"},{"key":"18_CR2","unstructured":"How long does it take to create a 3D model? https:\/\/3d-ace.com\/blog\/how-long-does-it-take-to-create-a-3d-model\/"},{"key":"18_CR3","unstructured":"How long does it take to make a 3D model? https:\/\/pixune.com\/blog\/how-long-does-it-take-to-create-a-3d-model\/"},{"key":"18_CR4","unstructured":"Ahn, M., et al.: Do as i can, not as i say: grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 (2022)"},{"key":"18_CR5","unstructured":"Austin, J., et al.: Program synthesis with large language models. arXiv preprint arXiv:2108.07732 (2021)"},{"key":"18_CR6","unstructured":"Baumli, K., et al.: Vision-language models as a source of rewards. arXiv preprint arXiv:2312.09187 (2023)"},{"key":"18_CR7","unstructured":"Betker, J., et al.: Improving image generation with better captions. Comput. Sci. 2(3), 8 (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"18_CR8","doi-asserted-by":"crossref","unstructured":"Chen, D.Z., Siddiqui, Y., Lee, H.Y., Tulyakov, S., Nie\u00dfner, M.: Text2tex: text-driven texture synthesis via diffusion models. arXiv preprint arXiv:2303.11396 (2023)","DOI":"10.1109\/ICCV51070.2023.01701"},{"key":"18_CR9","unstructured":"Chen, M., et al.: Evaluating large language models trained on code (2021)"},{"key":"18_CR10","first-page":"30923","volume":"35","author":"Y Chen","year":"2022","unstructured":"Chen, Y., Chen, R., Lei, J., Zhang, Y., Jia, K.: Tango: text-driven photorealistic and robust 3D stylization via lighting decomposition. Adv. Neural. Inf. Process. Syst. 35, 30923\u201330936 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR11","doi-asserted-by":"crossref","unstructured":"De\u00a0La\u00a0Torre, F., Fang, C.M., Huang, H., Banburski-Fahey, A., Fernandez, J.A., Lanier, J.: LLMR: real-time prompting of interactive worlds using large language models. arXiv preprint arXiv:2309.12276 (2023)","DOI":"10.1145\/3613904.3642579"},{"key":"18_CR12","unstructured":"Firoozi, R., et al.: Foundation models in robotics: applications, challenges, and the future. arXiv preprint arXiv:2312.07843 (2023)"},{"key":"18_CR13","unstructured":"Fu, C., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"18_CR14","unstructured":"Fu, C., et al.: A challenger to GPT-4V? Early explorations of gemini in visual expertise. arXiv preprint arXiv:2312.12436 (2023)"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Goel, P., Wang, K.C., Liu, C.K., Fatahalian, K.: Iterative motion editing with natural language. arXiv preprint arXiv:2312.11538 (2023)","DOI":"10.1145\/3641519.3657447"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Guerrero, P., Ha\u0161an, M., Sunkavalli, K., M\u011bch, R., Boubekeur, T., Mitra, N.J.: Matformer: a generative model for procedural materials. arXiv preprint arXiv:2207.01044 (2022)","DOI":"10.1145\/3528223.3530173"},{"key":"18_CR17","doi-asserted-by":"crossref","unstructured":"Henzler, P., Deschaintre, V., Mitra, N.J., Ritschel, T.: Generative modelling of BRDF textures from flash images. arXiv preprint arXiv:2102.11861 (2021)","DOI":"10.1145\/3478513.3480507"},{"key":"18_CR18","unstructured":"Hu, Y., et al.: Toward general-purpose robots via foundation models: a survey and meta-analysis. arXiv preprint arXiv:2312.08782 (2023)"},{"key":"18_CR19","doi-asserted-by":"crossref","unstructured":"Hu, Y., Guerrero, P., Hasan, M., Rushmeier, H., Deschaintre, V.: Node graph optimization using differentiable proxies. In: ACM SIGGRAPH 2022 Conference Proceedings, pp.\u00a01\u20139 (2022)","DOI":"10.1145\/3528233.3530733"},{"issue":"2","key":"18_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3502431","volume":"41","author":"Y Hu","year":"2022","unstructured":"Hu, Y., He, C., Deschaintre, V., Dorsey, J., Rushmeier, H.: An inverse procedural modeling pipeline for SVBRDF maps. ACM Trans. Graph. (TOG) 41(2), 1\u201317 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"18_CR21","unstructured":"Huang, I., Krishna, V., Atekha, O., Guibas, L.: Aladdin: zero-shot hallucination of stylized 3D assets from abstract scene descriptions. arXiv preprint arXiv:2306.06212 (2023)"},{"key":"18_CR22","unstructured":"Jiang, A.Q., et al.: Mistral 7B. arXiv preprint arXiv:2310.06825 (2023)"},{"key":"18_CR23","unstructured":"Li, C., et al.: LLaVA-MED: training a large language-and-vision assistant for biomedicine in one day. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"18_CR24","doi-asserted-by":"crossref","unstructured":"Liang, J., et al.: Code as policies: language model programs for embodied control. In: 2023 IEEE International Conference on Robotics and Automation (ICRA), pp. 9493\u20139500. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"18_CR25","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.neucom.2018.02.061","volume":"291","author":"J Liu","year":"2018","unstructured":"Liu, J., et al.: Perception-driven procedural texture generation from examples. Neurocomputing 291, 21\u201334 (2018)","journal-title":"Neurocomputing"},{"key":"18_CR26","unstructured":"Olausson, T.X., Inala, J.P., Wang, C., Gao, J., Solar-Lezama, A.: Is self-repair a silver bullet for code generation? In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"18_CR27","unstructured":"OpenAI: GPT-4 system card. OpenAI (2023). https:\/\/cdn.openai.com\/papers\/gpt-4-system-card.pdf"},{"key":"18_CR28","unstructured":"OpenAI: GPT-4v(ision) system card. OpenAI (2023). https:\/\/api.semanticscholar.org\/CorpusID:263218031"},{"key":"18_CR29","doi-asserted-by":"crossref","unstructured":"Park, J.S., O\u2019Brien, J., Cai, C.J., Morris, M.R., Liang, P., Bernstein, M.S.: Generative agents: interactive simulacra of human behavior. In: Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, pp. 1\u201322 (2023)","DOI":"10.1145\/3586183.3606763"},{"key":"18_CR30","unstructured":"Patil, S.G., Zhang, T., Wang, X., Gonzalez, J.E.: Gorilla: large language model connected with massive APIs. arXiv preprint arXiv:2305.15334 (2023)"},{"key":"18_CR31","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"18_CR32","doi-asserted-by":"crossref","unstructured":"Raistrick, A., et al.: Infinite photorealistic worlds using procedural generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12630\u201312641 (2023)","DOI":"10.1109\/CVPR52729.2023.01215"},{"key":"18_CR33","doi-asserted-by":"crossref","unstructured":"Richardson, E., Metzer, G., Alaluf, Y., Giryes, R., Cohen-Or, D.: Texture: text-guided texturing of 3D shapes. arXiv preprint arXiv:2302.01721 (2023)","DOI":"10.1145\/3588432.3591503"},{"key":"18_CR34","doi-asserted-by":"crossref","unstructured":"Ritchie, D., et al.: Neurosymbolic models for computer graphics. In: Computer Graphics Forum, vol.\u00a042, pp. 545\u2013568. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14775"},{"issue":"7995","key":"18_CR35","doi-asserted-by":"publisher","first-page":"468","DOI":"10.1038\/s41586-023-06924-6","volume":"625","author":"B Romera-Paredes","year":"2024","unstructured":"Romera-Paredes, B., et al.: Mathematical discoveries from program search with large language models. Nature 625(7995), 468\u2013475 (2024)","journal-title":"Nature"},{"key":"18_CR36","unstructured":"Schick, T., et al.: Toolformer: language models can teach themselves to use tools. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"18_CR37","unstructured":"Sharma, P., et al.: Alchemist: parametric control of material properties with diffusion models. arXiv preprint arXiv:2312.02970 (2023)"},{"issue":"6","key":"18_CR38","first-page":"1","volume":"39","author":"L Shi","year":"2020","unstructured":"Shi, L., et al.: Match: differentiable material graphs for procedural material capture. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"18_CR39","doi-asserted-by":"crossref","unstructured":"Shimizu, E., Fisher, M., Paris, S., McCann, J., Fatahalian, K.: Design adjectives: a framework for interactive model-guided exploration of parameterized design spaces. In: Proceedings of the 33rd Annual ACM Symposium on User Interface Software and Technology, pp. 261\u2013278 (2020)","DOI":"10.1145\/3379337.3415866"},{"key":"18_CR40","unstructured":"Shinn, N., Cassano, F., Labash, B., Gopinath, A., Narasimhan, K., Yao, S.: Reflexion: language agents with verbal reinforcement learning (2023). arXiv preprint cs.AI\/2303.11366 (2023)"},{"key":"18_CR41","doi-asserted-by":"crossref","unstructured":"Singh, I., et al.: Progprompt: generating situated robot task plans using large language models. In: 2023 IEEE International Conference on Robotics and Automation (ICRA), pp. 11523\u201311530. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"18_CR42","unstructured":"Sun, C., Han, J., Deng, W., Wang, X., Qin, Z., Gould, S.: 3D-GPT: procedural 3D modeling with large language models. arXiv preprint arXiv:2310.12945 (2023)"},{"key":"18_CR43","doi-asserted-by":"crossref","unstructured":"Tchapmi, L.P., Ray, T., Tchapmi, M., Shen, B., Martin-Martin, R., Savarese, S.: Generating procedural 3D materials from images using neural networks. In: 2022 4th International Conference on Image, Video and Signal Processing, pp. 32\u201340 (2022)","DOI":"10.1145\/3531232.3531237"},{"key":"18_CR44","unstructured":"Team, G., et al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"18_CR45","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"18_CR46","unstructured":"Vecchio, G., et al.: Controlmat: a controlled generative approach to material capture. arXiv preprint arXiv:2309.01700 (2023)"},{"key":"18_CR47","doi-asserted-by":"crossref","unstructured":"Vecchio, G., Sortino, R., Palazzo, S., Spampinato, C.: Matfuse: controllable material generation with diffusion models. arXiv preprint arXiv:2308.11408 (2023)","DOI":"10.1109\/CVPR52733.2024.00424"},{"key":"18_CR48","unstructured":"Wang, G., et al.: Voyager: an open-ended embodied agent with large language models. arXiv preprint arXiv:2305.16291 (2023)"},{"key":"18_CR49","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"18_CR50","unstructured":"Wen, Z., Liu, Z., Sridhar, S., Fu, R.: Anyhome: open-vocabulary generation of structured and textured 3D homes. arXiv preprint arXiv:2312.06644 (2023)"},{"key":"18_CR51","doi-asserted-by":"crossref","unstructured":"Wu, T., et al.: GPT-4V (ision) is a human-aligned evaluator for text-to-3D generation. arXiv preprint arXiv:2401.04092 (2024)","DOI":"10.1109\/CVPR52733.2024.02098"},{"key":"18_CR52","unstructured":"Xiao, X., et al.: Robot learning in the era of foundation models: a survey. arXiv preprint arXiv:2311.14379 (2023)"},{"key":"18_CR53","unstructured":"Yamada, Y., Chandu, K., Lin, Y., Hessel, J., Yildirim, I., Choi, Y.: L3go: language agents with chain-of-3D-thoughts for generating unconventional objects. arXiv preprint arXiv:2402.09052 (2024)"},{"key":"18_CR54","doi-asserted-by":"crossref","unstructured":"Yang, H., Chen, Y., Pan, Y., Yao, T., Chen, Z., Mei, T.: 3dstyle-diffusion: pursuing fine-grained text-driven 3D stylization with 2D diffusion models. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 6860\u20136868 (2023)","DOI":"10.1145\/3581783.3612363"},{"key":"18_CR55","doi-asserted-by":"crossref","unstructured":"Yang, Y., et al.: Holodeck: language guided generation of 3D embodied AI environments. In: The IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR 2024), vol.\u00a030, pp. 20\u201325. IEEE\/CVF (2024)","DOI":"10.1109\/CVPR52733.2024.01536"},{"key":"18_CR56","unstructured":"Yang, Z., et al.: The dawn of LMMs: preliminary explorations with GPT-4V (ision). arXiv preprint arXiv:2309.17421, vol. 9, no. 1, p. 1 (2023)"},{"key":"18_CR57","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"18_CR58","unstructured":"Yin, S., et al.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)"},{"key":"18_CR59","unstructured":"Yin, S., et al.: Woodpecker: hallucination correction for multimodal large language models. arXiv preprint arXiv:2310.16045 (2023)"},{"key":"18_CR60","doi-asserted-by":"crossref","unstructured":"Zeng, X.: Paint3d: paint anything 3D with lighting-less texture diffusion models. arXiv preprint arXiv:2312.13913 (2023)","DOI":"10.1109\/CVPR52733.2024.00407"},{"key":"18_CR61","unstructured":"Zhou, H., et al.: Language-conditioned learning for robotic manipulation: a survey. arXiv preprint arXiv:2312.10807 (2023)"},{"key":"18_CR62","doi-asserted-by":"crossref","unstructured":"Zsolnai-Feh\u00e9r, K., Wonka, P., Wimmer, M.: Gaussian material synthesis. arXiv preprint arXiv:1804.08369 (2018)","DOI":"10.1145\/3197517.3201307"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73024-5_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T17:09:24Z","timestamp":1732554564000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73024-5_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,24]]},"ISBN":["9783031730238","9783031730245"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73024-5_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,24]]},"assertion":[{"value":"24 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}