{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:05:55Z","timestamp":1765310755090,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":89,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62441222"],"award-info":[{"award-number":["62441222"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Innovation Yongjiang 2035 Key R&D Programme","award":["2025Z062"],"award-info":[{"award-number":["2025Z062"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755132","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"9783-9792","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HiScene: Creating Hierarchical 3D Scenes with Isometric View Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2373-2647","authenticated-orcid":false,"given":"Wenqi","family":"Dong","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7604-5553","authenticated-orcid":false,"given":"Bangbang","family":"Yang","sequence":"additional","affiliation":[{"name":"ByteDance, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8112-7960","authenticated-orcid":false,"given":"Zesong","family":"Yang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4267-7297","authenticated-orcid":false,"given":"Yuan","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6978-6994","authenticated-orcid":false,"given":"Tao","family":"Hu","sequence":"additional","affiliation":[{"name":"ByteDance, San Jose, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2662-0334","authenticated-orcid":false,"given":"Hujun","family":"Bao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-7734-7053","authenticated-orcid":false,"given":"Yuewen","family":"Ma","sequence":"additional","affiliation":[{"name":"ByteDance, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7130-439X","authenticated-orcid":false,"given":"Zhaopeng","family":"Cui","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Blended latent diffusion. ACM transactions on graphics (TOG)","author":"Avrahami Omri","year":"2023","unstructured":"Omri Avrahami, Ohad Fried, and Dani Lischinski. 2023. Blended latent diffusion. ACM transactions on graphics (TOG), Vol. 42, 4 (2023), 1-11."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/344779.344972"},{"key":"e_1_3_2_1_4_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts et al. 2023. Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"SceneFactor: Factored Latent 3D Diffusion for Controllable 3D Scene Generation. arXiv preprint arXiv:2412.01801","author":"Bokhovkin Alexey","year":"2024","unstructured":"Alexey Bokhovkin, Quan Meng, Shubham Tulsiani, and Angela Dai. 2024. SceneFactor: Factored Latent 3D Diffusion for Controllable 3D Scene Generation. arXiv preprint arXiv:2412.01801 (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02033"},{"key":"e_1_3_2_1_7_1","volume-title":"Region-aware text-to-image generation via hard binding and soft refinement. arXiv preprint arXiv:2411.06558","author":"Chen Zhennan","year":"2024","unstructured":"Zhennan Chen, Yajie Li, Haofan Wang, Zhibo Chen, Zhengkai Jiang, Jun Li, Qian Wang, Jian Yang, and Ying Tai. 2024a. Region-aware text-to-image generation via hard binding and soft refinement. arXiv preprint arXiv:2411.06558 (2024)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Zhaoxi Chen Jiaxiang Tang Yuhao Dong Ziang Cao Fangzhou Hong Yushi Lan Tengfei Wang Haozhe Xie Tong Wu Shunsuke Saito et al. 2024b. 3dtopia-xl: Scaling high-quality 3d asset generation via primitive diffusion. arXiv preprint arXiv:2409.12957 (2024).","DOI":"10.1109\/CVPR52734.2025.02475"},{"key":"e_1_3_2_1_9_1","volume-title":"Luciddreamer: Domain-free generation of 3d gaussian splatting scenes. arXiv preprint arXiv:2311.13384","author":"Chung Jaeyoung","year":"2023","unstructured":"Jaeyoung Chung, Suyoung Lee, Hyeongjin Nam, Jaerin Lee, and Kyoung Mu Lee. 2023. Luciddreamer: Domain-free generation of 3d gaussian splatting scenes. arXiv preprint arXiv:2311.13384 (2023)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2004.833105"},{"key":"e_1_3_2_1_11_1","volume-title":"TexPro: Text-guided PBR Texturing with Procedural Material Modeling. arXiv preprint arXiv:2410.15891","author":"Dang Ziqiang","year":"2024","unstructured":"Ziqiang Dang, Wenqi Dong, Zesong Yang, Bangbang Yang, Liang Li, Yuewen Ma, and Zhaopeng Cui. 2024. TexPro: Text-guided PBR Texturing with Procedural Material Modeling. arXiv preprint arXiv:2410.15891 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657425"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00727"},{"key":"e_1_3_2_1_15_1","volume-title":"Lrm: Large reconstruction model for single image to 3d. arXiv preprint arXiv:2311.04400","author":"Hong Yicong","year":"2023","unstructured":"Yicong Hong, Kai Zhang, Jiuxiang Gu, Sai Bi, Yang Zhou, Difan Liu, Feng Liu, Kalyan Sunkavalli, Trung Bui, and Hao Tan. 2023. Lrm: Large reconstruction model for single image to 3d. arXiv preprint arXiv:2311.04400 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Tracking any object amodally. CoRR","author":"Hsieh Cheng-Yen","year":"2023","unstructured":"Cheng-Yen Hsieh, Tarasha Khurana, Achal Dave, and Deva Ramanan. 2023. Tracking any object amodally. CoRR (2023)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02202"},{"key":"e_1_3_2_1_18_1","volume-title":"SmartEraser: Remove Anything from Images using Masked-Region Guidance. arXiv preprint arXiv:2501.08279","author":"Jiang Longtao","year":"2025","unstructured":"Longtao Jiang, Zhendong Wang, Jianmin Bao, Wengang Zhou, Dongdong Chen, Lei Shi, Dong Chen, and Houqiang Li. 2025. SmartEraser: Remove Anything from Images using Masked-Region Guidance. arXiv preprint arXiv:2501.08279 (2025)."},{"key":"e_1_3_2_1_19_1","volume-title":"European Conference on Computer Vision. Springer, 150-168","author":"Ju Xuan","year":"2024","unstructured":"Xuan Ju, Xian Liu, Xintao Wang, Yuxuan Bian, Ying Shan, and Qiang Xu. 2024. Brushnet: A plug-and-play image inpainting model with decomposed dual-branch diffusion. In European Conference on Computer Vision. Springer, 150-168."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.23"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00401"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_25_1","volume-title":"European Conference on Computer Vision. Springer, 214-230","author":"Li Haoran","year":"2024","unstructured":"Haoran Li, Haolin Shi, Wenli Zhang, Wenjun Wu, Yong Liao, Lin Wang, Lik-hang Lee, and Peng Yuan Zhou. 2024d. Dreamscene: 3d gaussian-based text-to-3d scene generation via formation pattern sampling. In European Conference on Computer Vision. Springer, 214-230."},{"key":"e_1_3_2_1_26_1","unstructured":"Renjie Li Panwang Pan Bangbang Yang Dejia Xu Shijie Zhou Xuanyang Zhang Zeming Li Achuta Kadambi Zhangyang Wang Zhengzhong Tu et al. 2024c. 4k4dgen: Panoramic 4d generation at 4k resolution. arXiv preprint arXiv:2406.13527 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Craftsman: High-fidelity mesh generation with 3d native generation and interactive geometry refiner. arXiv preprint arXiv:2405.14979","author":"Li Weiyu","year":"2024","unstructured":"Weiyu Li, Jiarui Liu, Rui Chen, Yixun Liang, Xuelin Chen, Ping Tan, and Xiaoxiao Long. 2024b. Craftsman: High-fidelity mesh generation with 3d native generation and interactive geometry refiner. arXiv preprint arXiv:2405.14979 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"DIScene: Object Decoupling and Interaction Modeling for Complex Scene Generation. In SIGGRAPH Asia 2024 Conference Papers. 1-12","author":"Li Xiao-Lei","year":"2024","unstructured":"Xiao-Lei Li, Haodong Li, Hao-Xiang Chen, Tai-Jiang Mu, and Shi-Min Hu. 2024a. DIScene: Object Decoupling and Interaction Modeling for Complex Scene Generation. In SIGGRAPH Asia 2024 Conference Papers. 1-12."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00623"},{"key":"e_1_3_2_1_30_1","volume-title":"Instructscene: Instruction-driven 3d indoor scene synthesis with semantic graph prior. arXiv preprint arXiv:2402.04717","author":"Lin Chenguo","year":"2024","unstructured":"Chenguo Lin and Yadong Mu. 2024. Instructscene: Instruction-driven 3d indoor scene synthesis with semantic graph prior. arXiv preprint arXiv:2402.04717 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Diffsplat: Repurposing image diffusion models for scalable gaussian splat generation. arXiv preprint arXiv:2501.16764","author":"Lin Chenguo","year":"2025","unstructured":"Chenguo Lin, Panwang Pan, Bangbang Yang, Zeming Li, and Yadong Mu. 2025. Diffsplat: Repurposing image diffusion models for scalable gaussian splat generation. arXiv preprint arXiv:2501.16764 (2025)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"e_1_3_2_1_33_1","first-page":"16246","article-title":"Variational amodal object completion","volume":"33","author":"Ling Huan","year":"2020","unstructured":"Huan Ling, David Acuna, Karsten Kreis, Seung Wook Kim, and Sanja Fidler. 2020. Variational amodal object completion. Advances in Neural Information Processing Systems, Vol. 33 (2020), 16246-16257.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_34_1","volume-title":"Image inpainting via tractable steering of diffusion models. arXiv preprint arXiv:2401.03349","author":"Liu Anji","year":"2023","unstructured":"Anji Liu, Mathias Niepert, and Guy Van den Broeck. 2023b. Image inpainting via tractable steering of diffusion models. arXiv preprint arXiv:2401.03349 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00925"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681129"},{"key":"e_1_3_2_1_37_1","unstructured":"Jia-Hong Liu Shao-Kui Zhang Tianqi Zhang Jia-Tong Zhang and Song-Hai Zhang. 2025. SceneFunctioner: Tailoring Large Language Model for Function-Oriented Interactive Scene Synthesis. https:\/\/openreview.net\/forum?id=IXFCPqFHMQ"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00960"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_2_1_40_1","volume-title":"Syncdreamer: Generating multiview-consistent images from a single-view image. arXiv preprint arXiv:2309.03453","author":"Liu Yuan","year":"2023","unstructured":"Yuan Liu, Cheng Lin, Zijiao Zeng, Xiaoxiao Long, Lingjie Liu, Taku Komura, and Wenping Wang. 2023a. Syncdreamer: Generating multiview-consistent images from a single-view image. arXiv preprint arXiv:2309.03453 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2001.937655"},{"key":"e_1_3_2_1_43_1","volume-title":"Lt3sd: Latent trees for 3d scene diffusion. arXiv preprint arXiv:2409.08215","author":"Meng Quan","year":"2024","unstructured":"Quan Meng, Lei Li, Matthias Nie\u00dfner, and Angela Dai. 2024. Lt3sd: Latent trees for 3d scene diffusion. arXiv preprint arXiv:2409.08215 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00377"},{"key":"e_1_3_2_1_46_1","first-page":"12013","article-title":"Atiss: Autoregressive transformers for indoor scene synthesis","volume":"34","author":"Paschalidou Despoina","year":"2021","unstructured":"Despoina Paschalidou, Amlan Kar, Maria Shugrina, Karsten Kreis, Andreas Geiger, and Sanja Fidler. 2021. Atiss: Autoregressive transformers for indoor scene synthesis. Advances in Neural Information Processing Systems, Vol. 34 (2021), 12013-12026.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01063"},{"key":"e_1_3_2_1_48_1","volume-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)."},{"key":"e_1_3_2_1_49_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2022. Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988 (2022)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00313"},{"key":"e_1_3_2_1_51_1","volume-title":"High-quality entity segmentation. arXiv preprint arXiv:2211.05776","author":"Qi Lu","year":"2022","unstructured":"Lu Qi, Jason Kuen, Weidong Guo, Tiancheng Shen, Jiuxiang Gu, Jiaya Jia, Zhe Lin, and Ming-Hsuan Yang. 2022. High-quality entity segmentation. arXiv preprint arXiv:2211.05776 (2022)."},{"key":"e_1_3_2_1_52_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00914"},{"key":"e_1_3_2_1_54_1","volume-title":"Zero123: a single image to consistent multi-view diffusion base model. arXiv preprint arXiv:2310.15110","author":"Shi Ruoxi","year":"2023","unstructured":"Ruoxi Shi, Hansheng Chen, Zhuoyang Zhang, Minghua Liu, Chao Xu, Xinyue Wei, Linghao Chen, Chong Zeng, and Hao Su. 2023a. Zero123: a single image to consistent multi-view diffusion base model. arXiv preprint arXiv:2310.15110 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Mvdream: Multi-view diffusion for 3d generation. arXiv preprint arXiv:2308.16512","author":"Shi Yichun","year":"2023","unstructured":"Yichun Shi, Peng Wang, Jianglong Ye, Mai Long, Kejie Li, and Xiao Yang. 2023b. Mvdream: Multi-view diffusion for 3d generation. arXiv preprint arXiv:2308.16512 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"European Conference on Computer Vision. Springer, 1-18","author":"Tang Jiaxiang","year":"2024","unstructured":"Jiaxiang Tang, Zhaoxi Chen, Xiaokang Chen, Tengfei Wang, Gang Zeng, and Ziwei Liu. 2024a. Lgm: Large multi-view gaussian model for high-resolution 3d content creation. In European Conference on Computer Vision. Springer, 1-18."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01938"},{"key":"e_1_3_2_1_58_1","volume-title":"Dreamgaussian: Generative gaussian splatting for efficient 3d content creation. arXiv preprint arXiv:2309.16653","author":"Tang Jiaxiang","year":"2023","unstructured":"Jiaxiang Tang, Jiawei Ren, Hang Zhou, Ziwei Liu, and Gang Zeng. 2023b. Dreamgaussian: Generative gaussian splatting for efficient 3d content creation. arXiv preprint arXiv:2309.16653 (2023)."},{"key":"e_1_3_2_1_59_1","first-page":"1363","article-title":"Emergent correspondence from image diffusion","volume":"36","author":"Tang Luming","year":"2023","unstructured":"Luming Tang, Menglin Jia, Qianqian Wang, Cheng Perng Phoo, and Bharath Hariharan. 2023a. Emergent correspondence from image diffusion. Advances in Neural Information Processing Systems, Vol. 36 (2023), 1363-1389.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_60_1","first-page":"67575","article-title":"Architect: Generating Vivid and Interactive 3D Scenes with Hierarchical 2D Inpainting","volume":"37","author":"Wang Yian","year":"2024","unstructured":"Yian Wang, Xiaowen Qiu, Jiageng Liu, Zhehuan Chen, Jiting Cai, Yufei Wang, Tsun-Hsuan Johnson Wang, Zhou Xian, and Chuang Gan. 2024. Architect: Generating Vivid and Interactive 3D Scenes with Hierarchical 2D Inpainting. Advances in Neural Information Processing Systems, Vol. 37 (2024), 67575-67603.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_61_1","volume-title":"Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090","author":"Wu Haoning","year":"2023","unstructured":"Haoning Wu, Zicheng Zhang, Weixia Zhang, Chaofeng Chen, Liang Liao, Chunyi Li, Yixuan Gao, Annan Wang, Erli Zhang, Wenxiu Sun, et al., 2023. Q-align: Teaching lmms for visual scoring via discrete text-defined levels. arXiv preprint arXiv:2312.17090 (2023)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658188"},{"key":"e_1_3_2_1_63_1","volume-title":"Structured 3d latents for scalable and versatile 3d generation. arXiv preprint arXiv:2412.01506","author":"Xiang Jianfeng","year":"2024","unstructured":"Jianfeng Xiang, Zelong Lv, Sicheng Xu, Yu Deng, Ruicheng Wang, Bowen Zhang, Dong Chen, Xin Tong, and Jiaolong Yang. 2024. Structured 3d latents for scalable and versatile 3d generation. arXiv preprint arXiv:2412.01506 (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"Instantmesh: Efficient 3d mesh generation from a single image with sparse-view large reconstruction models. arXiv preprint arXiv:2404.07191","author":"Xu Jiale","year":"2024","unstructured":"Jiale Xu, Weihao Cheng, Yiming Gao, Xintao Wang, Shenghua Gao, and Ying Shan. 2024a. Instantmesh: Efficient 3d mesh generation from a single image with sparse-view large reconstruction models. arXiv preprint arXiv:2404.07191 (2024)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00869"},{"key":"e_1_3_2_1_66_1","volume-title":"Sketch2Scene: Automatic Generation of Interactive 3D Game Scenes from User's Casual Sketches. arXiv preprint arXiv:2408.04567","author":"Xu Yongzhi","year":"2024","unstructured":"Yongzhi Xu, Yonhon Ng, Yifu Wang, Inkyu Sa, Yunfei Duan, Yang Li, Pan Ji, and Hongdong Li. 2024b. Sketch2Scene: Automatic Generation of Interactive 3D Game Scenes from User's Casual Sketches. arXiv preprint arXiv:2408.04567 (2024)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687672"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR58804.2024.00085"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00558"},{"key":"e_1_3_2_1_70_1","volume-title":"InstaScene: Towards Complete 3D Instance Decomposition and Reconstruction from Cluttered Scenes. arXiv preprint arXiv:2507.08416","author":"Yang Zesong","year":"2025","unstructured":"Zesong Yang, Bangbang Yang, Wenqi Dong, Chenxuan Cao, Liyuan Cui, Yuewen Ma, Zhaopeng Cui, and Hujun Bao. 2025. InstaScene: Towards Complete 3D Instance Decomposition and Reconstruction from Cluttered Scenes. arXiv preprint arXiv:2507.08416 (2025)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3730841"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01948"},{"key":"e_1_3_2_1_73_1","volume-title":"Wonderworld: Interactive 3d scene generation from a single image. arXiv preprint arXiv:2406.09394","author":"Yu Hong-Xing","year":"2024","unstructured":"Hong-Xing Yu, Haoyi Duan, Charles Herrmann, William T Freeman, and Jiajun Wu. 2024a. Wonderworld: Interactive 3d scene generation from a single image. arXiv preprint arXiv:2406.09394 (2024)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00636"},{"key":"e_1_3_2_1_75_1","volume-title":"ImmerseGen: Agent-Guided Immersive World Generation with Alpha-Textured Proxies. arXiv preprint arXiv:2506.14315","author":"Yuan Jinyan","year":"2025","unstructured":"Jinyan Yuan, Bangbang Yang, Keke Wang, Panwang Pan, Lin Ma, Xuehai Zhang, Xiao Liu, Zhaopeng Cui, and Yuewen Ma. 2025. ImmerseGen: Agent-Guided Immersive World Generation with Alpha-Textured Proxies. arXiv preprint arXiv:2506.14315 (2025)."},{"key":"e_1_3_2_1_76_1","volume-title":"European Conference on Computer Vision. Springer, 167-184","author":"Zhai Guangyao","year":"2024","unstructured":"Guangyao Zhai, Evin Pinar \u00d6rnek, Dave Zhenyu Chen, Ruotong Liao, Yan Di, Nassir Navab, Federico Tombari, and Benjamin Busam. 2024. Echoscene: Indoor scene generation via information echo over scene graph diffusion. In European Conference on Computer Vision. Springer, 167-184."},{"key":"e_1_3_2_1_77_1","first-page":"30026","article-title":"Commonscenes: Generating commonsense 3d indoor scenes with scene graph diffusion","volume":"36","author":"Zhai Guangyao","year":"2023","unstructured":"Guangyao Zhai, Evin Pinar \u00d6rnek, Shun-Cheng Wu, Yan Di, Federico Tombari, Nassir Navab, and Benjamin Busam. 2023. Commonscenes: Generating commonsense 3d indoor scenes with scene graph diffusion. Advances in Neural Information Processing Systems, Vol. 36 (2023), 30026-30038.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02645"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00384"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3618342"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658146"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00652"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3268115"},{"key":"e_1_3_2_1_84_1","volume-title":"Large scale image completion via co-modulated generative adversarial networks. arXiv preprint arXiv:2103.10428","author":"Zhao Shengyu","year":"2021","unstructured":"Shengyu Zhao, Jonathan Cui, Yilun Sheng, Yue Dong, Xiao Liang, Eric I Chang, and Yan Xu. 2021. Large scale image completion via co-modulated generative adversarial networks. arXiv preprint arXiv:2103.10428 (2021)."},{"key":"e_1_3_2_1_85_1","unstructured":"Zibo Zhao Zeqiang Lai Qingxiang Lin Yunfei Zhao Haolin Liu Shuhui Yang Yifei Feng Mingxin Yang Sheng Zhang Xianghui Yang et al. 2025. Hunyuan3d 2.0: Scaling diffusion models for high resolution textured 3d assets generation. arXiv preprint arXiv:2501.12202 (2025)."},{"key":"e_1_3_2_1_86_1","volume-title":"Michelangelo: Conditional 3d shape generation based on shape-image-text aligned latent representation. Advances in neural information processing systems","author":"Zhao Zibo","year":"2023","unstructured":"Zibo Zhao, Wen Liu, Xin Chen, Xianfang Zeng, Rui Wang, Pei Cheng, Bin Fu, Tao Chen, Gang Yu, and Shenghua Gao. 2023. Michelangelo: Conditional 3d shape generation based on shape-image-text aligned latent representation. Advances in neural information processing systems, Vol. 36 (2023), 73969-73982."},{"key":"e_1_3_2_1_87_1","volume-title":"European Conference on Computer Vision. Springer, 324-342","author":"Zhou Shijie","year":"2024","unstructured":"Shijie Zhou, Zhiwen Fan, Dejia Xu, Haoran Chang, Pradyumna Chari, Tejas Bharadwaj, Suya You, Zhangyang Wang, and Achuta Kadambi. 2024a. Dreamscene360: Unconstrained text-to-3d scene generation with panoramic gaussian splatting. In European Conference on Computer Vision. Springer, 324-342."},{"key":"e_1_3_2_1_88_1","volume-title":"Gala3d: Towards text-to-3d complex scene generation via layout-guided generative gaussian splatting. arXiv preprint arXiv:2402.07207","author":"Zhou Xiaoyu","year":"2024","unstructured":"Xiaoyu Zhou, Xingjian Ran, Yajiao Xiong, Jinlin He, Zhiwei Lin, Yongtao Wang, Deqing Sun, and Ming-Hsuan Yang. 2024b. Gala3d: Towards text-to-3d complex scene generation via layout-guided generative gaussian splatting. arXiv preprint arXiv:2402.07207 (2024)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.320"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:01:38Z","timestamp":1765310498000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755132"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":89,"alternative-id":["10.1145\/3746027.3755132","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755132","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}