{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,29]],"date-time":"2026-06-29T17:34:03Z","timestamp":1782754443099,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"MOE AcRF Tier 2 Grant","award":["MOE-T2EP20220-0007"],"award-info":[{"award-number":["MOE-T2EP20220-0007"]}]},{"name":"RIE2020 Industry Alignment Fund","award":["I1901E0052"],"award-info":[{"award-number":["I1901E0052"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680557","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"10814-10823","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["iControl3D: An Interactive System for Controllable 3D Scene Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5765-3852","authenticated-orcid":false,"given":"Xingyi","family":"Li","sequence":"first","affiliation":[{"name":"School of AIA, Huazhong University of Science and Technology &amp; S-Lab, Nanyang Technological University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5335-4919","authenticated-orcid":false,"given":"Yizheng","family":"Wu","sequence":"additional","affiliation":[{"name":"School of AIA, Huazhong University of Science and Technology &amp; S-Lab, Nanyang Technological University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7578-7667","authenticated-orcid":false,"given":"Jun","family":"Cen","sequence":"additional","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5740-2682","authenticated-orcid":false,"given":"Juewen","family":"Peng","sequence":"additional","affiliation":[{"name":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9433-720X","authenticated-orcid":false,"given":"Kewei","family":"Wang","sequence":"additional","affiliation":[{"name":"School of AIA, Huazhong University of Science and Technology &amp; S-Lab, Nanyang Technological University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0884-5126","authenticated-orcid":false,"given":"Ke","family":"Xian","sequence":"additional","affiliation":[{"name":"School of EIC, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0597-4475","authenticated-orcid":false,"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[{"name":"SenseTime Research, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9223-1863","authenticated-orcid":false,"given":"Zhiguo","family":"Cao","sequence":"additional","affiliation":[{"name":"School of AIA, Huazhong University of Science and Technology, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0329-7458","authenticated-orcid":false,"given":"Guosheng","family":"Lin","sequence":"additional","affiliation":[{"name":"S-Lab, Nanyang Technological University, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"25102","article-title":"Gaudi: A neural architect for immersive 3d scene generation","volume":"35","author":"Bautista Miguel Angel","year":"2022","unstructured":"Miguel Angel Bautista, Pengsheng Guo, Samira Abnar, Walter Talbott, Alexander Toshev, Zhuoyuan Chen, Laurent Dinh, Shuangfei Zhai, Hanlin Goh, Daniel Ulbricht, et al. 2022. Gaudi: A neural architect for immersive 3d scene generation. Advances in Neural Information Processing Systems (NeurIPS), Vol. 35 (2022), 25102--25116.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2001.990497"},{"key":"e_1_3_2_2_3_1","volume-title":"Zoedepth: Zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288","author":"Bhat Shariq Farooq","year":"2023","unstructured":"Shariq Farooq Bhat, Reiner Birkl, Diana Wofk, Peter Wonka, and Matthias M\u00fcller. 2023. Zoedepth: Zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_22"},{"key":"e_1_3_2_2_5_1","volume-title":"Songyou Peng, Mohamad Shahbazi, Anton Obukhov, Luc Van Gool, and Gordon Wetzstein.","author":"Cai Shengqu","year":"2022","unstructured":"Shengqu Cai, Eric Ryan Chan, Songyou Peng, Mohamad Shahbazi, Anton Obukhov, Luc Van Gool, and Gordon Wetzstein. 2022. DiffDreamer: Consistent Single-view Perpetual View Generation with Conditional Diffusion Models. arXiv preprint arXiv:2211.12131 (2022)."},{"key":"e_1_3_2_2_6_1","volume-title":"Persistent Nature: A Generative Model of Unbounded 3D Worlds. arXiv preprint arXiv:2303.13515","author":"Chai Lucy","year":"2023","unstructured":"Lucy Chai, Richard Tucker, Zhengqi Li, Phillip Isola, and Noah Snavely. 2023. Persistent Nature: A Generative Model of Unbounded 3D Worlds. arXiv preprint arXiv:2303.13515 (2023)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00574"},{"key":"e_1_3_2_2_9_1","volume-title":"Axel Levy, Miika Aittala, Shalini De Mello, Tero Karras, and Gordon Wetzstein.","author":"Chan Eric R","year":"2023","unstructured":"Eric R Chan, Koki Nagano, Matthew A Chan, Alexander W Bergman, Jeong Joon Park, Axel Levy, Miika Aittala, Shalini De Mello, Tero Karras, and Gordon Wetzstein. 2023. GeNVS: Generative Novel View Synthesis with 3D-Aware Diffusion Models. arXiv preprint arXiv:2304.02602 (2023)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20893-6_7"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555447"},{"key":"e_1_3_2_2_12_1","volume-title":"Scenedreamer: Unbounded 3d scene generation from 2d image collections. arXiv preprint arXiv:2302.01330","author":"Chen Zhaoxi","year":"2023","unstructured":"Zhaoxi Chen, Guangcong Wang, and Ziwei Liu. 2023. Scenedreamer: Unbounded 3d scene generation from 2d image collections. arXiv preprint arXiv:2302.01330 (2023)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01404"},{"key":"e_1_3_2_2_14_1","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in Neural Information Processing Systems (NeurIPS), Vol. 34 (2021), 8780--8794.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_15_1","volume-title":"Scenescape: Text-driven consistent scene generation. arXiv preprint arXiv:2302.01133","author":"Fridman Rafail","year":"2023","unstructured":"Rafail Fridman, Amit Abecasis, Yoni Kasten, and Tali Dekel. 2023. Scenescape: Text-driven consistent scene generation. arXiv preprint arXiv:2302.01133 (2023)."},{"key":"e_1_3_2_2_16_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems (NeurIPS), Vol. 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_17_1","volume-title":"Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models. arXiv preprint arXiv:2303.11989","author":"H\u00f6llein Lukas","year":"2023","unstructured":"Lukas H\u00f6llein, Ang Cao, Andrew Owens, Justin Johnson, and Matthias Nie\u00dfner. 2023. Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models. arXiv preprint arXiv:2303.11989 (2023)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01229"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2009.2031133"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Alexander Kirillov Eric Mintun Nikhila Ravi Hanzi Mao Chloe Rolland Laura Gustafson Tete Xiao Spencer Whitehead Alexander C Berg Wan-Yen Lo et al. 2023. Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_2_22_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 14738--14748","author":"Koh Jing Yu","year":"2021","unstructured":"Jing Yu Koh, Honglak Lee, Yinfei Yang, Jason Baldridge, and Peter Anderson. 2021. Pathdreamer: A world model for indoor navigation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV). 14738--14748."},{"key":"e_1_3_2_2_23_1","volume-title":"Understanding pure clip guidance for voxel grid nerf models. arXiv preprint arXiv:2209.15172","author":"Lee Han-Hung","year":"2022","unstructured":"Han-Hung Lee and Angel X Chang. 2022. Understanding pure clip guidance for voxel grid nerf models. arXiv preprint arXiv:2209.15172 (2022)."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19769-7_30"},{"key":"e_1_3_2_2_25_1","volume-title":"Magic3D: High-Resolution Text-to-3D Content Creation. arXiv preprint arXiv:2211.10440","author":"Lin Chen-Hsuan","year":"2022","unstructured":"Chen-Hsuan Lin, Jun Gao, Luming Tang, Towaki Takikawa, Xiaohui Zeng, Xun Huang, Karsten Kreis, Sanja Fidler, Ming-Yu Liu, and Tsung-Yi Lin. 2022. Magic3D: High-Resolution Text-to-3D Content Creation. arXiv preprint arXiv:2211.10440 (2022)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01419"},{"key":"e_1_3_2_2_27_1","unstructured":"Minghua Liu Chao Xu Haian Jin Linghao Chen Zexiang Xu Hao Su et al. 2023. One-2--3--45: Any single image to 3d mesh in 45 seconds without per-shape optimization. arXiv preprint arXiv:2306.16928 (2023)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01129"},{"key":"e_1_3_2_2_31_1","volume-title":"Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T Barron, and Ben Mildenhall. 2022. Dreamfusion: Text-to-3d using 2d diffusion. arXiv preprint arXiv:2209.14988 (2022)."},{"key":"e_1_3_2_2_32_1","volume-title":"International conference on machine learning (ICML). PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning (ICML). PMLR, 8748--8763."},{"key":"e_1_3_2_2_33_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_2_34_1","volume-title":"Accelerating 3d deep learning with pytorch3d. arXiv preprint arXiv:2007.08501","author":"Ravi Nikhila","year":"2020","unstructured":"Nikhila Ravi, Jeremy Reizenstein, David Novotny, Taylor Gordon, Wan-Yen Lo, Justin Johnson, and Georgia Gkioxari. 2020. Accelerating 3d deep learning with pytorch3d. arXiv preprint arXiv:2007.08501 (2020)."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00355"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_37_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems (NeurIPS), Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_38_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"29","author":"Salimans Tim","year":"2016","unstructured":"Tim Salimans, Ian Goodfellow, Wojciech Zaremba, Vicki Cheung, Alec Radford, and Xi Chen. 2016. Improved techniques for training gans. Advances in Neural Information Processing Systems (NeurIPS), Vol. 29 (2016)."},{"key":"e_1_3_2_2_39_1","first-page":"20154","article-title":"Graf: Generative radiance fields for 3d-aware image synthesis","volume":"33","author":"Schwarz Katja","year":"2020","unstructured":"Katja Schwarz, Yiyi Liao, Michael Niemeyer, and Andreas Geiger. 2020. Graf: Generative radiance fields for 3d-aware image synthesis. Advances in Neural Information Processing Systems (NeurIPS), Vol. 33 (2020), 20154--20166.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_2_40_1","volume-title":"Mvdream: Multi-view diffusion for 3d generation. arXiv preprint arXiv:2308.16512","author":"Shi Yichun","year":"2023","unstructured":"Yichun Shi, Peng Wang, Jianglong Ye, Mai Long, Kejie Li, and Xiao Yang. 2023. Mvdream: Multi-view diffusion for 3d generation. arXiv preprint arXiv:2308.16512 (2023)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00805"},{"key":"e_1_3_2_2_42_1","volume-title":"International Conference on Machine Learning (ICML). PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning (ICML). PMLR, 2256--2265."},{"key":"e_1_3_2_2_43_1","volume-title":"3D-GPT: Procedural 3D Modeling with Large Language Models. arXiv preprint arXiv:2310.12945","author":"Sun Chunyi","year":"2023","unstructured":"Chunyi Sun, Junlin Han, Weijian Deng, Xinlong Wang, Zishan Qin, and Stephen Gould. 2023. 3D-GPT: Procedural 3D Modeling with Large Language Models. arXiv preprint arXiv:2310.12945 (2023)."},{"key":"e_1_3_2_2_44_1","volume-title":"Kamyar Salahi, et al.","author":"Tancik Matthew","year":"2023","unstructured":"Matthew Tancik, Ethan Weber, Evonne Ng, Ruilong Li, Brent Yi, Justin Kerr, Terrance Wang, Alexander Kristoffersen, Jake Austin, Kamyar Salahi, et al. 2023. Nerfstudio: A modular framework for neural radiance field development. arXiv preprint arXiv:2302.04264 (2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"Consistent View Synthesis with Pose-Guided Diffusion Models. arXiv preprint arXiv:2303.17598","author":"Tseng Hung-Yu","year":"2023","unstructured":"Hung-Yu Tseng, Qinbo Li, Changil Kim, Suhib Alsisan, Jia-Bin Huang, and Johannes Kopf. 2023. Consistent View Synthesis with Pose-Guided Diffusion Models. arXiv preprint arXiv:2303.17598 (2023)."},{"key":"e_1_3_2_2_46_1","volume-title":"Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation. arXiv preprint arXiv:2212.00774","author":"Wang Haochen","year":"2022","unstructured":"Haochen Wang, Xiaodan Du, Jiahao Li, Raymond A Yeh, and Greg Shakhnarovich. 2022. Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation. arXiv preprint arXiv:2212.00774 (2022)."},{"key":"e_1_3_2_2_47_1","volume-title":"ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation. arXiv preprint arXiv:2305.16213","author":"Wang Zhengyi","year":"2023","unstructured":"Zhengyi Wang, Cheng Lu, Yikai Wang, Fan Bao, Chongxuan Li, Hang Su, and Jun Zhu. 2023. ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation. arXiv preprint arXiv:2305.16213 (2023)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00749"},{"key":"e_1_3_2_2_49_1","volume-title":"Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3197517.3201323","article-title":"Stereo magnification: learning view synthesis using multiplane images","volume":"37","author":"Zhou Tinghui","year":"2018","unstructured":"Tinghui Zhou, Richard Tucker, John Flynn, Graham Fyffe, and Noah Snavely. 2018. Stereo magnification: learning view synthesis using multiplane images. ACM Transactions on Graphics (TOG), Vol. 37, 4 (2018), 1--12.","journal-title":"ACM Transactions on Graphics (TOG)"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680557","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680557","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:45Z","timestamp":1750291425000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680557"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3680557","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680557","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}