{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:26:51Z","timestamp":1765308411994,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","funder":[{"name":"The European Union?s Horizon Europe research and innovation program","award":["No. 101120237 (ELIAS)"],"award-info":[{"award-number":["No. 101120237 (ELIAS)"]}]},{"name":"FAIR - Future AI Research","award":["PE00000013"],"award-info":[{"award-number":["PE00000013"]}]},{"name":"Tianjin Natural Science Foundation, Key Project","award":["No. 22JCZDJC0022"],"award-info":[{"award-number":["No. 22JCZDJC0022"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755072","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"10915-10924","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FreeInsert: Disentangled Text-Guided Object Insertion in 3D Gaussian Scene without Spatial Priors"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7417-6837","authenticated-orcid":false,"given":"Chenxi","family":"Li","sequence":"first","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1168-3527","authenticated-orcid":false,"given":"Weijie","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Trento, Trento, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0636-0071","authenticated-orcid":false,"given":"Qiang","family":"Li","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6597-7248","authenticated-orcid":false,"given":"Nicu","family":"Sebe","sequence":"additional","affiliation":[{"name":"University of Trento, Trento, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1275-2333","authenticated-orcid":false,"given":"Bruno","family":"Lepri","sequence":"additional","affiliation":[{"name":"Fondazione Bruno Kessler, Trento, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0578-8138","authenticated-orcid":false,"given":"Weizhi","family":"Nie","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al., 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01517"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00067"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"volume-title":"The Thirty-eighth Annual Conference on Neural Information Processing Systems.","author":"Cao Chenjie","key":"e_1_3_2_1_5_1","unstructured":"Chenjie Cao, Chaohui Yu, Fan Wang, Xiangyang Xue, and Yanwei Fu. [n.d.]. MVInpainter: Learning Multi-View Consistent Inpainting to Bridge 2D and 3D Editing. In The Thirty-eighth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02029"},{"key":"e_1_3_2_1_7_1","volume-title":"European Conference on Computer Vision. Springer, 128-146","author":"Chen Yongwei","year":"2024","unstructured":"Yongwei Chen, Tengfei Wang, Tong Wu, Xingang Pan, Kui Jia, and Ziwei Liu. 2024b. Comboverse: Compositional 3d assets creation using spatially-aware diffusion guidance. In European Conference on Computer Vision. Springer, 128-146."},{"key":"e_1_3_2_1_8_1","volume-title":"Mohammadreza Salehi, Niklas Muennighoff, Kyle Lo, Luca Soldaini, et al.","author":"Deitke Matt","year":"2024","unstructured":"Matt Deitke, Christopher Clark, Sangho Lee, Rohun Tripathi, Yue Yang, Jae Sung Park, Mohammadreza Salehi, Niklas Muennighoff, Kyle Lo, Luca Soldaini, et al., 2024. Molmo and pixmo: Open weights and open data for state-of-the-art multimodal models. arXiv preprint arXiv:2409.17146 (2024)."},{"key":"e_1_3_2_1_9_1","first-page":"61466","article-title":"Vica-nerf: View-consistency-aware 3d editing of neural radiance fields","volume":"36","author":"Dong Jiahua","year":"2023","unstructured":"Jiahua Dong and Yu-Xiong Wang. 2023. Vica-nerf: View-consistency-aware 3d editing of neural radiance fields. Advances in Neural Information Processing Systems, Vol. 36 (2023), 61466-61477.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_10_1","volume-title":"Point Cloud Completion Guided by Prior Knowledge via Causal Inference. arXiv preprint arXiv:2305.17770","author":"Gao Songxue","year":"2023","unstructured":"Songxue Gao, Chuanqi Jiao, Ruidong Chen, Weijie Wang, and Weizhi Nie. 2023. Point Cloud Completion Guided by Prior Knowledge via Causal Inference. arXiv preprint arXiv:2305.17770 (2023)."},{"key":"e_1_3_2_1_11_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Alex Vaughan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01808"},{"key":"e_1_3_2_1_13_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Hong Yicong","year":"2023","unstructured":"Yicong Hong, Kai Zhang, Jiuxiang Gu, Sai Bi, Yang Zhou, Difan Liu, Feng Liu, Kalyan Sunkavalli, Trung Bui, and Hao Tan. 2023. Lrm: Large reconstruction model for single image to 3d. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_15_1","volume-title":"LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations.","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, yelong shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_16_1","unstructured":"Aaron Jaech Adam Kalai Adam Lerer Adam Richardson Ahmed El-Kishky Aiden Low Alec Helyar Aleksander Madry Alex Beutel Alex Carney et al. 2024. Openai o1 system card. arXiv preprint arXiv:2412.16720 (2024)."},{"key":"e_1_3_2_1_17_1","volume-title":"European Conference on Computer Vision. Springer, 364-380","author":"Khalid Umar","year":"2024","unstructured":"Umar Khalid, Hasan Iqbal, Nazmul Karim, Muhammad Tayyab, Jing Hua, and Chen Chen. 2024. LatentEditor: text driven local editing of 3D scenes. In European Conference on Computer Vision. Springer, 364-380."},{"key":"e_1_3_2_1_18_1","first-page":"73232","article-title":"Collaborative score distillation for consistent visual editing","volume":"36","author":"Kim Subin","year":"2023","unstructured":"Subin Kim, Kyungmin Lee, June Suk Choi, Jongheon Jeong, Kihyuk Sohn, and Jinwoo Shin. 2023. Collaborative score distillation for consistent visual editing. Advances in Neural Information Processing Systems, Vol. 36 (2023), 73232-73257.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01268"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28113"},{"key":"e_1_3_2_1_21_1","unstructured":"Aixin Liu Bei Feng Bing Xue Bingxuan Wang Bochao Wu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00951"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72920-1_7"},{"key":"e_1_3_2_1_24_1","volume-title":"T2TD: Text-3D generation model based on prior knowledge guidance","author":"Nie Weizhi","year":"2024","unstructured":"Weizhi Nie, Ruidong Chen, Weijie Wang, Bruno Lepri, and Nicu Sebe. 2024. T2TD: Text-3D generation model based on prior knowledge guidance. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3344684"},{"key":"e_1_3_2_1_26_1","volume-title":"DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research","author":"Oquab Maxime","year":"2024","unstructured":"Maxime Oquab, Timoth\u00e9e Darcet, Th\u00e9o Moutakanni, Huy V. Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel HAZIZA, Francisco Massa, Alaaeldin El-Nouby, Mido Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herve Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, and Piotr Bojanowski. 2024. DINOv2: Learning Robust Visual Features without Supervision. Transactions on Machine Learning Research (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Park JangHo","year":"2024","unstructured":"JangHo Park, Gihyun Kwon, and Jong Chul Ye. 2024. ED-NeRF: Efficient Text-Guided Editing of 3D Scene With Latent Space NeRF. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T. Barron, and Ben Mildenhall. 2022. DreamFusion: Text-to-3D using 2D Diffusion. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_30_1","volume-title":"Luc Van Gool, and Federico Tombari","author":"Shahbazi Mohamad","year":"2024","unstructured":"Mohamad Shahbazi, Liesbeth Claessens, Michael Niemeyer, Edo Collins, Alessio Tonioni, Luc Van Gool, and Federico Tombari. 2024. InseRF: Text-Driven Generative Object Insertion in Neural 3D Scenes. arXiv preprint arXiv:2401.05335 (2024)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00495"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01323"},{"volume-title":"Computer Graphics Forum","author":"Sun Yanhao","key":"e_1_3_2_1_33_1","unstructured":"Yanhao Sun, Runze Tian, Xiao Han, XinYao Liu, Yan Zhang, and Kai Xu. 2024. GSEditPro: 3D Gaussian Splatting Editing with Attention-based Progressive Localization. In Computer Graphics Forum, Vol. 43. Wiley Online Library, e15215."},{"key":"e_1_3_2_1_34_1","volume-title":"European Conference on Computer Vision. Springer, 1-18","author":"Tang Jiaxiang","year":"2024","unstructured":"Jiaxiang Tang, Zhaoxi Chen, Xiaokang Chen, Tengfei Wang, Gang Zeng, and Ziwei Liu. 2024. Lgm: Large multi-view gaussian model for high-resolution 3d content creation. In European Conference on Computer Vision. Springer, 1-18."},{"key":"e_1_3_2_1_35_1","volume-title":"Instruct-gs2gs: Editing 3d gaussian splats with instructions","author":"Vachha Cyrus","year":"2024","unstructured":"Cyrus Vachha and Ayaan Haque. [n.d.]. Instruct-gs2gs: Editing 3d gaussian splats with instructions (2024). URL https:\/\/instruct-gs2gs. github. io ([n.d.])."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01205"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01975"},{"key":"e_1_3_2_1_38_1","volume-title":"Nicu Sebe, and Bruno Lepri.","author":"Wang Weijie","year":"2023","unstructured":"Weijie Wang, Guofeng Mei, Bin Ren, Xiaoshui Huang, Fabio Poiesi, Luc Van Gool, Nicu Sebe, and Bruno Lepri. 2023. Zero-shot point cloud registration. arXiv preprint arXiv:2312.03032 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Fully-Geometric Cross-Attention for Point Cloud Registration. arXiv preprint arXiv:2502.08285","author":"Wang Weijie","year":"2025","unstructured":"Weijie Wang, Guofeng Mei, Jian Zhang, Nicu Sebe, Bruno Lepri, and Fabio Poiesi. 2025. Fully-Geometric Cross-Attention for Point Cloud Registration. arXiv preprint arXiv:2502.08285 (2025)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680861"},{"key":"e_1_3_2_1_41_1","volume-title":"European Conference on Computer Vision. Springer, 55-71","author":"Wu Jing","year":"2024","unstructured":"Jing Wu, Jia-Wang Bian, Xinghui Li, Guangrun Wang, Ian Reid, Philip Torr, and Victor Adrian Prisacariu. 2024. Gaussctrl: Multi-view consistent text-driven 3d gaussian splatting editing. In European Conference on Computer Vision. Springer, 55-71."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00461"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681039"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_45_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Zhou Junwei","year":"2025","unstructured":"Junwei Zhou, Xueting Li, Lu Qi, and Ming-Hsuan Yang. 2025. Layout-your-3D: Controllable and Precise 3D Generation with 2D Blueprint. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681289"},{"key":"e_1_3_2_1_47_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Zhou Xiaoyu","year":"2024","unstructured":"Xiaoyu Zhou, Xingjian Ran, Yajiao Xiong, Jinlin He, Zhiwei Lin, Yongtao Wang, Deqing Sun, and Ming-Hsuan Yang. 2024b. GALA3D: Towards Text-to-3D Complex Scene Generation via Layout-guided Generative Gaussian Splatting. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3658205"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618190"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755072","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:23:23Z","timestamp":1765308203000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755072"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":49,"alternative-id":["10.1145\/3746027.3755072","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755072","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}