{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,2]],"date-time":"2025-12-02T03:35:39Z","timestamp":1764646539105,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681289","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"3401-3410","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Edit3D: Elevating 3D Scene Editing with Attention-Driven Multi-Turn Interactivity"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0674-9296","authenticated-orcid":false,"given":"Peng","family":"Zhou","sequence":"first","affiliation":[{"name":"College of Artificial Intelligence MoE Key Lab of Brain-Machine Intelligence Technology, Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0653-3663","authenticated-orcid":false,"given":"Dunbo","family":"Cai","sequence":"additional","affiliation":[{"name":"China Mobile (Suzhou) Software Technology Company Limited, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2101-4046","authenticated-orcid":false,"given":"Yujian","family":"Du","sequence":"additional","affiliation":[{"name":"China Mobile (Suzhou) Software Technology Company Limited, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0503-7254","authenticated-orcid":false,"given":"Runqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"China Mobile (Suzhou) Software Technology Company Limited, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7339-028X","authenticated-orcid":false,"given":"Bingbing","family":"Ni","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0306-534X","authenticated-orcid":false,"given":"Jie","family":"Qin","sequence":"additional","affiliation":[{"name":"College of Artificial Intelligence MoE Key Lab of Brain-Machine Intelligence Technology, Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5661-1722","authenticated-orcid":false,"given":"Ling","family":"Qian","sequence":"additional","affiliation":[{"name":"China Mobile (Suzhou) Software Technology Company Limited, Suzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 18392--18402","author":"Brooks Tim","key":"e_1_3_2_1_2_1","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A. Efros. 2023. InstructPix2Pix: Learning to Follow Image Editing Instructions. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 18392--18402."},{"key":"e_1_3_2_1_3_1","volume-title":"Language Models Are Few-Shot Learners. arXiv:2005.14165","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models Are Few-Shot Learners. arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_4_1","volume-title":"Detect What You Can: Detecting and Representing Objects Using Holistic Models and Body Parts. arXiv:1406.2031","author":"Chen Xianjie","year":"2014","unstructured":"Xianjie Chen, Roozbeh Mottaghi, Xiaobai Liu, Sanja Fidler, Raquel Urtasun, and Alan Yuille. 2014. Detect What You Can: Detecting and Representing Objects Using Holistic Models and Body Parts. arXiv:1406.2031 (2014)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02029"},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Learning Representations.","author":"Cheng Xinhua","year":"2024","unstructured":"Xinhua Cheng, Tianyu Yang, Jianan Wang, Yu Li, Lei Zhang, Jian Zhang, and Li Yuan. 2024. Progressive3D: Progressively Local Editing for Text-to-3D Content Creation with Complex Semantic Prompts. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5485--5494","author":"de Geus Daan","year":"2021","unstructured":"Daan de Geus, Panagiotis Meletis, Chenyang Lu, Xiaoxiao Wen, and Gijs Dubbelman. 2021. Part-Aware Panoptic Segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 5485--5494."},{"key":"e_1_3_2_1_8_1","first-page":"8780","article-title":"Diffusion Models Beat GANs on Image Synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Advances in Neural Information Processing Systems, Vol. 34. 8780--8794.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20902--20911","author":"Fang Jiemin","year":"2024","unstructured":"Jiemin Fang, Junjie Wang, Xiaopeng Zhang, Lingxi Xie, and Qi Tian. 2024. GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 20902--20911."},{"key":"e_1_3_2_1_11_1","volume-title":"An Image Is Worth One Word: Personalizing Text-to-Image Generation Using Textual Inversion. arXiv:2208.01618","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit H. Bermano, Gal Chechik, and Daniel Cohen-Or. 2022. An Image Is Worth One Word: Personalizing Text-to-Image Generation Using Textual Inversion. arXiv:2208.01618 (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530164"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Learning Representations.","author":"Gu Xiuye","year":"2021","unstructured":"Xiuye Gu, Tsung-Yi Lin, Weicheng Kuo, and Yin Cui. 2021. Open-Vocabulary Object Detection via Vision and Language Knowledge Distillation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01808"},{"key":"e_1_3_2_1_16_1","volume-title":"High-Quality Dataset of Parts. arXiv:2112.00933","author":"He Ju","year":"2022","unstructured":"Ju He, Shuo Yang, Shaokang Yang, Adam Kortylewski, Xiaoding Yuan, Jie-Neng Chen, Shuai Liu, Cheng Yang, Qihang Yu, and Alan Yuille. 2022. PartImageNet: A Large, High-Quality Dataset of Parts. arXiv:2112.00933 (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_18_1","first-page":"6840","article-title":"Denoising Diffusion Probabilistic Models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Advances in Neural Information Processing Systems, Vol. 33. 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"Cascaded Diffusion Models for High Fidelity Image Generation. arXiv:2106.15282","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho, Chitwan Saharia, William Chan, David J. Fleet, Mohammad Norouzi, and Tim Salimans. 2021. Cascaded Diffusion Models for High Fidelity Image Generation. arXiv:2106.15282 (2021)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_22_1","volume-title":"arXiv:2304.02643","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, and Ross Girshick. 2023. Segment Anything. arXiv:2304.02643 (2023)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01062"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Learning Representations.","author":"Kuo Weicheng","year":"2023","unstructured":"Weicheng Kuo, Yin Cui, Xiuye Gu, A. J. Piergiovanni, and Anelia Angelova. 2023. F-VLM: Open-Vocabulary Object Detection upon Frozen Vision and Language Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_25_1","volume-title":"Language-Driven Semantic Segmentation. In International Conference on Learning Representations.","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian Q. Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-Driven Semantic Segmentation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_26_1","volume-title":"Hoi","author":"Li Dongxu","year":"2023","unstructured":"Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, and Steven C.H. Hoi. 2023. LAVIS: A One-stop Library for Language-Vision Intelligence. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics."},{"key":"e_1_3_2_1_27_1","volume-title":"International Conference on Machine Learning. 19730--19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping Language-Image Pre-Training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning. 19730--19742."},{"key":"e_1_3_2_1_28_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. arXiv:2201.12086 (2022)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_42"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28113"},{"key":"e_1_3_2_1_32_1","volume-title":"Learning Object-Language Alignments for Open-Vocabulary Object Detection. arXiv:2211.14843","author":"Lin Chuang","year":"2022","unstructured":"Chuang Lin, Peize Sun, Yi Jiang, Ping Luo, Lizhen Qu, Gholamreza Haffari, Zehuan Yuan, and Jianfei Cai. 2022. Learning Object-Language Alignments for Open-Vocabulary Object Detection. arXiv:2211.14843 (2022)."},{"key":"e_1_3_2_1_33_1","volume-title":"Visual Instruction Tuning. arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.124"},{"key":"e_1_3_2_1_35_1","first-page":"3497","article-title":"OBJECT 3DIT: Language-guided 3D-aware Image Editing","volume":"36","author":"Michel Oscar","year":"2023","unstructured":"Oscar Michel, Anand Bhattad, Eli VanderBilt, Ranjay Krishna, Aniruddha Kembhavi, and Tanmay Gupta. 2023. OBJECT 3DIT: Language-guided 3D-aware Image Editing. In Advances in Neural Information Processing Systems, Vol. 36. 3497--3516.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"GMNet: Graph Matching Network for Large Scale Part Semantic Segmentation in the Wild. In European Conference on Computer Vision. 397--414","author":"Michieli Umberto","year":"2020","unstructured":"Umberto Michieli, Edoardo Borsato, Luca Rossi, and Pietro Zanuttigh. 2020. GMNet: Graph Matching Network for Large Scale Part Semantic Segmentation in the Wild. In European Conference on Computer Vision. 397--414."},{"key":"e_1_3_2_1_37_1","volume-title":"Commun. ACM","volume":"65","author":"Mildenhall Ben","year":"2021","unstructured":"Ben Mildenhall, Pratul P Srinivasan, Matthew Tancik, Jonathan T Barron, Ravi Ramamoorthi, and Ren Ng. 2021. Nerf: Representing Scenes as Neural Radiance Fields for View Synthesis. Commun. ACM, Vol. 65, 1 (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"Simple Open-Vocabulary Object Detection with Vision Transformers. In European Conference on Computer Vision.","author":"Minderer Matthias","year":"2022","unstructured":"Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. 2022. Simple Open-Vocabulary Object Detection with Vision Transformers. In European Conference on Computer Vision."},{"key":"e_1_3_2_1_39_1","volume-title":"Watch Your Steps: Local Image and Scene Editing by Text Instructions. arXiv:2308.08947","author":"Mirzaei Ashkan","year":"2023","unstructured":"Ashkan Mirzaei, Tristan Aumentado-Armstrong, Marcus A. Brubaker, Jonathan Kelly, Alex Levinshtein, Konstantinos G. Derpanis, and Igor Gilitschenski. 2023. Watch Your Steps: Local Image and Scene Editing by Text Instructions. arXiv:2308.08947 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_41_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning.","author":"Nichol Alex","year":"2022","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_42_1","volume-title":"Training Language Models to Follow Instructions with Human Feedback. arXiv:2203.02155","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. 2022. Training Language Models to Follow Instructions with Human Feedback. arXiv:2203.02155 (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"DreamFusion: Text-to-3D Using 2D Diffusion. arXiv:2209.14988","author":"Poole Ben","year":"2022","unstructured":"Ben Poole, Ajay Jain, Jonathan T. Barron, and Ben Mildenhall. 2022. DreamFusion: Text-to-3D Using 2D Diffusion. arXiv:2209.14988 (2022)."},{"key":"e_1_3_2_1_44_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving Language Understanding by Generative Pre-Training. (2018). https:\/\/www.cs.ubc.ca\/ amuham01\/LING530\/papers\/radford2018improving.pdf"},{"key":"e_1_3_2_1_45_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language Models Are Unsupervised Multitask Learners. (2019). https:\/\/d4mucfpksywv.cloudfront.net\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf"},{"key":"e_1_3_2_1_46_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 (2023)."},{"key":"e_1_3_2_1_47_1","volume-title":"Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_50_1","first-page":"36479","article-title":"Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S. Sara Mahdavi, Rapha Gontijo Lopes, Tim Salimans, Jonathan Ho, David J. Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Advances in Neural Information Processing Systems, Vol. 35. 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_52_1","volume-title":"Deep Unsupervised Learning Using Nonequilibrium Thermodynamics. In International Conference on Machine Learning. 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric A. Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep Unsupervised Learning Using Nonequilibrium Thermodynamics. In International Conference on Machine Learning. 2256--2265."},{"key":"e_1_3_2_1_53_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_54_1","unstructured":"Yang Song and Stefano Ermon. 2019. Generative Modeling by Estimating Gradients of the Data Distribution. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_55_1","unstructured":"Peize Sun Shoufa Chen and Ping Luo. 2023. Grounded Segment Anything: From Objects to Parts. https:\/\/github.com\/Cheems-Seminar\/grounded-segment-any-parts."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01417"},{"key":"e_1_3_2_1_57_1","volume-title":"Kamyar Salahi, Abhik Ahuja, et al.","author":"Tancik Matthew","year":"2023","unstructured":"Matthew Tancik, Ethan Weber, Evonne Ng, Ruilong Li, Brent Yi, Terrance Wang, Alexander Kristoffersen, Jake Austin, Kamyar Salahi, Abhik Ahuja, et al. 2023. Nerfstudio: A Modular Framework for Neural Radiance Field Development. In SIGGRAPH. 1--12."},{"key":"e_1_3_2_1_58_1","volume-title":"LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. LLaMA: Open and Efficient Foundation Language Models. arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_59_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00905"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00217"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"e_1_3_2_1_64_1","volume-title":"Zixuan Ma, Yufei Xue, Jidong Zhai, Wenguang Chen, Peng Zhang, Yuxiao Dong, and Jie Tang.","author":"Zeng Aohan","year":"2022","unstructured":"Aohan Zeng, Xiao Liu, Zhengxiao Du, Zihan Wang, Hanyu Lai, Ming Ding, Zhuoyi Yang, Yifan Xu, Wendi Zheng, Xiao Xia, Weng Lam Tam, Zixuan Ma, Yufei Xue, Jidong Zhai, Wenguang Chen, Peng Zhang, Yuxiao Dong, and Jie Tang. 2022. GLM-130B: An Open Bilingual Pre-trained Model. arXiv:2210.02414 (2022)."},{"key":"e_1_3_2_1_65_1","volume-title":"Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong.","author":"Zhang Chaoning","year":"2023","unstructured":"Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, and Choong Seon Hong. 2023. Faster Segment Anything: Towards Lightweight SAM for Mobile Applications. arXiv:2306.14289 (2023)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00089"},{"key":"e_1_3_2_1_67_1","volume-title":"Learning Deep Features for Discriminative Localization. arXiv:1512.04150","author":"Zhou Bolei","year":"2015","unstructured":"Bolei Zhou, Aditya Khosla, Agata Lapedriza, Aude Oliva, and Antonio Torralba. 2015. Learning Deep Features for Discriminative Localization. arXiv:1512.04150 (2015)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00167"},{"key":"e_1_3_2_1_69_1","volume-title":"TIP-Editor","author":"Zhuang Jingyu","year":"2024","unstructured":"Jingyu Zhuang, Di Kang, Yan-Pei Cao, Guanbin Li, Liang Lin, and Ying Shan. 2024. TIP-Editor: An Accurate 3D Editor Following Both Text-Prompts And Image-Prompts. arXiv:2401.14828 (2024)."},{"volume-title":"DreamEditor","author":"Zhuang Jingyu","key":"e_1_3_2_1_70_1","unstructured":"Jingyu Zhuang, Chen Wang, Lingjie Liu, Liang Lin, and Guanbin Li. 2023. DreamEditor: Text-Driven 3D Scene Editing with Neural Fields. In SIGGRAPH Asia. 1--10."},{"key":"e_1_3_2_1_71_1","unstructured":"Xueyan Zou Jianwei Yang Hao Zhang Feng Li Linjie Li Jianfeng Wang Lijuan Wang Jianfeng Gao and Yong Jae Lee. 2023. Segment Everything Everywhere All at Once. In Advances in Neural Information Processing Systems."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681289","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681289","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:43Z","timestamp":1750295863000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681289"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":71,"alternative-id":["10.1145\/3664647.3681289","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681289","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}