{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:46:33Z","timestamp":1772905593500,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":106,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Packard Fellowship"},{"name":"Amazon Faculty Research Award"},{"name":"NSF IIS-2239076"},{"name":"NSF Graduate Research Fellowship","award":["Grant No. DGE2140739"],"award-info":[{"award-number":["Grant No. DGE2140739"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687564","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Customizing Text-to-Image Diffusion with Object Viewpoint Control"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1799-1069","authenticated-orcid":false,"given":"Nupur","family":"Kumari","sequence":"first","affiliation":[{"name":"Carnegie Mellon Uniersity, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5102-9258","authenticated-orcid":false,"given":"Grace","family":"Su","sequence":"additional","affiliation":[{"name":"Carnegie Mellon Uniersity, Pittsburgh, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2507-4674","authenticated-orcid":false,"given":"Richard","family":"Zhang","sequence":"additional","affiliation":[{"name":"Adobe Research, California, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9534-6868","authenticated-orcid":false,"given":"Taesung","family":"Park","sequence":"additional","affiliation":[{"name":"Adobe Research, California, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6783-1795","authenticated-orcid":false,"given":"Eli","family":"Shechtman","sequence":"additional","affiliation":[{"name":"Adobe Research, Seattle, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8504-3410","authenticated-orcid":false,"given":"Jun-Yan","family":"Zhu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon Uniersity, Pittsburgh, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_2_2_1","unstructured":"Adobe. 2023. Generative Fill. https:\/\/www.adobe.com\/products\/photoshop\/generative-fill.html."},{"key":"e_1_3_3_2_3_1","doi-asserted-by":"crossref","unstructured":"Yuval Alaluf Elad Richardson Gal Metzer and Daniel Cohen-Or. 2023. A Neural Space-Time Representation for Text-to-Image Personalization. ACM Transactions on Graphics (TOG) (2023).","DOI":"10.1145\/3618322"},{"key":"e_1_3_3_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618173"},{"key":"e_1_3_3_2_5_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Bar-Tal Omer","year":"2023","unstructured":"Omer Bar-Tal, Lior Yariv, Yaron Lipman, and Tali Dekel. 2023. Multidiffusion: Fusing diffusion paths for controlled image generation. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00580"},{"key":"e_1_3_3_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01804"},{"key":"e_1_3_3_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01245"},{"key":"e_1_3_3_2_9_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Boss Mark","year":"2022","unstructured":"Mark Boss, Andreas Engelhardt, Abhishek Kar, Yuanzhen Li, Deqing Sun, Jonathan Barron, Hendrik Lensch, and Varun Jampani. 2022. Samurai: Shape and material from unconstrained real-world arbitrary image collections. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_10_1","doi-asserted-by":"crossref","unstructured":"Manuel Brack Felix Friedrich Katharina Kornmeier Linoy Tsaban Patrick Schramowski Kristian Kersting and Apolin\u00e1rio Passos. 2024. LEDITS++: Limitless Image Editing using Text-to-Image Models. IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2024).","DOI":"10.1109\/CVPR52733.2024.00846"},{"key":"e_1_3_3_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_2_12_1","doi-asserted-by":"crossref","unstructured":"James Burgess Kuan-Chieh Wang and Serena Yeung-Levy. 2024. Viewpoint Textual Inversion: Discovering Scene Representations and 3D View Control in 2D Diffusion Models. European Conference on Computer Vision (ECCV) (2024).","DOI":"10.1007\/978-3-031-73039-9_24"},{"key":"e_1_3_3_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_3_2_14_1","volume-title":"IEEE International Conference on Computer Vision (ICCV)","author":"Chan Eric\u00a0R","year":"2023","unstructured":"Eric\u00a0R Chan, Koki Nagano, Matthew\u00a0A Chan, Alexander\u00a0W Bergman, Jeong\u00a0Joon Park, Axel Levy, Miika Aittala, Shalini De\u00a0Mello, Tero Karras, and Gordon Wetzstein. 2023. GeNVS: Generative novel view synthesis with 3D-aware diffusion models. In IEEE International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_3_2_15_1","unstructured":"ChatGPT. 2022. ChatGPT. https:\/\/chat.openai.com\/chat."},{"key":"e_1_3_3_2_16_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG) (2023).","DOI":"10.1145\/3592116"},{"key":"e_1_3_3_2_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19824-3_20"},{"key":"e_1_3_3_2_18_1","doi-asserted-by":"crossref","unstructured":"Tao Chen Zhe Zhu Ariel Shamir Shi-Min Hu and Daniel Cohen-Or. 2013. 3-sweep: Extracting editable objects from a single photo. ACM Transactions on graphics (TOG) (2013).","DOI":"10.1145\/2508363.2508378"},{"key":"e_1_3_3_2_19_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Chen Wenhu","year":"2023","unstructured":"Wenhu Chen, Hexiang Hu, Yandong Li, Nataniel Rui, Xuhui Jia, Ming-Wei Chang, and William\u00a0W Cohen. 2023. Subject-driven text-to-image generation via apprenticeship learning. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00645"},{"key":"e_1_3_3_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01254"},{"key":"e_1_3_3_2_22_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_23_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Dong Jiahua","year":"2023","unstructured":"Jiahua Dong and Yu-Xiong Wang. 2023. ViCA-NeRF: View-Consistency-Aware 3D Editing of Neural Radiance Fields. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01201"},{"key":"e_1_3_3_2_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_3_2_26_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Gal Rinon","year":"2023","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit\u00a0H Bermano, Gal Chechik, and Daniel Cohen-Or. 2023a. An image is worth one word: Personalizing text-to-image generation using textual inversion. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_27_1","doi-asserted-by":"crossref","unstructured":"Rinon Gal Moab Arar Yuval Atzmon Amit\u00a0H Bermano Gal Chechik and Daniel Cohen-Or. 2023b. Encoder-based domain tuning for fast personalization of text-to-image models. ACM Transactions on Graphics (TOG) (2023).","DOI":"10.1145\/3592133"},{"key":"e_1_3_3_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00694"},{"key":"e_1_3_3_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00673"},{"key":"e_1_3_3_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01808"},{"key":"e_1_3_3_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_32_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Hertz Amir","year":"2023","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2023. Prompt-to-prompt image editing with cross attention control. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_33_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00482"},{"key":"e_1_3_3_2_35_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Hu Edward\u00a0J","year":"2022","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. Lora: Low-rank adaptation of large language models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_36_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Jampani Varun","year":"2023","unstructured":"Varun Jampani, Kevis-Kokitsi Maninis, Andreas Engelhardt, Arjun Karpur, Karen Truong, Kyle Sargent, Stefan Popov, Andr\u00e9 Araujo, Ricardo Martin\u00a0Brualla, Kaushal Patel, et\u00a0al. 2023. Navi: Category-agnostic image collections with high-quality 3d shape and pose annotations. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"e_1_3_3_2_38_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Karras Tero","year":"2022","unstructured":"Tero Karras, Miika Aittala, Timo Aila, and Samuli Laine. 2022. Elucidating the design space of diffusion-based generative models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_39_1","unstructured":"Tero Karras Miika Aittala Jaakko Lehtinen Janne Hellsten Timo Aila and Samuli Laine. 2023. Analyzing and Improving the Training Dynamics of Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.02696 (2023)."},{"key":"e_1_3_3_2_40_1","doi-asserted-by":"crossref","unstructured":"Kevin Karsch Varsha Hedau David Forsyth and Derek Hoiem. 2011. Rendering synthetic objects into legacy photographs. ACM Transactions on graphics (TOG) (2011).","DOI":"10.1145\/2070752.2024191"},{"key":"e_1_3_3_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_3_2_42_1","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3D Gaussian Splatting for Real-Time Radiance Field Rendering. ACM Transactions on Graphics 42 4 (2023).","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"e_1_3_3_2_44_1","doi-asserted-by":"crossref","unstructured":"Natasha Kholgade Tomas Simon Alexei Efros and Yaser Sheikh. 2014. 3D object manipulation in a single photograph using stock 3D models. ACM Transactions on graphics (TOG) (2014).","DOI":"10.1145\/2601097.2601209"},{"key":"e_1_3_3_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"e_1_3_3_2_46_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Kingma Diederik\u00a0P","year":"2014","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2014. Auto-encoding variational bayes. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_3_2_48_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Li Dongxu","year":"2023","unstructured":"Dongxu Li, Junnan Li, and Steven\u00a0CH Hoi. 2023. Blip-diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"e_1_3_3_2_50_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Liu Yuan","year":"2024","unstructured":"Yuan Liu, Cheng Lin, Zijiao Zeng, Xiaoxiao Long, Lingjie Liu, Taku Komura, and Wenping Wang. 2024. Syncdreamer: Generating multiview-consistent images from a single-view image. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_51_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_52_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Meng Chenlin","year":"2022","unstructured":"Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2022. Sdedit: Guided image synthesis and editing with stochastic differential equations. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01218"},{"key":"e_1_3_3_2_54_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Michel Oscar","year":"2023","unstructured":"Oscar Michel, Anand Bhattad, Eli VanderBilt, Ranjay Krishna, Aniruddha Kembhavi, and Tanmay Gupta. 2023. Object 3dit: Language-guided 3d-aware image editing. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_55_1","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P Srinivasan Matthew Tancik Jonathan\u00a0T Barron Ravi Ramamoorthi and Ren Ng. 2021. Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. ACM (2021).","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_3_2_57_1","doi-asserted-by":"crossref","unstructured":"Thomas M\u00fcller Alex Evans Christoph Schied and Alexander Keller. 2022. Instant neural graphics primitives with a multiresolution hash encoding. ACM Transactions on Graphics (ToG) (2022).","DOI":"10.1145\/3528223.3530127"},{"key":"e_1_3_3_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00540"},{"key":"e_1_3_3_2_59_1","volume-title":"TMLR","author":"Oquab Maxime","year":"2023","unstructured":"Maxime Oquab, Timoth\u00e9e Darcet, Th\u00e9o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, et\u00a0al. 2023. Dinov2: Learning robust visual features without supervision. In TMLR."},{"key":"e_1_3_3_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591513"},{"key":"e_1_3_3_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02107"},{"key":"e_1_3_3_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_63_1","unstructured":"Dustin Podell Zion English Kyle Lacey Andreas Blattmann Tim Dockhorn Jonas M\u00fcller Joe Penna and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.01952 (2023)."},{"key":"e_1_3_3_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"e_1_3_3_2_65_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00223"},{"key":"e_1_3_3_2_67_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 (2022)."},{"key":"e_1_3_3_2_68_1","unstructured":"Nikhila Ravi Jeremy Reizenstein David Novotny Taylor Gordon Wan-Yen Lo Justin Johnson and Georgia Gkioxari. 2020. Accelerating 3d deep learning with pytorch3d. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2007.08501 (2020)."},{"key":"e_1_3_3_2_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01072"},{"key":"e_1_3_3_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_71_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"e_1_3_3_2_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_2_73_1","unstructured":"Nataniel Ruiz Yuanzhen Li Varun Jampani Wei Wei Tingbo Hou Yael Pritch Neal Wadhwa Michael Rubinstein and Kfir Aberman. 2023b. Hyperdreambooth: Hypernetworks for fast personalization of text-to-image models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.06949 (2023)."},{"key":"e_1_3_3_2_74_1","unstructured":"Simo Ryu. 2023. LoRA-Stable Diffusion. https:\/\/github.com\/cloneofsimo\/lora."},{"key":"e_1_3_3_2_75_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar\u00a0Seyed Ghasemipour, Burcu\u00a0Karagol Ayan, S\u00a0Sara Mahdavi, Rapha\u00a0Gontijo Lopes, et\u00a0al. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_76_1","doi-asserted-by":"crossref","unstructured":"Kyle Sargent Zizhang Li Tanmay Shah Charles Herrmann Hong-Xing Yu Yunzhi Zhang Eric\u00a0Ryan Chan Dmitry Lagun Li Fei-Fei Deqing Sun et\u00a0al. 2023. Zeronvs: Zero-shot 360-degree view synthesis from a single real image. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.17994 (2023).","DOI":"10.1109\/CVPR52733.2024.00900"},{"key":"e_1_3_3_2_77_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Sauer Axel","year":"2023","unstructured":"Axel Sauer, Tero Karras, Samuli Laine, Andreas Geiger, and Timo Aila. 2023. Stylegan-t: Unlocking the power of gans for fast large-scale text-to-image synthesis. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_78_1","unstructured":"Christoph Schuhmann Richard Vencu Romain Beaumont Robert Kaczmarczyk Clayton Mullis Aarush Katta Theo Coombes Jenia Jitsev and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2111.02114 (2021)."},{"key":"e_1_3_3_2_79_1","unstructured":"Jing Shi Wei Xiong Zhe Lin and Hyun\u00a0Joon Jung. 2023. Instantbooth: Personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.03411 (2023)."},{"key":"e_1_3_3_2_80_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Shi Yichun","year":"2024","unstructured":"Yichun Shi, Peng Wang, Jianglong Ye, Mai Long, Kejie Li, and Xiao Yang. 2024. Mvdream: Multi-view diffusion for 3d generation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_81_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning (ICML). https:\/\/dl.acm.org\/doi\/10.5555\/3045118.3045358"},{"key":"e_1_3_3_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01620"},{"key":"e_1_3_3_2_83_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising diffusion implicit models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00287"},{"key":"e_1_3_3_2_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591516"},{"key":"e_1_3_3_2_86_1","unstructured":"Jiaxiang Tang Jiawei Ren Hang Zhou Ziwei Liu and Gang Zeng. 2023. Dreamgaussian: Generative gaussian splatting for efficient 3d content creation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.16653 (2023)."},{"key":"e_1_3_3_2_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591506"},{"key":"e_1_3_3_2_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618249"},{"key":"e_1_3_3_2_89_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_90_1","unstructured":"Andrey Voynov Qinghao Chu Daniel Cohen-Or and Kfir Aberman. 2023. P + : Extended Textual Conditioning in Text-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.09522 (2023)."},{"key":"e_1_3_3_2_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00661"},{"key":"e_1_3_3_2_92_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"e_1_3_3_2_93_1","unstructured":"Rundi Wu Ben Mildenhall Philipp Henzler Keunhong Park Ruiqi Gao Daniel Watson Pratul\u00a0P Srinivasan Dor Verbin Jonathan\u00a0T Barron Ben Poole et\u00a0al. 2024. ReconFusion: 3D Reconstruction with Diffusion Priors. (2024)."},{"key":"e_1_3_3_2_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00428"},{"key":"e_1_3_3_2_95_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Xu Yinghao","year":"2024","unstructured":"Yinghao Xu, Hao Tan, Fujun Luan, Sai Bi, Peng Wang, Jiahao Li, Zifan Shi, Kalyan Sunkavalli, Gordon Wetzstein, Zexiang Xu, and Kai Zhang. 2024. DMV3D: Denoising Multi-View Diffusion using 3D Large Reconstruction Model. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_96_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Yao Shunyu","year":"2018","unstructured":"Shunyu Yao, Tzu\u00a0Ming Hsu, Jun-Yan Zhu, Jiajun Wu, Antonio Torralba, Bill Freeman, and Josh Tenenbaum. 2018. 3d-aware scene manipulation via inverse graphics. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_2_97_1","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023b. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06721 (2023)."},{"key":"e_1_3_3_2_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00823"},{"key":"e_1_3_3_2_99_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00406"},{"key":"e_1_3_3_2_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00455"},{"key":"e_1_3_3_2_101_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Yuanzhong Xu, Jing\u00a0Yu Koh, Thang Luong, Gunjan Baid, Zirui Wang, Vijay Vasudevan, Alexander Ku, Yinfei Yang, Burcu\u00a0Karagol Ayan, et\u00a0al. 2022. Scaling autoregressive models for content-rich text-to-image generation. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_2_102_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681396"},{"key":"e_1_3_3_2_103_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhang Jason\u00a0Y","year":"2024","unstructured":"Jason\u00a0Y Zhang, Amy Lin, Moneish Kumar, Tzu-Hsuan Yang, Deva Ramanan, and Shubham Tulsiani. 2024. Cameras as Rays: Sparse-view Pose Estimation via Ray Diffusion. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_104_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_2_105_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Zhang Yuxuan","year":"2021","unstructured":"Yuxuan Zhang, Wenzheng Chen, Huan Ling, Jun Gao, Yinan Zhang, Antonio Torralba, and Sanja Fidler. 2021. Image gans meet differentiable rendering for inverse graphics and interpretable 3d neural rendering. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_106_1","doi-asserted-by":"crossref","unstructured":"Yuxin Zhang Weiming Dong Fan Tang Nisha Huang Haibin Huang Chongyang Ma Tong-Yee Lee Oliver Deussen and Changsheng Xu. 2023. ProSpect: Prompt Spectrum for Attribute-Aware Personalization of Diffusion Models. ACM Transactions on Graphics (TOG) (2023).","DOI":"10.1145\/3618342"},{"key":"e_1_3_3_2_107_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01211"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687564","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:26Z","timestamp":1750294706000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687564"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":106,"alternative-id":["10.1145\/3680528.3687564","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687564","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}