{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:15:25Z","timestamp":1778080525590,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100020595","name":"National Science and Technology Council","doi-asserted-by":"publisher","award":["NSTC-112-2634-F-002-006, NSTC-112-2222-E-001-001-MY2"],"award-info":[{"award-number":["NSTC-112-2634-F-002-006, NSTC-112-2222-E-001-001-MY2"]}],"id":[{"id":"10.13039\/100020595","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001869","name":"Academia Sinica","doi-asserted-by":"publisher","award":["AS-CDA-110-M09"],"award-info":[{"award-number":["AS-CDA-110-M09"]}],"id":[{"id":"10.13039\/501100001869","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687635","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["Camera Settings as Tokens: Modeling Photography on Latent Diffusion Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8347-5938","authenticated-orcid":false,"given":"I-Sheng","family":"Fang","sequence":"first","affiliation":[{"name":"Research Center for Information Technology Innovation, Academia Sinica, Taipei City, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1206-9729","authenticated-orcid":false,"given":"Yue-Hua","family":"Han","sequence":"additional","affiliation":[{"name":"Research Center for Information Technology Innovation, Academia Sinica, Taipei City, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0209-8932","authenticated-orcid":false,"given":"Jun-Cheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Research Center for Information Technology Innovation, Academia Sinica, Taipei City, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_3_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_7"},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"crossref","unstructured":"Omri Avrahami Ohad Fried and Dani Lischinski. 2023. Blended Latent Diffusion. SIGGRAPH (2023).","DOI":"10.1145\/3592450"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"crossref","unstructured":"Omri Avrahami Amir Hertz Yael Vinker Moab Arar Shlomi Fruchter Ohad Fried Daniel Cohen-Or and Dani Lischinski. 2024. The Chosen One: Consistent Characters in Text-to-Image Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.10093 (2024).","DOI":"10.1145\/3641519.3657430"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_3_3_6_1","unstructured":"Gregor B.2024. Digital Camera Database. https:\/\/www.digicamdb.com\/"},{"key":"e_1_3_3_3_7_1","unstructured":"Jimmy\u00a0Lei Ba Jamie\u00a0Ryan Kiros and Geoffrey\u00a0E Hinton. 2016. Layer normalization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1607.06450 (2016)."},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00091"},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"crossref","unstructured":"Tim Brooks Aleksander Holynski and Alexei\u00a0A Efros. 2022. InstructPix2Pix: Learning to Follow Image Editing Instructions. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2211.09800 (2022).","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_3_10_1","unstructured":"Minwoo Byeon Beomhee Park Haecheon Kim Sungjun Lee Woonhyuk Baek and Saehoon Kim. 2022. COYO-700M: Image-Text Pair Dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset."},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models. SIGGRAPH (2023).","DOI":"10.1145\/3592116"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"crossref","unstructured":"Junsong Chen Jincheng Yu Chongjian Ge Lewei Yao Enze Xie Yue Wu Zhongdao Wang James Kwok Ping Luo Huchuan Lu and Zhenguo Li. 2023. PixArt-\u03b1 : Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis.","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"e_1_3_3_3_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2713168.2713194"},{"key":"e_1_3_3_3_15_1","unstructured":"Gabriel Goh James Betker Li Jing and Aditya Ramesh. 2023. Improving Image Generation with Better Captions. https:\/\/openai.com\/index\/dall-e-3\/."},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00673"},{"key":"e_1_3_3_3_17_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_3_18_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_3_19_1","volume-title":"ICLR","author":"Hu Edward\u00a0J","year":"2022","unstructured":"Edward\u00a0J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In ICLR. https:\/\/openreview.net\/forum?id=nZeVKeeFYf9"},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"crossref","unstructured":"Mengqi Huang Zhendong Mao Mingcong Liu Qian He and Yongdong Zhang. 2024. RealCustom: Narrowing Real Text Word for Real-Time Open-Domain Text-to-Image Customization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.00483 (2024).","DOI":"10.1109\/CVPR52733.2024.00714"},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","unstructured":"Gabriel Ilharco Mitchell Wortsman Ross Wightman Cade Gordon Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller Hannaneh Hajishirzi Ali Farhadi and Ludwig Schmidt. 2021. OpenCLIP. 10.5281\/zenodo.5143773","DOI":"10.5281\/zenodo.5143773"},{"key":"e_1_3_3_3_22_1","volume-title":"ICML","author":"Jaegle Andrew","year":"2021","unstructured":"Andrew Jaegle, Felix Gimeno, Andy Brock, Oriol Vinyals, Andrew Zisserman, and Joao Carreira. 2021. Perceiver: General perception with iterative attention. In ICML."},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_3_3_24_1","volume-title":"ICLR","author":"Kingma Diederik\u00a0P","year":"2015","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In ICLR."},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_3_3_26_1","unstructured":"Jaeho Lee. 2024. LoRA DreamBooth - ljh415\/dreambooth. https:\/\/huggingface.co\/ljh415\/dreambooth"},{"key":"e_1_3_3_3_27_1","unstructured":"Jialu Li Jaemin Cho Yi-Lin Sung Jaehong Yoon and Mohit Bansal. 2024a. SELMA: Learning and Merging Skill-Specific Text-to-Image Experts with Auto-Generated Data. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.06952 (2024)."},{"key":"e_1_3_3_3_28_1","volume-title":"ICML","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML."},{"key":"e_1_3_3_3_29_1","unstructured":"Shikai Li Jianglin Fu Kaiyuan Liu Wentao Wang Kwan-Yee Lin and Wayne Wu. 2024b. CosmicMan: A Text-to-Image Foundation Model for Humans. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.01294 (2024)."},{"key":"e_1_3_3_3_30_1","unstructured":"Yanyu Li Xian Liu Anil Kag Ju Hu Yerlan Idelbayev Dhritiman Sagar Yanzhi Wang Sergey Tulyakov and Jian Ren. 2024c. TextCraftor: Your Text Encoder Can be Image Quality Controller. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.18978 (2024)."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00071"},{"key":"e_1_3_3_3_32_1","volume-title":"International Conference on Learning Representations","author":"Liu Luping","year":"2022","unstructured":"Luping Liu, Yi Ren, Zhijie Lin, and Zhou Zhao. 2022. Pseudo Numerical Methods for Diffusion Models on Manifolds. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=PlKWVd2yBkY"},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00218"},{"key":"e_1_3_3_3_34_1","volume-title":"ICLR","author":"Meng Chenlin","year":"2022","unstructured":"Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2022. SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations. In ICLR."},{"key":"e_1_3_3_3_35_1","volume-title":"AAAI","author":"Mou Chong","year":"2023","unstructured":"Chong Mou, Xintao Wang, Liangbin Xie, Yanze Wu, Jian Zhang, Zhongang Qi, Ying Shan, and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. In AAAI."},{"key":"e_1_3_3_3_36_1","unstructured":"Kartik Narayan Vibashan VS Rama Chellappa and Vishal\u00a0M Patel. 2024. FaceXFormer: A Unified Transformer for Facial Analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12960 (2024)."},{"key":"e_1_3_3_3_37_1","unstructured":"Alex Nichol Prafulla Dhariwal Aditya Ramesh Pranav Shyam Pamela Mishkin Bob McGrew Ilya Sutskever and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.10741 (2022)."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00294"},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"crossref","unstructured":"Gaurav Parmar Krishna Kumar\u00a0Singh Richard Zhang Yijun Li Jingwan Lu and Jun-Yan Zhu. 2023. Zero-shot Image-to-Image Translation. SIGGRAPH (2023).","DOI":"10.1145\/3588432.3591513"},{"key":"e_1_3_3_3_40_1","unstructured":"Maitreya Patel Changhoon Kim Sheng Cheng Chitta Baral and Yezhou Yang. 2023. Eclipse: A resource-efficient text-to-image prior for image generations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.04655 (2023)."},{"key":"e_1_3_3_3_41_1","unstructured":"Dustin Podell Zion English Kyle Lacey Andreas Blattmann Tim Dockhorn Jonas M\u00fcller Joe Penna and Robin Rombach. 2023. Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.01952 (2023)."},{"key":"e_1_3_3_3_42_1","unstructured":"Jie Qin Jie Wu Weifeng Chen Yuxi Ren Huixia Li Hefeng Wu Xuefeng Xiao Rui Wang and Shilei Wen. 2024. DiffusionGPT: LLM-Driven Text-to-Image Generation System. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.10061 (2024)."},{"key":"e_1_3_3_3_43_1","unstructured":"Leigang Qu Wenjie Wang Yongqi Li Hanwang Zhang Liqiang Nie and Tat-Seng Chua. 2024. Discriminative Probing and Tuning for Text-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.04321 (2024)."},{"key":"e_1_3_3_3_44_1","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_3_3_45_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 (2022)."},{"key":"e_1_3_3_3_46_1","unstructured":"Maik Riechert. 2024. https:\/\/letmaik.github.io\/rawpy\/api\/"},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_3_49_1","volume-title":"NeurIPS","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et\u00a0al. 2022. LAION-5B: An open large-scale dataset for training next generation image-text models. In NeurIPS."},{"key":"e_1_3_3_3_50_1","volume-title":"NeurIPSW","author":"Schuhmann Christoph","year":"2021","unstructured":"Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs. In NeurIPSW."},{"key":"e_1_3_3_3_51_1","unstructured":"Kihyuk Sohn Nataniel Ruiz Kimin Lee Daniel\u00a0Castro Chin Irina Blok Huiwen Chang Jarred Barber Lu Jiang Glenn Entis Yuanzhen Li Yuan Hao Irfan Essa Michael Rubinstein and Dilip Krishnan. 2023. StyleDrop: Text-to-Image Generation in Any Style. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.00983 (2023)."},{"key":"e_1_3_3_3_52_1","unstructured":"Ianar\u00e9 S\u00e9vi. 2024. https:\/\/pypi.org\/project\/ExifRead\/"},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01366"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"crossref","unstructured":"Bart Thomee David\u00a0A Shamma Gerald Friedland Benjamin Elizalde Karl Ni Douglas Poland Damian Borth and Li-Jia Li. 2016. YFCC100M: The new data in multimedia research. Commun. ACM 59 2 (2016) 64\u201373.","DOI":"10.1145\/2812802"},{"key":"e_1_3_3_3_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_3_3_56_1","volume-title":"NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS. https:\/\/dl.acm.org\/doi\/10.5555\/3295222.3295349"},{"key":"e_1_3_3_3_57_1","volume-title":"The Eleventh International Conference on Learning Representations","author":"Watson Daniel","year":"2023","unstructured":"Daniel Watson, William Chan, Ricardo\u00a0Martin Brualla, Jonathan Ho, Andrea Tagliasacchi, and Mohammad Norouzi. 2023. Novel View Synthesis with Diffusion Models. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=HtoA0oT30jC"},{"key":"e_1_3_3_3_58_1","unstructured":"Chenfei Wu Shengming Yin Weizhen Qi Xiaodong Wang Zecheng Tang and Nan Duan. 2023. Visual ChatGPT: Talking Drawing and Editing with Visual Foundation Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.04671 (2023)."},{"key":"e_1_3_3_3_59_1","unstructured":"Yanwu Xu Yang Zhao Zhisheng Xiao and Tingbo Hou. 2023. UFOGen: You Forward Once Large Scale Text-to-Image Generation via Diffusion GANs. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.09257 (2023)."},{"key":"e_1_3_3_3_60_1","volume-title":"arXiv preprint arxiv:https:\/\/arXiv.org\/abs\/2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image Diffusion Models. In arXiv preprint arxiv:https:\/\/arXiv.org\/abs\/2308.06721."},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687635","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687635","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:27Z","timestamp":1750294707000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687635"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":60,"alternative-id":["10.1145\/3680528.3687635","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687635","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}