{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T20:04:17Z","timestamp":1772913857447,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3680528.3687652","type":"proceedings-article","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T08:14:37Z","timestamp":1733213677000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["TrailBlazer: Trajectory Control for Diffusion-Based Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9499-2623","authenticated-orcid":false,"given":"Wan-Duo Kurt","family":"Ma","sequence":"first","affiliation":[{"name":"Victoria University of Wellington, Wellington, New Zealand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6835-7263","authenticated-orcid":false,"given":"J. P.","family":"Lewis","sequence":"additional","affiliation":[{"name":"NVIDIA Research, Santa Clara, United States of America and Victoria University of Wellington, Santa Clara, New Zealand"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1973-3920","authenticated-orcid":false,"given":"W. Bastiaan","family":"Kleijn","sequence":"additional","affiliation":[{"name":"Victoria University of Wellington, Wellington, New Zealand"}]}],"member":"320","published-online":{"date-parts":[[2024,12,3]]},"reference":[{"key":"e_1_3_3_3_2_1","volume-title":"Grammar of the Film Language","author":"Arijon Daniel","year":"1976","unstructured":"Daniel Arijon. 1976. Grammar of the Film Language. Focal Press."},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","unstructured":"Yogesh Balaji Seungjun Nah Xun Huang Arash Vahdat Jiaming Song Karsten Kreis Miika Aittala Timo Aila Samuli Laine Bryan Catanzaro Tero Karras and Ming-Yu Liu. 2022. eDiff-I: Text-to-Image Diffusion Models with an Ensemble of Expert Denoisers. CoRR abs\/2211.01324 (2022). 10.48550\/arXiv.2211.01324 arXiv:https:\/\/arXiv.org\/abs\/2211.01324","DOI":"10.48550\/arXiv.2211.01324"},{"key":"e_1_3_3_3_4_1","unstructured":"Omer Bar-Tal Lior Yariv Yaron Lipman and Tali Dekel. 2023. MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation. CoRR abs\/2302.08113 (2023). arXiv:https:\/\/arXiv.org\/abs\/2302.08113"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/280811.281029"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"e_1_3_3_3_7_1","unstructured":"cerspense. 2023. zeroscope-v2-576w. https:\/\/huggingface.co\/cerspense\/zeroscope-v2-576w Accessed: 2023-10-01."},{"key":"e_1_3_3_3_8_1","unstructured":"Weifeng Chen Jie Wu Pan Xie Hefeng Wu Jiashi Li Xin Xia Xuefeng Xiao and Liang Lin. 2023. Control-A-Video: Controllable Text-to-Video Generation with Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2305.13840\u00a0[cs.CV]"},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"crossref","unstructured":"Patrick Esser Johnathan Chiu Parmida Atighehchian Jonathan Granskog and Anastasis Germanidis. 2023. Structure and Content-Guided Video Synthesis with Diffusion Models. ArXiv abs\/2302.03011 (2023). https:\/\/api.semanticscholar.org\/CorpusID:256615582","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_3_3_10_1","unstructured":"Songwei Ge Seungjun Nah Guilin Liu Tyler Poon Andrew Tao Bryan Catanzaro David Jacobs Jia-Bin Huang Ming-Yu Liu and Yogesh Balaji. 2023. Preserve Your Own Correlation: A Noise Prior for Video Diffusion Models. Proceedings of the IEEE\/CVF International Conference on Computer Vision 2023 (2023)."},{"key":"e_1_3_3_3_11_1","unstructured":"William Harvey Saeid Naderiparizi Vaden Masrani Christian Weilbach and Frank Wood. 2022. Flexible Diffusion Modeling of Long Videos. arxiv:https:\/\/arXiv.org\/abs\/2205.11495\u00a0[cs.CV]"},{"key":"e_1_3_3_3_12_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-Prompt Image Editing with Cross Attention Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01626 (2022)."},{"key":"e_1_3_3_3_13_1","volume-title":"Advances in Neural Information Processing Systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/8a1d694707eb0fefe65871369074926d-Paper.pdf https:\/\/dl.acm.org\/doi\/10.5555\/3295222.3295408"},{"key":"e_1_3_3_3_14_1","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao Alexey\u00a0A. Gritsenko Diederik\u00a0P. Kingma Ben Poole Mohammad Norouzi David\u00a0J. Fleet and Tim Salimans. 2022a. Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303 (2022). https:\/\/api.semanticscholar.org\/CorpusID:252715883"},{"key":"e_1_3_3_3_15_1","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems 33 (2020)."},{"key":"e_1_3_3_3_16_1","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J. Fleet. 2022b. Video Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2204.03458\u00a0[cs.CV]"},{"key":"e_1_3_3_3_17_1","unstructured":"Zhihao Hu and Dong Xu. 2023. VideoControlNet: A Motion-Guided Video-to-Video Translation Framework by Using Diffusion Model with ControlNet. arxiv:https:\/\/arXiv.org\/abs\/2307.14073\u00a0[cs.CV]"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","unstructured":"Tobias H\u00f6ppe Arash Mehrjou Stefan Bauer Didrik Nielsen and Andrea Dittadi. 2022. Diffusion Models for Video Prediction and Infilling. 10.48550\/ARXIV.2206.07696","DOI":"10.48550\/ARXIV.2206.07696"},{"key":"e_1_3_3_3_19_1","doi-asserted-by":"crossref","unstructured":"Yash Jain Anshul Nasery Vibhav Vineet and Harkirat Behl. 2023. PEEKABOO: Interactive Video Generation via Masked-Diffusion. arxiv:https:\/\/arXiv.org\/abs\/2312.07509\u00a0[cs.CV]","DOI":"10.1109\/CVPR52733.2024.00772"},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"crossref","unstructured":"Levon Khachatryan Andranik Movsisyan Vahram Tadevosyan Roberto Henschel Zhangyang Wang Shant Navasardyan and Humphrey Shi. 2023. Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.13439 (2023).","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"e_1_3_3_3_21_1","unstructured":"Pengxiang Li Zhili Liu Kai Chen Lanqing Hong Yunzhi Zhuge Dit-Yan Yeung Huchuan Lu and Xu Jia. 2023a. Trackdiffusion: Multi-object tracking data generation via diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.00651 (2023)."},{"key":"e_1_3_3_3_22_1","unstructured":"Yuheng Li Haotian Liu Qingyang Wu Fangzhou Mu Jianwei Yang Jianfeng Gao Chunyuan Li and Yong\u00a0Jae Lee. 2023b. GLIGEN: Open-Set Grounded Text-to-Image Generation. CoRR abs\/2301.07093 (2023). arXiv:https:\/\/arXiv.org\/abs\/2301.07093"},{"key":"e_1_3_3_3_23_1","unstructured":"Long Lian Baifeng Shi Adam Yala Trevor Darrell and Boyi Li. 2023. LLM-grounded Video Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2309.17444\u00a0[cs.CV]"},{"key":"e_1_3_3_3_24_1","unstructured":"Jun\u00a0Hao Liew Hanshu Yan Daquan Zhou and Jiashi Feng. 2022. MagicMix: Semantic Mixing with Diffusion Models. CoRR abs\/2210.16056 (2022)."},{"key":"e_1_3_3_3_25_1","unstructured":"Zhengxiong Luo Dayou Chen Yingya Zhang Yan Huang Liang Wang Yujun Shen Deli Zhao Jingren Zhou and Tieniu Tan. 2023. VideoFusion: Decomposed Diffusion Models for High-Quality Video Generation. arxiv:https:\/\/arXiv.org\/abs\/2303.08320\u00a0[cs.CV]"},{"key":"e_1_3_3_3_26_1","unstructured":"Wan-Duo\u00a0Kurt Ma J.\u00a0P. Lewis Avisek Lahiri Thomas Leung and W.\u00a0Bastiaan Kleijn. 2023. Directed Diffusion: Direct Control of Object Placement through Attention Guidance. arxiv:https:\/\/arXiv.org\/abs\/2302.13153\u00a0[cs.CV]"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"e_1_3_3_3_28_1","unstructured":"Chong Mou Xintao Wang Liangbin Xie Yanze Wu Jian Zhang Zhongang Qi Ying Shan and Xiaohu Qie. 2023. T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.08453 (2023)."},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"e_1_3_3_3_30_1","unstructured":"Alex Nichol and Prafulla Dhariwal. 2021. Improved Denoising Diffusion Probabilistic Models. arxiv:https:\/\/arXiv.org\/abs\/2102.09672\u00a0[cs.LG]"},{"key":"e_1_3_3_3_31_1","volume-title":"ICML","author":"Nichol Alexander\u00a0Quinn","year":"2022","unstructured":"Alexander\u00a0Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In ICML."},{"key":"e_1_3_3_3_32_1","unstructured":"OpenAI. 2024. Sora: Creating video from text. https:\/\/openai.com\/sora"},{"key":"e_1_3_3_3_33_1","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga Alban Desmaison Andreas K\u00f6pf Edward Yang Zach DeVito Martin Raison Alykhan Tejani Sasank Chilamkurthy Benoit Steiner Lu Fang Junjie Bai and Soumith Chintala. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. arxiv:https:\/\/arXiv.org\/abs\/1912.01703\u00a0[cs.LG]"},{"key":"e_1_3_3_3_34_1","unstructured":"Chenyang Qi Xiaodong Cun Yong Zhang Chenyang Lei Xintao Wang Ying Shan and Qifeng Chen. 2023. FateZero: Fusing Attentions for Zero-shot Text-based Video Editing. arXiv:https:\/\/arXiv.org\/abs\/2303.09535 (2023)."},{"key":"e_1_3_3_3_35_1","volume-title":"Proc.\u00a0ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proc.\u00a0ICML."},{"key":"e_1_3_3_3_36_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. CoRR abs\/2204.06125 (2022). arXiv:https:\/\/arXiv.org\/abs\/2204.06125"},{"key":"e_1_3_3_3_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_38_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily Denton Seyed Kamyar\u00a0Seyed Ghasemipour Burcu\u00a0Karagol Ayan S.\u00a0Sara Mahdavi Rapha\u00a0Gontijo Lopes Tim Salimans Jonathan Ho David\u00a0J. Fleet and Mohammad Norouzi. 2022a. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. CoRR abs\/2205.11487 (2022)."},{"key":"e_1_3_3_3_39_1","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L. Denton Seyed Kamyar\u00a0Seyed Ghasemipour Burcu\u00a0Karagol Ayan Seyedeh\u00a0Sara Mahdavi Raphael\u00a0Gontijo Lopes Tim Salimans Jonathan Ho David\u00a0J. Fleet and Mohammad Norouzi. 2022b. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. ArXiv abs\/2205.11487 (2022). https:\/\/api.semanticscholar.org\/CorpusID:248986576"},{"key":"e_1_3_3_3_40_1","unstructured":"Mike Seymour. 2023. She-Hulk: W\u0113t\u0101\u2019s Shade of Green. https:\/\/www.fxguide.com\/fxfeatured\/she-hulk-wetas-shade-of-green\/ Accessed: 2022-10-27."},{"key":"e_1_3_3_3_41_1","unstructured":"Mike Seymour. 2024. Actually Using Sora. https:\/\/www.fxguide.com\/fxfeatured\/actually-using-sora\/"},{"key":"e_1_3_3_3_42_1","unstructured":"Yujun Shi Chuhui Xue Jiachun Pan Wenqing Zhang Vincent\u00a0YF Tan and Song Bai. 2023. DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.14435 (2023)."},{"key":"e_1_3_3_3_43_1","volume-title":"International Conference on Machine Learning","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Machine Learning."},{"key":"e_1_3_3_3_44_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021."},{"key":"e_1_3_3_3_45_1","volume-title":"NeurIPS","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. In NeurIPS , Vol.\u00a032."},{"key":"e_1_3_3_3_46_1","unstructured":"Wei Sun and Tianfu Wu. 2022. Learning Layout and Style Reconfigurable GANs for Controllable Image Synthesis. TPAMI 44 (2022) 5070\u20135087."},{"key":"e_1_3_3_3_47_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf https:\/\/dl.acm.org\/doi\/10.5555\/3295222.3295349"},{"key":"e_1_3_3_3_48_1","volume-title":"(NeurIPS) Advances in Neural Information Processing Systems","author":"Voleti Vikram","year":"2022","unstructured":"Vikram Voleti, Alexia Jolicoeur-Martineau, and Christopher Pal. 2022. MCVD: Masked Conditional Video Diffusion for Prediction, Generation, and Interpolation. In (NeurIPS) Advances in Neural Information Processing Systems. https:\/\/arxiv.org\/abs\/2205.09853"},{"key":"e_1_3_3_3_49_1","unstructured":"Jiuniu Wang Hangjie Yuan Dayou Chen Yingya Zhang Xiang Wang and Shiwei Zhang. 2023a. ModelScope Text-to-Video Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2308.06571\u00a0[cs.CV]"},{"key":"e_1_3_3_3_50_1","unstructured":"Jiawei Wang Yuchen Zhang Jiaxin Zou Yan Zeng Guoqiang Wei Liping Yuan and Hang Li. 2024b. Boximator: Generating Rich and Controllable Motions for Video Synthesis. arxiv:https:\/\/arXiv.org\/abs\/2402.01566\u00a0[cs.CV]"},{"key":"e_1_3_3_3_51_1","first-page":"7594","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Wang Xiang","year":"2023","unstructured":"Xiang Wang, Hangjie Yuan, Shiwei Zhang, Dayou Chen, Jiuniu Wang, Yingya Zhang, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023b. VideoComposer: Compositional Video Synthesis with Motion Controllability. In Advances in Neural Information Processing Systems , A.\u00a0Oh, T.\u00a0Naumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 7594\u20137611. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/180f6184a3458fa19c28c5483bc61877-Paper-Conference.pdf"},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"e_1_3_3_3_53_1","unstructured":"Lilian Weng. 2021. What are diffusion models?https:\/\/lilianweng.github.io\/posts\/2021-07-11-diffusion-models\/"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"e_1_3_3_3_55_1","unstructured":"Jinheng Xie Yuexiang Li Yawen Huang Haozhe Liu Wentian Zhang Yefeng Zheng and Mike\u00a0Zheng Shou. 2023. BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained Diffusion. CoRR abs\/2307.10816 (2023). arXiv:https:\/\/arXiv.org\/abs\/2307.10816"},{"key":"e_1_3_3_3_56_1","unstructured":"Hanshu Yan Jun\u00a0Hao Liew Long Mai Shanchuan Lin and Jiashi Feng. 2023. MagicProp: Diffusion-based Video Editing via Motion-aware Appearance Propagation. arxiv:https:\/\/arXiv.org\/abs\/2309.00908\u00a0[cs.CV]"},{"key":"e_1_3_3_3_57_1","unstructured":"Ruihan Yang Prakhar Srivastava and Stephan Mandt. 2022b. Diffusion Probabilistic Modeling for Video Generation. arxiv:https:\/\/arXiv.org\/abs\/2203.09481\u00a0[cs.CV]"},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"crossref","unstructured":"Shiyuan Yang Liang Hou Haibin Huang Chongyang Ma Pengfei Wan Di Zhang Xiaodong Chen and Jing Liao. 2024. Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion. arxiv:https:\/\/arXiv.org\/abs\/2402.03162\u00a0[cs.CV]","DOI":"10.1145\/3641519.3657481"},{"key":"e_1_3_3_3_59_1","doi-asserted-by":"crossref","unstructured":"Zuopeng Yang Daqing Liu Chaoyue Wang J. Yang and Dacheng Tao. 2022a. Modeling Image Composition for Complex Scene Generation. CVPR (2022) 7754\u20137763.","DOI":"10.1109\/CVPR52688.2022.00761"},{"key":"e_1_3_3_3_60_1","unstructured":"Shengming Yin Chenfei Wu Jian Liang Jie Shi Houqiang Li Gong Ming and Nan Duan. 2023. DragNUWA: Fine-grained Control in Video Generation by Integrating Text Image and Trajectory. arXiv. https:\/\/www.microsoft.com\/en-us\/research\/publication\/dragnuwa-fine-grained-control-in-video-generation-by-integrating-text-image-and-trajectory\/"},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang and Maneesh Agrawala. 2023. Adding Conditional Control to Text-to-Image Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2302.05543\u00a0[cs.CV]","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_3_62_1","doi-asserted-by":"publisher","unstructured":"Bo Zhao Weidong Yin Lili Meng and Leonid Sigal. 2020. Layout2image: Image Generation from Layout. Int. J. Comput. Vis. 128 10 (2020) 2418\u20132435. 10.1007\/s11263-020-01300-7 https:\/\/dl.acm.org\/doi\/10.1007\/s11263-020-01300-7","DOI":"10.1007\/s11263-020-01300-7"}],"event":{"name":"SA '24: SIGGRAPH Asia 2024 Conference Papers","location":"Tokyo Japan","acronym":"SA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["SIGGRAPH Asia 2024 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687652","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3680528.3687652","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:20Z","timestamp":1750295900000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3680528.3687652"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":61,"alternative-id":["10.1145\/3680528.3687652","10.1145\/3680528"],"URL":"https:\/\/doi.org\/10.1145\/3680528.3687652","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}