{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:47:48Z","timestamp":1777657668246,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["U23B2048, 62076186 and 62225113"],"award-info":[{"award-number":["U23B2048, 62076186 and 62225113"]}]},{"name":"the Innovative Research Group Project of Hubei Province","award":["2024AFA017"],"award-info":[{"award-number":["2024AFA017"]}]},{"name":"the National Key Research and Development Program of China","award":["2023YFC2705700"],"award-info":[{"award-number":["2023YFC2705700"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680692","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"6979-6988","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["When ControlNet Meets Inexplicit Masks: A Case Study of ControlNet on its Contour-following Ability"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1579-2218","authenticated-orcid":false,"given":"Wenjie","family":"Xuan","sequence":"first","affiliation":[{"name":"School of Computer Science, National Engineering Research Center for Multimedia Software, Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9931-5138","authenticated-orcid":false,"given":"Yufei","family":"Xu","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0682-8645","authenticated-orcid":false,"given":"Shanshan","family":"Zhao","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9002-1029","authenticated-orcid":false,"given":"Chaoyue","family":"Wang","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3907-8820","authenticated-orcid":false,"given":"Juhua","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, National Engineering Research Center for Multimedia Software, Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0059-8458","authenticated-orcid":false,"given":"Bo","family":"Du","sequence":"additional","affiliation":[{"name":"School of Computer Science, National Engineering Research Center for Multimedia Software, Wuhan University, Wuhan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01762"},{"key":"e_1_3_2_1_2_1","volume-title":"LooseControl: Lifting ControlNet for Generalized Depth Conditioning. arXiv preprint arXiv:2312.03079","author":"Bhat Shariq Farooq","year":"2023","unstructured":"Shariq Farooq Bhat, Niloy J Mitra, and Peter Wonka. 2023. LooseControl: Lifting ControlNet for Generalized Depth Conditioning. arXiv preprint arXiv:2312.03079 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Z. Cao G. Hidalgo Martinez T. Simon S. Wei and Y. A. Sheikh. 2019. OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. IEEE Transactions on Pattern Analysis and Machine Intelligence (2019)."},{"key":"e_1_3_2_1_4_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612489"},{"key":"e_1_3_2_1_6_1","unstructured":"Patrick Esser Sumith Kulal Andreas Blattmann Rahim Entezari Jonas M\u00fcller Harry Saini Yam Levi Dominik Lorenz Axel Sauer Frederic Boesel et al. 2024. Scaling rectified flow transformers for high-resolution image synthesis. arXiv preprint arXiv:2403.03206 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_2_1_8_1","volume-title":"Benchmarking spatial relationships in text-to-image generation. arXiv preprint arXiv:2212.10015","author":"Gokhale Tejas","year":"2022","unstructured":"Tejas Gokhale, Hamid Palangi, Besmira Nushi, Vibhav Vineet, Eric Horvitz, Ece Kamar, Chitta Baral, and Yezhou Yang. 2022. Benchmarking spatial relationships in text-to-image generation. arXiv preprint arXiv:2212.10015 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19953"},{"key":"e_1_3_2_1_10_1","volume-title":"SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models. arXiv preprint arXiv:2311.16933","author":"Guo Yuwei","year":"2023","unstructured":"Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai. 2023. SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models. arXiv preprint arXiv:2311.16933 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"e_1_3_2_1_12_1","volume-title":"HyperNetworks. In International Conference on Learning Representations.","author":"Ha David","year":"2016","unstructured":"David Ha, Andrew M Dai, and Quoc V Le. 2016. HyperNetworks. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_13_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3021209"},{"key":"e_1_3_2_1_16_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications.","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications."},{"key":"e_1_3_2_1_17_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Hu Minghui","year":"2023","unstructured":"Minghui Hu, Jianbin Zheng, Daqing Liu, Chuanxia Zheng, Chaoyue Wang, Dacheng Tao, and Tat-Jen Cham. 2023. Cocktail: Mixing multi-modality control for text-conditional image generation. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning. 13753--13773","author":"Huang Lianghua","year":"2023","unstructured":"Lianghua Huang, Di Chen, Yu Liu, Yujun Shen, Deli Zhao, and Jingren Zhou. 2023. Composer: creative and controllable image synthesis with composable conditions. In Proceedings of the 40th International Conference on Machine Learning. 13753--13773."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Feng Liang Bichen Wu Jialiang Wang Licheng Yu Kunpeng Li Yinan Zhao Ishan Misra Jia-Bin Huang Peizhao Zhang Peter Vajda et al. 2023. FlowVid: Taming Imperfect Optical Flows for Consistent Video-to-Video Synthesis. arXiv preprint arXiv:2312.17681 (2023).","DOI":"10.1109\/CVPR52733.2024.00784"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_23_1","volume-title":"Dirk Weissenborn, and et.al.","author":"Minderer Matthias","year":"2022","unstructured":"Matthias Minderer, Alexey Gritsenko, Austin Stone Maxim Neumann, Dirk Weissenborn, and et.al. 2022. Simple Open-Vocabulary Object Detection with Vision Transformers. ECCV (2022)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"e_1_3_2_1_25_1","volume-title":"Wuerstchen: An Efficient Architecture for Large-Scale Text-to-Image Diffusion Models. arxiv: 2306.00637 [cs.CV]","author":"Pernias Pablo","year":"2023","unstructured":"Pablo Pernias, Dominic Rampas, Mats L. Richter, Christopher J. Pal, and Marc Aubreville. 2023. Wuerstchen: An Efficient Architecture for Large-Scale Text-to-Image Diffusion Models. arxiv: 2306.00637 [cs.CV]"},{"key":"e_1_3_2_1_26_1","volume-title":"Caiming Xiong, Silvio Savarese, et al.","author":"Qin Can","year":"2024","unstructured":"Can Qin, Shu Zhang, Ning Yu, Yihao Feng, Xinyi Yang, Yingbo Zhou, Huan Wang, Juan Carlos Niebles, Caiming Xiong, Silvio Savarese, et al. 2024. UniControl: A Unified Diffusion Model for Controllable Visual Generation In the Wild. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_28_1","volume-title":"International Conference on Machine Learning. PMLR, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821--8831."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3019967"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_31_1","unstructured":"Runway. 2022. Stable Diffusion v1-5. https:\/\/huggingface.co\/runwayml\/stable-diffusion-v1-5"},{"key":"e_1_3_2_1_32_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925954"},{"key":"e_1_3_2_1_34_1","volume-title":"Adversarial diffusion distillation. arXiv preprint arXiv:2311.17042","author":"Sauer Axel","year":"2023","unstructured":"Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach. 2023. Adversarial diffusion distillation. arXiv preprint arXiv:2311.17042 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"SmartMask: Context Aware High-Fidelity Mask Generation for Fine-grained Object Insertion and Layout Control. arXiv preprint arXiv:2312.05039","author":"Singh Jaskirat","year":"2023","unstructured":"Jaskirat Singh, Jianming Zhang, Qing Liu, Cameron Smith, Zhe Lin, and Liang Zheng. 2023. SmartMask: Context Aware High-Fidelity Mask Generation for Fine-grained Object Insertion and Layout Control. arXiv preprint arXiv:2312.05039 (2023)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591560"},{"key":"e_1_3_2_1_37_1","volume-title":"Disco: Disentangled control for referring human dance generation in real world. arXiv preprint arXiv:2307.00040","author":"Wang Tan","year":"2023","unstructured":"Tan Wang, Linjie Li, Kevin Lin, Yuanhao Zhai, Chung-Ching Lin, Zhengyuan Yang, Hanwang Zhang, Zicheng Liu, and Lijuan Wang. 2023. Disco: Disentangled control for referring human dance generation in real world. arXiv preprint arXiv:2307.00040 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Xie Jinheng","year":"2023","unstructured":"Jinheng Xie, Kai Ye, Yudong Li, Yuexiang Li, Kevin Qinghong Lin, Yefeng Zheng, Linlin Shen, and Mike Zheng Shou. 2023. Learning Visual Prior via Generative Pre-Training. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.164"},{"key":"e_1_3_2_1_40_1","volume-title":"Hanshu Yan, Jia-Wei Liu, Chenxu Zhang, Jiashi Feng, and Mike Zheng Shou.","author":"Xu Zhongcong","year":"2023","unstructured":"Zhongcong Xu, Jianfeng Zhang, Jun Hao Liew, Hanshu Yan, Jia-Wei Liu, Chenxu Zhang, Jiashi Feng, and Mike Zheng Shou. 2023. MagicAnimate: Temporally Consistent Human Image Animation using Diffusion Model. In arXiv."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612136"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_43_1","volume-title":"Controllable Text-to-Image Generation with GPT-4. arXiv preprint arXiv:2305.18583","author":"Zhang Tianjun","year":"2023","unstructured":"Tianjun Zhang, Yi Zhang, Vibhav Vineet, Neel Joshi, and Xin Wang. 2023. Controllable Text-to-Image Generation with GPT-4. arXiv preprint arXiv:2305.18583 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zhao Shihao","year":"2024","unstructured":"Shihao Zhao, Dongdong Chen, Yen-Chun Chen, Jianmin Bao, Shaozhe Hao, Lu Yuan, and Kwan-Yee K Wong. 2024. Uni-controlnet: All-in-one control to text-to-image diffusion models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models. NeurIPS","author":"Zhao Wenliang","year":"2023","unstructured":"Wenliang Zhao, Lujia Bai, Yongming Rao, Jie Zhou, and Jiwen Lu. 2023. UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of Diffusion Models. NeurIPS (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680692","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680692","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:57Z","timestamp":1750295877000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680692"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":45,"alternative-id":["10.1145\/3664647.3680692","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680692","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}