{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:43:32Z","timestamp":1777653812112,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755169","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"9862-9870","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["CoCoNO: Attention Contrast-and-Complete for Initial Noise Optimization in Text-to-Image Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7007-006X","authenticated-orcid":false,"given":"Aravindan Kamatchi","family":"Sundaram","sequence":"first","affiliation":[{"name":"Data Science and Artificial Intelligence, Indian Institute of Technology, Madras, Chennai, Tamil Nadu, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3658-4171","authenticated-orcid":false,"given":"Ujjayan","family":"Pal","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Indian Institute of Technology, Madras, Chennai, Tamil Nadu, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2069-1830","authenticated-orcid":false,"given":"Abhimanyu","family":"Chauhan","sequence":"additional","affiliation":[{"name":"Mechanical Engineering, Indian Institute of Technology, Bombay, Mumbai, Maharashtra, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5368-2442","authenticated-orcid":false,"given":"Aishwarya","family":"Agarwal","sequence":"additional","affiliation":[{"name":"Adobe Research, Bengaluru, Karnataka, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7627-7765","authenticated-orcid":false,"given":"Srikrishna","family":"Karanam","sequence":"additional","affiliation":[{"name":"Adobe Research, Bengaluru, Karnataka, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00217"},{"key":"e_1_3_2_1_2_1","volume-title":"Language models are few-shot learners. arXiv preprint ArXiv:2005.14165","author":"Brown Tom B","year":"2020","unstructured":"Tom B Brown. Language models are few-shot learners. arXiv preprint ArXiv:2005.14165, 2020."},{"key":"e_1_3_2_1_3_1","volume-title":"Attendand- excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG), 42(4):1--10","author":"Chefer Hila","year":"2023","unstructured":"Hila Chefer, Yuval Alaluf, Yael Vinker, Lior Wolf, and Daniel Cohen-Or. Attendand- excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG), 42(4):1--10, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"Reno: Enhancing one-step text-to-image models through reward-based noise optimization. arXiv preprint arXiv:2406.04312","author":"Eyring Luca","year":"2024","unstructured":"Luca Eyring, Shyamgopal Karthik, Karsten Roth, Alexey Dosovitskiy, and Zeynep Akata. Reno: Enhancing one-step text-to-image models through reward-based noise optimization. arXiv preprint arXiv:2406.04312, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"Reno: Enhancing one-step text-to-image models through reward-based noise optimization","author":"Eyring Luca","year":"2024","unstructured":"Luca Eyring, Shyamgopal Karthik, Karsten Roth, Alexey Dosovitskiy, and Zeynep Akata. Reno: Enhancing one-step text-to-image models through reward-based noise optimization, 2024."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589335.3651927"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00896"},{"key":"e_1_3_2_1_8_1","first-page":"36","article-title":"Optimizing prompts for textto- image generation","author":"Hao Yaru","year":"2024","unstructured":"Yaru Hao, Zewen Chi, Li Dong, and Furu Wei. Optimizing prompts for textto- image generation. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_9_1","volume-title":"Ben Hoover, Pinar Yanardag, and Duen Horng Chau. Conceptattention: Diffusion transformers learn highly interpretable features","author":"Helbling Alec","year":"2025","unstructured":"Alec Helbling, Tuna Han Salih Meral, Ben Hoover, Pinar Yanardag, and Duen Horng Chau. Conceptattention: Diffusion transformers learn highly interpretable features, 2025."},{"key":"e_1_3_2_1_10_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626, 2022."},{"key":"e_1_3_2_1_11_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598, 2022."},{"key":"e_1_3_2_1_12_1","volume-title":"et al. Introvae: Introspective variational autoencoders for photographic image synthesis. Advances in neural information processing systems, 31","author":"Huang Huaibo","year":"2018","unstructured":"Huaibo Huang, Ran He, Zhenan Sun, Tieniu Tan, et al. Introvae: Introspective variational autoencoders for photographic image synthesis. Advances in neural information processing systems, 31, 2018."},{"key":"e_1_3_2_1_13_1","first-page":"78723","article-title":"A comprehensive benchmark for open-world compositional text-to-image generation","volume":"36","author":"Huang Kaiyi","year":"2023","unstructured":"Kaiyi Huang, Kaiyue Sun, Enze Xie, Zhenguo Li, and Xihui Liu. T2i-compbench: A comprehensive benchmark for open-world compositional text-to-image generation. Advances in Neural Information Processing Systems, 36:78723--78747, 2023.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_16_1","volume-title":"Pladis: Pushing the limits of attention in diffusion models at inference time by leveraging sparsity","author":"Kim Kwanyoung","year":"2025","unstructured":"Kwanyoung Kim and Byeongsu Sim. Pladis: Pushing the limits of attention in diffusion models at inference time by leveraging sparsity, 2025."},{"key":"e_1_3_2_1_17_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114, 2013."},{"key":"e_1_3_2_1_18_1","volume-title":"The hungarian method for the assignment problem. Naval research logistics quarterly, 2(1--2):83--97","author":"Kuhn Harold W","year":"1955","unstructured":"Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly, 2(1--2):83--97, 1955."},{"key":"e_1_3_2_1_19_1","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning, pages 12888--12900. PMLR, 2022."},{"key":"e_1_3_2_1_20_1","volume-title":"34th British Machine Vision Conference 2023, BMVC 2023","author":"Li Yumeng","year":"2023","unstructured":"Yumeng Li, Margret Keuper, Dan Zhang, and Anna Khoreva. Divide & bind your attention for improved generative semantic nursing. In 34th British Machine Vision Conference 2023, BMVC 2023, 2023."},{"key":"e_1_3_2_1_21_1","volume-title":"Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291","author":"Lin Zhiqiu","year":"2024","unstructured":"Zhiqiu Lin, Deepak Pathak, Baiqi Li, Jiayao Li, Xide Xia, Graham Neubig, Pengchuan Zhang, and Deva Ramanan. Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291, 2024."},{"key":"e_1_3_2_1_22_1","volume-title":"Improving text-to-image consistency via automatic prompt optimization. arXiv preprint arXiv:2403.17804","author":"Ma\u00f1as Oscar","year":"2024","unstructured":"Oscar Ma\u00f1as, Pietro Astolfi, Melissa Hall, Candace Ross, Jack Urbanek, Adina Williams, Aishwarya Agrawal, Adriana Romero-Soriano, and Michal Drozdzal. Improving text-to-image consistency via automatic prompt optimization. arXiv preprint arXiv:2403.17804, 2024."},{"key":"e_1_3_2_1_23_1","volume-title":"Mohammad Hossein Rohban, and Mahdieh Soleymani Baghshah. Attention overlap is responsible for the entity missing problem in text-to-image diffusion models!","author":"Marioriyad Arash","year":"2025","unstructured":"Arash Marioriyad, Mohammadali Banayeeanzade, Reza Abbasi, Mohammad Hossein Rohban, and Mahdieh Soleymani Baghshah. Attention overlap is responsible for the entity missing problem in text-to-image diffusion models!, 2025."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00860"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02514"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00244"},{"key":"e_1_3_2_1_27_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. Learning transferable visual models from natural language supervision. In International conference on machine learning, pages 8748--8763. PMLR, 2021."},{"key":"e_1_3_2_1_28_1","volume-title":"Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140):1--67","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, 21(140):1--67, 2020."},{"key":"e_1_3_2_1_29_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 1(2):3","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 1(2):3, 2022."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_31_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, 35: 36479--36494","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, 35: 36479--36494, 2022."},{"key":"e_1_3_2_1_32_1","volume-title":"Notes on kullback-leibler divergence and likelihood. arXiv preprint arXiv:1404.2000","author":"Shlens Jonathon","year":"2014","unstructured":"Jonathon Shlens. Notes on kullback-leibler divergence and likelihood. arXiv preprint arXiv:1404.2000, 2014."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01602"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_1_35_1","volume-title":"Neural discrete representation learning. Advances in neural information processing systems, 30","author":"Den Oord Aaron Van","year":"2017","unstructured":"Aaron Van Den Oord, Oriol Vinyals, et al. Neural discrete representation learning. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_36_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_37_1","volume-title":"Investigating prompt engineering in diffusion models. arXiv preprint arXiv:2211.15462","author":"Witteveen Sam","year":"2022","unstructured":"Sam Witteveen and Martin Andrews. Investigating prompt engineering in diffusion models. arXiv preprint arXiv:2211.15462, 2022."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_39_1","volume-title":"Improving text-to-image synthesis using contrastive learning. arXiv preprint arXiv:2107.02423","author":"Ye Hui","year":"2021","unstructured":"Hui Ye, Xiulong Yang, Martin Takac, Rajshekhar Sunderraman, and Shihao Ji. Improving text-to-image synthesis using contrastive learning. arXiv preprint arXiv:2107.02423, 2021."},{"key":"e_1_3_2_1_40_1","first-page":"833","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Zhang Han","year":"2021","unstructured":"Han Zhang, Jing Yu Koh, Jason Baldridge, Honglak Lee, and Yinfei Yang. Crossmodal contrastive learning for text-to-image generation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pages 833--842, 2021."},{"key":"e_1_3_2_1_41_1","first-page":"70","volume-title":"European Conference on Computer Vision","author":"Zhang Yang","year":"2024","unstructured":"Yang Zhang, Teoh Tze Tzun, Lim Wei Hern, and Kenji Kawaguchi. Enhancing semantic fidelity in text-to-image synthesis: Attention regulation in diffusion models. In European Conference on Computer Vision, pages 70--86. Springer, 2024."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"},{"key":"e_1_3_2_1_43_1","volume-title":"Toward multimodal image-to-image translation. Advances in neural information processing systems, 30","author":"Zhu Jun-Yan","year":"2017","unstructured":"Jun-Yan Zhu, Richard Zhang, Deepak Pathak, Trevor Darrell, Alexei A Efros, OliverWang, and Eli Shechtman. Toward multimodal image-to-image translation. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755169","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:59Z","timestamp":1765343099000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755169"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":44,"alternative-id":["10.1145\/3746027.3755169","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755169","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}