{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T15:49:18Z","timestamp":1774021758523,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62201484, 62422606, 624B2124"],"award-info":[{"award-number":["62201484, 62422606, 624B2124"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,10]]},"DOI":"10.1145\/3721238.3730684","type":"proceedings-article","created":{"date-parts":[[2025,7,23]],"date-time":"2025-07-23T08:40:47Z","timestamp":1753260047000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DreamMask: Boosting Open-vocabulary Panoptic Segmentation with Synthetic Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-2978-666X","authenticated-orcid":false,"given":"Yuanpeng","family":"Tu","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5008-4720","authenticated-orcid":false,"given":"Xi","family":"Chen","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9525-0288","authenticated-orcid":false,"given":"Ser-Nam","family":"Lim","sequence":"additional","affiliation":[{"name":"University of Central Florida, Orlando, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8277-2706","authenticated-orcid":false,"given":"Hengshuang","family":"Zhao","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,27]]},"reference":[{"key":"e_1_3_3_3_2_1","volume-title":"CVPR","year":"2018","unstructured":"2018. Learning from synthetic data: Addressing domain shift for semantic segmentation. In CVPR."},{"key":"e_1_3_3_3_3_1","volume-title":"CVPR","year":"2019","unstructured":"2019. Unsupervised domain adaptation for semantic segmentation of urban scenes. In CVPR."},{"key":"e_1_3_3_3_4_1","volume-title":"CVPR","year":"2023","unstructured":"2023. Unsupervised Contrastive Learning Framework for Domain Adaptive Semantic Segmentation. In CVPR."},{"key":"e_1_3_3_3_5_1","unstructured":"Antreas Antoniou Amos Storkey and Harrison Edwards. 2017. Data augmentation generative adversarial networks. arXiv:https:\/\/arXiv.org\/abs\/1711.04340 (2017)."},{"key":"e_1_3_3_3_6_1","unstructured":"Shekoofeh Azizi Simon Kornblith Chitwan Saharia Mohammad Norouzi and David\u00a0J Fleet. 2023. Synthetic data from diffusion models improves imagenet classification. TMLR (2023)."},{"key":"e_1_3_3_3_7_1","volume-title":"NeurIPS","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et\u00a0al. 2020. Language models are few-shot learners. In NeurIPS."},{"key":"e_1_3_3_3_8_1","volume-title":"NeurIPS","author":"Bucher Maxime","year":"2019","unstructured":"Maxime Bucher, Tuan-Hung Vu, Matthieu Cord, and Patrick P\u00e9rez. 2019. Zero-shot semantic segmentation. In NeurIPS."},{"key":"e_1_3_3_3_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00132"},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00111"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01249"},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"crossref","unstructured":"Bo Cheng Yuhang Ma Liebucha Wu Shanyuan Liu Ao Ma Xiaoyu Wu Dawei Leng and Yuhui Yin. 2024. HiCo: Hierarchical Controllable Diffusion Model for Layout-to-image Generation. arXiv:https:\/\/arXiv.org\/abs\/2410.14324 (2024).","DOI":"10.52202\/079017-4094"},{"key":"e_1_3_3_3_13_1","unstructured":"Seokju Cho Heeseong Shin Sunghwan Hong Seungjun An Seungjun Lee Anurag Arnab Paul\u00a0Hongsuck Seo and Seungryong Kim. 2023. CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation. arXiv:https:\/\/arXiv.org\/abs\/2303.11797 (2023)."},{"key":"e_1_3_3_3_14_1","unstructured":"Aakanksha Chowdhery Sharan Narang Jacob Devlin Maarten Bosma Gaurav Mishra Adam Roberts Paul Barham Hyung\u00a0Won Chung Charles Sutton Sebastian Gehrmann et\u00a0al. 2023. Palm: Scaling language modeling with pathways. JMLR (2023)."},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Son\u00a0D Dao Hengcan Shi Dinh Phung and Jianfei Cai. 2023. Class Enhancement Losses with Pseudo Labels for Open-Vocabulary Semantic Segmentation. IEEE Transactions on Multimedia (2023).","DOI":"10.1109\/TMM.2023.3330102"},{"key":"e_1_3_3_3_16_1","volume-title":"NAACL","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL."},{"key":"e_1_3_3_3_17_1","volume-title":"NeurIPS","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. In NeurIPS."},{"key":"e_1_3_3_3_18_1","volume-title":"ICML","author":"Ding Zheng","year":"2023","unstructured":"Zheng Ding, Jieke Wang, and Zhuowen Tu. 2023. Open-Vocabulary Universal Image Segmentation with MaskCLIP. In ICML."},{"key":"e_1_3_3_3_19_1","volume-title":"ICLR","author":"Dockhorn Tim","year":"2022","unstructured":"Tim Dockhorn, Arash Vahdat, and Karsten Kreis. 2022. Score-based generative modeling with critically-damped langevin diffusion. In ICLR."},{"key":"e_1_3_3_3_20_1","unstructured":"Danny etc Driess. 2023. Palm-e: An embodied multimodal language model. arXiv (2023)."},{"key":"e_1_3_3_3_21_1","unstructured":"Hugo etc Touvron. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv (2023)."},{"key":"e_1_3_3_3_22_1","doi-asserted-by":"crossref","unstructured":"Mark Everingham Luc Van\u00a0Gool Christopher\u00a0KI Williams John Winn and Andrew Zisserman. 2010. The pascal visual object classes (voc) challenge. IJCV (2010).","DOI":"10.1007\/s11263-009-0275-4"},{"key":"e_1_3_3_3_23_1","volume-title":"NeurIPS","author":"Feng Weixi","year":"2023","unstructured":"Weixi Feng, Wanrong Zhu, Tsu-jui Fu, Varun Jampani, Arjun Akula, Xuehai He, Sugato Basu, Xin\u00a0Eric Wang, and William\u00a0Yang Wang. 2023. LayoutGPT: Compositional Visual Planning and Generation with Large Language Models. In NeurIPS."},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00106"},{"key":"e_1_3_3_3_26_1","volume-title":"ICLR","author":"He Ruifei","year":"2023","unstructured":"Ruifei He, Shuyang Sun, Xin Yu, Chuhui Xue, Wenqing Zhang, Philip Torr, Song Bai, and Xiaojuan Qi. 2023b. Is synthetic data from generative models ready for image recognition?. In ICLR."},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01081"},{"key":"e_1_3_3_3_28_1","volume-title":"NeurIPS","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NeurIPS."},{"key":"e_1_3_3_3_29_1","unstructured":"Linhao Huang and Jing Yu. 2025. ToLo: A Two-Stage Training-Free Layout-To-Image Generation Framework For High-Overlap Layouts. arXiv:https:\/\/arXiv.org\/abs\/2503.01667 (2025)."},{"key":"e_1_3_3_3_30_1","unstructured":"Shaohan Huang Li Dong Wenhui Wang Yaru Hao Saksham Singhal Shuming Ma Tengchao Lv Lei Cui Owais\u00a0Khan Mohammed Barun Patra et\u00a0al. 2024. Language is not all you need: Aligning perception with language models. NeurIPS (2024)."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1549"},{"key":"e_1_3_3_3_32_1","volume-title":"ECCV","author":"Jiao Siyu","year":"2024","unstructured":"Siyu Jiao, Hongguang Zhu, Jiannan Huang, Yao Zhao, Yunchao Wei, and Shi Humphrey. 2024. Collaborative Vision-Text Representation Optimizing for Open-Vocabulary Segmentation. In ECCV."},{"key":"e_1_3_3_3_33_1","volume-title":"NAACL","author":"Kenton Jacob Devlin Ming-Wei\u00a0Chang","year":"2019","unstructured":"Jacob Devlin Ming-Wei\u00a0Chang Kenton and Lee\u00a0Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In NAACL."},{"key":"e_1_3_3_3_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00656"},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_3_36_1","volume-title":"ECCV","author":"Lan Mengcheng","year":"2024","unstructured":"Mengcheng Lan, Chaofeng Chen, Yiping Ke, Xinjiang Wang, Litong Feng, and Wayne Zhang. 2024. ProxyCLIP: Proxy Attention Improves CLIP for Open-Vocabulary Segmentation. In ECCV."},{"key":"e_1_3_3_3_37_1","volume-title":"ICLR","author":"Li Boyi","year":"2022","unstructured":"Boyi Li, Kilian\u00a0Q Weinberger, Serge Belongie, Vladlen Koltun, and Ren\u00e9 Ranftl. 2022. Language-driven semantic segmentation. In ICLR."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_3_3_39_1","unstructured":"Yuheng Li Haotian Liu Qingyang Wu Fangzhou Mu Jianwei Yang Jianfeng Gao Chunyuan Li and Yong\u00a0Jae Lee. 2023. GLIGEN: Open-Set Grounded Text-to-Image Generation. CVPR (2023)."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"e_1_3_3_3_41_1","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C\u00a0Lawrence Zitnick. 2014. Microsoft coco: Common objects in context."},{"key":"e_1_3_3_3_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00633"},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_3_44_1","volume-title":"CVPR","author":"Qin Jie","year":"2023","unstructured":"Jie Qin, Jie Wu, Pengxiang Yan, Ming Li, Ren Yuxi, Xuefeng Xiao, Yitong Wang, Rui Wang, Shilei Wen, Xin Pan, et\u00a0al. 2023. FreeSeg: Unified, Universal and Open-Vocabulary Image Segmentation. In CVPR."},{"key":"e_1_3_3_3_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612012"},{"key":"e_1_3_3_3_46_1","volume-title":"ICML","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In ICML."},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00774"},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.606"},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"crossref","unstructured":"Haruka Takahashi and Shigeru Kuriyama. 2024. Text2Layout: Layout Generation From Text Representation Using Transformer. IEEE Access (2024).","DOI":"10.1109\/ACCESS.2024.3452957"},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"crossref","unstructured":"Bart Thomee David\u00a0A Shamma Gerald Friedland Benjamin Elizalde Karl Ni Douglas Poland Damian Borth and Li-Jia Li. 2016. YFCC100M: The new data in multimedia research. Commun. ACM (2016).","DOI":"10.1145\/2812802"},{"key":"e_1_3_3_3_53_1","unstructured":"Brandon Trabucco Kyle Doherty Max Gurinas and Ruslan Salakhutdinov. 2023. Effective data augmentation with diffusion models. arXiv:https:\/\/arXiv.org\/abs\/2302.07944 (2023)."},{"key":"e_1_3_3_3_54_1","unstructured":"Vibashan VS Shubhankar Borse Hyojin Park Debasmit Das Vishal Patel Munawar Hayat and Fatih Porikli. 2024. PosSAM: Panoptic Open-vocabulary Segment Anything. arXiv:https:\/\/arXiv.org\/abs\/2403.09620 (2024)."},{"key":"e_1_3_3_3_55_1","volume-title":"NeurIPS","author":"Wang Xudong","year":"2023","unstructured":"Xudong Wang, Shufan Li, Konstantinos Kallidromitis, Yusuke Kato, Kazuki Kozuka, and Trevor Darrell. 2023a. Hierarchical open-vocabulary universal image segmentation. In NeurIPS."},{"key":"e_1_3_3_3_56_1","unstructured":"Yibin Wang Weizhong Zhang Jianwei Zheng and Cheng Jin. 2023b. Enhancing Object Coherence in Layout-to-Image Synthesis. arXiv:https:\/\/arXiv.org\/abs\/2311.10522 (2023)."},{"key":"e_1_3_3_3_57_1","unstructured":"Zhaoqing Wang Xiaobo Xia Ziye Chen Xiao He Yandong Guo Mingming Gong and Tongliang Liu. 2024. Open-Vocabulary Segmentation with Unpaired Mask-Text Supervision. arXiv:https:\/\/arXiv.org\/abs\/2402.08960 (2024)."},{"key":"e_1_3_3_3_58_1","unstructured":"Size Wu Wenwei Zhang Lumin Xu Sheng Jin Xiangtai Li Wentao Liu and Chen\u00a0Change Loy. 2023a. CLIPSelf: Vision Transformer Distills Itself for Open-Vocabulary Dense Prediction. arXiv:https:\/\/arXiv.org\/abs\/2310.01403 (2023)."},{"key":"e_1_3_3_3_59_1","unstructured":"Weijia Wu Yuzhong Zhao Hao Chen Yuchao Gu Rui Zhao Yefei He Hong Zhou Mike\u00a0Zheng Shou and Chunhua Shen. 2023b. DatasetDM: Synthesizing Data with Perception Annotations Using Diffusion Models. NeurIPS (2023)."},{"key":"e_1_3_3_3_60_1","unstructured":"Weijia Wu Yuzhong Zhao Mike\u00a0Zheng Shou Hong Zhou and Chunhua Shen. 2023c. Diffumask: Synthesizing images with pixel-level annotations for semantic segmentation using diffusion models. ICCV (2023)."},{"key":"e_1_3_3_3_61_1","unstructured":"Yinwei Wu Xianpan Zhou Bing Ma Xuefeng Su Kai Ma and Xinchao Wang. 2024. IFAdapter: Instance Feature Control for Grounded Text-to-Image Generation. arXiv:https:\/\/arXiv.org\/abs\/2409.08240 (2024)."},{"key":"e_1_3_3_3_62_1","unstructured":"Jiahao Xie Wei Li Xiangtai Li Ziwei Liu Yew\u00a0Soon Ong and Chen\u00a0Change Loy. 2023. MosaicFusion: Diffusion Models as Data Augmenters for Large Vocabulary Instance Segmentation. arXiv:https:\/\/arXiv.org\/abs\/2309.13042 (2023)."},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"e_1_3_3_3_64_1","unstructured":"Jiarui Xu Sifei Liu Arash Vahdat Wonmin Byeon Xiaolong Wang and Shalini De\u00a0Mello. 2023a. Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.04803 (2023)."},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"e_1_3_3_3_66_1","volume-title":"NeurIPS","author":"Yang Lihe","year":"2023","unstructured":"Lihe Yang, Xiaogang Xu, Bingyi Kang, Yinghuan Shi, and Hengshuang Zhao. 2023. FreeMask: Synthetic Images with Dense Annotations Make Stronger Segmentation Models. In NeurIPS."},{"key":"e_1_3_3_3_67_1","volume-title":"NeurIPS","author":"Yu Qihang","year":"2023","unstructured":"Qihang Yu, Ju He, Xueqing Deng, Xiaohui Shen, and Liang-Chieh Chen. 2023. Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen Convolutional CLIP. In NeurIPS."},{"key":"e_1_3_3_3_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"}],"event":{"name":"SIGGRAPH Conference Papers '25: Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers","location":"Vancouver BC Canada","acronym":"SIGGRAPH Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the Special Interest Group on Computer Graphics and Interactive Techniques Conference Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3721238.3730684","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T14:56:37Z","timestamp":1774018597000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721238.3730684"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,27]]},"references-count":67,"alternative-id":["10.1145\/3721238.3730684","10.1145\/3721238"],"URL":"https:\/\/doi.org\/10.1145\/3721238.3730684","relation":{},"subject":[],"published":{"date-parts":[[2025,7,27]]},"assertion":[{"value":"2025-07-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}