{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T08:22:18Z","timestamp":1758270138867,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["U19A2057,62102384"],"award-info":[{"award-number":["U19A2057,62102384"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["WK3480000008,WK3480000010"],"award-info":[{"award-number":["WK3480000008,WK3480000010"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548154","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:43:12Z","timestamp":1665416592000},"page":"4327-4335","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Background Layout Generation and Object Knowledge Transfer for Text-to-Image Generation"],"prefix":"10.1145","author":[{"given":"Zhuowei","family":"Chen","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Zhendong","family":"Mao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Shancheng","family":"Fang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Bo","family":"Hu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096","author":"Brock Andrew","year":"2018","unstructured":"Andrew Brock , Jeff Donahue , and Karen Simonyan . 2018. Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 ( 2018 ). Andrew Brock, Jeff Donahue, and Karen Simonyan. 2018. Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 (2018)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Holger Caesar Jasper Uijlings and Vittorio Ferrari. 2018. COCO-Stuff: Thing and stuff classes in context. In CVPR. 1209--1218.  Holger Caesar Jasper Uijlings and Vittorio Ferrari. 2018. COCO-Stuff: Thing and stuff classes in context. In CVPR. 1209--1218.","DOI":"10.1109\/CVPR.2018.00132"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Kevin Chen Christopher B. Choy Manolis Savva Angel X. Chang Thomas Funkhouser and Silvio Savarese. 2018. Text2Shape: Generating shapes from natural language by learning joint embeddings. In ACCV. 100--116.  Kevin Chen Christopher B. Choy Manolis Savva Angel X. Chang Thomas Funkhouser and Silvio Savarese. 2018. Text2Shape: Generating shapes from natural language by learning joint embeddings. In ACCV. 100--116.","DOI":"10.1007\/978-3-030-20893-6_7"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Jia Deng Wei Dong Richard Socher Li-Jia Li Kai Li and Li Fei-Fei. 2009. ImageNet: A large-scale hierarchical image database. In CVPR. 248--255.  Jia Deng Wei Dong Richard Socher Li-Jia Li Kai Li and Li Fei-Fei. 2009. ImageNet: A large-scale hierarchical image database. In CVPR. 248--255.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_5_1","volume-title":"A learned representation for artistic style. arXiv preprint arXiv:1610.07629","author":"Dumoulin Vincent","year":"2016","unstructured":"Vincent Dumoulin , Jonathon Shlens , and Manjunath Kudlur . 2016. A learned representation for artistic style. arXiv preprint arXiv:1610.07629 ( 2016 ). Vincent Dumoulin, Jonathon Shlens, and Manjunath Kudlur. 2016. A learned representation for artistic style. arXiv preprint arXiv:1610.07629 (2016)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Kamal Gupta Justin Lazarow Alessandro Achille Larry S. Davis Vijay Mahadevan and Abhinav Shrivastava. 2021. LayoutTransformer: Layout generation and completion with self-attention. In ICCV. 1004--1014.  Kamal Gupta Justin Lazarow Alessandro Achille Larry S. Davis Vijay Mahadevan and Abhinav Shrivastava. 2021. LayoutTransformer: Layout generation and completion with self-attention. In ICCV. 1004--1014.","DOI":"10.1109\/ICCV48922.2021.00104"},{"key":"e_1_3_2_2_7_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a local nash equilibrium. In NeurIPS. 6629--6640.  Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a local nash equilibrium. In NeurIPS. 6629--6640."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3021209"},{"key":"e_1_3_2_2_9_1","volume-title":"The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751","author":"Holtzman Ari","year":"2019","unstructured":"Ari Holtzman , Jan Buys , Li Du , Maxwell Forbes , and Yejin Choi . 2019. The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751 ( 2019 ). Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, and Yejin Choi. 2019. The curious case of neural text degeneration. arXiv preprint arXiv:1904.09751 (2019)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Seunghoon Hong Dingdong Yang Jongwook Choi and Honglak Lee. 2018. Inferring semantic layout for hierarchical text-to-image synthesis. In CVPR. 7986-- 7994.  Seunghoon Hong Dingdong Yang Jongwook Choi and Honglak Lee. 2018. Inferring semantic layout for hierarchical text-to-image synthesis. In CVPR. 7986-- 7994.","DOI":"10.1109\/CVPR.2018.00833"},{"key":"e_1_3_2_2_11_1","volume-title":"Michael Ying Yang, and Bodo Rosenhahn","author":"Hu Kai","year":"2021","unstructured":"Kai Hu , Wentong Liao , Michael Ying Yang, and Bodo Rosenhahn . 2021 . Text to image generation with semantic-spatial aware GAN. arXiv preprint arXiv:2104.00567 (2021). Kai Hu,Wentong Liao, Michael Ying Yang, and Bodo Rosenhahn. 2021. Text to image generation with semantic-spatial aware GAN. arXiv preprint arXiv:2104.00567 (2021)."},{"key":"e_1_3_2_2_12_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Jiang Yuming","year":"2021","unstructured":"Yuming Jiang , Ziqi Huang , Xingang Pan , Chen Change Loy, and Ziwei Liu . 2021 . Talk-to-Edit: Fine-grained facial editing via dialog. In ICCV. 13799--13808. Yuming Jiang, Ziqi Huang, Xingang Pan, Chen Change Loy, and Ziwei Liu. 2021. Talk-to-Edit: Fine-grained facial editing via dialog. In ICCV. 13799--13808."},{"key":"e_1_3_2_2_13_1","volume-title":"Progressive growing of GANs for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196","author":"Karras Tero","year":"2017","unstructured":"Tero Karras , Timo Aila , Samuli Laine , and Jaakko Lehtinen . 2017. Progressive growing of GANs for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196 ( 2017 ). Tero Karras, Timo Aila, Samuli Laine, and Jaakko Lehtinen. 2017. Progressive growing of GANs for improved quality, stability, and variation. arXiv preprint arXiv:1710.10196 (2017)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-4012"},{"key":"e_1_3_2_2_15_1","unstructured":"Wenbo Li Pengchuan Zhang Lei Zhang Qiuyuan Huang Xiaodong He Siwei Lyu and Jianfeng Gao. 2019. Object-driven text-to-image synthesis via adversarial training. In CVPR. 12174--12182.  Wenbo Li Pengchuan Zhang Lei Zhang Qiuyuan Huang Xiaodong He Siwei Lyu and Jianfeng Gao. 2019. Object-driven text-to-image synthesis via adversarial training. In CVPR. 12174--12182."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common objects in context. In ECCV. 740--755.  Tsung-Yi Lin Michael Maire Serge Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll\u00e1r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common objects in context. In ECCV. 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Taesung Park Ming-Yu Liu Ting-Chun Wang and Jun-Yan Zhu. 2019. Semantic image synthesis with spatially-adaptive normalization. In CVPR. 2337--2346.  Taesung Park Ming-Yu Liu Ting-Chun Wang and Jun-Yan Zhu. 2019. Semantic image synthesis with spatially-adaptive normalization. In CVPR. 2337--2346.","DOI":"10.1109\/CVPR.2019.00244"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"crossref","unstructured":"Tingting Qiao Jing Zhang Duanqing Xu and Dacheng Tao. 2019. MirrorGAN: Learning text-to-image generation by redescription. In CVPR. 1505--1514.  Tingting Qiao Jing Zhang Duanqing Xu and Dacheng Tao. 2019. MirrorGAN: Learning text-to-image generation by redescription. In CVPR. 1505--1514.","DOI":"10.1109\/CVPR.2019.00160"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Yanyuan Qiao Qi Chen Chaorui Deng Ning Ding Yuankai Qi Mingkui Tan Xincheng Ren and Qi Wu. 2021. R-GAN: Exploring human-like way for reasonable text-to-image synthesis via generative adversarial networks. In ACM MM. 2085--2093.  Yanyuan Qiao Qi Chen Chaorui Deng Ning Ding Yuankai Qi Mingkui Tan Xincheng Ren and Qi Wu. 2021. R-GAN: Exploring human-like way for reasonable text-to-image synthesis via generative adversarial networks. In ACM MM. 2085--2093.","DOI":"10.1145\/3474085.3475363"},{"key":"e_1_3_2_2_21_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford , Jong Wook Kim , Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021 . Learning transferable visual models from natural language supervision. In ICML. 8748--8763. Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_2_22_1","unstructured":"Scott Reed Zeynep Akata Xinchen Yan Lajanugen Logeswaran Bernt Schiele and Honglak Lee. 2016. Generative adversarial text to image synthesis. In ICML. 1060--1069.  Scott Reed Zeynep Akata Xinchen Yan Lajanugen Logeswaran Bernt Schiele and Honglak Lee. 2016. Generative adversarial text to image synthesis. In ICML. 1060--1069."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Shulan Ruan Yong Zhang Kun Zhang Yanbo Fan Fan Tang Qi Liu and Enhong Chen. 2021. DAE-GAN: Dynamic aspect-aware GAN for text-to-image synthesis. In ICCV. 13960--13969.  Shulan Ruan Yong Zhang Kun Zhang Yanbo Fan Fan Tang Qi Liu and Enhong Chen. 2021. DAE-GAN: Dynamic aspect-aware GAN for text-to-image synthesis. In ICCV. 13960--13969.","DOI":"10.1109\/ICCV48922.2021.01370"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Wei Sun and Tianfu Wu. 2019. Image synthesis from reconfigurable layout and style. In ICCV. 10531--10540.  Wei Sun and Tianfu Wu. 2019. Image synthesis from reconfigurable layout and style. In ICCV. 10531--10540.","DOI":"10.1109\/ICCV.2019.01063"},{"key":"e_1_3_2_2_25_1","volume-title":"Learning layout and style reconfigurable gans for controllable image synthesis. arXiv preprint arXiv:2003.11571","author":"Sun Wei","year":"2020","unstructured":"Wei Sun and Tianfu Wu. 2020. Learning layout and style reconfigurable gans for controllable image synthesis. arXiv preprint arXiv:2003.11571 ( 2020 ). Wei Sun and Tianfu Wu. 2020. Learning layout and style reconfigurable gans for controllable image synthesis. arXiv preprint arXiv:2003.11571 (2020)."},{"key":"e_1_3_2_2_26_1","volume-title":"DF-GAN: Deep fusion generative adversarial networks for text-to-image synthesis. arXiv preprint arXiv:2008.05865","author":"Tao Ming","year":"2020","unstructured":"Ming Tao , Hao Tang , Songsong Wu , Nicu Sebe , Xiao-Yuan Jing , Fei Wu , and Bingkun Bao . 2020. DF-GAN: Deep fusion generative adversarial networks for text-to-image synthesis. arXiv preprint arXiv:2008.05865 ( 2020 ). Ming Tao, Hao Tang, Songsong Wu, Nicu Sebe, Xiao-Yuan Jing, Fei Wu, and Bingkun Bao. 2020. DF-GAN: Deep fusion generative adversarial networks for text-to-image synthesis. arXiv preprint arXiv:2008.05865 (2020)."},{"key":"e_1_3_2_2_27_1","unstructured":"Peter Welinder Steve Branson Takeshi Mita Catherine Wah Florian Schroff Serge Belongie and Pietro Perona. 2010. Caltech-UCSD birds 200. (2010).  Peter Welinder Steve Branson Takeshi Mita Catherine Wah Florian Schroff Serge Belongie and Pietro Perona. 2010. Caltech-UCSD birds 200. (2010)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Tao Xu Pengchuan Zhang Qiuyuan Huang Han Zhang Zhe Gan Xiaolei Huang and Xiaodong He. 2018. AttnGAN: Fine-grained text to image generation with attentional generative adversarial networks. In CVPR. 1316--1324.  Tao Xu Pengchuan Zhang Qiuyuan Huang Han Zhang Zhe Gan Xiaolei Huang and Xiaodong He. 2018. AttnGAN: Fine-grained text to image generation with attentional generative adversarial networks. In CVPR. 1316--1324.","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_2_29_1","unstructured":"Guojun Yin Bin Liu Lu Sheng Nenghai Yu XiaogangWang and Jing Shao. 2019. Semantics disentangling for text-to-image generation. In CVPR. 2327--2336.  Guojun Yin Bin Liu Lu Sheng Nenghai Yu XiaogangWang and Jing Shao. 2019. Semantics disentangling for text-to-image generation. In CVPR. 2327--2336."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Mingkuan Yuan and Yuxin Peng. 2018. Text-to-image synthesis via symmetrical distillation networks. In ACM MM. 1407--1415.  Mingkuan Yuan and Yuxin Peng. 2018. Text-to-image synthesis via symmetrical distillation networks. In ACM MM. 1407--1415.","DOI":"10.1145\/3240508.3240559"},{"key":"e_1_3_2_2_31_1","unstructured":"Han Zhang Ian Goodfellow Dimitris Metaxas and Augustus Odena. 2019. Selfattention generative adversarial networks. In ICML. 7354--7363.  Han Zhang Ian Goodfellow Dimitris Metaxas and Augustus Odena. 2019. Selfattention generative adversarial networks. In ICML. 7354--7363."},{"key":"e_1_3_2_2_32_1","volume-title":"Metaxas","author":"Zhang Han","year":"2017","unstructured":"Han Zhang , Tao Xu , Hongsheng Li , Shaoting Zhang , Xiaogang Wang , Xiaolei Huang , and Dimitris N . Metaxas . 2017 . StackGAN: Text to photo-realistic image synthesis with stacked generative adversarial networks. In ICCV. 5907--5915. Han Zhang, Tao Xu, Hongsheng Li, Shaoting Zhang, Xiaogang Wang, Xiaolei Huang, and Dimitris N. Metaxas. 2017. StackGAN: Text to photo-realistic image synthesis with stacked generative adversarial networks. In ICCV. 5907--5915."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Bo Zhao Lili Meng Weidong Yin and Leonid Sigal. 2019. Image generation from layout. In CVPR. 8584--8593.  Bo Zhao Lili Meng Weidong Yin and Leonid Sigal. 2019. Image generation from layout. In CVPR. 8584--8593.","DOI":"10.1109\/CVPR.2019.00878"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"crossref","unstructured":"Yutong Zhou. 2021. Generative adversarial network for text-to-face synthesis and manipulation. In ACM MM. 2940--2944.  Yutong Zhou. 2021. Generative adversarial network for text-to-face synthesis and manipulation. In ACM MM. 2940--2944.","DOI":"10.1109\/FG52635.2021.9666791"},{"key":"e_1_3_2_2_36_1","unstructured":"Minfeng Zhu Pingbo Pan Wei Chen and Yi Yang. 2019. DM-GAN: Dynamic memory generative adversarial networks for text-to-image synthesis. In CVPR. 5802--5810.  Minfeng Zhu Pingbo Pan Wei Chen and Yi Yang. 2019. DM-GAN: Dynamic memory generative adversarial networks for text-to-image synthesis. In CVPR. 5802--5810."}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Lisboa Portugal","acronym":"MM '22"},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548154","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548154","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:00:19Z","timestamp":1750186819000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":36,"alternative-id":["10.1145\/3503161.3548154","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548154","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}