{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T07:14:49Z","timestamp":1778224489675,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Natural Science Foundation of China","award":["Grant No. U1836217, Grant No. 62076240, Grant No.62006244, and Grant No. 61721004"],"award-info":[{"award-number":["Grant No. U1836217, Grant No. 62076240, Grant No.62006244, and Grant No. 61721004"]}]},{"name":"the National Key R&D Program of China","award":["Grant 2020AAA0140002"],"award-info":[{"award-number":["Grant 2020AAA0140002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475391","type":"proceedings-article","created":{"date-parts":[[2023,1,5]],"date-time":"2023-01-05T23:03:42Z","timestamp":1672959822000},"page":"2290-2298","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":44,"title":["Multi-caption Text-to-Face Synthesis: Dataset and Algorithm"],"prefix":"10.1145","author":[{"given":"Jianxin","family":"Sun","sequence":"first","affiliation":[{"name":"Center for Research on Intelligent Perception and Computing, NLPR, CASIA &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Li","sequence":"additional","affiliation":[{"name":"Center for Research on Intelligent Perception and Computing, NLPR, CASIA &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weining","family":"Wang","sequence":"additional","affiliation":[{"name":"Center for Research on Intelligent Perception and Computing, NLPR, CASIA, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute of North Electronic Equipment, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenan","family":"Sun","sequence":"additional","affiliation":[{"name":"Center for Research on Intelligent Perception and Computing, NLPR, CASIA &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/2976040.2976058"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"e_1_3_2_1_3_1","volume-title":"FTGAN: A fully-trained generative adversarial networks for text to face generation. arXiv preprint arXiv:1904.05729","author":"Chen Xiang","year":"2019"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01092"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00916"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/70"},{"key":"e_1_3_2_1_7_1","volume-title":"Face2Text: Collecting an Annotated Image Description Corpus for the Generation of Rich Face Descriptions. arXiv preprint arxiv:1803.03827","author":"Gatt Albert","year":"2021"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2916751"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00044"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Learning Representations (ICLR) .","author":"Karras Tero","year":"2018"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Learning Representations (ICLR) .","author":"Kingma Diederik P","year":"2015"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00559"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454472"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00790"},{"key":"e_1_3_2_1_19_1","volume-title":"CPGAN: Full-spectrum content-parsing generative adversarial networks for text-to-image synthesis. arXiv preprint arXiv:1912.08562","author":"Liang Jiadong","year":"2019"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01215"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.425"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/BigMM.2019.00-42"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_2_1_26_1","volume-title":"Conditional image generation and manipulation for user-specified content. arXiv preprint arXiv:2005.04909","author":"Stap David","year":"2020"},{"key":"e_1_3_2_1_27_1","volume-title":"Attention is all You need in speech separation. arXiv preprint arXiv:2010.13154","author":"Subakan Cem","year":"2020"},{"key":"e_1_3_2_1_28_1","volume-title":"European Conference on Computer Vision (ECCV). 223--240","author":"Suganuma Masanori","year":"2020"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414008"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"S. Belongie J. Hays P. Perona D. Ramanan P. Dollr T.-Y. Lin M. Maire and C. L. Zitnick. 2014. Microsoft coco: Common objects in context. (2014) 740--755.  S. Belongie J. Hays P. Perona D. Ramanan P. Dollr T.-Y. Lin M. Maire and C. L. Zitnick. 2014. Microsoft coco: Common objects in context. (2014) 740--755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_31_1","unstructured":"T2F. [n.d.]. https:\/\/github.com\/akanimax\/T2F. Accessed: 2021-07--22.  T2F. [n.d.]. https:\/\/github.com\/akanimax\/T2F. Accessed: 2021-07--22."},{"key":"e_1_3_2_1_32_1","volume-title":"DF-GAN: Deep fusion generative adversarial networks for Text-to-Image synthesis. arXiv preprint arXiv:2008.05865","author":"Tao Ming","year":"2020"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3045390.3045533"},{"key":"e_1_3_2_1_34_1","volume-title":"Instance normalization: The missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022","author":"Ulyanov Dmitry","year":"2016"},{"key":"e_1_3_2_1_35_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011).  Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2018.2833032"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00235"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3304415.3304583"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.5555\/3294771.3294778"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00556"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00515"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00480"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475391","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475391","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:32Z","timestamp":1750193312000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475391"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":47,"alternative-id":["10.1145\/3474085.3475391","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475391","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}