{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:04:16Z","timestamp":1750309456173,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176144, and 62076153"],"award-info":[{"award-number":["62176144, and 62076153"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fundamental Research Project of Shandong, China","award":["ZR2019ZD03, and ZR2024MF043"],"award-info":[{"award-number":["ZR2019ZD03, and ZR2024MF043"]}]},{"name":"Taishan Scholar Project of Shandong, China","award":["ts20190924"],"award-info":[{"award-number":["ts20190924"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700240","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Unified Contrastive Framework with Multi-Granularity Fusion for Text-to-Image Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8899-8319","authenticated-orcid":false,"given":"Yachao","family":"He","sequence":"first","affiliation":[{"name":"Shandong Normal University, Jinan, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9121-5124","authenticated-orcid":false,"given":"Li","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6259-7533","authenticated-orcid":false,"given":"Huaxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8061-1797","authenticated-orcid":false,"given":"Dongmei","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3819-2789","authenticated-orcid":false,"given":"Hongzhen","family":"Li","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, CN"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Shane Barratt and Rishi Sharma. 2018. A note on the inception score. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1801.01973 (2018)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Hila Chefer Yuval Alaluf Yael Vinker Lior Wolf and Daniel Cohen-Or. 2023. Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201310.","DOI":"10.1145\/3592116"},{"key":"e_1_3_3_1_4_2","first-page":"1597","volume-title":"International conference on machine learning","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597\u20131607."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00859"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01092"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Qingrong Cheng Keyu Wen and Xiaodong Gu. 2022. Vision-language matching for text-to-image synthesis via generative adversarial networks. IEEE Transactions on Multimedia (2022).","DOI":"10.1109\/TMM.2022.3217384"},{"key":"e_1_3_3_1_8_2","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021) 8780\u20138794."},{"key":"e_1_3_3_1_9_2","unstructured":"Ming Ding Wendi Zheng Wenyi Hong and Jie Tang. 2022. Cogview2: Faster and better text-to-image generation via hierarchical transformers. Advances in Neural Information Processing Systems 35 (2022) 16890\u201316902."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_12_2","unstructured":"Jonathan Ho Chitwan Saharia William Chan David\u00a0J Fleet Mohammad Norouzi and Tim Salimans. 2022. Cascaded diffusion models for high fidelity image generation. Journal of Machine Learning Research 23 47 (2022) 1\u201333."},{"key":"e_1_3_3_1_13_2","first-page":"8888","volume-title":"International Conference on Machine Learning","author":"Hou Liang","year":"2022","unstructured":"Liang Hou, Qi Cao, Huawei Shen, Siyuan Pan, Xiaoshuang Li, and Xueqi Cheng. 2022. Conditional gans with auxiliary discriminative classifier. In International Conference on Machine Learning. PMLR, 8888\u20138902."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547881"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3481540"},{"key":"e_1_3_3_1_16_2","unstructured":"Yifan Jiang Shiyu Chang and Zhangyang Wang. 2021. Transgan: Two pure transformers can make one strong gan and that can scale up. Advances in Neural Information Processing Systems 34 (2021) 14745\u201314758."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_3_1_18_2","unstructured":"Kwonjoon Lee Huiwen Chang Lu Jiang Han Zhang Zhuowen Tu and Ce Liu. 2021. Vitgan: Training gans with vision transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.04589 (2021)."},{"key":"e_1_3_3_1_19_2","unstructured":"Bowen Li Philip\u00a0HS Torr and Thomas Lukasiewicz. 2022. Memory-driven text-to-image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.07022 (2022)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01765"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_22_2","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1807.03748 (2018)."},{"key":"e_1_3_3_1_23_2","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)","author":"Park Dong\u00a0Huk","year":"2021","unstructured":"Dong\u00a0Huk Park, Samaneh Azadi, Xihui Liu, Trevor Darrell, and Anna Rohrbach. 2021. Benchmark for compositional text-to-image synthesis. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)."},{"key":"e_1_3_3_1_24_2","first-page":"8821","volume-title":"International conference on machine learning","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821\u20138831."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01370"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Hongchen Tan Xiuping Liu Meng Liu Baocai Yin and Xin Li. 2020. KT-GAN: Knowledge-transfer generative adversarial network for text-to-image synthesis. IEEE Transactions on Image Processing 30 (2020) 1275\u20131290.","DOI":"10.1109\/TIP.2020.3026728"},{"key":"e_1_3_3_1_28_2","unstructured":"Hongchen Tan Baocai Yin Kun Wei Xiuping Liu and Xin Li. 2023. ALR-GAN: Adaptive Layout Refinement for Text-to-Image Synthesis. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01602"},{"key":"e_1_3_3_1_30_2","unstructured":"Zijie\u00a0J Wang Evan Montoya David Munechika Haoyang Yang Benjamin Hoover and Duen\u00a0Horng Chau. 2022. Diffusiondb: A large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.14896 (2022)."},{"key":"e_1_3_3_1_31_2","unstructured":"Peter Welinder Steve Branson Takeshi Mita Catherine Wah Florian Schroff Serge Belongie and Pietro Perona. 2010. Caltech-UCSD birds 200. (2010)."},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547821"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859791"},{"key":"e_1_3_3_1_36_2","unstructured":"Hui Ye Xiulong Yang Martin Takac Rajshekhar Sunderraman and Shihao Ji. 2021. Improving text-to-image synthesis using contrastive learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2107.02423 (2021)."},{"key":"e_1_3_3_1_37_2","unstructured":"Senmao Ye Huan Wang Mingkui Tan and Fei Liu. 2023. Recurrent affine transformation for text-to-image synthesis. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"crossref","unstructured":"Xianhua Zeng Zhengyi Huang Liming Xu and Yicai Xie. 2022. CP-GAN: Meet the high requirements of diagnose report to medical image by content preservation. IET Image Processing 16 1 (2022) 29\u201338.","DOI":"10.1049\/ipr2.12145"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Auckland New Zealand","acronym":"MMAsia '24"},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700240","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:16Z","timestamp":1750295416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700240"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":38,"alternative-id":["10.1145\/3696409.3700240","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700240","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}