{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T05:29:23Z","timestamp":1771046963447,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62272157, No.U21A20518, No.61976086"],"award-info":[{"award-number":["No.62272157, No.U21A20518, No.61976086"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Natural Science Foundation of Changsha","award":["No. kq2202177"],"award-info":[{"award-number":["No. kq2202177"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613781","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"5311-5320","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment for Markup-to-Image Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6054-3694","authenticated-orcid":false,"given":"Guojin","family":"Zhong","sequence":"first","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9600-7789","authenticated-orcid":false,"given":"Jin","family":"Yuan","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2013-4791","authenticated-orcid":false,"given":"Pan","family":"Wang","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1090-667X","authenticated-orcid":false,"given":"Kailun","family":"Yang","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5658-5509","authenticated-orcid":false,"given":"Weili","family":"Guan","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9720-5915","authenticated-orcid":false,"given":"Zhiyong","family":"Li","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01119"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463615"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548154"},{"key":"e_1_3_2_1_7_1","volume-title":"Chemberta: Large-scale self-supervised pretraining for molecular property prediction. arXiv preprint arXiv:2010.09885","author":"Chithrananda Seyone","year":"2020","unstructured":"Seyone Chithrananda, Gabriel Grand, and Bharath Ramsundar. 2020. Chemberta: Large-scale self-supervised pretraining for molecular property prediction. arXiv preprint arXiv:2010.09885 (2020)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Deng Yuntian","year":"2023","unstructured":"Yuntian Deng, Noriyuki Kojima, and Alexander M Rush. 2023. Markup-to-Image Diffusion Models with Scheduled Sampling. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00520"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS).","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. In Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS).","author":"Dieng Adji Bousso","year":"2017","unstructured":"Adji Bousso Dieng, Dustin Tran, Rajesh Ranganath, John Paisley, and David Blei. 2017. Variational Inference via \u03c7 Upper Bound Minimization. Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS).","author":"Ding Ming","year":"2021","unstructured":"Ming Ding, Zhuoyi Yang, Wenyi Hong, Wendi Zheng, Chang Zhou, Da Yin, Junyang Lin, Xu Zou, Zhou Shao, Hongxia Yang, and Jie Tang. 2021. CogView: Mastering Text-to-Image Generation via Transformers. In Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_13_1","unstructured":"Leo Gao Stella Biderman Sid Black Laurence Golding Travis Hoppe Charles Foster Jason Phang Horace He Anish Thite Noa Nabeshima et al. 2020. The pile: An 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS).","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Proceedings of the International Conference on Advances in Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00069"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS).","author":"Kang Minguk","year":"2020","unstructured":"Minguk Kang and Jaesik Park. 2020. ContraGAN: Contrastive Learning for Conditional Image Generation. In Proceedings of the International Conference on Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21344"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01198"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV).","author":"Liu Nan","unstructured":"Nan Liu, Shuang Li, Yilun Du, Antonio Torralba, and Joshua B. Tenenbaum. 2022. Compositional Visual Generation with Composable Diffusion Models. In Proceedings of the European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_24_1","volume-title":"Improving Adversarial Robustness by Contrastive Guided Diffusion Process. arXiv preprint arXiv:2210.09643","author":"Ouyang Yidong","year":"2022","unstructured":"Yidong Ouyang, Liyan Xie, and Guang Cheng. 2022. Improving Adversarial Robustness by Contrastive Guided Diffusion Process. arXiv preprint arXiv:2210.09643 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00088"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475363"},{"key":"e_1_3_2_1_27_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_28_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS).","author":"Razavi Ali","year":"2019","unstructured":"Ali Razavi, Aaron Van den Oord, and Oriol Vinyals. 2019. Generating diverse high-fidelity images with vq-vae-2. In Proceedings of the International Conference on Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Reed Scott","year":"2016","unstructured":"Scott Reed, Zeynep Akata, Xinchen Yan, Lajanugen Logeswaran, Bernt Schiele, and Honglak Lee. 2016. Generative Adversarial Text to Image Synthesis. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the International Conference on Advances in Neural Information Processing Systems(NIPS).","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L. Denton, Seyed Kamyar Seyed Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, Jonathan Ho, David J. Fleet, and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. In Proceedings of the International Conference on Advances in Neural Information Processing Systems(NIPS)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548159"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01602"},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Conference on Neural Information Processing Systems (NIPS).","author":"van den Oord Aaron","year":"2017","unstructured":"Aaron van den Oord, Oriol Vinyals, and Koray Kavukcuoglu. 2017. Neural Discrete Representation Learning. In Proceedings of the International Conference on Neural Information Processing Systems (NIPS)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475226"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557268"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547821"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Xu Jiarui","year":"2023","unstructured":"Jiarui Xu, Sifei Liu, Arash Vahdat, Wonmin Byeon, Xiaolong Wang, and Shalini De Mello. 2023. ODISE: Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_42_1","volume-title":"Improving Text-to-Image Synthesis Using Contrastive Learning. In British Machine Vision Conference.","author":"Ye Hui","year":"2021","unstructured":"Hui Ye, Xiulong Yang, Martin Tak\u00e1c, Rajshekhar Sunderraman, and Shihao Ji. 2021. Improving Text-to-Image Synthesis Using Contrastive Learning. In British Machine Vision Conference."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00089"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision (ICCV).","author":"Zhang Han","unstructured":"Han Zhang, Tao Xu, Hongsheng Li, and et al. 2017. StackGAN: Text to Photo-realistic Image Synthesis with Stacked Generative Adversarial Networks. In Proceedings of the IEEE Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2856256"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02065"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00979"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Zhu Ye","year":"2023","unstructured":"Ye Zhu, Yu Wu, Kyle Olszewski, Jian Ren, Sergey Tulyakov, and Yan Yan. 2023. Discrete Contrastive Diffusion for Cross-Modal Music and Image Generation. In Proceedings of the International Conference on Learning Representations (ICLR)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613781","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613781","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:04:52Z","timestamp":1755821092000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613781"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":48,"alternative-id":["10.1145\/3581783.3613781","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613781","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}