{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T03:10:14Z","timestamp":1755832214742,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024QY1400"],"award-info":[{"award-number":["2024QY1400"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681652","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T02:59:41Z","timestamp":1729911581000},"page":"3558-3567","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Embedding an Ethical Mind: Aligning Text-to-Image Synthesis via Lightweight Value Optimization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-0017-6415","authenticated-orcid":false,"given":"Xingqi","family":"Wang","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2710-1613","authenticated-orcid":false,"given":"Xiaoyuan","family":"Yi","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8608-8482","authenticated-orcid":false,"given":"Xing","family":"Xie","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8449-278X","authenticated-orcid":false,"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"DCST, BNRist, Tsinghua University &amp; Key Laboratory of Pervasive Computing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"volume-title":"SIGIR'94: Proceedings of the Seventeenth Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval, organised by Dublin City University. Springer, 232--241","key":"e_1_3_2_1_1_1","unstructured":"1994. Some simple effective approximations to the 2-poisson model for probabilistic weighted retrieval. In SIGIR'94: Proceedings of the Seventeenth Annual International ACM-SIGIR Conference on Research and Development in Information Retrieval, organised by Dublin City University. Springer, 232--241."},{"key":"e_1_3_2_1_2_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_3_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716--23736."},{"key":"e_1_3_2_1_4_1","volume-title":"A general theoretical paradigm to understand learning from human preferences. arXiv preprint arXiv:2310.12036","author":"Azar Mohammad Gheshlaghi","year":"2023","unstructured":"Mohammad Gheshlaghi Azar, Mark Rowland, Bilal Piot, Daniel Guo, Daniele Calandriello, Michal Valko, and R\u00e9mi Munos. 2023. A general theoretical paradigm to understand learning from human preferences. arXiv preprint arXiv:2310.12036 (2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Yuntao Bai Andy Jones Kamal Ndousse Amanda Askell Anna Chen Nova DasSarma Dawn Drain Stanislav Fort Deep Ganguli Tom Henighan et al. 2022. Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"Analytic-dpm: an analytic estimate of the optimal reverse variance in diffusion probabilistic models. arXiv preprint arXiv:2201.06503","author":"Bao Fan","year":"2022","unstructured":"Fan Bao, Chongxuan Li, Jun Zhu, and Bo Zhang. 2022. Analytic-dpm: an analytic estimate of the optimal reverse variance in diffusion probabilistic models. arXiv preprint arXiv:2201.06503 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_11"},{"key":"e_1_3_2_1_8_1","volume-title":"Image synthesis from an ethical perspective. AI & SOCIETY","author":"Bendel Oliver","year":"2023","unstructured":"Oliver Bendel. 2023. Image synthesis from an ethical perspective. AI & SOCIETY (2023), 1--10."},{"key":"e_1_3_2_1_9_1","unstructured":"James Betker Gabriel Goh Li Jing Tim Brooks Jianfeng Wang Linjie Li Long Ouyang Juntang Zhuang Joyce Lee Yufei Guo et al. 2023. Improving image generation with better captions. Computer Science. https:\/\/cdn. openai. com\/papers\/dall-e-3. pdf 2 3 (2023) 8."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604722"},{"key":"e_1_3_2_1_11_1","unstructured":"Rishi Bommasani Drew A Hudson Ehsan Adeli Russ Altman Simran Arora Sydney von Arx Michael S Bernstein Jeannette Bohg Antoine Bosselut Emma Brunskill et al. 2021. On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.2307\/2334029"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1186\/s41239-023-00411-8"},{"key":"e_1_3_2_1_14_1","volume-title":"Debiasing vision-language models via biased prompts. arXiv preprint arXiv:2302.00070","author":"Chuang Ching-Yao","year":"2023","unstructured":"Ching-Yao Chuang, Varun Jampani, Yuanzhen Li, Antonio Torralba, and Stefanie Jegelka. 2023. Debiasing vision-language models via biased prompts. arXiv preprint arXiv:2302.00070 (2023)."},{"key":"e_1_3_2_1_15_1","unstructured":"The Economist. 2024. Is Google's Gemini chatbot woke by accident or by design? https:\/\/www.economist.com\/united-states\/2024\/02\/28\/is-googles-geminichatbot- woke-by-accident-or-design Last accessed on 2024-04--11."},{"key":"e_1_3_2_1_16_1","volume-title":"Mitigating stereotypical biases in text to image generative systems. arXiv preprint arXiv:2310.06904","author":"Esposito Piero","year":"2023","unstructured":"Piero Esposito, Parmida Atighehchian, Anastasis Germanidis, and Deepti Ghadiyaram. 2023. Mitigating stereotypical biases in text to image generative systems. arXiv preprint arXiv:2310.06904 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Fair diffusion: Instructing text-to-image generation models on fairness. arXiv preprint arXiv:2302.10893","author":"Friedrich Felix","year":"2023","unstructured":"Felix Friedrich, Manuel Brack, Lukas Struppek, Dominik Hintersdorf, Patrick Schramowski, Sasha Luccioni, and Kristian Kersting. 2023. Fair diffusion: Instructing text-to-image generation models on fairness. arXiv preprint arXiv:2302.10893 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit H Bermano, Gal Chechik, and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00230"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00503"},{"key":"e_1_3_2_1_21_1","unstructured":"Deep Ganguli Amanda Askell Nicholas Schiefer Thomas I Liao Kamile Lukoi\u00afute Anna Chen Anna Goldie Azalia Mirhoseini Catherine Olsson Danny Hernandez et al. 2023. The capacity for moral self-correction in large language models. arXiv preprint arXiv:2302.07459 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findingsemnlp"},{"key":"e_1_3_2_1_23_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems 27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_24_1","volume-title":"Selective Amnesia: A Continual Learning Approach to Forgetting in Deep Generative Models. In Advances in Neural Information Processing Systems","author":"Heng Alvin","year":"2023","unstructured":"Alvin Heng and Harold Soh. 2023. Selective Amnesia: A Continual Learning Approach to Forgetting in Deep Generative Models. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 17170--17194. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/ 376276a95781fa17c177b1ccdd0a03ac-Paper-Conference.pdf"},{"key":"e_1_3_2_1_25_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_26_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_27_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_28_1","first-page":"72096","article-title":"Language Is Not All You Need: Aligning Perception with Language Models","volume":"36","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Barun Patra, Qiang Liu, Kriti Aggarwal, Zewen Chi, Nils Bjorck, Vishrav Chaudhary, Subhojit Som, XIA SONG, and Furu Wei. 2023. Language Is Not All You Need: Aligning Perception with Language Models. In Advances in Neural Information Processing Systems, Vol. 36. 72096--72109.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"RSDPO: A Hybrid Rejection Sampling and Direct Preference Optimization Method for Alignment of Large Language Models. arXiv preprint arXiv:2402.10038","author":"Khaki Saeed","year":"2024","unstructured":"Saeed Khaki, JinJin Li, Lan Ma, Liu Yang, and Prathap Ramachandra. 2024. RSDPO: A Hybrid Rejection Sampling and Direct Preference Optimization Method for Alignment of Large Language Models. arXiv preprint arXiv:2402.10038 (2024)."},{"key":"e_1_3_2_1_30_1","unstructured":"Eunji Kim Siwon Kim Chaehun Shin and Sungroh Yoon. 2023. De-stereotyping text-to-image models through prompt tuning. (2023)."},{"key":"e_1_3_2_1_31_1","first-page":"8067","article-title":"Glow-tts: A generative flow for text-to-speech via monotonic alignment search","volume":"33","author":"Kim Jaehyeon","year":"2020","unstructured":"Jaehyeon Kim, Sungwon Kim, Jungil Kong, and Sungroh Yoon. 2020. Glow-tts: A generative flow for text-to-speech via monotonic alignment search. Advances in Neural Information Processing Systems 33 (2020), 8067--8077.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02074"},{"key":"e_1_3_2_1_34_1","volume-title":"Rlaif: Scaling reinforcement learning from human feedback with ai feedback. arXiv preprint arXiv:2309.00267","author":"Lee Harrison","year":"2023","unstructured":"Harrison Lee, Samrat Phatale, Hassan Mansoor, Kellie Lu, Thomas Mesnard, Colton Bishop, Victor Carbune, and Abhinav Rastogi. 2023. Rlaif: Scaling reinforcement learning from human feedback with ai feedback. arXiv preprint arXiv:2309.00267 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Lee Kenton","year":"2023","unstructured":"Kenton Lee, Mandar Joshi, Iulia Raluca Turc, Hexiang Hu, Fangyu Liu, Julian Martin Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, and Kristina Toutanova. 2023. Pix2struct: Screenshot parsing as pretraining for visual language understanding. In International Conference on Machine Learning. PMLR, 18893--18912."},{"key":"e_1_3_2_1_36_1","volume-title":"Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192","author":"Lee Kimin","year":"2023","unstructured":"Kimin Lee, Hao Liu, Moonkyung Ryu, OliviaWatkins, Yuqing Du, Craig Boutilier, Pieter Abbeel, Mohammad Ghavamzadeh, and Shixiang Shane Gu. 2023. Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester Brian","year":"2021","unstructured":"Brian Lester, Rami Al-Rfou, and Noah Constant. 2021. The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_38_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730--19742."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26538"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference on Machine Learning. PMLR, 6565--6576","author":"Liang Paul Pu","year":"2021","unstructured":"Paul Pu Liang, Chiyu Wu, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2021. Towards understanding and mitigating social biases in language models. In International Conference on Machine Learning. PMLR, 6565--6576."},{"key":"e_1_3_2_1_41_1","volume-title":"The unlocking spell on base llms: Rethinking alignment via in-context learning. arXiv preprint arXiv:2312.01552","author":"Lin Bill Yuchen","year":"2023","unstructured":"Bill Yuchen Lin, Abhilasha Ravichander, Ximing Lu, Nouha Dziri, Melanie Sclar, Khyathi Chandu, Chandra Bhagavatula, and Yejin Choi. 2023. The unlocking spell on base llms: Rethinking alignment via in-context learning. arXiv preprint arXiv:2312.01552 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Improved Baselines with Visual Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following.","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved Baselines with Visual Instruction Tuning. In NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following."},{"key":"e_1_3_2_1_43_1","first-page":"34892","article-title":"Visual Instruction Tuning","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems, Vol. 36. 34892--34916.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_44_1","volume-title":"Safety of Multimodal Large Language Models on Images and Text. arXiv preprint arXiv:2402.00357","author":"Liu Xin","year":"2024","unstructured":"Xin Liu, Yichen Zhu, Yunshi Lan, Chao Yang, and Yu Qiao. 2024. Safety of Multimodal Large Language Models on Images and Text. arXiv preprint arXiv:2402.00357 (2024)."},{"key":"e_1_3_2_1_45_1","first-page":"5775","article-title":"Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps","volume":"35","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. Advances in Neural Information Processing Systems 35 (2022), 5775--5787.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","volume-title":"Maskocr: Text recognition with masked encoder-decoder pretraining. arXiv preprint arXiv:2206.00311","author":"Lyu Pengyuan","year":"2022","unstructured":"Pengyuan Lyu, Chengquan Zhang, Shanshan Liu, Meina Qiao, Yangliu Xu, Liang Wu, Kun Yao, Junyu Han, Errui Ding, and Jingdong Wang. 2022. Maskocr: Text recognition with masked encoder-decoder pretraining. arXiv preprint arXiv:2206.00311 (2022)."},{"key":"e_1_3_2_1_47_1","volume-title":"Jimmy Lei Ba, and Ruslan Salakhutdinov","author":"Mansimov Elman","year":"2015","unstructured":"Elman Mansimov, Emilio Parisotto, Jimmy Lei Ba, and Ruslan Salakhutdinov. 2015. Generating images from captions with attention. arXiv preprint arXiv:1511.02793 (2015)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600211.3604711"},{"key":"e_1_3_2_1_49_1","volume-title":"International conference on machine learning. PMLR, 8162--8171","author":"Nichol Alexander Quinn","year":"2021","unstructured":"Alexander Quinn Nichol and Prafulla Dhariwal. 2021. Improved denoising diffusion probabilistic models. In International conference on machine learning. PMLR, 8162--8171."},{"key":"e_1_3_2_1_50_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Advances in neural information processing systems 35 (2022) 27730--27744."},{"key":"e_1_3_2_1_51_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_53_1","volume-title":"Levine (Eds.)","volume":"36","author":"Rafailov Rafael","year":"2023","unstructured":"Rafael Rafailov, Archit Sharma, Eric Mitchell, Christopher D Manning, Stefano Ermon, and Chelsea Finn. 2023. Direct Preference Optimization: Your Language Model is Secretly a Reward Model. In Advances in Neural Information Processing Systems, A. Oh, T. Neumann, A. Globerson, K. Saenko, M. Hardt, and S. Levine (Eds.), Vol. 36. Curran Associates, Inc., 53728--53741."},{"key":"e_1_3_2_1_54_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_55_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1, 2","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1, 2 (2022), 3."},{"key":"e_1_3_2_1_56_1","volume-title":"International conference on machine learning. Pmlr, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821--8831."},{"key":"e_1_3_2_1_57_1","volume-title":"International conference on machine learning. PMLR, 1060--1069","author":"Reed Scott","year":"2016","unstructured":"Scott Reed, Zeynep Akata, Xinchen Yan, Lajanugen Logeswaran, Bernt Schiele, and Honglak Lee. 2016. Generative adversarial text to image synthesis. In International conference on machine learning. PMLR, 1060--1069."},{"key":"e_1_3_2_1_58_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2020. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 33rd International Conference on Machine Learning.","author":"Romoff Joshua","year":"2016","unstructured":"Joshua Romoff, Nicolas Angelard-Gontier, and Prasanna Parthasarathi. 2016. Variational encoder decoder for image generation conditioned on captions. In Proceedings of the 33rd International Conference on Machine Learning."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_62_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022), 36479--36494."},{"key":"e_1_3_2_1_63_1","volume-title":"Improved techniques for training gans. Advances in neural information processing systems 29","author":"Salimans Tim","year":"2016","unstructured":"Tim Salimans, Ian Goodfellow, Wojciech Zaremba, Vicki Cheung, Alec Radford, and Xi Chen. 2016. Improved techniques for training gans. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_64_1","volume-title":"Self-critiquing models for assisting human evaluators. arXiv preprint arXiv:2206.05802","author":"Saunders William","year":"2022","unstructured":"William Saunders, Catherine Yeh, Jeff Wu, Steven Bills, Long Ouyang, Jonathan Ward, and Jan Leike. 2022. Self-critiquing models for assisting human evaluators. arXiv preprint arXiv:2206.05802 (2022)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02157"},{"key":"e_1_3_2_1_66_1","first-page":"25278","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann Christoph","year":"2022","unstructured":"Christoph Schuhmann, Romain Beaumont, Richard Vencu, Cade Gordon, Ross Wightman, Mehdi Cherti, Theo Coombes, Aarush Katta, Clayton Mullis, Mitchell Wortsman, et al. 2022. Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems 35 (2022), 25278--25294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29865"},{"key":"e_1_3_2_1_69_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_70_1","volume-title":"Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_71_1","volume-title":"Generative pretraining in multimodality. arXiv preprint arXiv:2307.05222","author":"Sun Quan","year":"2023","unstructured":"Quan Sun, Qiying Yu, Yufeng Cui, Fan Zhang, Xiaosong Zhang, Yueze Wang, Hongcheng Gao, Jingjing Liu, Tiejun Huang, and XinlongWang. 2023. Generative pretraining in multimodality. arXiv preprint arXiv:2307.05222 (2023)."},{"key":"e_1_3_2_1_72_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_73_1","volume-title":"Diffusion idea exploration for art generation. arXiv preprint arXiv:2307.04978","author":"Verma Nikhil","year":"2023","unstructured":"Nikhil Verma. 2023. Diffusion idea exploration for art generation. arXiv preprint arXiv:2307.04978 (2023)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00786"},{"key":"e_1_3_2_1_75_1","volume-title":"Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.213"},{"key":"e_1_3_2_1_77_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems 35 (2022), 24824--24837."},{"key":"e_1_3_2_1_78_1","volume-title":"Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval. In International Conference on Learning Representations.","author":"Xiong Lee","year":"2020","unstructured":"Lee Xiong, Chenyan Xiong, Ye Li, Kwok-Fung Tang, Jialin Liu, Paul N Bennett, Junaid Ahmed, and Arnold Overwijk. 2020. Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_80_1","volume-title":"A New Creative Generation Pipeline for Click-Through Rate with Stable Diffusion Model. arXiv preprint arXiv:2401.10934","author":"Yang Hao","year":"2024","unstructured":"Hao Yang, Jianxin Yuan, Shuai Yang, Linhe Xu, Shuo Yuan, and Yifan Zeng. 2024. A New Creative Generation Pipeline for Click-Through Rate with Stable Diffusion Model. arXiv preprint arXiv:2401.10934 (2024)."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3375709"},{"key":"e_1_3_2_1_82_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Yang Zonghan","year":"2022","unstructured":"Zonghan Yang, Xiaoyuan Yi, Peng Li, Yang Liu, and Xing Xie. 2022. Unified Detoxifying and Debiasing in Language Generation via Inference-time Adaptive Optimization. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_83_1","unstructured":"Lili Yu Bowen Shi Ramakanth Pasunuru Benjamin Muller Olga Golovneva Tianlu Wang Arun Babu Binh Tang Brian Karrer Shelly Sheynin et al. 2023. Scaling autoregressive multi-modal models: Pretraining and instruction tuning. arXiv preprint arXiv:2309.02591 (2023)."},{"key":"e_1_3_2_1_84_1","volume-title":"Forget-me-not: Learning to forget in text-to-image diffusion models. arXiv preprint arXiv:2303.17591","author":"Zhang Eric","year":"2023","unstructured":"Eric Zhang, Kai Wang, Xingqian Xu, Zhangyang Wang, and Humphrey Shi. 2023. Forget-me-not: Learning to forget in text-to-image diffusion models. arXiv preprint arXiv:2303.17591 (2023)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.629"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00595"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681652","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681652","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T21:26:51Z","timestamp":1755811611000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681652"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":86,"alternative-id":["10.1145\/3664647.3681652","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681652","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}