{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:06:58Z","timestamp":1765339618965,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755820","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:56:43Z","timestamp":1761371803000},"page":"5208-5217","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Causality-aligned Prompt Learning via Diffusion-based Counterfactual Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1202-3993","authenticated-orcid":false,"given":"Xinshu","family":"Li","sequence":"first","affiliation":[{"name":"University of New South Wales, Sydney, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6617-0369","authenticated-orcid":false,"given":"Ruoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"University of New South Wales, Sydney, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1736-2764","authenticated-orcid":false,"given":"Erdun","family":"Gao","sequence":"additional","affiliation":[{"name":"The University of Adelaide, Adelaide, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7147-5589","authenticated-orcid":false,"given":"Mingming","family":"Gong","sequence":"additional","affiliation":[{"name":"University of Melbourne, Melbourne, Australia and Mohamed bin Zayed University of Artificial Intelligence, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4149-839X","authenticated-orcid":false,"given":"Lina","family":"Yao","sequence":"additional","affiliation":[{"name":"CSIRO's Data61, Sydney, Australia and University of New South Wales, Sydney, Australia"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_3_1","first-page":"364","article-title":"Diffusion visual counterfactual explanations","volume":"35","author":"Augustin Maximilian","year":"2022","unstructured":"Maximilian Augustin, Valentyn Boreiko, Francesco Croce, and Matthias Hein. 2022. Diffusion visual counterfactual explanations. Advances in Neural Information Processing Systems, Vol. 35 (2022), 364-377.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01055"},{"key":"e_1_3_2_1_5_1","first-page":"446","volume-title":"Switzerland","author":"Bossard Lukas","year":"2014","unstructured":"Lukas Bossard, Matthieu Guillaumin, and Luc Van Gool. 2014. Food-101-mining discriminative components with random forests. In Computer vision-ECCV 2014: 13th European conference, zurich, Switzerland, September 6-12, 2014, proceedings, part VI 13. Springer, 446-461."},{"key":"e_1_3_2_1_6_1","volume-title":"Interventional and counterfactual inference with diffusion models. arXiv preprint arXiv:2302.00860","author":"Chao Patrick","year":"2023","unstructured":"Patrick Chao, Patrick Bl\u00f6baum, and Shiva Prasad Kasiviswanathan. 2023. Interventional and counterfactual inference with diffusion models. arXiv preprint arXiv:2302.00860 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"International conference on machine learning. PMLR, 1597-1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597-1607."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems, Vol. 34 (2021), 8780-8794."},{"key":"e_1_3_2_1_10_1","first-page":"71","volume-title":"UK","author":"Fu Tsu-Jui","year":"2020","unstructured":"Tsu-Jui Fu, Xin Eric Wang, Matthew F Peterson, Scott T Grafton, Miguel P Eckstein, and William Yang Wang. 2020. Counterfactual vision-and-language navigation via adversarial path sampler. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part VI 16. Springer, 71-86."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Machine Learning. PMLR, 2376-2384","author":"Goyal Yash","year":"2019","unstructured":"Yash Goyal, Ziyan Wu, Jan Ernst, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Counterfactual visual explanations. In International Conference on Machine Learning. PMLR, 2376-2384."},{"key":"e_1_3_2_1_13_1","unstructured":"Gregory Griffin Alex Holub Pietro Perona et al. 2007. Caltech-256 object category dataset. Technical Report. Technical Report 7694 California Institute of Technology Pasadena."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR, 1414-1423","author":"Hartford Jason","year":"2017","unstructured":"Jason Hartford, Greg Lewis, Kevin Leyton-Brown, and Matt Taddy. 2017. Deep IV: A flexible approach for counterfactual prediction. In International Conference on Machine Learning. PMLR, 1414-1423."},{"key":"e_1_3_2_1_15_1","volume-title":"William Yang Wang, and Xin Eric Wang","author":"He Xuehai","year":"2022","unstructured":"Xuehai He, Diji Yang, Weixi Feng, Tsu-Jui Fu, Arjun Akula, Varun Jampani, Pradyumna Narayana, Sugato Basu, William Yang Wang, and Xin Eric Wang. 2022. Cpl: Counterfactual prompt learning for vision and language models. arXiv preprint arXiv:2210.10362 (2022)."},{"key":"e_1_3_2_1_16_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_17_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_23"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the Asian Conference on Computer Vision. 858-876","author":"Jeanneret Guillaume","year":"2022","unstructured":"Guillaume Jeanneret, Lo\u00efc Simon, and Fr\u00e9d\u00e9ric Jurie. 2022. Diffusion models for counterfactual explanations. In Proceedings of the Asian Conference on Computer Vision. 858-876."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01576"},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. PMLR, 3020-3029","author":"Johansson Fredrik","year":"2016","unstructured":"Fredrik Johansson, Uri Shalit, and David Sontag. 2016. Learning representations for counterfactual inference. In International conference on machine learning. PMLR, 3020-3029."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00996"},{"key":"e_1_3_2_1_25_1","volume-title":"Supervised contrastive learning. Advances in neural information processing systems","author":"Khosla Prannay","year":"2020","unstructured":"Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised contrastive learning. Advances in neural information processing systems, Vol. 33 (2020), 18661-18673."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01053"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_30_1","volume-title":"Supporting vision-language model inference with causality-pruning knowledge prompt. arXiv preprint arXiv:2205.11100","author":"Li Jiangmeng","year":"2022","unstructured":"Jiangmeng Li, Wenyi Mo, Wenwen Qiang, Bing Su, and Changwen Zheng. 2022b. Supporting vision-language model inference with causality-pruning knowledge prompt. arXiv preprint arXiv:2205.11100 (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_10"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671782"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM54844.2022.00130"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i12.29271"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.eacl-long.181"},{"key":"e_1_3_2_1_36_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.110064"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"e_1_3_2_1_39_1","volume-title":"Kun Zhang, and Bernhard Sch\u00f6lkopf.","author":"Lu Chaochao","year":"2020","unstructured":"Chaochao Lu, Biwei Huang, Ke Wang, Jos\u00e9 Miguel Hern\u00e1ndez-Lobato, Kun Zhang, and Bernhard Sch\u00f6lkopf. 2020. Sample-efficient reinforcement learning via counterfactual-based data augmentation. arXiv preprint arXiv:2012.09092 (2020)."},{"key":"e_1_3_2_1_40_1","volume-title":"Psychologically-inspired causal prompts. arXiv preprint arXiv:2305.01764","author":"Lyu Zhiheng","year":"2023","unstructured":"Zhiheng Lyu, Zhijing Jin, Justus Mattern, Rada Mihalcea, Mrinmaya Sachan, and Bernhard Sch\u00f6lkopf. 2023. Psychologically-inspired causal prompts. arXiv preprint arXiv:2305.01764 (2023)."},{"key":"e_1_3_2_1_41_1","volume-title":"arXiv preprint arXiv:2301.09031","author":"Nasr-Esfahany Arash","year":"2023","unstructured":"Arash Nasr-Esfahany and Emre Kiciman. 2023. Counterfactual (non-) identifiability of learned structural causal models. arXiv preprint arXiv:2301.09031 (2023)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"e_1_3_2_1_44_1","volume-title":"Daniel Coelho de Castro, and Ben Glocker","author":"Pawlowski Nick","year":"2020","unstructured":"Nick Pawlowski, Daniel Coelho de Castro, and Ben Glocker. 2020. Deep structural causal models for tractable counterfactual inference. Advances in neural information processing systems, Vol. 33 (2020), 857-869."},{"key":"e_1_3_2_1_45_1","unstructured":"Judea Pearl. 2009. Causality. Cambridge university press."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_47_1","first-page":"25165","article-title":"Lance: Stress-testing visual models by generating language-guided counterfactual images","volume":"36","author":"Prabhu Viraj","year":"2023","unstructured":"Viraj Prabhu, Sriram Yenamandra, Prithvijit Chattopadhyay, and Judy Hoffman. 2023. Lance: Stress-testing visual models by generating language-guided counterfactual images. Advances in Neural Information Processing Systems, Vol. 36 (2023), 25165-25184.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_49_1","volume-title":"Liu","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2023. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. arXiv:1910.10683 [cs.LG] https:\/\/arxiv.org\/abs\/1910.10683"},{"key":"e_1_3_2_1_50_1","volume-title":"Contrastive learning with hard negative samples. arXiv preprint arXiv:2010.04592","author":"Robinson Joshua","year":"2020","unstructured":"Joshua Robinson, Ching-Yao Chuang, Suvrit Sra, and Stefanie Jegelka. 2020. Contrastive learning with hard negative samples. arXiv preprint arXiv:2010.04592 (2020)."},{"key":"e_1_3_2_1_51_1","volume-title":"Causal Learning and Reasoning","author":"Sanchez Pedro","year":"2022","unstructured":"Pedro Sanchez and Sotirios A Tsaftaris. 2022. Diffusion Causal Models for Counterfactual Estimation. In Causal Learning and Reasoning 2022."},{"key":"e_1_3_2_1_52_1","volume-title":"On Causal and Anticausal Learning. In 29th International Conference on Machine Learning (ICML 2012). International Conference on Machine Learning.","author":"Sch\u00f6lkopf Bernhard","year":"2012","unstructured":"Bernhard Sch\u00f6lkopf, Dominik Janzing, Jonas Peters, Eleni Sgouritsa, Kun Zhang, and Joris Mooij. 2012. On Causal and Anticausal Learning. In 29th International Conference on Machine Learning (ICML 2012). International Conference on Machine Learning."},{"key":"e_1_3_2_1_53_1","volume-title":"Eric Wallace, and Sameer Singh.","author":"Shin Taylor","year":"2020","unstructured":"Taylor Shin, Yasaman Razeghi, Robert L Logan IV, Eric Wallace, and Sameer Singh. 2020. Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)."},{"volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","key":"e_1_3_2_1_54_1","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. [n.d.]. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_55_1","volume-title":"The self-normalized estimator for counterfactual learning. advances in neural information processing systems","author":"Swaminathan Adith","year":"2015","unstructured":"Adith Swaminathan and Thorsten Joachims. 2015. The self-normalized estimator for counterfactual learning. advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_56_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in neural information processing systems, Vol. 35 (2022), 24824-24837."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_20"},{"key":"e_1_3_2_1_58_1","first-page":"3485","article-title":"Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"Xiao Jianxiong","year":"2010","unstructured":"Jianxiong Xiao, James Hays, Krista A Ehinger, Aude Oliva, and Antonio Torralba. 2010. Sun database: Large-scale scene recognition from abbey to zoo. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3485-3492.","journal-title":"IEEE"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00947"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01446"},{"key":"e_1_3_2_1_61_1","volume-title":"Causal prompting: Debiasing large language model prompting based on front-door adjustment. arXiv preprint arXiv:2403.02738","author":"Zhang Congzhi","year":"2024","unstructured":"Congzhi Zhang, Linhai Zhang, Jialong Wu, Deyu Zhou, and Yulan He. 2024. Causal prompting: Debiasing large language model prompting based on front-door adjustment. arXiv preprint arXiv:2403.02738 (2024)."},{"key":"e_1_3_2_1_62_1","volume-title":"On the identifiability of the post-nonlinear causal model. arXiv preprint arXiv:1205.2599","author":"Zhang Kun","year":"2012","unstructured":"Kun Zhang and Aapo Hyvarinen. 2012. On the identifiability of the post-nonlinear causal model. arXiv preprint arXiv:1205.2599 (2012)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_65_1","volume-title":"International Conference on Machine Learning. PMLR, 12979-12990","author":"Zimmermann Roland S","year":"2021","unstructured":"Roland S Zimmermann, Yash Sharma, Steffen Schneider, Matthias Bethge, and Wieland Brendel. 2021. Contrastive learning inverts the data generating process. In International Conference on Machine Learning. PMLR, 12979-12990."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755820","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:03:15Z","timestamp":1765339395000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755820"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":65,"alternative-id":["10.1145\/3746027.3755820","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755820","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}