{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T19:45:37Z","timestamp":1768074337154,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFF0905402"],"award-info":[{"award-number":["2023YFF0905402"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Zhidemai Technology Co., Ltd"},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276268"],"award-info":[{"award-number":["62276268"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Natural Science Foundation","award":["L233008"],"award-info":[{"award-number":["L233008"]}]},{"name":"Migu Culture Technology Co., Ltd"},{"name":"Tencent AI Lab Rhino-Bird Focused Research Program"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681458","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"965-974","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["See or Guess: Counterfactually Regularized Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3288-1714","authenticated-orcid":false,"given":"Qian","family":"Cao","sequence":"first","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0144-1775","authenticated-orcid":false,"given":"Xu","family":"Chen","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6036-9035","authenticated-orcid":false,"given":"Ruihua","family":"Song","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5768-1095","authenticated-orcid":false,"given":"Xiting","family":"Wang","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6827-7426","authenticated-orcid":false,"given":"Xinting","family":"Huang","sequence":"additional","affiliation":[{"name":"Tencent AI Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3301-6113","authenticated-orcid":false,"given":"Yuchen","family":"Ren","sequence":"additional","affiliation":[{"name":"Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10044--10054","author":"Abbasnejad Ehsan","unstructured":"Ehsan Abbasnejad, Damien Teney, Amin Parvaneh, Javen Shi, and Anton van den Hengel. 2020. Counterfactual vision and language learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 10044--10054."},{"key":"e_1_3_2_1_2_1","volume-title":"Counterfactuals and Policy Analysis in Structural Models. In UAI '95: Proceedings of the Eleventh Annual Conference on Uncertainty in Artificial Intelligence","author":"Balke Alexander","year":"1995","unstructured":"Alexander Balke and Judea Pearl. 1995. Counterfactuals and Policy Analysis in Structural Models. In UAI '95: Proceedings of the Eleventh Annual Conference on Uncertainty in Artificial Intelligence, Montreal, Quebec, Canada, August 18--20, 1995, Philippe Besnard and Steve Hanks (Eds.). Morgan Kaufmann, 11--18. https:\/\/dslpitt.org\/uai\/displayArticleDetails.jsp?mmnu=1&smnu=2&article_id=414&proceeding_id=11"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00282"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00253"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548189"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.156"},{"key":"e_1_3_2_1_8_1","volume-title":"Causal inference in statistics: A primer","author":"Glymour Madelyn","unstructured":"Madelyn Glymour, Judea Pearl, and Nicholas P Jewell. 2016. Causal inference in statistics: A primer. John Wiley & Sons."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Machine Learning. PMLR, 2376--2384","author":"Goyal Yash","year":"2019","unstructured":"Yash Goyal, Ziyan Wu, Jan Ernst, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Counterfactual visual explanations. In International Conference on Machine Learning. PMLR, 2376--2384."},{"key":"e_1_3_2_1_10_1","volume-title":"Generating Counterfactual Explanations with Natural Language. CoRR","author":"Hendricks Lisa Anne","year":"2018","unstructured":"Lisa Anne Hendricks, Ronghang Hu, Trevor Darrell, and Zeynep Akata. 2018. Generating Counterfactual Explanations with Natural Language. CoRR, Vol. abs\/1806.09809 (2018). showeprint[arXiv]1806.09809 http:\/\/arxiv.org\/abs\/1806.09809"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00353"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning, ICML 2023","volume":"19742","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven C. H. Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning, ICML 2023, 23--29 July 2023, Honolulu, Hawaii, USA (Proceedings of Machine Learning Research, Vol. 202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 19730--19742. https:\/\/proceedings.mlr.press\/v202\/li23q.html"},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings, Part XXX 16","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.265"},{"key":"e_1_3_2_1_17_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01751"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.08485"},{"key":"e_1_3_2_1_21_1","volume-title":"Bermano","author":"Mokady Ron","year":"2021","unstructured":"Ron Mokady, Amir Hertz, and Amit H. Bermano. 2021. ClipCap: CLIP Prefix for Image Captioning. CoRR, Vol. abs\/2111.09734 (2021). showeprint[arXiv]2111.09734 https:\/\/arxiv.org\/abs\/2111.09734"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_24_1","volume-title":"Direct and Indirect Effects. In UAI '01: Proceedings of the 17th Conference in Uncertainty in Artificial Intelligence","author":"Pearl Judea","year":"2001","unstructured":"Judea Pearl. 2001. Direct and Indirect Effects. In UAI '01: Proceedings of the 17th Conference in Uncertainty in Artificial Intelligence, University of Washington, Seattle, Washington, USA, August 2--5, 2001, Jack S. Breese and Daphne Koller (Eds.). Morgan Kaufmann, 411--420. https:\/\/dslpitt.org\/uai\/displayArticleDetails.jsp?mmnu=1&smnu=2&article_id=126&proceeding_id=17"},{"key":"e_1_3_2_1_25_1","volume-title":"Causal inference. Causality: objectives and assessment","author":"Pearl Judea","year":"2010","unstructured":"Judea Pearl. 2010. Causal inference. Causality: objectives and assessment (2010), 39--58."},{"key":"e_1_3_2_1_26_1","volume-title":"Cambridge, UK","author":"Judea Pearl","year":"2000","unstructured":"Judea Pearl et al. 2000. Models, reasoning and inference. Cambridge, UK: CambridgeUniversityPress, Vol. 19, 2 (2000)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00564"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_32_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Santurkar Shibani","year":"2023","unstructured":"Shibani Santurkar, Yann Dubois, Rohan Taori, Percy Liang, and Tatsunori Hashimoto. 2023. Is a caption worth a thousand images? a study on representation learning. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_1_33_1","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019","author":"Schwab Patrick","year":"2019","unstructured":"Patrick Schwab and Walter Karlen. 2019. CXPlain: Causal Explanations for Model Interpretation under Uncertainty. In Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8--14, 2019, Vancouver, BC, Canada, Hanna M. Wallach, Hugo Larochelle, Alina Beygelzimer, Florence d'Alch\u00e9-Buc, Emily B. Fox, and Roman Garnett (Eds.). 10220--10230. https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/3ab6be46e1d6b21d59a3c3a0b9d0f6ef-Abstract.html"},{"key":"e_1_3_2_1_34_1","volume-title":"From show to tell: A survey on deep learning-based image captioning","author":"Stefanini Matteo","year":"2022","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. 2022. From show to tell: A survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 1 (2022), 539--559."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_34"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_38_1","volume-title":"Counterfactual Maximum Likelihood Estimation for Training Deep Networks. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021","author":"Wang Xinyi","year":"2021","unstructured":"Xinyi Wang, Wenhu Chen, Michael Saxon, and William Yang Wang. 2021. Counterfactual Maximum Likelihood Estimation for Training Deep Networks. In Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6--14, 2021, virtual, Marc'Aurelio Ranzato, Alina Beygelzimer, Yann N. Dauphin, Percy Liang, and Jennifer Wortman Vaughan (Eds.). 25072--25085. https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/d30d0f522a86b3665d8e3a9a91472e28-Abstract.html"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.visinf.2022.09.003"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599240"},{"key":"e_1_3_2_1_41_1","volume-title":"International conference on machine learning. PMLR","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048--2057."},{"key":"e_1_3_2_1_42_1","volume-title":"Foundation models meet visualizations: Challenges and opportunities. Computational Visual Media","author":"Yang Weikai","year":"2024","unstructured":"Weikai Yang, Mengchen Liu, Zheng Wang, and Shixia Liu. 2024. Foundation models meet visualizations: Challenges and opportunities. Computational Visual Media (2024), 1--26."},{"key":"e_1_3_2_1_43_1","first-page":"12996","article-title":"Deconfounded image captioning: A causal retrospect","volume":"45","author":"Yang Xu","year":"2021","unstructured":"Xu Yang, Hanwang Zhang, and Jianfei Cai. 2021. Deconfounded image captioning: A causal retrospect. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 11 (2021), 12996--13010.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01515"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING","author":"Zhang Hanyu","year":"2024","unstructured":"Hanyu Zhang, Xiting Wang, Xiang Ao, and Qing He. 2024. Distillation with Explanations from Large Language Models. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024). 5018--5028."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/AAAI.V35I4.16452"},{"key":"e_1_3_2_1_49_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123--18134.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2304.10592"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681458","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681458","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681458"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3681458","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681458","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}