{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:21:09Z","timestamp":1773246069772,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Natural Science Foundation of Hunan Province","award":["2022JJ30159"],"award-info":[{"award-number":["2022JJ30159"]}]},{"name":"Natural Science Foundation of Hunan Province","award":["2023JJ20013"],"award-info":[{"award-number":["2023JJ20013"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802121"],"award-info":[{"award-number":["61802121"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Fundamental Research Funds for the Central Universities"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681263","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4331-4340","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Causal-driven Large Language Models with Faithful Reasoning for Knowledge Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6601-2958","authenticated-orcid":false,"given":"Jiawei","family":"Wang","sequence":"first","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2611-2559","authenticated-orcid":false,"given":"Da","family":"Cao","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2183-4314","authenticated-orcid":false,"given":"Shaofei","family":"Lu","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7705-3639","authenticated-orcid":false,"given":"Zhanchang","family":"Ma","sequence":"additional","affiliation":[{"name":"Hunan University, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5573-6195","authenticated-orcid":false,"given":"Junbin","family":"Xiao","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.761"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","volume-title":"GPT-4 Can't Reason. arXiv preprint arXiv:2308.03762","author":"Arkoudas Konstantine","year":"2023","unstructured":"Konstantine Arkoudas. 2023. GPT-4 Can't Reason. arXiv preprint arXiv:2308.03762 (2023)."},{"key":"e_1_3_2_1_4_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. 2020. Language models are few-shot learners. In Advances in Neural Information Processing Systems, Vol. 33. 1877--1901.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611987"},{"key":"e_1_3_2_1_6_1","volume-title":"Faithful reasoning using large language models. arXiv preprint arXiv:2208.14271","author":"Creswell Antonia","year":"2022","unstructured":"Antonia Creswell and Murray Shanahan. 2022. Faithful reasoning using large language models. arXiv preprint arXiv:2208.14271 (2022)."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_8_1","volume-title":"Testing GPT-4 with Wolfram Alpha and Code Interpreter plug-ins on math and science problems. arXiv preprint arXiv:2308.05713","author":"Davis Ernest","year":"2023","unstructured":"Ernest Davis and Scott Aaronson. 2023. Testing GPT-4 with Wolfram Alpha and Code Interpreter plug-ins on math and science problems. arXiv preprint arXiv:2308.05713 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00680"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.2307\/2024268"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29771"},{"key":"e_1_3_2_1_12_1","volume-title":"Rethinking with retrieval: Faithful large language model inference. arXiv preprint arXiv:2301.00303","author":"He Hangfeng","year":"2022","unstructured":"Hangfeng He, Hongming Zhang, and Dan Roth. 2022. Rethinking with retrieval: Faithful large language model inference. arXiv preprint arXiv:2301.00303 (2022)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.68"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.386"},{"key":"e_1_3_2_1_15_1","volume-title":"Discriminator-Guided Multi-step Reasoning with Language Models. arXiv preprint arXiv:2305.14934","author":"Khalifa Muhammad","year":"2023","unstructured":"Muhammad Khalifa, Lajanugen Logeswaran, Moontae Lee, Honglak Lee, and Lu Wang. 2023. Discriminator-Guided Multi-step Reasoning with Language Models. arXiv preprint arXiv:2305.14934 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Unifiedqa: Crossing format boundaries with a single qa system. arXiv preprint arXiv:2005.00700","author":"Khashabi Daniel","year":"2020","unstructured":"Daniel Khashabi, Sewon Min, Tushar Khot, Ashish Sabharwal, Oyvind Tafjord, Peter Clark, and Hannaneh Hajishirzi. 2020. Unifiedqa: Crossing format boundaries with a single qa system. arXiv preprint arXiv:2005.00700 (2020)."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning. PMLR, 5583--5594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583--5594."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612389"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.397"},{"key":"e_1_3_2_1_20_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_22_1","volume-title":"Confidence Matters: Revisiting Intrinsic Self-Correction Capabilities of Large Language Models. arXiv preprint arXiv:2402.12563","author":"Li Loka","year":"2024","unstructured":"Loka Li, Guangyi Chen, Yusheng Su, Zhenhao Chen, Yixuan Zhang, Eric Xing, and Kun Zhang. 2024. Confidence Matters: Revisiting Intrinsic Self-Correction Capabilities of Large Language Models. arXiv preprint arXiv:2402.12563 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.291"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3626772.3657807"},{"key":"e_1_3_2_1_26_1","first-page":"2507","article-title":"Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering","volume":"35","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering. In Advances in Neural Information Processing Systems, Vol. 35. 2507--2521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"Iconqa: A new benchmark for abstract diagram understanding and visual language reasoning. arXiv preprint arXiv:2110.13214","author":"Lu Pan","year":"2021","unstructured":"Pan Lu, Liang Qiu, Jiaqi Chen, Tony Xia, Yizhou Zhao, Wei Zhang, Zhou Yu, Xiaodan Liang, and Song-Chun Zhu. 2021. Iconqa: A new benchmark for abstract diagram understanding and visual language reasoning. arXiv preprint arXiv:2110.13214 (2021)."},{"key":"e_1_3_2_1_28_1","volume-title":"AutoM3L: An Automated Multimodal Machine Learning Framework with Large Language Models. arXiv preprint arXiv:2408.00665","author":"Luo Daqin","year":"2024","unstructured":"Daqin Luo, Chengjian Feng, Yuxuan Nong, and Yiqing Shen. 2024. AutoM3L: An Automated Multimodal Machine Learning Framework with Large Language Models. arXiv preprint arXiv:2408.00665 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645316"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583451"},{"key":"e_1_3_2_1_31_1","unstructured":"Long Ouyang Jeffrey Wu Xu Jiang Diogo Almeida Carroll Wainwright Pamela Mishkin Chong Zhang Sandhini Agarwal Katarina Slama Alex Ray et al. 2022. Training language models to follow instructions with human feedback. Vol. 35 (2022) 27730--27744."},{"key":"e_1_3_2_1_32_1","volume-title":"Cambridge University Press","volume":"19","author":"Judea","year":"2000","unstructured":"Judea Pearl et al. 2000. Models, reasoning and inference. Cambridge University Press, Vol. 19, 2 (2000)."},{"key":"e_1_3_2_1_33_1","volume-title":"IMAGDressing-v1: Customizable Virtual Dressing. arXiv preprint arXiv:2407.12705","author":"Shen Fei","year":"2024","unstructured":"Fei Shen, Xin Jiang, Xin He, Hu Ye, Cong Wang, Xiaoyu Du, Zechao Li, and Jinghui Tang. 2024. IMAGDressing-v1: Customizable Virtual Dressing. arXiv preprint arXiv:2407.12705 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612009"},{"key":"e_1_3_2_1_35_1","volume-title":"Reflexion: Language agents with verbal reinforcement learning. In Advances in Neural Information Processing Systems.","author":"Shinn Noah","year":"2023","unstructured":"Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik R Narasimhan, and Shunyu Yao. 2023. Reflexion: Language agents with verbal reinforcement learning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.330"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351087"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613822"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Wang Xuezhi","year":"2023","unstructured":"Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc V Le, Ed H Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou. 2023. Self-Consistency Improves Chain of Thought Reasoning in Language Models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_40_1","volume-title":"Denny Zhou, et al.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. , Vol. 35 (2022), 24824--24837."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611873"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing.","author":"Weng Yixuan","year":"2023","unstructured":"Yixuan Weng, Minjun Zhu, Fei Xia, Bin Li, Shizhu He, Kang Liu, and Jun Zhao. 2023. Large Language Models are Better Reasoners with Self-Verification. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing."},{"key":"e_1_3_2_1_43_1","volume-title":"Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519","author":"Wu Shengqiong","year":"2023","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2023. Next-gpt: Any-to-any multimodal llm. arXiv preprint arXiv:2309.05519 (2023)."},{"key":"e_1_3_2_1_44_1","unstructured":"Yuxi Xie Kenji Kawaguchi Yiran Zhao Xu Zhao Min-Yen Kan Junxian He and Qizhe Xie. 2023. Self-Evaluation Guided Beam Search for Reasoning. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_45_1","volume-title":"An Empirical Evaluation of Confidence Elicitation in LLMs. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=gjeQKFxFpZ","author":"Xiong Miao","year":"2024","unstructured":"Miao Xiong, Zhiyuan Hu, Xinyang Lu, YIFEI LI, Jie Fu, Junxian He, and Bryan Hooi. 2024. Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=gjeQKFxFpZ"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00972"},{"key":"e_1_3_2_1_47_1","volume-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421","author":"Yang Zhengyuan","year":"2023","unstructured":"Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, and Lijuan Wang. 2023. The dawn of lmms: Preliminary explorations with gpt-4v (ision). arXiv preprint arXiv:2309.17421, Vol. 9 (2023)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_49_1","first-page":"11809","article-title":"Tree of Thoughts: Deliberate Problem Solving with Large Language Models","volume":"36","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2023. Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In Advances in Neural Information Processing Systems, Vol. 36. 11809--11822.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_50_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik R Narasimhan, and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_51_1","volume-title":"A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_1_53_1","volume-title":"Multimodal Chain-of-Thought Reasoning in Language Models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023. Multimodal Chain-of-Thought Reasoning in Language Models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Zhou Denny","year":"2022","unstructured":"Denny Zhou, Nathanael Sch\u00e4rli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc V Le, et al. 2022. Least-to-Most Prompting Enables Complex Reasoning in Large Language Models. In Proceedings of the International Conference on Learning Representations."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681263","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681263","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681263"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3681263","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681263","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}