{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T19:18:46Z","timestamp":1776107926535,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant No.: 62372314"],"award-info":[{"award-number":["Grant No.: 62372314"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hong Kong RGC Theme-based Research Scheme","award":["PolyU No.: T43-513\/23-N"],"award-info":[{"award-number":["PolyU No.: T43-513\/23-N"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681102","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"419-428","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["A Picture Is Worth a Graph: A Blueprint Debate Paradigm for Multimodal Reasoning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2945-8248","authenticated-orcid":false,"given":"Changmeng","family":"Zheng","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University, Kowloon, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0847-3416","authenticated-orcid":false,"given":"Dayong","family":"Liang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2347-4183","authenticated-orcid":false,"given":"Wengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Kowloon, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5706-5177","authenticated-orcid":false,"given":"Xiao-Yong","family":"Wei","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Kowloon, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3370-471X","authenticated-orcid":false,"given":"Qing","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University, Kowloon, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al.","author":"Achiam Josh","year":"2023","unstructured":"Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, et al. 2023. Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_2_3_1","volume-title":"The Twelfth International Conference on Learning Representations.","author":"Chan Chi-Min","year":"2023","unstructured":"Chi-Min Chan, Weize Chen, Yusheng Su, Jianxuan Yu, Wei Xue, Shanghang Zhang, Jie Fu, and Zhiyuan Liu. 2023. ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_4_1","volume-title":"Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325","author":"Chen Xinlei","year":"2015","unstructured":"Xinlei Chen, Hao Fang, Tsung-Yi Lin, Ramakrishna Vedantam, Saurabh Gupta, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_2_5_1","unstructured":"Karl Cobbe Vineet Kosaraju Mohammad Bavarian Mark Chen Heewoo Jun Lukasz Kaiser Matthias Plappert Jerry Tworek Jacob Hilton Reiichiro Nakano et al. 2021. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)."},{"key":"e_1_3_2_2_6_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Albert Li, Pascale Fung, and Steven C. H. Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. ArXiv, Vol. abs\/2305.06500 (2023)."},{"key":"e_1_3_2_2_7_1","volume-title":"Improving Factuality and Reasoning in Language Models through Multiagent Debate. arXiv preprint arXiv:2305.14325","author":"Du Yilun","year":"2023","unstructured":"Yilun Du, Shuang Li, Antonio Torralba, Joshua B Tenenbaum, and Igor Mordatch. 2023. Improving Factuality and Reasoning in Language Models through Multiagent Debate. arXiv preprint arXiv:2305.14325 (2023)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.327"},{"key":"e_1_3_2_2_10_1","volume-title":"Measuring Massive Multitask Language Understanding. In International Conference on Learning Representations.","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring Massive Multitask Language Understanding. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_11_1","volume-title":"Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139","author":"Horawalavithana Sameera","year":"2023","unstructured":"Sameera Horawalavithana, Sai Munikoti, Ian Stewart, and Henry Kvinge. 2023. Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139 (2023)."},{"key":"e_1_3_2_2_12_1","volume-title":"A survey of knowledge enhanced pre-trained language models","author":"Hu Linmei","year":"2023","unstructured":"Linmei Hu, Zeyi Liu, Ziwang Zhao, Lei Hou, Liqiang Nie, and Juanzi Li. 2023. A survey of knowledge enhanced pre-trained language models. IEEE Transactions on Knowledge and Data Engineering (2023)."},{"key":"e_1_3_2_2_13_1","volume-title":"Large Language Models Cannot Self-Correct Reasoning Yet. In The Twelfth International Conference on Learning Representations.","author":"Huang Jie","year":"2023","unstructured":"Jie Huang, Xinyun Chen, Swaroop Mishra, Huaixiu Steven Zheng, Adams Wei Yu, Xinying Song, and Denny Zhou. 2023. Large Language Models Cannot Self-Correct Reasoning Yet. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_14_1","volume-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3587251"},{"key":"e_1_3_2_2_16_1","volume-title":"Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate. arXiv preprint arXiv:2305.19118","author":"Liang Tian","year":"2023","unstructured":"Tian Liang, Zhiwei He, Wenxiang Jiao, Xing Wang, Yan Wang, Rui Wang, Yujiu Yang, Zhaopeng Tu, and Shuming Shi. 2023. Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate. arXiv preprint arXiv:2305.19118 (2023)."},{"key":"e_1_3_2_2_17_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_18_1","volume-title":"Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281","author":"Liu Yuan","year":"2023","unstructured":"Yuan Liu, Haodong Duan, Yuanhan Zhang, Bo Li, Songyang Zhang, Wangbo Zhao, Yike Yuan, Jiaqi Wang, Conghui He, Ziwei Liu, et al. 2023. Mmbench: Is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)."},{"key":"e_1_3_2_2_19_1","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume":"35","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tanglin Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems, Vol. 35 (2022), 2507--2521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_20_1","volume-title":"Song-Chun Zhu, and Jianfeng Gao.","author":"Lu Pan","year":"2024","unstructured":"Pan Lu, Baolin Peng, Hao Cheng, Michel Galley, Kai-Wei Chang, Ying Nian Wu, Song-Chun Zhu, and Jianfeng Gao. 2024. Chameleon: Plug-and-play compositional reasoning with large language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583451"},{"key":"e_1_3_2_2_22_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Madaan Aman","year":"2024","unstructured":"Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et al. 2024. Self-refine: Iterative refinement with self-feedback. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_23_1","volume-title":"Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076","author":"Mitra Chancharik","year":"2023","unstructured":"Chancharik Mitra, Brandon Huang, Trevor Darrell, and Roei Herzig. 2023. Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076 (2023)."},{"key":"e_1_3_2_2_24_1","volume-title":"KAM-CoT: Knowledge Augmented Multimodal Chain-of-Thoughts Reasoning. arXiv preprint arXiv:2401.12863","author":"Mondal Debjyoti","year":"2024","unstructured":"Debjyoti Mondal, Suraj Modi, Subhadarshi Panda, Rituraj Singh, and Godawari Sudhakar Rao. 2024. KAM-CoT: Knowledge Augmented Multimodal Chain-of-Thoughts Reasoning. arXiv preprint arXiv:2401.12863 (2024)."},{"key":"e_1_3_2_2_25_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Shinn Noah","year":"2024","unstructured":"Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. 2024. Reflexion: Language agents with verbal reinforcement learning. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_2_26_1","volume-title":"Think-on-Graph: Deep and Responsible Reasoning of Large Language Model on Knowledge Graph. In The Twelfth International Conference on Learning Representations.","author":"Sun Jiashuo","year":"2023","unstructured":"Jiashuo Sun, Chengjin Xu, Lumingyuan Tang, Saizhuo Wang, Chen Lin, Yeyun Gong, Lionel Ni, Heung-Yeung Shum, and Jian Guo. 2023. Think-on-Graph: Deep and Responsible Reasoning of Large Language Model on Knowledge Graph. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_2_27_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"Knowledge-driven cot: Exploring faithful reasoning in llms for knowledge-intensive question answering. arXiv preprint arXiv:2308.13259","author":"Wang Keheng","year":"2023","unstructured":"Keheng Wang, Feiyu Duan, Sirui Wang, Peiguang Li, Yunsen Xian, Chuantao Yin, Wenge Rong, and Zhang Xiong. 2023. Knowledge-driven cot: Exploring faithful reasoning in llms for knowledge-intensive question answering. arXiv preprint arXiv:2308.13259 (2023)."},{"key":"e_1_3_2_2_29_1","volume-title":"T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large Language Model Signals for Science Question Answering. arXiv preprint arXiv:2305.03453","author":"Wang Lei","year":"2023","unstructured":"Lei Wang, Yi Hu, Jiabang He, Xing Xu, Ning Liu, Hui Liu, and Heng Tao Shen. 2023. T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large Language Model Signals for Science Question Answering. arXiv preprint arXiv:2305.03453 (2023)."},{"key":"e_1_3_2_2_30_1","volume-title":"Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079","author":"Wang Weihan","year":"2023","unstructured":"Weihan Wang, Qingsong Lv, Wenmeng Yu, Wenyi Hong, Ji Qi, Yan Wang, Junhui Ji, Zhuoyi Yang, Lei Zhao, Xixuan Song, et al. 2023. Cogvlm: Visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01973"},{"key":"e_1_3_2_2_32_1","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny Zhou, et al. 2022. Chain-of-thought prompting elicits reasoning in large language models. Advances in Neural Information Processing Systems, Vol. 35 (2022), 24824--24837.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1459359.1459371"},{"key":"e_1_3_2_2_34_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Welleck Sean","year":"2022","unstructured":"Sean Welleck, Ximing Lu, Peter West, Faeze Brahman, Tianxiao Shen, Daniel Khashabi, and Yejin Choi. 2022. Generating Sequences by Learning to Self-Correct. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413650"},{"key":"e_1_3_2_2_36_1","volume-title":"Image and Video. In International Conference on Machine Learning.","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Mingshi Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qiuchen Qian, Wei Wang, Guohai Xu, Ji Zhang, Songfang Huang, Feiran Huang, and Jingren Zhou. 2023. mPLUG-2: A Modularized Multi-modal Foundation Model Across Text, Image and Video. In International Conference on Machine Learning."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.936"},{"key":"e_1_3_2_2_38_1","volume-title":"Exploring collaboration mechanisms for llm agents: A social psychology view. arXiv preprint arXiv:2310.02124","author":"Zhang Jintian","year":"2023","unstructured":"Jintian Zhang, Xin Xu, and Shumin Deng. 2023. Exploring collaboration mechanisms for llm agents: A social psychology view. arXiv preprint arXiv:2310.02124 (2023)."},{"key":"e_1_3_2_2_39_1","volume-title":"Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023. Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"Boosting entity-aware image captioning with multi-modal knowledge graph","author":"Zhao Wentian","year":"2023","unstructured":"Wentian Zhao and Xinxiao Wu. 2023. Boosting entity-aware image captioning with multi-modal knowledge graph. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3476968"},{"key":"e_1_3_2_2_42_1","volume-title":"Progressive-hint prompting improves reasoning in large language models. arXiv preprint arXiv:2304.09797","author":"Zheng Chuanyang","year":"2023","unstructured":"Chuanyang Zheng, Zhengying Liu, Enze Xie, Zhenguo Li, and Yu Li. 2023. Progressive-hint prompting improves reasoning in large language models. arXiv preprint arXiv:2304.09797 (2023)."},{"key":"e_1_3_2_2_43_1","volume-title":"DDCoT: Duty-Distinct Chain-of-Thought Prompting for Multimodal Reasoning in Language Models. In Thirty-seventh Conference on Neural Information Processing Systems.","author":"Zheng Ge","year":"2023","unstructured":"Ge Zheng, Bin Yang, Jiajin Tang, Hong-Yu Zhou, and Sibei Yang. 2023. DDCoT: Duty-Distinct Chain-of-Thought Prompting for Multimodal Reasoning in Language Models. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_44_1","volume-title":"The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=1tZbq88f27","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=1tZbq88f27"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681102","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681102","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:52Z","timestamp":1750294672000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681102"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":44,"alternative-id":["10.1145\/3664647.3681102","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681102","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}