{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:20:14Z","timestamp":1777656014829,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2037, U22B2051, U23A20383, U21A20472, 62176222, 62176223, 62176226, 62072386, 62072387, 62072389, 62002305, 62272401"],"award-info":[{"award-number":["U21B2037, U22B2051, U23A20383, U21A20472, 62176222, 62176223, 62176226, 62072386, 62072387, 62072389, 62002305, 62272401"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Natural Science Foundation of Fujian Province of China","award":["2022J06001"],"award-info":[{"award-number":["2022J06001"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["62025603"],"award-info":[{"award-number":["62025603"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2022ZD0118201"],"award-info":[{"award-number":["2022ZD0118201"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681249","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"9096-9105","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Cantor: Inspiring Multimodal Chain-of-Thought of MLLM"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-4034-2865","authenticated-orcid":false,"given":"Timin","family":"Gao","sequence":"first","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8015-5366","authenticated-orcid":false,"given":"Peixian","family":"Chen","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2911-5369","authenticated-orcid":false,"given":"Mengdan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0079-7668","authenticated-orcid":false,"given":"Chaoyou","family":"Fu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University &amp; School of Intelligence Science and Technology, Nanjing University, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3970-7519","authenticated-orcid":false,"given":"Yunhang","family":"Shen","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1642-0758","authenticated-orcid":false,"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0800-0609","authenticated-orcid":false,"given":"Shengchuan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6855-5403","authenticated-orcid":false,"given":"Xiawu","family":"Zheng","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8132-9083","authenticated-orcid":false,"given":"Xing","family":"Sun","sequence":"additional","affiliation":[{"name":"Tencent Youtu Lab, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7645-9606","authenticated-orcid":false,"given":"Liujuan","family":"Cao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9163-2932","authenticated-orcid":false,"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[{"name":"Key Laboratory of Multimedia Trusted Perception and Efficient Computing, Ministry of Education of China, Xiamen University, Xiamen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29720"},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_3_1","volume-title":"Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. arXiv preprint arXiv:2211.12588","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Xueguang Ma, Xinyi Wang, and William W Cohen. 2022. Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. arXiv preprint arXiv:2211.12588 (2022)."},{"key":"e_1_3_2_1_4_1","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery Aakanksha","year":"2023","unstructured":"Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. 2023. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, Vol. 24, 240 (2023), 1--113.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","volume-title":"Active prompting with chain-of-thought for large language models. arXiv preprint arXiv:2302.12246","author":"Diao Shizhe","year":"2023","unstructured":"Shizhe Diao, Pengcheng Wang, Yong Lin, and Tong Zhang. 2023. Active prompting with chain-of-thought for large language models. arXiv preprint arXiv:2302.12246 (2023)."},{"key":"e_1_3_2_1_6_1","volume-title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Peixian Chen, Yunhang Shen, Yulei Qin, Mengdan Zhang, Xu Lin, Jinrui Yang, Xiawu Zheng, Ke Li, Xing Sun, Yunsheng Wu, and Rongrong Ji. 2023. MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2306.13394 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise. arXiv preprint arXiv:2312.12436","author":"Fu Chaoyou","year":"2023","unstructured":"Chaoyou Fu, Renrui Zhang, Zihan Wang, Yubo Huang, Zhengye Zhang, Longtian Qiu, Gaoxiang Ye, Yunhang Shen, Mengdan Zhang, Peixian Chen, Sirui Zhao, Shaohui Lin, Deqiang Jiang, Di Yin, Peng Gao, Ke Li, Hongsheng Li, and Xing Sun. 2023. A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise. arXiv preprint arXiv:2312.12436 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"International Conference on Machine Learning. PMLR, 10764--10799","author":"Gao Luyu","year":"2023","unstructured":"Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig. 2023. Pal: Program-aided language models. In International Conference on Machine Learning. PMLR, 10764--10799."},{"key":"e_1_3_2_1_9_1","first-page":"6704","article-title":"Cyclip: Cyclic contrastive language-image pretraining","volume":"35","author":"Goel Shashank","year":"2022","unstructured":"Shashank Goel, Hritik Bansal, Sumit Bhatia, Ryan Rossi, Vishwa Vinay, and Aditya Grover. 2022. Cyclip: Cyclic contrastive language-image pretraining. Advances in Neural Information Processing Systems, Vol. 35 (2022), 6704--6719.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29776"},{"key":"e_1_3_2_1_11_1","volume-title":"Large language models are reasoning teachers. arXiv preprint arXiv:2212.10071","author":"Ho Namgyu","year":"2022","unstructured":"Namgyu Ho, Laura Schmid, and Se-Young Yun. 2022. Large language models are reasoning teachers. arXiv preprint arXiv:2212.10071 (2022)."},{"key":"e_1_3_2_1_12_1","volume-title":"Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139","author":"Horawalavithana Sameera","year":"2023","unstructured":"Sameera Horawalavithana, Sai Munikoti, Ian Stewart, and Henry Kvinge. 2023. Scitune: Aligning large language models with scientific multimodal instructions. arXiv preprint arXiv:2307.01139 (2023)."},{"key":"e_1_3_2_1_13_1","volume-title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning into Vision-Language Models. arXiv preprint arXiv:2312.03052","author":"Hu Yushi","year":"2023","unstructured":"Yushi Hu, Otilia Stretcu, Chun-Ta Lu, Krishnamurthy Viswanathan, Kenji Hata, Enming Luo, Ranjay Krishna, and Ariel Fuxman. 2023. Visual Program Distillation: Distilling Tools and Programmatic Reasoning into Vision-Language Models. arXiv preprint arXiv:2312.03052 (2023)."},{"key":"e_1_3_2_1_14_1","volume-title":"DP-INNet: Dual-Path Implicit Neural Network for Spatial and Spectral Features Fusion in Pan-Sharpening. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 268--279","author":"Huang Jingjia","year":"2023","unstructured":"Jingjia Huang, Ge Meng, Yingying Wang, Yunlong Lin, Yue Huang, and Xinghao Ding. 2023. DP-INNet: Dual-Path Implicit Neural Network for Spatial and Spectral Features Fusion in Pan-Sharpening. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 268--279."},{"key":"e_1_3_2_1_15_1","volume-title":"Neil Zhenqiang Gong, et al","author":"Huang Yue","year":"2023","unstructured":"Yue Huang, Jiawen Shi, Yuan Li, Chenrui Fan, Siyuan Wu, Qihui Zhang, Yixin Liu, Pan Zhou, Yao Wan, Neil Zhenqiang Gong, et al. 2023. Metatool benchmark for large language models: Deciding whether to use tools and which to use. arXiv preprint arXiv:2310.03128 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Mathprompter: Mathematical reasoning using large language models. arXiv preprint arXiv:2303.05398","author":"Imani Shima","year":"2023","unstructured":"Shima Imani, Liang Du, and Harsh Shrivastava. 2023. Mathprompter: Mathematical reasoning using large language models. arXiv preprint arXiv:2303.05398 (2023)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Daniel Khashabi Sewon Min Tushar Khot Ashish Sabharwal Oyvind Tafjord Peter Clark and Hannaneh Hajishirzi. 2020. UnifiedQA: Crossing Format Boundaries with a Single QA System. In Findings of the Association for Computational Linguistics (EMNLP). 1896--1907.","DOI":"10.18653\/v1\/2020.findings-emnlp.171"},{"key":"e_1_3_2_1_18_1","volume-title":"Internet-augmented dialogue generation. arXiv preprint arXiv:2107.07566","author":"Komeili Mojtaba","year":"2021","unstructured":"Mojtaba Komeili, Kurt Shuster, and Jason Weston. 2021. Internet-augmented dialogue generation. arXiv preprint arXiv:2107.07566 (2021)."},{"key":"e_1_3_2_1_19_1","unstructured":"Bin Lei Chunhua Liao Caiwen Ding et al. 2023. Boosting logical reasoning in large language models through a new framework: The graph of thought. arXiv preprint arXiv:2308.08614 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_21_1","volume-title":"Api-bank: A benchmark for tool-augmented llms. arXiv preprint arXiv:2304.08244","author":"Li Minghao","year":"2023","unstructured":"Minghao Li, Feifan Song, Bowen Yu, Haiyang Yu, Zhoujun Li, Fei Huang, and Yongbin Li. 2023. Api-bank: A benchmark for tool-augmented llms. arXiv preprint arXiv:2304.08244 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575","author":"Lin Ziyi","year":"2023","unstructured":"Ziyi Lin, Chris Liu, Renrui Zhang, Peng Gao, Longtian Qiu, Han Xiao, Han Qiu, Chen Lin, Wenqi Shao, Keqin Chen, et al. 2023. Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models. arXiv preprint arXiv:2311.07575 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Visual Instruction Tuning. arXiv preprint arXiv:2304.08485","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. arXiv preprint arXiv:2304.08485 (2023)."},{"key":"e_1_3_2_1_25_1","volume-title":"Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255","author":"Lu Pan","year":"2023","unstructured":"Pan Lu, Hritik Bansal, Tony Xia, Jiacheng Liu, Chunyuan Li, Hannaneh Hajishirzi, Hao Cheng, Kai-Wei Chang, Michel Galley, and Jianfeng Gao. 2023. Mathvista: Evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS).","author":"Lu Pan","year":"2022","unstructured":"Pan Lu, Swaroop Mishra, Tony Xia, Liang Qiu, Kai-Wei Chang, Song-Chun Zhu, Oyvind Tafjord, Peter Clark, and Ashwin Kalyan. 2022. Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering. In The 36th Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_27_1","volume-title":"Song-Chun Zhu, and Jianfeng Gao.","author":"Lu Pan","year":"2024","unstructured":"Pan Lu, Baolin Peng, Hao Cheng, Michel Galley, Kai-Wei Chang, Ying Nian Wu, Song-Chun Zhu, and Jianfeng Gao. 2024. Chameleon: Plug-and-play compositional reasoning with large language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28214"},{"key":"e_1_3_2_1_29_1","volume-title":"Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076","author":"Mitra Chancharik","year":"2023","unstructured":"Chancharik Mitra, Brandon Huang, Trevor Darrell, and Roei Herzig. 2023. Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076 (2023)."},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/blog\/chatgpt"},{"key":"e_1_3_2_1_31_1","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume":"35","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, et al. 2022. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, Vol. 35 (2022), 27730--27744.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_33_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_34_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Schick Timo","year":"2024","unstructured":"Timo Schick, Jane Dwivedi-Yu, Roberto Dess\u00ec, Roberta Raileanu, Maria Lomeli, Eric Hambro, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. 2024. Toolformer: Language models can teach themselves to use tools. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Shen Yongliang","year":"2024","unstructured":"Yongliang Shen, Kaitao Song, Xu Tan, Dongsheng Li, Weiming Lu, and Yueting Zhuang. 2024. Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_36_1","volume-title":"Siamak Shakeri, Dara Bahri","author":"Tay Yi","year":"2022","unstructured":"Yi Tay, Mostafa Dehghani, Vinh Q Tran, Xavier Garcia, Jason Wei, Xuezhi Wang, Hyung Won Chung, Siamak Shakeri, Dara Bahri, Tal Schuster, et al. 2022. Ul2: Unifying language learning paradigms. arXiv preprint arXiv:2205.05131 (2022)."},{"key":"e_1_3_2_1_37_1","volume-title":"Mllm-tool: A multimodal large language model for tool agent learning. arXiv preprint arXiv:2401.10727","author":"Wang Chenyu","year":"2024","unstructured":"Chenyu Wang, Weixin Luo, Qianyu Chen, Haonan Mai, Jindi Guo, Sixun Dong, XM Xuan, Zhengxin Li, Lin Ma, and Shenghua Gao. 2024. Mllm-tool: A multimodal large language model for tool agent learning. arXiv preprint arXiv:2401.10727, Vol. 4 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29884"},{"key":"e_1_3_2_1_39_1","volume-title":"Chi, and Denny Zhou","author":"Wang Xuezhi","year":"2022","unstructured":"Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc Le, Ed Chi, and Denny Zhou. 2022. Self-consistency improves chain of thought reasoning in language models. arXiv preprint arXiv:2203.11171 (2022)."},{"key":"e_1_3_2_1_40_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou. 2022. Chain of thought prompting elicits reasoning in large language models. arXiv preprint arXiv:2201.11903 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Visual chatgpt: Talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671","author":"Wu Chenfei","year":"2023","unstructured":"Chenfei Wu, Shengming Yin, Weizhen Qi, Xiaodong Wang, Zecheng Tang, and Nan Duan. 2023. Visual chatgpt: Talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)."},{"key":"e_1_3_2_1_42_1","volume-title":"Good questions help zero-shot image reasoning. arXiv preprint arXiv:2312.01598","author":"Yang Kaiwen","year":"2023","unstructured":"Kaiwen Yang, Tao Shen, Xinmei Tian, Xiubo Geng, Chongyang Tao, Dacheng Tao, and Tianyi Zhou. 2023. Good questions help zero-shot image reasoning. arXiv preprint arXiv:2312.01598 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Yao Shunyu","year":"2024","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2024. Tree of thoughts: Deliberate problem solving with large language models. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Beyond chain-of-thought, effective graph-of-thought reasoning in large language models. arXiv preprint arXiv:2305.16582","author":"Yao Yao","year":"2023","unstructured":"Yao Yao, Zuchao Li, and Hai Zhao. 2023. Beyond chain-of-thought, effective graph-of-thought reasoning in large language models. arXiv preprint arXiv:2305.16582 (2023)."},{"key":"e_1_3_2_1_45_1","volume-title":"A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_46_1","volume-title":"Woodpecker: Hallucination Correction for Multimodal Large Language Models. arXiv preprint arXiv:2310.16045","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Tong Xu, Hao Wang, Dianbo Sui, Yunhang Shen, Ke Li, Xing Sun, and Enhong Chen. 2023. Woodpecker: Hallucination Correction for Multimodal Large Language Models. arXiv preprint arXiv:2310.16045 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2023.3317321"},{"key":"e_1_3_2_1_48_1","volume-title":"Cocot: Contrastive chain-of-thought prompting for large multimodal models with multiple image inputs. arXiv preprint arXiv:2401.02582","author":"Zhang Daoan","year":"2024","unstructured":"Daoan Zhang, Junming Yang, Hanjia Lyu, Zijian Jin, Yuan Yao, Mingkai Chen, and Jiebo Luo. 2024. Cocot: Contrastive chain-of-thought prompting for large multimodal models with multiple image inputs. arXiv preprint arXiv:2401.02582 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention. arXiv preprint arXiv:2303.16199","author":"Zhang Renrui","year":"2023","unstructured":"Renrui Zhang, Jiaming Han, Aojun Zhou, Xiangfei Hu, Shilin Yan, Pan Lu, Hongsheng Li, Peng Gao, and Qiao Yu. 2023. LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention. arXiv preprint arXiv:2303.16199 (2023)."},{"key":"e_1_3_2_1_50_1","volume-title":"The Eleventh international conference on learning representations.","author":"Zhang Shaokun","year":"2023","unstructured":"Shaokun Zhang, Feiran Jia, Chi Wang, and Qingyun Wu. 2023. Targeted hyperparameter optimization with lexicographic preferences over multiple objectives. In The Eleventh international conference on learning representations."},{"key":"e_1_3_2_1_51_1","volume-title":"Hypertime: Hyperparameter optimization for combating temporal distribution shifts. arXiv preprint arXiv:2305.18421","author":"Zhang Shaokun","year":"2023","unstructured":"Shaokun Zhang, Yiran Wu, Zhonghua Zheng, Qingyun Wu, and Chi Wang. 2023. Hypertime: Hyperparameter optimization for combating temporal distribution shifts. arXiv preprint arXiv:2305.18421 (2023)."},{"key":"e_1_3_2_1_52_1","volume-title":"2023 d. Ideal: Influence-driven selective annotations empower in-context learners in large language models. arXiv preprint arXiv:2310.10873","author":"Zhang Shaokun","year":"2023","unstructured":"Shaokun Zhang, Xiaobo Xia, Zhaoqing Wang, Ling-Hao Chen, Jiale Liu, Qingyun Wu, and Tongliang Liu. 2023 d. Ideal: Influence-driven selective annotations empower in-context learners in large language models. arXiv preprint arXiv:2310.10873 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Training Language Model Agents without Modifying Language Models. arXiv preprint arXiv:2402.11359","author":"Zhang Shaokun","year":"2024","unstructured":"Shaokun Zhang, Jieyu Zhang, Jiale Liu, Linxin Song, Chi Wang, Ranjay Krishna, and Qingyun Wu. 2024. Training Language Model Agents without Modifying Language Models. arXiv preprint arXiv:2402.11359 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Automatic chain of thought prompting in large language models. arXiv preprint arXiv:2210.03493","author":"Zhang Zhuosheng","year":"2022","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, and Alex Smola. 2022. Automatic chain of thought prompting in large language models. arXiv preprint arXiv:2210.03493 (2022)."},{"key":"e_1_3_2_1_55_1","volume-title":"2023 e. Multimodal Chain-of-Thought Reasoning in Language Models. arXiv preprint arXiv:2302.00923","author":"Zhang Zhuosheng","year":"2023","unstructured":"Zhuosheng Zhang, Aston Zhang, Mu Li, Hai Zhao, George Karypis, and Alex Smola. 2023 e. Multimodal Chain-of-Thought Reasoning in Language Models. arXiv preprint arXiv:2302.00923 (2023)."},{"key":"e_1_3_2_1_56_1","first-page":"5168","article-title":"Ddcot: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models","volume":"36","author":"Zheng Ge","year":"2023","unstructured":"Ge Zheng, Bin Yang, Jiajin Tang, Hong-Yu Zhou, and Sibei Yang. 2023. Ddcot: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models. Advances in Neural Information Processing Systems, Vol. 36 (2023), 5168--5191.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01753-6"},{"key":"e_1_3_2_1_58_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zhuang Yuchen","year":"2024","unstructured":"Yuchen Zhuang, Yue Yu, Kuan Wang, Haotian Sun, and Chao Zhang. 2024. Toolqa: A dataset for llm question answering with external tools. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02225"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413849"},{"key":"e_1_3_2_1_61_1","volume-title":"Margin-based few-shot class-incremental learning with class-level overfitting mitigation. Advances in neural information processing systems","author":"Zou Yixiong","year":"2022","unstructured":"Yixiong Zou, Shanghang Zhang, Yuhua Li, and Ruixuan Li. 2022. Margin-based few-shot class-incremental learning with class-level overfitting mitigation. Advances in neural information processing systems, Vol. 35 (2022), 27267--27279."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475243"},{"key":"e_1_3_2_1_63_1","volume-title":"Compositional Few-Shot Class-Incremental Learning. arXiv preprint arXiv:2405.17022","author":"Zou Yixiong","year":"2024","unstructured":"Yixiong Zou, Shanghang Zhang, Haichen Zhou, Yuhua Li, and Ruixuan Li. 2024. Compositional Few-Shot Class-Incremental Learning. arXiv preprint arXiv:2405.17022 (2024)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681249","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681249","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681249"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":63,"alternative-id":["10.1145\/3664647.3681249","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681249","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}