{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,16]],"date-time":"2026-07-16T05:17:45Z","timestamp":1784179065526,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,7]]},"DOI":"10.1145\/3767695.3769486","type":"proceedings-article","created":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T17:14:58Z","timestamp":1764782098000},"page":"2-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["CogPlanner: Unveiling the Potential of Agentic Multimodal Retrieval Augmented Generation with Planning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3560-1230","authenticated-orcid":false,"given":"Xiaohan","family":"Yu","sequence":"first","affiliation":[{"name":"Huawei Cloud BU, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0600-5833","authenticated-orcid":false,"given":"Zhihan","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei Cloud BU, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1417-2295","authenticated-orcid":false,"given":"Chong","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Cloud BU, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Intent Detection in the Age of LLMs. arXiv preprint arXiv:2410.01627","author":"Arora Gaurav","year":"2024","unstructured":"Gaurav Arora, Shreya Jain, and Srujana Merugu. 2024. Intent Detection in the Age of LLMs. arXiv preprint arXiv:2410.01627 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511","author":"Asai Akari","year":"2023","unstructured":"Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. arXiv preprint arXiv:2310.11511 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00188"},{"key":"e_1_3_2_1_5_1","volume-title":"Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W Cohen. 2022. Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928 (2022)."},{"key":"e_1_3_2_1_6_1","unstructured":"Xingyu Chen Jiahao Xu Tian Liang Zhiwei He Jianhui Pang Dian Yu Linfeng Song Qiuzhi Liu Mengfei Zhou Zhuosheng Zhang et al. 2024. Do NOT Think That Much for 2 3=? On the Overthinking of o1-Like LLMs. arXiv preprint arXiv:2412.21187 (2024)."},{"key":"e_1_3_2_1_7_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models. arXiv preprint arXiv:2410.08182","author":"Hu Wenbo","year":"2024","unstructured":"Wenbo Hu, Jia-Chen Gu, Zi-Yi Dou, Mohsen Fayyaz, Pan Lu, Kai-Wei Chang, and Nanyun Peng. 2024. MRAG-Bench: Vision-Centric Evaluation for Retrieval-Augmented Multimodal Models. arXiv preprint arXiv:2410.08182 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Leveraging passage retrieval with generative models for open domain question answering. arXiv preprint arXiv:2007.01282","author":"Izacard Gautier","year":"2020","unstructured":"Gautier Izacard and Edouard Grave. 2020. Leveraging passage retrieval with generative models for open domain question answering. arXiv preprint arXiv:2007.01282 (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al.","author":"Jiang Albert Q","year":"2024","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al., 2024a. Mixtral of experts. arXiv preprint arXiv:2401.04088 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Mmsearch: Benchmarking the potential of large models as multi-modal search engines. arXiv preprint arXiv:2409.12959","author":"Jiang Dongzhi","year":"2024","unstructured":"Dongzhi Jiang, Renrui Zhang, Ziyu Guo, Yanmin Wu, Jiayi Lei, Pengshuo Qiu, Pan Lu, Zehui Chen, Guanglu Song, Peng Gao, et al., 2024b. Mmsearch: Benchmarking the potential of large models as multi-modal search engines. arXiv preprint arXiv:2409.12959 (2024)."},{"key":"e_1_3_2_1_13_1","volume-title":"II-MMR: Identifying and improving multi-modal multi-hop reasoning in visual question answering. arXiv preprint arXiv:2402.11058","author":"Kil Jihyung","year":"2024","unstructured":"Jihyung Kil, Farideh Tavazoee, Dongyeop Kang, and Joo-Kyung Kim. 2024. II-MMR: Identifying and improving multi-modal multi-hop reasoning in visual question answering. arXiv preprint arXiv:2402.11058 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Learning dense representations of phrases at scale. arXiv preprint arXiv:2012.12624","author":"Lee Jinhyuk","year":"2020","unstructured":"Jinhyuk Lee, Mujeen Sung, Jaewoo Kang, and Danqi Chen. 2020. Learning dense representations of phrases at scale. arXiv preprint arXiv:2012.12624 (2020)."},{"key":"e_1_3_2_1_15_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al., 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459-9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","volume-title":"A survey on retrieval-augmented text generation. arXiv preprint arXiv:2202.01110","author":"Li Huayang","year":"2022","unstructured":"Huayang Li, Yixuan Su, Deng Cai, Yan Wang, and Lemao Liu. 2022. A survey on retrieval-augmented text generation. arXiv preprint arXiv:2202.01110 (2022)."},{"key":"e_1_3_2_1_17_1","unstructured":"Kuan Li Zhongwang Zhang Huifeng Yin Liwen Zhang Litu Ou Jialong Wu Wenbiao Yin Baixuan Li Zhengwei Tao Xinyu Wang et al. 2025. WebSailor: Navigating Super-human Reasoning for Web Agent. arXiv preprint arXiv:2507.02592 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74-81."},{"key":"e_1_3_2_1_19_1","volume-title":"Retrieval augmented visual question answering with outside knowledge. arXiv preprint arXiv:2210.03809","author":"Lin Weizhe","year":"2022","unstructured":"Weizhe Lin and Bill Byrne. 2022. Retrieval augmented visual question answering with outside knowledge. arXiv preprint arXiv:2210.03809 (2022)."},{"key":"e_1_3_2_1_20_1","volume-title":"Query rewriting for retrieval-augmented large language models. arXiv preprint arXiv:2305.14283","author":"Ma Xinbei","year":"2023","unstructured":"Xinbei Ma, Yeyun Gong, Pengcheng He, Hai Zhao, and Nan Duan. 2023. Query rewriting for retrieval-augmented large language models. arXiv preprint arXiv:2305.14283 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Multi-modal Retrieval Augmented Multi-modal Generation: A Benchmark, Evaluate Metrics and Strong Baselines. arXiv preprint arXiv:2411.16365","author":"Ma Zi-Ao","year":"2024","unstructured":"Zi-Ao Ma, Tian Lan, Rong-Cheng Tu, Yong Hu, Heyan Huang, and Xian-Ling Mao. 2024. Multi-modal Retrieval Augmented Multi-modal Generation: A Benchmark, Evaluate Metrics and Strong Baselines. arXiv preprint arXiv:2411.16365 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_23_1","volume-title":"Beyond Text: Optimizing RAG with Multimodal Inputs for Industrial Applications. arXiv preprint arXiv:2410.21943","author":"Riedler Monica","year":"2024","unstructured":"Monica Riedler and Stefan Langer. 2024. Beyond Text: Optimizing RAG with Multimodal Inputs for Industrial Applications. arXiv preprint arXiv:2410.21943 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"Ragchecker: A fine-grained framework for diagnosing retrieval-augmented generation. arXiv preprint arXiv:2408.08067","author":"Ru Dongyu","year":"2024","unstructured":"Dongyu Ru, Lin Qiu, Xiangkun Hu, Tianhang Zhang, Peng Shi, Shuaichen Chang, Cheng Jiayang, Cunxiang Wang, Shichao Sun, Huanyu Li, et al., 2024. Ragchecker: A fine-grained framework for diagnosing retrieval-augmented generation. arXiv preprint arXiv:2408.08067 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Machine Learning. PMLR, 31210-31227","author":"Shi Freda","year":"2023","unstructured":"Freda Shi, Xinyun Chen, Kanishka Misra, Nathan Scales, David Dohan, Ed H Chi, Nathanael Sch\u00e4rli, and Denny Zhou. 2023. Large language models can be easily distracted by irrelevant context. In International Conference on Machine Learning. PMLR, 31210-31227."},{"key":"e_1_3_2_1_26_1","volume-title":"Plug-and-play vqa: Zero-shot vqa by conjoining large pretrained models with zero training. arXiv preprint arXiv:2210.08773","author":"Huat Tiong Anthony Meng","year":"2022","unstructured":"Anthony Meng Huat Tiong, Junnan Li, Boyang Li, Silvio Savarese, and Steven CH Hoi. 2022. Plug-and-play vqa: Zero-shot vqa by conjoining large pretrained models with zero training. arXiv preprint arXiv:2210.08773 (2022)."},{"key":"e_1_3_2_1_27_1","volume-title":"Query2doc: Query expansion with large language models. arXiv preprint arXiv:2303.07678","author":"Wang Liang","year":"2023","unstructured":"Liang Wang, Nan Yang, and Furu Wei. 2023. Query2doc: Query expansion with large language models. arXiv preprint arXiv:2303.07678 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191","author":"Wang Peng","year":"2024","unstructured":"Peng Wang, Shuai Bai, Sinan Tan, Shijie Wang, Zhihao Fan, Jinze Bai, Keqin Chen, Xuejing Liu, Jialin Wang, Wenbin Ge, Yang Fan, Kai Dang, Mengfei Du, Xuancheng Ren, Rui Men, Dayiheng Liu, Chang Zhou, Jingren Zhou, and Junyang Lin. 2024. Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution. arXiv preprint arXiv:2409.12191 (2024)."},{"key":"e_1_3_2_1_29_1","unstructured":"Jialong Wu Baixuan Li Runnan Fang Wenbiao Yin Liwen Zhang Zhengwei Tao Dingchu Zhang Zekun Xi Gang Fu Yong Jiang et al. 2025. WebDancer: Towards Autonomous Information Seeking Agency. arXiv preprint arXiv:2505.22648 (2025)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.05.001"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611830"},{"key":"e_1_3_2_1_32_1","volume-title":"Hongjian Gu, Jiaming Zhou, Yaochen Hu, Bin Wang, Qun Liu, Mark Coates, Yingxue Zhang, and Jianye Hao.","author":"Zhang Ge","year":"2024","unstructured":"Ge Zhang, Mohammad Ali Alomrani, Hongjian Gu, Jiaming Zhou, Yaochen Hu, Bin Wang, Qun Liu, Mark Coates, Yingxue Zhang, and Jianye Hao. 2024a. Path-of-Thoughts: Extracting and Following Paths for Robust Relational Reasoning with Large Language Models. arXiv preprint arXiv:2412.17963 (2024)."},{"key":"e_1_3_2_1_33_1","unstructured":"Tao Zhang Ziqi Zhang Zongyang Ma Yuxin Chen Zhongang Qi Chunfeng Yuan Bing Li Junfu Pu Yuxuan Zhao Zehua Xie et al. 2024b. mR2AG: Multimodal Retrieval-Reflection-Augmented Generation for Knowledge-Based VQA. arXiv preprint arXiv:2411.15041 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Sinno Jialin Pan, and Lidong Bing","author":"Zhang Wenxuan","year":"2023","unstructured":"Wenxuan Zhang, Yue Deng, Bing Liu, Sinno Jialin Pan, and Lidong Bing. 2023. Sentiment analysis in the era of large language models: A reality check. arXiv preprint arXiv:2305.15005 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Chengwei Qin, Bosheng Ding, Xiaobao Guo, Minzhi Li, Xingxuan Li, et al.","author":"Zhao Ruochen","year":"2023","unstructured":"Ruochen Zhao, Hailin Chen, Weishi Wang, Fangkai Jiao, Xuan Long Do, Chengwei Qin, Bosheng Ding, Xiaobao Guo, Minzhi Li, Xingxuan Li, et al., 2023a. Retrieving multimodal information for augmented generation: A survey. arXiv preprint arXiv:2303.10868 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"Chengwei Qin, Bosheng Ding, Xiaobao Guo, Minzhi Li, Xingxuan Li, et al.","author":"Zhao Ruochen","year":"2023","unstructured":"Ruochen Zhao, Hailin Chen, Weishi Wang, Fangkai Jiao, Xuan Long Do, Chengwei Qin, Bosheng Ding, Xiaobao Guo, Minzhi Li, Xingxuan Li, et al., 2023b. Retrieving multimodal information for augmented generation: A survey. arXiv preprint arXiv:2303.10868 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.38"},{"key":"e_1_3_2_1_38_1","volume-title":"Loic Feujio, Akash Maharaj, and Yunyao Li.","author":"Zhu Zhengyuan","year":"2024","unstructured":"Zhengyuan Zhu, Daniel Lee, Hong Zhang, Sai Sree Harsha, Loic Feujio, Akash Maharaj, and Yunyao Li. 2024. Murar: A simple and effective multimodal retrieval and answer refinement framework for multimodal question answering. arXiv preprint arXiv:2408.08521 (2024)."}],"event":{"name":"SIGIR-AP 2025:Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region","location":"Xi'an China","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 2025 Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region"],"original-title":[],"deposited":{"date-parts":[[2025,12,3]],"date-time":"2025-12-03T17:17:10Z","timestamp":1764782230000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3767695.3769486"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":38,"alternative-id":["10.1145\/3767695.3769486","10.1145\/3767695"],"URL":"https:\/\/doi.org\/10.1145\/3767695.3769486","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}