{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:20:49Z","timestamp":1774369249377,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s11432-024-4251-x","type":"journal-article","created":{"date-parts":[[2024,12,19]],"date-time":"2024-12-19T02:52:24Z","timestamp":1734576744000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":60,"title":["Woodpecker: hallucination correction for multimodal large language models"],"prefix":"10.1007","volume":"67","author":[{"given":"Shukang","family":"Yin","sequence":"first","affiliation":[]},{"given":"Chaoyou","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Sirui","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Tong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Dianbo","family":"Sui","sequence":"additional","affiliation":[]},{"given":"Yunhang","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Ke","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xing","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Enhong","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,13]]},"reference":[{"key":"4251_CR1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr\/nwae403","volume-title":"Natl Sci Open","author":"S Yin","year":"2024","unstructured":"Yin S, Fu C, Zhao S, et al. A survey on multimodal large language models. Natl Sci Open, 2024. doi: https:\/\/doi.org\/10.1093\/nsr\/nwae403"},{"key":"4251_CR2","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"H Liu","year":"2023","unstructured":"Liu H, Li C, Wu Q, et al. Visual instruction tuning. In: Proceedings of Conference on Neural Information Processing Systems, 2023"},{"key":"4251_CR3","volume-title":"mPLUG-Owl: modularization empowers large language models with multimodality","author":"Q Ye","year":"2023","unstructured":"Ye Q, Xu H, Xu G, et al. mPLUG-Owl: modularization empowers large language models with multimodality. 2023. ArXiv:2304.14178"},{"key":"4251_CR4","volume-title":"Proceedings of International Conference on Learning Representations","author":"D Zhu","year":"2024","unstructured":"Zhu D, Chen J, Shen X, et al. MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: Proceedings of International Conference on Learning Representations, 2024"},{"key":"4251_CR5","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"A Zhang","year":"2023","unstructured":"Zhang A, Fei H, Yao Y, et al. Transfer visual prompt generator across LLMs. In: Proceedings of Conference on Neural Information Processing Systems, 2023"},{"key":"4251_CR6","volume-title":"Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond","author":"J Bai","year":"2023","unstructured":"Bai J, Bai S, Yang S, et al. Qwen-VL: a versatile vision-language model for understanding, localization, text reading, and beyond. 2023. ArXiv:2308.12966"},{"key":"4251_CR7","volume-title":"Proceedings of International Conference on Learning Representations","author":"F Liu","year":"2023","unstructured":"Liu F, Lin K, Li L, et al. Mitigating hallucination in large multi-modal models via robust instruction tuning. In: Proceedings of International Conference on Learning Representations, 2023"},{"key":"4251_CR8","volume-title":"Proceedings of AAAI","author":"B Wang","year":"2024","unstructured":"Wang B, Wu F, Han X, et al. VIGC: visual instruction generation and correction. In: Proceedings of AAAI, 2024"},{"key":"4251_CR9","volume-title":"Proceedings of Conference on Empirical Methods in Natural Language Processing","author":"Y Li","year":"2023","unstructured":"Li Y, Du Y, Zhou K, et al. Evaluating object hallucination in large vision-language models. In: Proceedings of Conference on Empirical Methods in Natural Language Processing, 2023"},{"key":"4251_CR10","volume-title":"MME: a comprehensive evaluation benchmark for multimodal large language models","author":"C Fu","year":"2023","unstructured":"Fu C, Chen P, Shen Y, et al. MME: a comprehensive evaluation benchmark for multimodal large language models. 2023. ArXiv:2306.13394"},{"key":"4251_CR11","volume-title":"Evaluation and analysis of hallucination in large vision-language models","author":"J Wang","year":"2023","unstructured":"Wang J, Zhou Y, Xu G, et al. Evaluation and analysis of hallucination in large vision-language models. 2023. ArXiv:2308.15126"},{"key":"4251_CR12","volume-title":"Proceedings of AAAI","author":"A Gunjal","year":"2024","unstructured":"Gunjal A, Yin J, Bas E. Detecting and preventing hallucinations in large vision language models. In: Proceedings of AAAI, 2024"},{"key":"4251_CR13","volume-title":"Proceedings of AAAI (Workshop on ReLM)","author":"J Lu","year":"2024","unstructured":"Lu J, Rao J, Chen K, et al. Evaluation and mitigation of agnosia in multimodal large language models. In: Proceedings of AAAI (Workshop on ReLM), 2024"},{"key":"4251_CR14","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Z Li","year":"2024","unstructured":"Li Z, Yang B, Liu Q, et al. Monkey: image resolution and text label are important things for large multi-modal models. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4251_CR15","volume-title":"Halle-switch: rethinking and controlling object existence hallucinations in large vision language models for detailed caption","author":"B Zhai","year":"2023","unstructured":"Zhai B, Yang S, Zhao X, et al. Halle-switch: rethinking and controlling object existence hallucinations in large vision language models for detailed caption. 2023. ArXiv:2310.01779"},{"key":"4251_CR16","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"S Leng","year":"2024","unstructured":"Leng S, Zhang H, Chen G, et al. Mitigating object hallucinations in large vision-language models through visual contrastive decoding. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4251_CR17","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Q Huang","year":"2024","unstructured":"Huang Q, Dong X, Zhang P, et al. OPERA: alleviating hallucination in multi-modal large language models via over-trust penalty and retrospection-allocation. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4251_CR18","doi-asserted-by":"publisher","first-page":"112106","DOI":"10.1007\/s11432-022-3718-5","volume":"67","author":"X X Xie","year":"2024","unstructured":"Xie X X, Cheng G, Li Q Y, et al. Fewer is more: efficient object detection in large aerial images. Sci China Inf Sci, 2024, 67: 112106","journal-title":"Sci China Inf Sci"},{"key":"4251_CR19","doi-asserted-by":"publisher","first-page":"149101","DOI":"10.1007\/s11432-023-3943-6","volume":"67","author":"D Y Zhang","year":"2024","unstructured":"Zhang D Y, Liang D K, Yang H C, et al. SAM3D: zero-shot 3D object detection via the segment anything model. Sci China Inf Sci, 2024, 67: 149101","journal-title":"Sci China Inf Sci"},{"key":"4251_CR20","doi-asserted-by":"publisher","first-page":"222104","DOI":"10.1007\/s11432-021-3558-y","volume":"65","author":"Y N Song","year":"2022","unstructured":"Song Y N, Gao L, Li X Y, et al. A novel vision-based multi-task robotic grasp detection method for multi-object scenes. Sci China Inf Sci, 2022, 65: 222104","journal-title":"Sci China Inf Sci"},{"key":"4251_CR21","doi-asserted-by":"publisher","first-page":"222102","DOI":"10.1007\/s11432-021-3530-7","volume":"66","author":"Y Yang","year":"2023","unstructured":"Yang Y, Bao R, Guo W L, et al. Deep visual-linguistic fusion network considering cross-modal inconsistency for rumor detection. Sci China Inf Sci, 2023, 66: 222102","journal-title":"Sci China Inf Sci"},{"key":"4251_CR22","doi-asserted-by":"publisher","first-page":"139101","DOI":"10.1007\/s11432-023-3935-8","volume":"67","author":"K Xiao","year":"2024","unstructured":"Xiao K, Zhu A N, Iwana B K, et al. Scene text recognition via dual character counting-aware visual and semantic modeling network. Sci China Inf Sci, 2024, 67: 139101","journal-title":"Sci China Inf Sci"},{"key":"4251_CR23","volume-title":"Proceedings of International Conference on Learning Representations","author":"E Dinan","year":"2019","unstructured":"Dinan E, Roller S, Shuster K, et al. Wizard of Wikipedia: knowledge-powered conversational agents. In: Proceedings of International Conference on Learning Representations, 2019"},{"key":"4251_CR24","volume-title":"Proceedings of North American Chapter of the Association for Computational Linguistics","author":"F Petroni","year":"2020","unstructured":"Petroni F, Piktus A, Fan A, et al. KILT: a benchmark for knowledge intensive language tasks. In: Proceedings of North American Chapter of the Association for Computational Linguistics, 2020"},{"key":"4251_CR25","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"P Lewis","year":"2020","unstructured":"Lewis P, Perez E, Piktus A, et al. Retrieval-augmented generation for knowledge-intensive NLP tasks. In: Proceedings of Conference on Neural Information Processing Systems, 2020"},{"key":"4251_CR26","volume-title":"Proceedings of International Conference on Machine Learning","author":"S Borgeaud","year":"2022","unstructured":"Borgeaud S, Mensch A, Hoffmann J, et al. Improving language models by retrieving from trillions of tokens. In: Proceedings of International Conference on Machine Learning, 2022"},{"key":"4251_CR27","volume-title":"BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage","author":"K Shuster","year":"2022","unstructured":"Shuster K, Xu J, Komeili M, et al. BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage. 2022. ArXiv:2208.03188"},{"key":"4251_CR28","volume-title":"The web is your Oyster-knowledge-intensive NLP against a very large web corpus","author":"A Piktus","year":"2021","unstructured":"Piktus A, Petroni F, Karpukhin V, et al. The web is your Oyster-knowledge-intensive NLP against a very large web corpus. 2021. ArXiv:2112.09924"},{"key":"4251_CR29","volume-title":"Proceedings of Association for Computational Linguistics","author":"K H Huang","year":"2023","unstructured":"Huang K H, Chan H P, Ji H. Zero-shot faithful factual error correction. In: Proceedings of Association for Computational Linguistics, 2023"},{"key":"4251_CR30","volume-title":"Check your facts and try again: improving large language models with external knowledge and automated feedback","author":"B Peng","year":"2023","unstructured":"Peng B, Galley M, He P, et al. Check your facts and try again: improving large language models with external knowledge and automated feedback. 2023. ArXiv:2302.12813"},{"key":"4251_CR31","volume-title":"HuggingGPT: solving AI tasks with ChatGPT and its friends in hugging face","author":"Y Shen","year":"2023","unstructured":"Shen Y, Song K, Tan X, et al. HuggingGPT: solving AI tasks with ChatGPT and its friends in hugging face. 2023. ArXiv:2303.17580"},{"key":"4251_CR32","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"R Yang","year":"2023","unstructured":"Yang R, Song L, Li Y, et al. GPT4Tools: teaching large language model to use tools via self-instruction. In: Proceedings of Conference on Neural Information Processing Systems, 2023"},{"key":"4251_CR33","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"P Lu","year":"2023","unstructured":"Lu P, Peng B, Cheng H, et al. Chameleon: plug-and-play compositional reasoning with large language models. In: Proceedings of Conference on Neural Information Processing Systems, 2023"},{"key":"4251_CR34","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"T Gupta","year":"2023","unstructured":"Gupta T, Kembhavi A. Visual programming: compositional visual reasoning without training. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023"},{"key":"4251_CR35","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"H You","year":"2023","unstructured":"You H, Sun R, Wang Z, et al. IdealGPT: iteratively decomposing vision and language reasoning via large language models. In: Proceedings of Findings of the Association for Computational Linguistics, 2023"},{"key":"4251_CR36","volume-title":"Transactions on Machine Learning Research","author":"D Zhu","year":"2024","unstructured":"Zhu D, Chen J, Haydarov K, et al. ChatGPT asks, BLIP-2 answers: automatic questioning towards enriched visual descriptions. Transactions on Machine Learning Research, 2024. https:\/\/openreview.net\/forum?id=1LoVwFkZNo"},{"key":"4251_CR37","volume-title":"MM-REACT: prompting ChatGPT for multimodal reasoning and action","author":"Z Yang","year":"2023","unstructured":"Yang Z, Li L, Wang J, et al. MM-REACT: prompting ChatGPT for multimodal reasoning and action. 2023. ArXiv:2303.11381"},{"key":"4251_CR38","volume-title":"Visual ChatGPT: talking, drawing and editing with visual foundation models","author":"C Wu","year":"2023","unstructured":"Wu C, Yin S, Qi W, et al. Visual ChatGPT: talking, drawing and editing with visual foundation models. 2023. ArXiv:2303.04671"},{"key":"4251_CR39","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"R Zhang","year":"2023","unstructured":"Zhang R, Hu X, Li B, et al. Prompt, generate, then cache: cascade of foundation models makes strong few-shot learners. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023"},{"key":"4251_CR40","volume-title":"Proceedings of International Conference on Computer Vision","author":"X Zhu","year":"2023","unstructured":"Zhu X, Zhang R, He B, et al. PointCLIP V2: prompting CLIP and GPT for powerful 3D open-world learning. In: Proceedings of International Conference on Computer Vision, 2023"},{"key":"4251_CR41","volume-title":"Proceedings of International Conference on Computer Vision","author":"A Kirillov","year":"2023","unstructured":"Kirillov A, Mintun E, Ravi N, et al. Segment anything. In: Proceedings of International Conference on Computer Vision, 2023"},{"key":"4251_CR42","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Z Wang","year":"2023","unstructured":"Wang Z, Li Y, Chen X, et al. Detecting everything in the open world: towards universal object detection. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023"},{"key":"4251_CR43","volume-title":"Proceedings of European Conference on Computer Vision","author":"S Liu","year":"2023","unstructured":"Liu S, Zeng Z, Ren T, et al. Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. In: Proceedings of European Conference on Computer Vision, 2023"},{"key":"4251_CR44","volume-title":"Proceedings of International Conference on Machine Learning","author":"J Li","year":"2023","unstructured":"Li J, Li D, Savarese S, et al. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of International Conference on Machine Learning, 2023"},{"key":"4251_CR45","volume-title":"GPT-4 Technical Report","author":"OpenAI","year":"2023","unstructured":"OpenAI. GPT-4 Technical Report. 2023. ArXiv:2303.08774"},{"key":"4251_CR46","volume-title":"VDGD: mitigating LVLM hallucinations in cognitive prompts by bridging the visual perception gap","author":"S Ghosh","year":"2024","unstructured":"Ghosh S, Evuru C K R, Kumar S, et al. VDGD: mitigating LVLM hallucinations in cognitive prompts by bridging the visual perception gap. 2024. ArXiv:2405.15683"},{"key":"4251_CR47","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"H Hu","year":"2023","unstructured":"Hu H, Luan Y, Chen Y, et al. Open-domain visual entity recognition: towards recognizing millions of Wikipedia entities. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2023"},{"key":"4251_CR48","volume-title":"Otter: a multi-modal model with in-context instruction tuning","author":"B Li","year":"2023","unstructured":"Li B, Zhang Y, Chen L, et al. Otter: a multi-modal model with in-context instruction tuning. 2023. ArXiv:2305.03726"},{"key":"4251_CR49","volume-title":"Proceedings of International Conference on Machine Learning","author":"A Jaegle","year":"2021","unstructured":"Jaegle A, Gimeno F, Brock A, et al. Perceiver: general perception with iterative attention. In: Proceedings of International Conference on Machine Learning, 2021"},{"key":"4251_CR50","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"T Brown","year":"2020","unstructured":"Brown T, Mann B, Ryder N, et al. Language models are few-shot learners. In: Proceedings of Conference on Neural Information Processing Systems, 2020"},{"key":"4251_CR51","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"H Liu","year":"2024","unstructured":"Liu H, Li C, Li Y, et al. Improved baselines with visual instruction tuning. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4251_CR52","volume-title":"Proceedings of Conference on Computer Vision and Pattern Recognition","author":"Q Ye","year":"2024","unstructured":"Ye Q, Xu H, Ye J, et al. mPLUG-Owl2: revolutionizing multi-modal large language model with modality collaboration. In: Proceedings of Conference on Computer Vision and Pattern Recognition, 2024"},{"key":"4251_CR53","volume-title":"Proceedings of Conference on Neural Information Processing Systems","author":"W Wang","year":"2024","unstructured":"Wang W, Lv Q, Yu W, et al. CogVLM: visual expert for pretrained language models. In: Proceedings of Conference on Neural Information Processing Systems, 2024"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4251-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-024-4251-x","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-024-4251-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,19]],"date-time":"2026-01-19T22:03:06Z","timestamp":1768860186000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-024-4251-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":53,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["4251"],"URL":"https:\/\/doi.org\/10.1007\/s11432-024-4251-x","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]},"assertion":[{"value":"8 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 October 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 November 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 December 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"220105"}}