{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T10:13:42Z","timestamp":1776161622622,"version":"3.50.1"},"reference-count":65,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100005015","name":"South China University of Technology","doi-asserted-by":"publisher","award":["x2rjD2250190"],"award-info":[{"award-number":["x2rjD2250190"]}],"id":[{"id":"10.13039\/501100005015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2023B1515120078"],"award-info":[{"award-number":["2023B1515120078"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476097"],"award-info":[{"award-number":["62476097"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.eswa.2026.132010","type":"journal-article","created":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T03:13:10Z","timestamp":1773457990000},"page":"132010","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Self-improving multi-agent framework for zero-shot multimodal information extraction"],"prefix":"10.1016","volume":"318","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1629-2233","authenticated-orcid":false,"given":"Runwei","family":"Situ","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1767-789X","authenticated-orcid":false,"given":"Yi","family":"Cai","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2026.132010_bib0001","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132010_bib0002","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"2969","article-title":"In-context learning for few-shot multimodal named entity recognition","author":"Cai","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0003","series-title":"Proceedings of the 31st ACM international conference on multimedia","first-page":"4555","article-title":"Learning implicit entity-object relations by bidirectional generative alignment for multimodal NER","author":"Chen","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0004","series-title":"Findings of the association for computational linguistics: NAACL 2022","first-page":"1607","article-title":"Good visual guidance make a better extractor: Hierarchical visual prefix for multimodal entity and relation extraction","author":"Chen","year":"2022"},{"issue":"240","key":"10.1016\/j.eswa.2026.132010_bib0005","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.eswa.2026.132010_bib0006","series-title":"Findings of the association for computational linguistics: ACL 2023","first-page":"4005","article-title":"Why can GPT learn in-context? language models secretly perform gradient descent as meta-optimizers","author":"Dai","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0007","series-title":"Proceedings of the 41st international conference on machine learning","first-page":"11733","article-title":"Improving factuality and reasoning in language models through multiagent debate","author":"Du","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0008","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.125608","article-title":"Ce-dcvsi: Multimodal relational extraction based on collaborative enhancement of dual-channel visual semantic information","volume":"262","author":"Gong","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132010_bib0009","series-title":"Ccf international conference on natural language processing and chinese computing","first-page":"30","article-title":"Retrieval-augmented code generation for universal information extraction","author":"Guo","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0010","series-title":"12th international conference on learning representations, ICLR 2024","article-title":"MetaGPT: Meta programming for a multi-agent collaborative framework","author":"Hong","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0011","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"8032","article-title":"Mner-qg: An end-to-end mrc framework for multimodal named entity recognition with query grounding","volume":"vol. 37","author":"Jia","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0012","series-title":"Proceedings of the 30th ACM international conference on multimedia","first-page":"3549","article-title":"Query prior matters: A MRC framework for multimodal named entity recognition","author":"Jia","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0013","series-title":"Proceedings of the 41st international conference on machine learning","first-page":"22099","article-title":"Llm maybe longlm: Selfextend llm context window without tuning","author":"Jin","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0014","series-title":"International conference on machine learning","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0015","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"16420","article-title":"Clip-event: Connecting text and images with event structures","author":"Li","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0016","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"2557","article-title":"Cross-media structured common space for multimedia event extraction","author":"Li","year":"2020"},{"key":"10.1016\/j.eswa.2026.132010_bib0017","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"292","article-title":"Evaluating object hallucination in large vision-language models","author":"Li","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0018","series-title":"Proceedings of the 30th ACM international conference on multimedia","first-page":"1945","article-title":"Multimedia event extraction from news with a unified contrastive learning framework","author":"Liu","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0019","series-title":"Proceedings of deep learning inside out (deeLIO 2022): The 3rd workshop on knowledge extraction and integration for deep learning architectures","first-page":"100","article-title":"What makes good in-context examples for GPT-3?","author":"Liu","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0020","doi-asserted-by":"crossref","first-page":"157","DOI":"10.1162\/tacl_a_00638","article-title":"Lost in the middle: How language models use long contexts","volume":"12","author":"Liu","year":"2024","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"10.1016\/j.eswa.2026.132010_bib0021","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"18680","article-title":"Hierarchical aligned multimodal learning for NER on tweet posts","volume":"vol. 38","author":"Liu","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0022","unstructured":"Liu, W., Zhong, X., Hou, J., Li, S., Huang, H., & Fang, Y. (2023). Integrating large pre-trained models into multimodal named entity recognition with evidential fusion. arXiv preprint arXiv: 2306.16991."},{"issue":"3","key":"10.1016\/j.eswa.2026.132010_bib0023","doi-asserted-by":"crossref","first-page":"1053","DOI":"10.1162\/coli_a_00523","article-title":"Large language model instruction following: A survey of progresses and challenges","volume":"50","author":"Lou","year":"2024","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.eswa.2026.132010_bib0024","series-title":"Proceedings of the 56th annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"1990","article-title":"Visual attention model for name tagging in multimodal social media","author":"Lu","year":"2018"},{"key":"10.1016\/j.eswa.2026.132010_bib0025","series-title":"Proceedings of the 29th international conference on computational linguistics","first-page":"2055","article-title":"Flat multi-modal interaction transformer for named entity recognition","author":"Lu","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0026","series-title":"2024 IEEE international conference on robotics and automation (ICRA)","first-page":"286","article-title":"Roco: Dialectic multi-robot collaboration with large language models","author":"Mandi","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0027","series-title":"Proceedings of NAACL-HLT","first-page":"852","article-title":"Multimodal named entity recognition for short social media posts","author":"Moon","year":"2018"},{"key":"10.1016\/j.eswa.2026.132010_bib0028","unstructured":"OpenAI (2023). Gpt-4 technical report. arXiv preprint arXiv: 2303.08774."},{"key":"10.1016\/j.eswa.2026.132010_bib0029","series-title":"Proceedings of the 36th annual acm symposium on user interface software and technology","first-page":"1","article-title":"Generative agents: Interactive simulacra of human behavior","author":"Park","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0030","series-title":"Findings of the association for computational linguistics ACL 2024","first-page":"13025","article-title":"Infobench: Evaluating instruction following ability in large language models","author":"Qin","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0031","series-title":"Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"6090","article-title":"Learning or self-aligning? rethinking instruction fine-tuning","author":"Ren","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0032","series-title":"Proceedings of the 2022 conference of the north american chapter of the association for computational linguistics: human language technologies","first-page":"2655","article-title":"Learning to retrieve prompts for in-context learning","author":"Rubin","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0033","series-title":"Proceedings of the 33rd ACM international conference on multimedia","first-page":"4418","article-title":"Ground and reconstruct: Entity-region bidirectional alignment pre-training for low-resource GMNER","author":"Situ","year":"2025"},{"key":"10.1016\/j.eswa.2026.132010_bib0034","series-title":"International conference on learning representations","article-title":"Evaluating the zero-shot robustness of instruction-tuned language models","author":"Sun","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0035","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"13860","article-title":"RpBERT: a text-image relation propagation-based BERT model for multimodal NER","volume":"vol. 35","author":"Sun","year":"2021"},{"key":"10.1016\/j.eswa.2026.132010_bib0036","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"19062","article-title":"Umie: Unified multimodal information extraction with instruction tuning","volume":"vol. 38","author":"Sun","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0037","unstructured":"Sun, Y., Zhang, K., & Su, Y. (2023). Multimodal question answering for unified information extraction. arXiv preprint arXiv: 2310.03017."},{"key":"10.1016\/j.eswa.2026.132010_bib0038","unstructured":"Thoppilan, R., De Freitas, D., Hall, J., Shazeer, N., Kulshreshtha, A., Cheng, H.-T., Jin, A., Bos, T., Baker, L., Du, Y. et al. (2022). Lamda: Language models for dialog applications. arXiv preprint arXiv: 2201.08239."},{"key":"10.1016\/j.eswa.2026.132010_bib0039","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.127104","article-title":"Fewrelex: Exploring multimodal few-shot relation extraction with enhanced visual-textual mapping","volume":"277","author":"Tian","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132010_bib0040","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"9040","article-title":"Image enhanced event detection in news articles","volume":"vol. 34","author":"Tong","year":"2020"},{"key":"10.1016\/j.eswa.2026.132010_bib0041","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","article-title":"Gpt-re: In-context learning for relation extraction using large language models","author":"Wan","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0042","series-title":"Findings of the association for computational linguistics: NAACL 2025","first-page":"4257","article-title":"Gpt-ner: Named entity recognition via large language models","author":"Wang","year":"2025"},{"key":"10.1016\/j.eswa.2026.132010_bib0043","series-title":"Findings of the association for computational linguistics: EMNLP 2022","first-page":"5925","article-title":"Named entity and relation extraction with multi-modal retrieval","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0044","series-title":"Proceedings of the 2022 conference of the north american chapter of the association for computational linguistics: Human language technologies","first-page":"3176","article-title":"Ita: Image-text alignments for multi-modal named entity recognition","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_sbref0045","series-title":"The eleventh international conference on learning representations, ICLR 2023, kigali, rwanda, may 1-5, 2023","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0046","series-title":"2022 IEEE international conference on multimedia and expo (ICME)","first-page":"1","article-title":"Cat-mner: multimodal named entity recognition with knowledge-refined cross-modal attention","author":"Wang","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0047","unstructured":"Wang, X., Zhou, W., Zu, C., Xia, H., Chen, T., Zhang, Y., Zheng, R., Ye, J., Zhang, Q., Gui, T. et al. (2023b). Instructuie: Multi-task instruction tuning for unified information extraction. arXiv preprint arXiv: 2304.08085."},{"key":"10.1016\/j.eswa.2026.132010_bib0048","unstructured":"Wei, X., Cui, X., Cheng, N., Wang, X., Zhang, X., Huang, S., Xie, P., Xu, J., Chen, Y., Zhang, M. et al. (2023). Zero-shot information extraction via chatting with chatgpt. arXiv e-prints arXiv\u20132302."},{"key":"10.1016\/j.eswa.2026.132010_bib0049","unstructured":"Xiao, X., Wang, Y., Xu, N., Wang, Y., Yang, H., Wang, M., Luo, Y., Wang, L., Mao, W., & Zeng, D. (2023). Yayi-uie: A chat-enhanced instruction tuning framework for universal information extraction. arXiv preprint arXiv: 2312.15548."},{"key":"10.1016\/j.eswa.2026.132010_bib0050","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"7935","article-title":"Empirical study of zero-shot NER with chatGPT","author":"Xie","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0051","series-title":"Proceedings of the 2024 conference of the north american chapter of the association for computational linguistics: Human language technologies (volume 2: Short papers)","first-page":"583","article-title":"Self-improving for zero-shot named entity recognition with large language models","author":"Xie","year":"2024"},{"key":"10.1016\/j.eswa.2026.132010_bib0052","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"7572","article-title":"Examining inter-consistency of large language models collaboration: An in-depth analysis via debate","author":"Xiong","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0053","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"3342","article-title":"Improving multimodal named entity recognition via entity span detection with unified multimodal transformer","author":"Yu","year":"2020"},{"issue":"1","key":"10.1016\/j.eswa.2026.132010_bib0054","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TKDE.2024.3485107","article-title":"A fine-grained network for joint multimodal entity-relation extraction","volume":"37","author":"Yuan","year":"2024","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"10.1016\/j.eswa.2026.132010_bib0055","doi-asserted-by":"crossref","first-page":"781","DOI":"10.1109\/TASLPRO.2026.3651977","article-title":"Visual knowledge-enhanced LLaVA for fine-grained multimodal named entity recognition and grounding","volume":"34","author":"Yuan","year":"2026","journal-title":"IEEE Transactions on Audio, Speech and Language Processing"},{"key":"10.1016\/j.eswa.2026.132010_bib0056","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.124867","article-title":"Icka: An instruction construction and knowledge alignment framework for multimodal named entity recognition","volume":"255","author":"Zeng","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132010_bib0057","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"14347","article-title":"Multi-modal graph fusion for named entity recognition with targeted visual guidance","volume":"vol. 35","author":"Zhang","year":"2021"},{"key":"10.1016\/j.eswa.2026.132010_bib0058","series-title":"Findings of the association for computational linguistics: ACL 2023","first-page":"794","article-title":"Aligning instruction tasks unlocks large language models as zero-shot relation extractors","author":"Zhang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0059","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Adaptive co-attention network for named entity recognition in tweets","volume":"vol. 32","author":"Zhang","year":"2018"},{"key":"10.1016\/j.eswa.2026.132010_bib0060","series-title":"Findings of the association for computational linguistics: EMNLP 2023","first-page":"13088","article-title":"Llmaaa: Making large language models as active annotators","author":"Zhang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132010_bib0061","series-title":"Proceedings of the 25th ACM international conference on multimedia","first-page":"270","article-title":"Improving event extraction via multimodal integration","author":"Zhang","year":"2017"},{"key":"10.1016\/j.eswa.2026.132010_bib0062","series-title":"Proceedings of the 30th ACM international conference on multimedia","first-page":"3983","article-title":"Learning from different text-image pairs: A relation-enhanced graph convolutional network for multimodal NER","author":"Zhao","year":"2022"},{"key":"10.1016\/j.eswa.2026.132010_bib0063","series-title":"Proceedings of the 29th ACM international conference on multimedia","first-page":"5298","article-title":"Multimodal relation extraction with efficient graph alignment","author":"Zheng","year":"2021"},{"key":"10.1016\/j.eswa.2026.132010_bib0064","series-title":"2021 IEEE international conference on multimedia and expo (ICME)","first-page":"1","article-title":"Mnre: A challenge multimodal dataset for neural relation extraction with visual evidence in social media posts","author":"Zheng","year":"2021"},{"key":"10.1016\/j.eswa.2026.132010_bib0065","series-title":"The twelfth international conference on learning representations, ICLR 2024, vienna, austria, may 7-11, 2024","article-title":"UniversalNER: Targeted distillation from large language models for open named entity recognition","author":"Zhou","year":"2024"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426009231?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0957417426009231?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T09:14:24Z","timestamp":1776158064000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0957417426009231"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":65,"alternative-id":["S0957417426009231"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132010","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Self-improving multi-agent framework for zero-shot multimodal information extraction","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132010","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"132010"}}