{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T18:51:34Z","timestamp":1772823094869,"version":"3.50.1"},"reference-count":100,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Natural Science Foundation of China through the Key Project","award":["62431020"],"award-info":[{"award-number":["62431020"]}]},{"name":"National Natural Science Foundation of China through the Key Project","award":["62231027"],"award-info":[{"award-number":["62231027"]}]},{"name":"National Natural Science Foundation of China through the Joint Fund Project","award":["U22B2054"],"award-info":[{"award-number":["U22B2054"]}]},{"name":"China Postdoctoral Science Foundation (CPSF) through the Postdoctoral Fellowship Program","award":["GZC20232033"],"award-info":[{"award-number":["GZC20232033"]}]},{"name":"Fund for Foreign Scholars in University Research and Teaching Programs through the 111 Project","award":["B07048"],"award-info":[{"award-number":["B07048"]}]},{"name":"Program for Cheung Kong Scholars and Innovative Research Team in University","award":["IRT_15R53"],"award-info":[{"award-number":["IRT_15R53"]}]},{"name":"Key Scientific Technological Innovation Research Project by the Ministry of Education"},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["YJSJ25004"],"award-info":[{"award-number":["YJSJ25004"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["ZYTS25211"],"award-info":[{"award-number":["ZYTS25211"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Innovation Fund of Xidian University"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tnnls.2025.3576486","type":"journal-article","created":{"date-parts":[[2025,6,12]],"date-time":"2025-06-12T13:43:23Z","timestamp":1749735803000},"page":"19024-19038","source":"Crossref","is-referenced-by-count":2,"title":["Chain-of-Situation Aware Progressive Inference Learning"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2604-4381","authenticated-orcid":false,"given":"Yang","family":"Liu","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5669-9354","authenticated-orcid":false,"given":"Fang","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3354-9617","authenticated-orcid":false,"given":"Licheng","family":"Jiao","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6025-3881","authenticated-orcid":false,"given":"Qianyue","family":"Bao","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2002-3894","authenticated-orcid":false,"given":"Shuo","family":"Li","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6130-2518","authenticated-orcid":false,"given":"Lingling","family":"Li","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8780-5455","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5472-1426","authenticated-orcid":false,"given":"Puhua","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8872-2195","authenticated-orcid":false,"given":"Wenping","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, the Key Laboratory of Intelligent Perception and Image Understanding, Ministry of Education, the International Research Center for Intelligent Perception and Computation, and the Joint International Research Laboratory of Intelligent Perception and Computation, Xidian University, Xi&#x2019;an, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"key":"ref2","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"OpenAI","year":"2023"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3053249"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548199"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3401711"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2994057"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2567069"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20028"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3007412"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.2022.3194869"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3070605"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2939201"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3132366"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3185320"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3212909"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3386339"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111452"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3086066"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3226871"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3131154"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2023.3309104"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.597"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_19"},{"key":"ref24","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lee"},{"key":"ref25","article-title":"Least-to-most prompting enables complex reasoning in large language models","author":"Zhou","year":"2022","journal-title":"arXiv:2205.10625"},{"key":"ref26","article-title":"Automatic chain of thought prompting in large language models","author":"Zhang","year":"2022","journal-title":"arXiv:2210.03493"},{"key":"ref27","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref28","article-title":"Visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv:2304.08485"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1044"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17514"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.49"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.627"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17515"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3115\/1629235.1629236"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1034"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612096"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00554"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.863"},{"key":"ref39","article-title":"M3D: A multimodal, multilingual and multitask dataset for grounded document-level information extraction","author":"Liu","year":"2024","journal-title":"arXiv:2412.04026"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3402242"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.215"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20167"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01904"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547943"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00200"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28347"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01250"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3053941"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3389454"},{"key":"ref51","article-title":"Making pre-trained language models better few-shot learners","author":"Gao","year":"2020","journal-title":"arXiv:2012.15723"},{"key":"ref52","article-title":"It\u2019s not just size that matters: Small language models are also few-shot learners","author":"Schick","year":"2020","journal-title":"arXiv:2009.07118"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00468"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref56","article-title":"ChainLM: Empowering large language models with improved chain-of-thought prompting","author":"Cheng","year":"2024","journal-title":"arXiv:2403.14312"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00057"},{"key":"ref58","first-page":"5168","article-title":"DDCoT: Duty-distinct chain-of-thought prompting for multimodal reasoning in language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zheng"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10687914"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27888"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3454237"},{"key":"ref62","article-title":"Multimodal chain-of-thought reasoning: A comprehensive survey","author":"Wang","year":"2025","journal-title":"arXiv:2503.12605"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-eacl.83"},{"key":"ref64","article-title":"Show your work: Scratchpads for intermediate computation with language models","author":"Nye","year":"2021","journal-title":"arXiv:2112.00114"},{"key":"ref65","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2022","journal-title":"arXiv:2203.11171"},{"key":"ref66","article-title":"Generating sequences by learning to self-correct","author":"Welleck","year":"2022","journal-title":"arXiv:2211.00053"},{"key":"ref67","article-title":"Caption anything: Interactive image description with diverse multimodal controls","author":"Wang","year":"2023","journal-title":"arXiv:2305.02677"},{"key":"ref68","article-title":"Chain of thought prompt tuning in vision language models","author":"Ge","year":"2023","journal-title":"arXiv:2304.07919"},{"key":"ref69","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref70","article-title":"Multimodal chain-of-thought reasoning in language models","author":"Zhang","year":"2023","journal-title":"arXiv:2302.00923"},{"key":"ref71","first-page":"13109","article-title":"Video-of-thought: Step-by-step video reasoning from perception to cognition","volume-title":"Proc. 41st Int. Conf. Mach. Learn.","volume":"235","author":"Fei"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.101"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.720"},{"key":"ref74","article-title":"CoMT: A novel benchmark for chain of multi-modal thought on large vision-language models","author":"Cheng","year":"2024","journal-title":"arXiv:2412.12932"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3672398"},{"key":"ref76","first-page":"1","article-title":"From multimodal LLM to human-level AI: Modality, instruction, reasoning, efficiency and beyond","volume-title":"Proc. Joint Int. Conf. Comput. Linguistics, Lang. Resour. Eval. (LREC-COLING), Tutorial Summaries","author":"Fei"},{"key":"ref77","article-title":"Scaling instruction-finetuned language models","author":"Won Chung","year":"2022","journal-title":"arXiv:2210.11416"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3136907"},{"key":"ref79","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Alayrac"},{"key":"ref80","article-title":"OpenFlamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023","journal-title":"arXiv:2308.01390"},{"key":"ref81","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"arXiv:2301.12597"},{"key":"ref82","article-title":"Otter: A multi-modal model with in-context instruction tuning","author":"Li","year":"2023","journal-title":"arXiv:2305.03726"},{"key":"ref83","article-title":"InstructBLIP: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023","journal-title":"arXiv:2305.06500"},{"key":"ref84","article-title":"NExT-GPT: Any-to-any multimodal LLM","author":"Wu","year":"2023","journal-title":"arXiv:2309.05519"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref86","article-title":"LLaVA-NeXT: Improved reasoning, OCR, and world knowledge","author":"Liu","year":"2024"},{"key":"ref87","article-title":"LLaMA-adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2023","journal-title":"arXiv:2303.16199"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32913"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3689090.3689388"},{"key":"ref90","first-page":"1234","article-title":"Vitron: A unified pixel-level vision LLM for understanding, generating, segmenting, editing","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Fei"},{"key":"ref91","article-title":"LLaVA-grounding: Grounded visual chat with large multimodal models","author":"Zhang","year":"2023","journal-title":"arXiv:2312.02949"},{"key":"ref92","article-title":"GPT4RoI: Instruction tuning large language model on region-of-interest","author":"Zhang","year":"2023","journal-title":"arXiv:2307.03601"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02664"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2025.3548047"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3522802"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3492259"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3491176"},{"key":"ref98","article-title":"Unified vision and language prompt learning","author":"Zang","year":"2022","journal-title":"arXiv:2210.07225"},{"key":"ref99","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref100","article-title":"Multitask prompted training enables zero-shot task generalization","author":"Sanh","year":"2021","journal-title":"arXiv:2110.08207"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5962385\/11195929\/11031239.pdf?arnumber=11031239","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T17:38:27Z","timestamp":1759945107000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11031239\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":100,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2025.3576486","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}