{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T02:02:14Z","timestamp":1780020134931,"version":"3.53.1"},"reference-count":88,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116075","type":"journal-article","created":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T16:13:57Z","timestamp":1778861637000},"page":"116075","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["EvoCap: Enhancing video captioning via self-evolving video-LLMs with knowledge consolidation"],"prefix":"10.1016","volume":"346","author":[{"given":"Yangliu","family":"Hu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minye","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junqing","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4122-3767","authenticated-orcid":false,"given":"Yiping Phoebe","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1189-1254","authenticated-orcid":false,"given":"Wei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.116075_b1","doi-asserted-by":"crossref","unstructured":"J. Xu, T. Mei, T. Yao, Y. Rui, Msr-vtt: A large video description dataset for bridging video and language, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"10.1016\/j.knosys.2026.116075_b2","doi-asserted-by":"crossref","unstructured":"A. Miech, D. Zhukov, J.-B. Alayrac, M. Tapaswi, I. Laptev, J. Sivic, Howto100m: Learning a text-video embedding by watching hundred million narrated video clips, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 2630\u20132640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"10.1016\/j.knosys.2026.116075_b3","first-page":"49428","article-title":"Vidchapters-7m: Video chapters at scale","volume":"36","author":"Yang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b4","series-title":"Internvideo: General video foundation models via generative and discriminative learning","author":"Wang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116075_b5","doi-asserted-by":"crossref","unstructured":"M. Maaz, H. Rasheed, S. Khan, F. Khan, Video-chatgpt: Towards detailed video understanding via large vision and language models, in: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2024, pp. 12585\u201312602.","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"10.1016\/j.knosys.2026.116075_b6","doi-asserted-by":"crossref","unstructured":"C. Sun, A. Myers, C. Vondrick, K. Murphy, C. Schmid, Videobert: A joint model for video and language representation learning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 7464\u20137473.","DOI":"10.1109\/ICCV.2019.00756"},{"key":"10.1016\/j.knosys.2026.116075_b7","doi-asserted-by":"crossref","unstructured":"M. Bain, A. Nagrani, G. Varol, A. Zisserman, Frozen in time: A joint video and image encoder for end-to-end retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1728\u20131738.","DOI":"10.1109\/ICCV48922.2021.00175"},{"issue":"10","key":"10.1016\/j.knosys.2026.116075_b8","doi-asserted-by":"crossref","DOI":"10.1007\/s11432-024-4321-9","article-title":"Videochat: Chat-centric video understanding","volume":"68","author":"Li","year":"2025","journal-title":"Sci. China Inf. Sci."},{"key":"10.1016\/j.knosys.2026.116075_b9","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b10","doi-asserted-by":"crossref","unstructured":"B. Lin, Y. Ye, B. Zhu, J. Cui, M. Ning, P. Jin, L. Yuan, Video-llava: Learning united visual representation by alignment before projection, in: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024, pp. 5971\u20135984.","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"10.1016\/j.knosys.2026.116075_b11","article-title":"Knowledge enhancement and disentanglement learning for video captioning","author":"Wang","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b12","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114033","article-title":"INQUIRER: Harnessing internal knowledge graphs for video question generation","author":"Choi","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114717","article-title":"MLLMs-MR: Multi-modal recognition based on multi-modal large language models","author":"Fu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b14","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114050","article-title":"LLM-GAODE: Large-language-model augmented neural ordinary differential equation network for video nystagmography classification","author":"Qiu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b15","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114298","article-title":"MAGE-fend: Multimodal adaptive fusion with guidance from LLM expertise for fake news detection on short video platforms","author":"Hu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b16","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114190","article-title":"LLVM-drone: A synergistic framework integrating large language models and vision models for visual tasks in unmanned aerial vehicles","author":"Hu","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2026.116075_b17","series-title":"Fine-tuning language models from human preferences","author":"Ziegler","year":"2019"},{"key":"10.1016\/j.knosys.2026.116075_b18","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b19","series-title":"Video-llama: An instruction-tuned audio-visual language model for video understanding","author":"Zhang","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b20","series-title":"Eventhallusion: Diagnosing event hallucinations in video llms","author":"Zhang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b21","doi-asserted-by":"crossref","unstructured":"C. Li, E.W. Im, P. Fazli, Vidhalluc: Evaluating temporal hallucinations in multimodal large language models for video understanding, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 13723\u201313733.","DOI":"10.1109\/CVPR52734.2025.01281"},{"key":"10.1016\/j.knosys.2026.116075_b22","doi-asserted-by":"crossref","unstructured":"K. Bae, J. Kim, S. Lee, S. Lee, G. Lee, J. Choi, MASH-VLM: Mitigating Action-Scene Hallucination in Video-LLMs through Disentangled Spatial-Temporal Representations, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 13744\u201313753.","DOI":"10.1109\/CVPR52734.2025.01283"},{"key":"10.1016\/j.knosys.2026.116075_b23","series-title":"SEASON: Mitigating temporal hallucination in video large language models via self-diagnostic contrastive decoding","author":"Wu","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b24","series-title":"Mitigating hallucination in videollms via temporal-aware activation engineering","author":"Cai","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b25","series-title":"ResNetVLLM-2: Addressing ResNetVLLM\u2019s multi-modal hallucinations","author":"Khalil","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b26","doi-asserted-by":"crossref","unstructured":"Y. Wang, Y. Kordi, S. Mishra, A. Liu, N.A. Smith, D. Khashabi, H. Hajishirzi, Self-instruct: Aligning language models with self-generated instructions, in: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2023, pp. 13484\u201313508.","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"10.1016\/j.knosys.2026.116075_b27","first-page":"46534","article-title":"Self-refine: Iterative refinement with self-feedback","volume":"36","author":"Madaan","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b28","doi-asserted-by":"crossref","unstructured":"J. Huang, S. Gu, L. Hou, Y. Wu, X. Wang, H. Yu, J. Han, Large language models can self-improve, in: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, 2023, pp. 1051\u20131068.","DOI":"10.18653\/v1\/2023.emnlp-main.67"},{"key":"10.1016\/j.knosys.2026.116075_b29","series-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2022"},{"key":"10.1016\/j.knosys.2026.116075_b30","first-page":"15476","article-title":"Star: Bootstrapping reasoning with reasoning","volume":"35","author":"Zelikman","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b31","first-page":"8634","article-title":"Reflexion: Language agents with verbal reinforcement learning","volume":"36","author":"Shinn","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b32","series-title":"Reinforced self-training (rest) for language modeling","author":"Gulcehre","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b33","doi-asserted-by":"crossref","unstructured":"L. Yao, A. Torabi, K. Cho, N. Ballas, C. Pal, H. Larochelle, A. Courville, Describing videos by exploiting temporal structure, in: Proceedings of the IEEE International Conference on Computer Vision, 2015, pp. 4507\u20134515.","DOI":"10.1109\/ICCV.2015.512"},{"key":"10.1016\/j.knosys.2026.116075_b34","first-page":"9127","article-title":"Activitynet-qa: A dataset for understanding complex web videos via question answering","volume":"vol. 33","author":"Yu","year":"2019"},{"key":"10.1016\/j.knosys.2026.116075_b35","doi-asserted-by":"crossref","unstructured":"R. Goyal, S. Ebrahimi Kahou, V. Michalski, J. Materzynska, S. Westphal, H. Kim, V. Haenel, I. Fruend, P. Yianilos, M. Mueller-Freitag, et al., The\u201c something something\u201d video database for learning and evaluating visual common sense, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"10.1016\/j.knosys.2026.116075_b36","doi-asserted-by":"crossref","unstructured":"K. Li, Y. Wang, Y. He, Y. Li, Y. Wang, Y. Liu, Z. Wang, J. Xu, G. Chen, P. Luo, et al., Mvbench: A comprehensive multi-modal video understanding benchmark, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22195\u201322206.","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"10.1016\/j.knosys.2026.116075_b37","series-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b38","doi-asserted-by":"crossref","unstructured":"L. Zhu, Y. Yang, Actbert: Learning global-local video-text representations, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 8746\u20138755.","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"10.1016\/j.knosys.2026.116075_b39","series-title":"Clip-vip: Adapting pre-trained image-text model to video-language representation alignment","author":"Xue","year":"2022"},{"key":"10.1016\/j.knosys.2026.116075_b40","doi-asserted-by":"crossref","unstructured":"D. Li, J. Li, H. Li, J.C. Niebles, S.C. Hoi, Align and prompt: Video-and-language pre-training with entity prompts, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 4953\u20134963.","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"10.1016\/j.knosys.2026.116075_b41","series-title":"European Conference on Computer Vision","first-page":"323","article-title":"Llama-vid: An image is worth 2 tokens in large language models","author":"Li","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b42","doi-asserted-by":"crossref","unstructured":"Z. Chen, J. Wu, W. Wang, W. Su, G. Chen, S. Xing, M. Zhong, Q. Zhang, X. Zhu, L. Lu, et al., Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 24185\u201324198.","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"10.1016\/j.knosys.2026.116075_b43","doi-asserted-by":"crossref","unstructured":"P. Jin, R. Takanobu, W. Zhang, X. Cao, L. Yuan, Chat-univi: Unified visual representation empowers large language models with image and video understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 13700\u201313710.","DOI":"10.1109\/CVPR52733.2024.01300"},{"key":"10.1016\/j.knosys.2026.116075_b44","series-title":"Valley: Video assistant with large language model enhanced ability","author":"Luo","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b45","doi-asserted-by":"crossref","first-page":"19472","DOI":"10.52202\/079017-0614","article-title":"Sharegpt4video: Improving video understanding and generation with better captions","volume":"37","author":"Chen","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b46","doi-asserted-by":"crossref","unstructured":"E. Song, W. Chai, G. Wang, Y. Zhang, H. Zhou, F. Wu, H. Chi, X. Guo, T. Ye, Y. Zhang, et al., Moviechat: From dense token to sparse memory for long video understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 18221\u201318232.","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"10.1016\/j.knosys.2026.116075_b47","doi-asserted-by":"crossref","unstructured":"B. He, H. Li, Y.K. Jang, M. Jia, X. Cao, A. Shah, A. Shrivastava, S.-N. Lim, Ma-lmm: Memory-augmented large multimodal model for long-term video understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 13504\u201313514.","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"10.1016\/j.knosys.2026.116075_b48","doi-asserted-by":"crossref","unstructured":"S. Ren, L. Yao, S. Li, X. Sun, L. Hou, Timechat: A time-sensitive multimodal large language model for long video understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14313\u201314323.","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"10.1016\/j.knosys.2026.116075_b49","doi-asserted-by":"crossref","unstructured":"B. Huang, X. Wang, H. Chen, Z. Song, W. Zhu, Vtimellm: Empower llm to grasp video moments, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 14271\u201314280.","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"10.1016\/j.knosys.2026.116075_b50","doi-asserted-by":"crossref","unstructured":"Y. Li, Y. Song, L. Cao, J. Tetreault, L. Goldberg, A. Jaimes, J. Luo, TGIF: A new dataset and benchmark on animated GIF description, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 4641\u20134650.","DOI":"10.1109\/CVPR.2016.502"},{"key":"10.1016\/j.knosys.2026.116075_b51","doi-asserted-by":"crossref","unstructured":"R. Krishna, K. Hata, F. Ren, L. Fei-Fei, J. Carlos Niebles, Dense-captioning events in videos, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 706\u2013715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"10.1016\/j.knosys.2026.116075_b52","unstructured":"K. Grauman, A. Westbury, E. Byrne, Z. Chavis, A. Furnari, R. Girdhar, J. Hamburger, H. Jiang, M. Liu, X. Liu, et al., Ego4d: Around the world in 3,000 hours of egocentric video, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 18995\u201319012."},{"key":"10.1016\/j.knosys.2026.116075_b53","doi-asserted-by":"crossref","unstructured":"C. Li, H. Xu, J. Tian, W. Wang, M. Yan, B. Bi, J. Ye, H. Chen, G. Xu, Z. Cao, et al., mplug: Effective and efficient vision-language learning by cross-modal skip-connections, in: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, 2022, pp. 7241\u20137259.","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"10.1016\/j.knosys.2026.116075_b54","series-title":"Video-salmonn: Speech-enhanced audio-visual large language models","author":"Sun","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b55","series-title":"Holmes-vad: Towards unbiased and explainable video anomaly detection via multi-modal llm","author":"Zhang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b56","series-title":"Internvideo2: Scaling video foundation models for multimodal video understanding. arxiv 2024","author":"Wang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b57","first-page":"10078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b58","doi-asserted-by":"crossref","unstructured":"R. Girdhar, A. El-Nouby, M. Singh, K.V. Alwala, A. Joulin, I. Misra, Omnimae: Single model masked pretraining on images and videos, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 10406\u201310417.","DOI":"10.1109\/CVPR52729.2023.01003"},{"key":"10.1016\/j.knosys.2026.116075_b59","series-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval","author":"Luo","year":"2021"},{"key":"10.1016\/j.knosys.2026.116075_b60","series-title":"Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms","author":"Cheng","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b61","series-title":"Stanford alpaca: an instruction-following llama model (2023)","author":"Taori","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b62","series-title":"Self-alignment with instruction backtranslation","author":"Li","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b63","series-title":"Codet: Code generation with generated tests","author":"Chen","year":"2022"},{"key":"10.1016\/j.knosys.2026.116075_b64","series-title":"Critic: Large language models can self-correct with tool-interactive critiquing","author":"Gou","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b65","series-title":"Constitutional ai: Harmlessness from ai feedback","author":"Bai","year":"2022"},{"key":"10.1016\/j.knosys.2026.116075_b66","first-page":"64735","article-title":"Rest-mcts*: Llm self-training via process reward guided tree search","volume":"37","author":"Zhang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b67","unstructured":"W. Yuan, R.Y. Pang, K. Cho, X. Li, S. Sukhbaatar, J. Xu, J.E. Weston, Self-rewarding language models, in: Forty-First International Conference on Machine Learning, 2024."},{"key":"10.1016\/j.knosys.2026.116075_b68","series-title":"MemGPT: Towards LLMs as operating systems","author":"Packer","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b69","first-page":"19724","article-title":"Memorybank: Enhancing large language models with long-term memory","volume":"vol. 38","author":"Zhong","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b70","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.129063","article-title":"Learning to break: Knowledge-enhanced reasoning in multi-agent debate system","volume":"618","author":"Wang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.116075_b71","series-title":"European Conference on Computer Vision","first-page":"527","article-title":"Shuffle and learn: unsupervised learning using temporal order verification","author":"Misra","year":"2016"},{"key":"10.1016\/j.knosys.2026.116075_b72","doi-asserted-by":"crossref","unstructured":"Y. Yao, C. Liu, D. Luo, Y. Zhou, Q. Ye, Video playback rate perception for self-supervised spatio-temporal representation learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6548\u20136557.","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"10.1016\/j.knosys.2026.116075_b73","series-title":"European Conference on Computer Vision","first-page":"504","article-title":"Self-supervised video representation learning by pace prediction","author":"Wang","year":"2020"},{"key":"10.1016\/j.knosys.2026.116075_b74","first-page":"46595","article-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","volume":"36","author":"Zheng","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116075_b75","doi-asserted-by":"crossref","unstructured":"Z. Li, C. Wang, P. Ma, D. Wu, S. Wang, C. Gao, Y. Liu, Split and merge: Aligning position biases in LLM-based evaluators, in: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024, pp. 11084\u201311108.","DOI":"10.18653\/v1\/2024.emnlp-main.621"},{"key":"10.1016\/j.knosys.2026.116075_b76","series-title":"Are large language models reliable judges? a study on the factuality evaluation capabilities of llms","author":"Fu","year":"2023"},{"key":"10.1016\/j.knosys.2026.116075_b77","series-title":"Is your video language model a reliable judge?","author":"Liu","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b78","series-title":"Preference leakage: A contamination problem in LLM-as-a-judge","author":"Li","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b79","first-page":"1","article-title":"TimeJudge: empowering video-LLMs as zero-shot judges for temporal consistency in video captions","author":"Hu","year":"2025","journal-title":"Front. Inf. Technol. Electron. Eng."},{"key":"10.1016\/j.knosys.2026.116075_b80","series-title":"Video-LevelGauge: Investigating contextual positional bias in large video language models","author":"Xia","year":"2025"},{"key":"10.1016\/j.knosys.2026.116075_b81","series-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"10.1016\/j.knosys.2026.116075_b82","series-title":"Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b83","series-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b84","series-title":"mplug-owl3: Towards long image-sequence understanding in multi-modal large language models","author":"Ye","year":"2024"},{"key":"10.1016\/j.knosys.2026.116075_b85","series-title":"European Conference on Computer Vision","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"Wang","year":"2016"},{"issue":"5","key":"10.1016\/j.knosys.2026.116075_b86","doi-asserted-by":"crossref","first-page":"3755","DOI":"10.1109\/TCSVT.2023.3319330","article-title":"MC-blur: A comprehensive benchmark for image deblurring","volume":"34","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116075_b87","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111628","article-title":"LLDiffusion: Learning degradation representations in diffusion models for low-light image enhancement","volume":"166","author":"Wang","year":"2025","journal-title":"Pattern Recognit."},{"issue":"10","key":"10.1016\/j.knosys.2026.116075_b88","doi-asserted-by":"crossref","first-page":"4541","DOI":"10.1007\/s11263-024-02056-0","article-title":"Gridformer: Residual dense transformer with grid structure for image restoration in adverse weather conditions","volume":"132","author":"Wang","year":"2024","journal-title":"Int. J. Comput. Vis."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008014?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126008014?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T01:10:18Z","timestamp":1780017018000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126008014"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":88,"alternative-id":["S0950705126008014"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116075","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EvoCap: Enhancing video captioning via self-evolving video-LLMs with knowledge consolidation","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116075","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116075"}}