{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T07:42:45Z","timestamp":1777880565015,"version":"3.51.4"},"reference-count":110,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.knosys.2026.115568","type":"journal-article","created":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T06:50:36Z","timestamp":1772347836000},"page":"115568","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Towards generalized video captioning: An effective multi-modal knowledge graph perspective"],"prefix":"10.1016","volume":"340","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0271-5515","authenticated-orcid":false,"given":"Haoying","family":"Sun","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6264-9006","authenticated-orcid":false,"given":"Shuyi","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3181-6761","authenticated-orcid":false,"given":"Zeyu","family":"Xi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7209-0215","authenticated-orcid":false,"given":"Lifang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115568_bib0001","doi-asserted-by":"crossref","unstructured":"Z. Yue, Q. Zhang, A. Hu, L. Zhang, Z. Wang, Q. Jin, Movie101: A new movie understanding benchmark, (2023). arXiv: 2305.12140.","DOI":"10.18653\/v1\/2023.acl-long.257"},{"key":"10.1016\/j.knosys.2026.115568_bib0002","series-title":"YouTube2Text: recognizing and describing arbitrary activities using semantic hierarchiesand zero-shot recognition","author":"Guadarrama","year":"2013"},{"key":"10.1016\/j.knosys.2026.115568_bib0003","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"433","article-title":"Translating video content to natural language descriptions","author":"Rohrbach","year":"2013"},{"key":"10.1016\/j.knosys.2026.115568_bib0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.129177","article-title":"A simple yet effective knowledge guided method for entity-aware video captioning on a basketball benchmark","volume":"619","author":"Xi","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.115568_bib0005","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","first-page":"24330","article-title":"Player-centric multimodal prompt generation for large language model based identity-aware basketball video captioning","author":"Xi","year":"2025"},{"key":"10.1016\/j.knosys.2026.115568_bib0006","series-title":"Proceedings of the 32nd ACM International Conference on Information and Knowledge Management","first-page":"5391","article-title":"GOAL: a challenging knowledge-grounded video captioning benchmark for real-time soccer commentary generation","author":"Qi","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0007","unstructured":"Y. He, Y. Lin, J. Wu, H. Zhang, Y. Zhang, R. Le, StoryTeller: improving long video description through global audio-visual character identification, arXiv: 2411.07076(2024)."},{"key":"10.1016\/j.knosys.2026.115568_bib0008","unstructured":"A. Nadeem, F. Sardari, R. Dawes, S.S. Husain, A. Hilton, A. Mustafa, NarrativeBridge: enhancing video captioning with causal-temporal narrative, arXiv: 2406.06499(2024)."},{"key":"10.1016\/j.knosys.2026.115568_bib0009","unstructured":"N. Ballas, L. Yao, C. Pal, A. Courville, Delving deeper into convolutional networks for learning video representations, arXiv: 1511.06432(2015)."},{"key":"10.1016\/j.knosys.2026.115568_bib0010","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1029","article-title":"Hierarchical recurrent neural encoder for video representation with application to captioning","author":"Pan","year":"2016"},{"key":"10.1016\/j.knosys.2026.115568_bib0011","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"4507","article-title":"Describing videos by exploiting temporal structure","author":"Yao","year":"2015"},{"key":"10.1016\/j.knosys.2026.115568_bib0012","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4584","article-title":"Video paragraph captioning using hierarchical recurrent neural networks","author":"Yu","year":"2016"},{"key":"10.1016\/j.knosys.2026.115568_bib0013","series-title":"Proceedings of the 26th ACM International Conference on Multimedia","first-page":"63","article-title":"Hierarchical memory modelling for video captioning","author":"Wang","year":"2018"},{"key":"10.1016\/j.knosys.2026.115568_bib0014","unstructured":"A. Dosovitskiy, An image is worth 16x16 words: transformers for image recognition at scale, (2020). arXiv: 2010.11929."},{"key":"10.1016\/j.knosys.2026.115568_bib0015","unstructured":"H. Luo, L. Ji, B. Shi, H. Huang, N. Duan, T. Li, J. Li, T. Bharti, M. Zhou, UniVL: a unified video and language pre-training model for multimodal understanding and generation, (2020). arXiv: 2002.06353."},{"key":"10.1016\/j.knosys.2026.115568_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8746","article-title":"ActBERT: learning global-local video-text representations","author":"Zhu","year":"2020"},{"key":"10.1016\/j.knosys.2026.115568_bib0017","unstructured":"B. Korbar, F. Petroni, R. Girdhar, L. Torresani, Video understanding as machine translation, (2020). arXiv: 2006.07203."},{"key":"10.1016\/j.knosys.2026.115568_bib0018","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"15558","article-title":"Accurate and fast compressed video captioning","author":"Shen","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0019","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"18941","article-title":"Text with knowledge graph augmented transformer for video captioning","author":"Gu","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0020","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10870","article-title":"Spatio-temporal graph for video captioning with knowledge distillation","author":"Pan","year":"2020"},{"key":"10.1016\/j.knosys.2026.115568_bib0021","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13278","article-title":"Object relational graph with teacher-recommended learning for video captioning","author":"Zhang","year":"2020"},{"key":"10.1016\/j.knosys.2026.115568_bib0022","doi-asserted-by":"crossref","first-page":"2726","DOI":"10.1109\/TIP.2022.3158546","article-title":"Long short-term relation transformer with global gating for video captioning","volume":"31","author":"Li","year":"2022","journal-title":"IEEE Trans. Image Process."},{"issue":"1s","key":"10.1016\/j.knosys.2026.115568_bib0023","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3539225","article-title":"Retrieval augmented convolutional encoder-decoder networks for video captioning","volume":"19","author":"Chen","year":"2023","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.115568_bib0024","doi-asserted-by":"crossref","first-page":"5366","DOI":"10.1109\/TIP.2023.3307969","article-title":"Concept-aware video captioning: describing videos with effective prior information","volume":"32","author":"Yang","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"9","key":"10.1016\/j.knosys.2026.115568_bib0025","doi-asserted-by":"crossref","first-page":"2407","DOI":"10.1109\/TMM.2019.2896515","article-title":"Generating video descriptions with latent topic guidance","volume":"21","author":"Chen","year":"2019","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.knosys.2026.115568_bib0026","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3724","article-title":"Refined semantic enhancement towards frequency diffusion for video captioning","volume":"37","author":"Zhong","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0027","series-title":"Proceedings of the 2025 International Conference on Multimedia Retrieval","first-page":"2038","article-title":"DSSM-KG: dual-stream state-space modeling with adaptive knowledge injection for video captioning","author":"Sun","year":"2025"},{"key":"10.1016\/j.knosys.2026.115568_bib0028","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Conceptnet 5.5: an open multilingual graph of general knowledge","volume":"31","author":"Speer","year":"2017"},{"key":"10.1016\/j.knosys.2026.115568_bib0029","series-title":"2019IEEE Winter Conference on Applications of Computer Vision (WACV)","first-page":"283","article-title":"Improving image captioning by leveraging knowledge graphs","author":"Zhou","year":"2019"},{"key":"10.1016\/j.knosys.2026.115568_bib0030","article-title":"Scene adaptive dynamic multi-modal knowledge for video captioning","volume":"305","author":"Sun","year":"2025","journal-title":"Expert Syst. Appl."},{"issue":"9","key":"10.1016\/j.knosys.2026.115568_bib0031","doi-asserted-by":"crossref","first-page":"4484","DOI":"10.1109\/TCSVT.2023.3277827","article-title":"Concept parser with multimodal graph learning for video captioning","volume":"33","author":"Wu","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115568_bib0032","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"17949","article-title":"SwinBERT: end-to-end transformers with sparse attention for video captioning","author":"Lin","year":"2022"},{"key":"10.1016\/j.knosys.2026.115568_bib0033","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.115568_bib0034","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3202","article-title":"Video swin transformer","author":"Liu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115568_bib0035","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"4858","article-title":"Clip4Caption: clip for video caption","author":"Tang","year":"2021"},{"key":"10.1016\/j.knosys.2026.115568_bib0036","series-title":"2006IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR\u201906)","first-page":"1735","article-title":"Dimensionality reduction by learning an invariant mapping","volume":"2","author":"Hadsell","year":"2006"},{"key":"10.1016\/j.knosys.2026.115568_bib0037","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103856","article-title":"Unified hierarchical contrastive learning for video captioning","volume":"127","author":"Sun","year":"2026","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.knosys.2026.115568_bib0038","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107702","article-title":"Enhancing the alignment between target words and corresponding frames for video captioning","volume":"111","author":"Tu","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115568_bib0039","series-title":"International Conference on Multimedia Modeling","first-page":"42","article-title":"Hierarchical vision-language alignment for video captioning","author":"Zhang","year":"2018"},{"issue":"2","key":"10.1016\/j.knosys.2026.115568_bib0040","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3546828","article-title":"Learning video-text aligned representations for video captioning","volume":"19","author":"Shi","year":"2023","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl."},{"key":"10.1016\/j.knosys.2026.115568_bib0041","series-title":"Chinese Conference on Pattern Recognition and Computer Vision (PRCV)","first-page":"368","article-title":"Clip meets video captioning: concept-aware representation learning does matter","author":"Yang","year":"2022"},{"key":"10.1016\/j.knosys.2026.115568_bib0042","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13525","article-title":"Retrieval-augmented egocentric video captioning","author":"Xu","year":"2024"},{"key":"10.1016\/j.knosys.2026.115568_bib0043","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9837","article-title":"Open-book video captioning with retrieve-copy-generate network","author":"Zhang","year":"2021"},{"key":"10.1016\/j.knosys.2026.115568_bib0044","doi-asserted-by":"crossref","first-page":"1122","DOI":"10.1109\/TIP.2024.3359045","article-title":"Emotional video captioning with vision-based emotion interpretation network","volume":"33","author":"Song","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.115568_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121601","article-title":"Contrastive topic-enhanced network for video captioning","volume":"237","author":"Zeng","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.115568_bib0046","doi-asserted-by":"crossref","first-page":"602","DOI":"10.1038\/s42256-025-01014-w","article-title":"A comprehensive large-scale biomedical knowledge graph for AI-powered data-driven biomedical research","volume":"7","author":"Zhang","year":"2025","journal-title":"Nat. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115568_bib0047","doi-asserted-by":"crossref","unstructured":"Z. Zhang, J. Chen, X. Chen, H. Liu, Y. Xiang, B. Liu, Y. Zheng, An industry evaluation of embedding-based entity alignment, arXiv: 2010.11522(2020).","DOI":"10.18653\/v1\/2020.coling-industry.17"},{"key":"10.1016\/j.knosys.2026.115568_bib0048","unstructured":"J. Cui, M. Ning, Z. Li, B. Chen, Y. Yan, H. Li, B. Ling, Y. Tian, L. Yuan, Chatlaw: a multi-agent collaborative legal assistant with knowledge graph enhanced mixture-of-experts large language model, arXiv: 2306.16092(2023)."},{"issue":"4","key":"10.1016\/j.knosys.2026.115568_bib0049","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1023\/B:BTTJ.0000047600.45421.6d","article-title":"ConceptNet\u2013a practical commonsense reasoning tool-kit","volume":"22","author":"Liu","year":"2004","journal-title":"BT Technol. J."},{"key":"10.1016\/j.knosys.2026.115568_bib0050","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"4290","article-title":"Improved image captioning via knowledge graph-augmented models","author":"Santiesteban","year":"2024"},{"key":"10.1016\/j.knosys.2026.115568_bib0051","doi-asserted-by":"crossref","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","article-title":"Image captioning with transformer and knowledge graph","volume":"143","author":"Zhang","year":"2021","journal-title":"Pattern Recognit. Lett."},{"key":"10.1016\/j.knosys.2026.115568_bib0052","unstructured":"N. Xu, Y. Wang, T. Zhang, H. Tian, M. Kankanhalli, A.-A. Liu, How to understand named entities: using common sense for news captioning, arXiv: 2403.06520(2024)."},{"key":"10.1016\/j.knosys.2026.115568_bib0053","doi-asserted-by":"crossref","first-page":"2659","DOI":"10.1109\/TMM.2023.3301279","article-title":"Boosting entity-aware image captioning with multi-modal knowledge graph","volume":"26","author":"Zhao","year":"2023","journal-title":"IEEE Trans. Multimedia."},{"key":"10.1016\/j.knosys.2026.115568_bib0054","series-title":"The Semantic Web: 16th International Conference, ESWC 2019, PortoroVz, Slovenia, June 2\u20136, 2019, Proceedings 16","first-page":"459","article-title":"MMKG: multi-modal knowledge graphs","author":"Liu","year":"2019"},{"key":"10.1016\/j.knosys.2026.115568_bib0055","doi-asserted-by":"crossref","unstructured":"Z. Chen, Y. Zhang, Y. Fang, Y. Geng, L. Guo, X. Chen, Q. Li, W. Zhang, J. Chen, Y. Zhu, et al., Knowledge graphs meet multi-modal learning: a comprehensive survey, arXiv: 2402.05391(2024).","DOI":"10.2139\/ssrn.5044404"},{"key":"10.1016\/j.knosys.2026.115568_bib0056","unstructured":"X. Yi, Y. Li, L. Zou, MMPKUBase: a comprehensive and high-quality Chinese multi-modal knowledge graph, (2024). arXiv: 2408.01679."},{"key":"10.1016\/j.knosys.2026.115568_bib0057","series-title":"Proceedings of the 33rd ACM International Conference on Information and Knowledge Management","first-page":"5360","article-title":"VHAKG: a multi-modal knowledge graph based on synchronized multi-view videos of daily activities","author":"Egami","year":"2024"},{"key":"10.1016\/j.knosys.2026.115568_bib0058","series-title":"Proceedings of the 32nd ACM International Conference on Information and Knowledge Management","first-page":"3361","article-title":"AspectMMKG: a multi-modal knowledge graph with aspect-aware entities","author":"Zhang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0059","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"18653","article-title":"Beyond entities: a large-scale multi-modal knowledge graph with triplet fact grounding","volume":"38","author":"Liu","year":"2024"},{"key":"10.1016\/j.knosys.2026.115568_bib0060","unstructured":"R. Xie, Z. Liu, H. Luan, M. Sun, Image-embodied knowledge representation learning, arXiv preprint arXiv: 1609.07028(2016)."},{"key":"10.1016\/j.knosys.2026.115568_bib0061","unstructured":"K. Simonyan, A. Zisserman, Very deep convolutional networks for large-scale image recognition, (2014). arXiv: 1409.1556."},{"key":"10.1016\/j.knosys.2026.115568_bib0062","first-page":"2787","article-title":"Translating embeddings for modeling multi-relational data","volume":"26","author":"Bordes","year":"2013","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.115568_bib0063","series-title":"2019 International Joint Conference on Neural Networks (IJCNN)","first-page":"1","article-title":"Multimodal data enhanced representation learning for knowledge graphs","author":"Wang","year":"2019"},{"key":"10.1016\/j.knosys.2026.115568_bib0064","series-title":"Proceedings of the Seventh Joint Conference on Lexical and Computational Semantics","first-page":"225","article-title":"A multimodal translation-based approach for knowledge graph representation learning","author":"Mousselly-Sergieh","year":"2018"},{"key":"10.1016\/j.knosys.2026.115568_bib0065","doi-asserted-by":"crossref","unstructured":"P. Pezeshkpour, L. Chen, S. Singh, Embedding multimodal relational data for knowledge base completion, (2018). arXiv: 1809.01341.","DOI":"10.18653\/v1\/D18-1359"},{"key":"10.1016\/j.knosys.2026.115568_bib0066","series-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"1532","article-title":"Glove: global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.knosys.2026.115568_bib0067","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"15005","article-title":"APKGC: noise-enhanced multi-modal knowledge graph completion with attention penalty","volume":"39","author":"Jian","year":"2025"},{"issue":"8","key":"10.1016\/j.knosys.2026.115568_bib0068","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3760786","article-title":"Adaptive modality interaction transformer for multimodal knowledge graph completion","volume":"19","author":"Jian","year":"2025","journal-title":"ACM Trans. Knowl. Discov. Data"},{"key":"10.1016\/j.knosys.2026.115568_bib0069","series-title":"Proceedings of the 30th ACM International Conference on Information & Knowledge Management","first-page":"2140","article-title":"DisenKGAT: knowledge graph embedding with disentangled graph attention network","author":"Wu","year":"2021"},{"key":"10.1016\/j.knosys.2026.115568_bib0070","series-title":"International Conference on Machine Learning","first-page":"4212","article-title":"Disentangled graph convolutional networks","author":"Ma","year":"2019"},{"key":"10.1016\/j.knosys.2026.115568_bib0071","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128614","article-title":"Decoupled semantic graph neural network for knowledge graph embedding","volume":"611","author":"Li","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.115568_bib0072","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121278","article-title":"MMKDGAT: multi-modal knowledge graph-aware deep graph attention network for remote sensing image recommendation","volume":"235","author":"Wang","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.115568_bib0073","unstructured":"A. Pesaranghader, T. Sajed, RECipe: Does a multi-modal recipe knowledge graph fit a multi-purpose recommendation system?, (2023). arXiv: 2308.04579."},{"key":"10.1016\/j.knosys.2026.115568_bib0074","doi-asserted-by":"crossref","unstructured":"F. Xia, B. Li, Y. Weng, S. He, K. Liu, B. Sun, S. Li, J. Zhao, LingYi: medical conversational question answering system based on multi-modal knowledge graphs, (2022). arXiv: 2204.09220.","DOI":"10.18653\/v1\/2022.emnlp-demos.15"},{"key":"10.1016\/j.knosys.2026.115568_bib0075","unstructured":"S. Cheng, X. Liang, Z. Bi, H. Chen, N. Zhang, Multi-modal protein knowledge graph construction and applications, (2022). arXiv: 2207.10080."},{"key":"10.1016\/j.knosys.2026.115568_bib0076","series-title":"Machine Learning for Health (ML4H)","first-page":"52","article-title":"Multi-modal graph learning over umls knowledge graphs","author":"Burger","year":"2023"},{"key":"10.1016\/j.knosys.2026.115568_bib0077","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113766","article-title":"Noise-enhanced graph contrastive learning for multimodal recommendation systems","volume":"324","author":"Shi","year":"2025","journal-title":"Knowl. Based Syst."},{"issue":"2","key":"10.1016\/j.knosys.2026.115568_bib0078","doi-asserted-by":"crossref","first-page":"715","DOI":"10.1109\/TKDE.2022.3224228","article-title":"Multi-modal knowledge graph construction and application: a survey","volume":"36","author":"Zhu","year":"2022","journal-title":"IEEE Trans. Knowl. Data Eng."},{"issue":"6","key":"10.1016\/j.knosys.2026.115568_bib0079","doi-asserted-by":"crossref","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","article-title":"Faster R-CNN: towards real-time object detection with region proposal networks","volume":"39","author":"Ren","year":"2016","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115568_bib0080","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.115568_bib0081","doi-asserted-by":"crossref","first-page":"5150","DOI":"10.1109\/TIP.2022.3192709","article-title":"Cross-modal graph with meta concepts for video captioning","volume":"31","author":"Wang","year":"2022","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.115568_bib0082","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"815","article-title":"FaceNet: a unified embedding for face recognition and clustering","author":"Schroff","year":"2015"},{"key":"10.1016\/j.knosys.2026.115568_bib0083","unstructured":"J. Robinson, C.-Y. Chuang, S. Sra, S. Jegelka, Contrastive learning with hard negative samples, (2020). arXiv: 2010.04592."},{"key":"10.1016\/j.knosys.2026.115568_bib0084","series-title":"2022IEEE 5th International Conference on Electronics and Communication Engineering (ICECE)","first-page":"202","article-title":"Hard contrastive learning for video captioning","author":"Wu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115568_bib0085","first-page":"32731","article-title":"TripletClip: improving compositional reasoning of clip via synthetic vision-language negatives","volume":"37","author":"Patel","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.115568_bib0086","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"7463","article-title":"Structure-aware contrastive learning for diagram understanding of multimodal models","author":"Sasaki","year":"2025"},{"key":"10.1016\/j.knosys.2026.115568_bib0087","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5288","article-title":"MSR-VTT: a large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.knosys.2026.115568_bib0088","series-title":"Proceedings of the 49thAnnual Meeting of the Association for Computational Linguistics: Human Language Technologies","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"Chen","year":"2011"},{"key":"10.1016\/j.knosys.2026.115568_bib0089","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.knosys.2026.115568_bib0090","series-title":"Proceedings of the Acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation And\/or Summarization","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"Banerjee","year":"2005"},{"key":"10.1016\/j.knosys.2026.115568_bib0091","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.knosys.2026.115568_bib0092","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4566","article-title":"CIDEr: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.knosys.2026.115568_bib0093","unstructured":"D.P. Kingma, Adam: a method for stochastic optimization, (2014). arXiv: 1412.6980."},{"key":"10.1016\/j.knosys.2026.115568_bib0094","doi-asserted-by":"crossref","first-page":"222","DOI":"10.1016\/j.neucom.2018.06.096","article-title":"Fused GRU with semantic-temporal attention for video captioning","volume":"395","author":"Gao","year":"2020","journal-title":"Neurocomputing"},{"key":"10.1016\/j.knosys.2026.115568_bib0095","doi-asserted-by":"crossref","first-page":"4013","DOI":"10.1109\/TIP.2020.2969330","article-title":"Image captioning with end-to-end attribute detection and subsequent attributes prediction","volume":"29","author":"Huang","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.115568_bib0096","doi-asserted-by":"crossref","unstructured":"L. Yan, Q. Wang, Y. Cui, F. Feng, X. Quan, X. Zhang, D. Liu, GL-RG: global-local representation granularity for video captioning, (2022). arXiv: 2205.10706.","DOI":"10.24963\/ijcai.2022\/384"},{"key":"10.1016\/j.knosys.2026.115568_bib0097","first-page":"2514","article-title":"Semantic grouping network for video captioning","volume":"35","author":"Ryu","year":"2021","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.115568_bib0098","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1543","article-title":"Motion guided region message passing for video captioning","author":"Chen","year":"2021"},{"key":"10.1016\/j.knosys.2026.115568_bib0099","doi-asserted-by":"crossref","first-page":"202","DOI":"10.1109\/TIP.2021.3120867","article-title":"Hierarchical representation network with auxiliary tasks for video captioning and video question answering","volume":"31","author":"Gao","year":"2021","journal-title":"IEEE Trans. Image Process."},{"issue":"10","key":"10.1016\/j.knosys.2026.115568_bib0100","doi-asserted-by":"crossref","first-page":"6753","DOI":"10.1109\/TCSVT.2022.3169894","article-title":"Towards knowledge-aware video captioning via transitive visual relationship detection","volume":"32","author":"Wu","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115568_bib0101","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.114003","article-title":"Knowledge enhancement and disentanglement learning for video captioning","volume":"326","author":"Wang","year":"2025","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115568_bib0102","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2025.107817","article-title":"MGTR-MISS: more ground truth retrieving based multimodal interaction and semantic supervision for video description","volume":"192","author":"Zhang","year":"2025","journal-title":"Neural Netw."},{"key":"10.1016\/j.knosys.2026.115568_bib0103","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Inception-v4, inception-resnet and the impact of residual connections on learning","volume":"31","author":"Szegedy","year":"2017"},{"key":"10.1016\/j.knosys.2026.115568_bib0104","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.knosys.2026.115568_bib0105","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6546","article-title":"Can spatiotemporal 3D CNNs retrace the history of 2D CNNs and imagenet?","author":"Hara","year":"2018"},{"issue":"11","key":"10.1016\/j.knosys.2026.115568_bib0106","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.knosys.2026.115568_bib0107","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"26160","article-title":"Video-Xl: extra-long vision language model for hour-scale video understanding","author":"Shu","year":"2025"},{"key":"10.1016\/j.knosys.2026.115568_bib0108","unstructured":"X. Liu, Y. Shu, Z. Liu, A. Li, Y. Tian, B. Zhao, Video-XL-Pro: reconstructive token compression for extremely long video understanding, (2025). arXiv: 2503.18478."},{"key":"10.1016\/j.knosys.2026.115568_bib0109","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"18221","article-title":"MovieChat: from dense token to sparse memory for long video understanding","author":"Song","year":"2024"},{"key":"10.1016\/j.knosys.2026.115568_bib0110","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13504","article-title":"MA-LMM: memory-augmented large multimodal model for long-term video understanding","author":"He","year":"2024"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126003102?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126003102?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:15:19Z","timestamp":1777594519000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126003102"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":110,"alternative-id":["S0950705126003102"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115568","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Towards generalized video captioning: An effective multi-modal knowledge graph perspective","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115568","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115568"}}