{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T02:12:33Z","timestamp":1778724753327,"version":"3.51.4"},"reference-count":39,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62132006"],"award-info":[{"award-number":["62132006"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62562035"],"award-info":[{"award-number":["62562035"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U24A20220"],"award-info":[{"award-number":["U24A20220"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100022957","name":"Double Thousand Plan of Jiangxi Province","doi-asserted-by":"publisher","award":["jxsq2023101092"],"award-info":[{"award-number":["jxsq2023101092"]}],"id":[{"id":"10.13039\/100022957","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013064","name":"Key Research and Development Program of Jiangxi Province","doi-asserted-by":"publisher","award":["20252BCE310034"],"award-info":[{"award-number":["20252BCE310034"]}],"id":[{"id":"10.13039\/501100013064","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004479","name":"Jiangxi Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["20242BAB23012"],"award-info":[{"award-number":["20242BAB23012"]}],"id":[{"id":"10.13039\/501100004479","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004479","name":"Jiangxi Provincial Natural Science Foundation","doi-asserted-by":"publisher","award":["20252BAC200182"],"award-info":[{"award-number":["20252BAC200182"]}],"id":[{"id":"10.13039\/501100004479","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Displays"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.displa.2026.103420","type":"journal-article","created":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T16:24:58Z","timestamp":1772555098000},"page":"103420","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Retrieval Augmented video captioning with quality-aware re-ranking and cross-gating fusion"],"prefix":"10.1016","volume":"93","author":[{"given":"Chengyang","family":"Fang","sequence":"first","affiliation":[]},{"given":"Jian","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yilong","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Wenhui","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Gangyan","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Zhixuan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yuming","family":"Fang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.displa.2026.103420_b1","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"17928","article-title":"Swinbert: End-to-end transformers with sparse attention for video captioning","author":"Lin","year":"2022"},{"key":"10.1016\/j.displa.2026.103420_b2","series-title":"2023 IEEE\/CVF International Conference on Computer Vision","first-page":"15512","article-title":"Accurate and fast compressed video captioning","author":"Shen","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b3","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"20105","article-title":"Meltr: Meta loss transformer for learning to fine-tune video foundation models","author":"Ko","year":"2023"},{"issue":"3","key":"10.1016\/j.displa.2026.103420_b4","first-page":"2552","article-title":"Comprehensive visual grounding for video description","volume":"38","author":"Jiang","year":"2024","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"1s","key":"10.1016\/j.displa.2026.103420_b5","doi-asserted-by":"crossref","DOI":"10.1145\/3539225","article-title":"Retrieval augmented convolutional encoder\u2013decoder networks for video captioning","volume":"19","author":"Chen","year":"2023","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.displa.2026.103420_b6","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13525","article-title":"Retrieval-augmented egocentric video captioning","author":"Xu","year":"2024"},{"key":"10.1016\/j.displa.2026.103420_b7","series-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","first-page":"20715","article-title":"IFCap: Image-like retrieval and frequency-based entity filtering for zero-shot captioning","author":"Lee","year":"2024"},{"key":"10.1016\/j.displa.2026.103420_b8","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2840","article-title":"Smallcap: Lightweight image captioning prompted with retrieval augmentation","author":"Ramos","year":"2023"},{"issue":"4","key":"10.1016\/j.displa.2026.103420_b9","first-page":"4320","article-title":"Vipcap: Retrieval text-based visual prompts for lightweight image captioning","volume":"39","author":"Kim","year":"2025","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.displa.2026.103420_b10","series-title":"Proceedings of the 19th International Conference on Content-Based Multimedia Indexing","first-page":"1","article-title":"Retrieval-augmented transformer for image captioning","author":"Sarto","year":"2022"},{"key":"10.1016\/j.displa.2026.103420_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112170","article-title":"Retta: Retrieval-enhanced test-time adaptation for zero-shot video captioning","volume":"171","author":"Ma","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.displa.2026.103420_b12","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","first-page":"543","article-title":"Video-LLaMA: An instruction-tuned audio-visual language model for video understanding","author":"Zhang","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b13","series-title":"Computer Vision \u2013 ECCV 2022","first-page":"1","article-title":"Expanding language-image pretrained models for general video recognition","author":"Ni","year":"2022"},{"key":"10.1016\/j.displa.2026.103420_b14","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5288","article-title":"Msr-vtt: A large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.displa.2026.103420_b15","series-title":"Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"Chen","year":"2011"},{"key":"10.1016\/j.displa.2026.103420_b16","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.displa.2026.103420_b17","series-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/Or Summarization","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"Banerjee","year":"2005"},{"key":"10.1016\/j.displa.2026.103420_b18","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4566","article-title":"Cider: Consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.displa.2026.103420_b19","series-title":"International Conference on Learning Representations","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"issue":"3","key":"10.1016\/j.displa.2026.103420_b20","first-page":"2514","article-title":"Semantic grouping network for video captioning","volume":"35","author":"Ryu","year":"2021","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.displa.2026.103420_b21","series-title":"2021 IEEE\/CVF International Conference on Computer Vision","first-page":"1523","article-title":"Motion guided region message passing for video captioning","author":"Chen","year":"2021"},{"issue":"2","key":"10.1016\/j.displa.2026.103420_b22","doi-asserted-by":"crossref","first-page":"880","DOI":"10.1109\/TCSVT.2021.3063423","article-title":"Syntax-guided hierarchical attention network for video captioning","volume":"32","author":"Deng","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"2","key":"10.1016\/j.displa.2026.103420_b23","doi-asserted-by":"crossref","first-page":"1049","DOI":"10.1109\/TPAMI.2023.3327677","article-title":"Learning hierarchical modular networks for video captioning","volume":"46","author":"Li","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"3","key":"10.1016\/j.displa.2026.103420_b24","first-page":"3724","article-title":"Refined semantic enhancement towards frequency diffusion for video captioning","volume":"37","author":"Zhong","year":"2023","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"10.1016\/j.displa.2026.103420_b25","doi-asserted-by":"crossref","first-page":"4389","DOI":"10.1109\/TMM.2023.3322329","article-title":"Icocap: Improving video captioning by compounding images","volume":"26","author":"Liang","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.displa.2026.103420_b26","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13275","article-title":"Object relational graph with teacher-recommended learning for video captioning","author":"Zhang","year":"2020"},{"key":"10.1016\/j.displa.2026.103420_b27","series-title":"Computer Vision \u2013 ECCV 2024","first-page":"396","article-title":"Internvideo2: Scaling foundation models for multimodal video understanding","author":"Wang","year":"2025"},{"key":"10.1016\/j.displa.2026.103420_b28","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3192","article-title":"Video swin transformer","author":"Liu","year":"2022"},{"key":"10.1016\/j.displa.2026.103420_b29","series-title":"Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence","first-page":"1622","article-title":"Prompt learns prompt: Exploring knowledge-aware generative prompt collaboration for video captioning","author":"Yan","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b30","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19358","article-title":"Eva: Exploring the limits of masked visual representation learning at scale","author":"Fang","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b31","series-title":"The ava-kinetics localized human actions video dataset","author":"Li","year":"2020"},{"key":"10.1016\/j.displa.2026.103420_b32","series-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90% chatgpt quality","author":"V. Team","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b33","series-title":"2023 IEEE\/CVF International Conference on Computer Vision","first-page":"15359","article-title":"Hitea: Hierarchical temporal-aware video-language pre-training","author":"Ye","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b34","series-title":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10714","article-title":"Vid2seq: Large-scale pretraining of a visual language model for dense video captioning","author":"Yang","year":"2023"},{"key":"10.1016\/j.displa.2026.103420_b35","series-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"18209","article-title":"Omnivid: A generative framework for universal video understanding","author":"Wang","year":"2024"},{"issue":"4","key":"10.1016\/j.displa.2026.103420_b36","doi-asserted-by":"crossref","first-page":"3383","DOI":"10.1109\/TCSVT.2024.3502736","article-title":"Action-driven semantic representation and aggregation for video captioning","volume":"35","author":"Han","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"10.1016\/j.displa.2026.103420_b37","doi-asserted-by":"crossref","first-page":"6357","DOI":"10.1109\/TCSVT.2025.3541965","article-title":"Frame-by-frame multi-object tracking-guided video captioning","volume":"35","author":"Luo","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"1","key":"10.1016\/j.displa.2026.103420_b38","doi-asserted-by":"crossref","first-page":"1092","DOI":"10.1109\/TNNLS.2023.3323491","article-title":"Visual commonsense-aware representation network for video captioning","volume":"36","author":"Zeng","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.displa.2026.103420_b39","series-title":"International Conference on Learning Representations","article-title":"Bertscore: Evaluating text generation with bert","author":"Zhang","year":"2020"}],"container-title":["Displays"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226000831?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226000831?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T02:06:53Z","timestamp":1778724413000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0141938226000831"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":39,"alternative-id":["S0141938226000831"],"URL":"https:\/\/doi.org\/10.1016\/j.displa.2026.103420","relation":{},"ISSN":["0141-9382"],"issn-type":[{"value":"0141-9382","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Retrieval Augmented video captioning with quality-aware re-ranking and cross-gating fusion","name":"articletitle","label":"Article Title"},{"value":"Displays","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.displa.2026.103420","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103420"}}