{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T13:03:23Z","timestamp":1780923803793,"version":"3.54.1"},"reference-count":66,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004735","name":"Hunan Provincial Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100014472","name":"Scientific Research Foundation of Hunan Provincial Education Department","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100014472","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.knosys.2026.116187","type":"journal-article","created":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T04:10:55Z","timestamp":1778818255000},"page":"116187","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multi-scene topic-aware for novel single continuous shot multiple scenes endoscopy report generation"],"prefix":"10.1016","volume":"347","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9509-0755","authenticated-orcid":false,"given":"Xinpan","family":"Yuan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9955-8913","authenticated-orcid":false,"given":"Junhua","family":"Kuang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4411-1085","authenticated-orcid":false,"given":"Changhong","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2756-1540","authenticated-orcid":false,"given":"Liujie","family":"Hua","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4033-1843","authenticated-orcid":false,"given":"Guihu","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2295-9114","authenticated-orcid":false,"given":"Siming","family":"Jin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"10s","key":"10.1016\/j.knosys.2026.116187_b1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3522747","article-title":"A survey on deep learning and explainability for automatic report generation from medical images","volume":"54","author":"Messina","year":"2022","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.knosys.2026.116187_b2","article-title":"Automated radiology report generation: A review of recent advances","author":"Sloan","year":"2024","journal-title":"IEEE Rev. Biomed. Eng."},{"key":"10.1016\/j.knosys.2026.116187_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.imu.2023.101273","article-title":"Deep learning approaches to automatic radiology report generation: A systematic review","volume":"39","author":"Liao","year":"2023","journal-title":"Informatics Med. Unlocked"},{"issue":"1","key":"10.1016\/j.knosys.2026.116187_b4","doi-asserted-by":"crossref","first-page":"48","DOI":"10.1186\/s12938-023-01113-y","article-title":"A survey on automatic generation of medical imaging reports based on deep learning","volume":"22","author":"Pang","year":"2023","journal-title":"BioMedical Eng. OnLine"},{"issue":"1","key":"10.1016\/j.knosys.2026.116187_b5","article-title":"Automatic medical report generation: Methods and applications","volume":"13","author":"Guo","year":"2024","journal-title":"APSIPA Trans. Signal Inf. Process."},{"key":"10.1016\/j.knosys.2026.116187_b6","article-title":"Overview of ImageCLEFcaption 2017: image caption prediction and concept detection for biomedical images","author":"Eickhoff","year":"2017","journal-title":"Proc. CLEF 2017 Work. Notes"},{"key":"10.1016\/j.knosys.2026.116187_b7","article-title":"Overview of the imageclef 2018 caption prediction tasks","volume":"vol. 2125","author":"Garc\u00eda Seco de Herrera","year":"2018"},{"key":"10.1016\/j.knosys.2026.116187_b8","doi-asserted-by":"crossref","unstructured":"J. Pavlopoulos, V. Kougia, I. Androutsopoulos, A survey on biomedical image captioning, in: Proceedings of the Second Workshop on Shortcomings in Vision and Language, 2019, pp. 26\u201336.","DOI":"10.18653\/v1\/W19-1803"},{"issue":"1","key":"10.1016\/j.knosys.2026.116187_b9","doi-asserted-by":"crossref","first-page":"253","DOI":"10.1007\/s11280-022-01013-6","article-title":"Auxiliary signal-guided knowledge encoder-decoder for medical report generation","volume":"26","author":"Li","year":"2023","journal-title":"World Wide Web"},{"key":"10.1016\/j.knosys.2026.116187_b10","series-title":"MIMIC-CXR-jpg, a large publicly available database of labeled chest radiographs","author":"Johnson","year":"2019"},{"key":"10.1016\/j.knosys.2026.116187_b11","first-page":"590","article-title":"Chexpert: A large chest radiograph dataset with uncertainty labels and expert comparison","volume":"vol. 33","author":"Irvin","year":"2019"},{"key":"10.1016\/j.knosys.2026.116187_b12","doi-asserted-by":"crossref","unstructured":"X. Wang, Y. Peng, L. Lu, Z. Lu, M. Bagheri, R.M. Summers, Chestx-ray8: Hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 2097\u20132106.","DOI":"10.1109\/CVPR.2017.369"},{"key":"10.1016\/j.knosys.2026.116187_b13","doi-asserted-by":"crossref","unstructured":"O. Vinyals, A. Toshev, S. Bengio, D. Erhan, Show and tell: A neural image caption generator, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015, pp. 3156\u20133164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"10.1016\/j.knosys.2026.116187_b14","doi-asserted-by":"crossref","unstructured":"Z. Chen, Y. Song, T.-H. Chang, X. Wan, Generating Radiology Reports via Memory-driven Transformer, in: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, EMNLP, 2020, pp. 1439\u20131449.","DOI":"10.18653\/v1\/2020.emnlp-main.112"},{"key":"10.1016\/j.knosys.2026.116187_b15","series-title":"Cross-modal memory networks for radiology report generation","author":"Chen","year":"2022"},{"key":"10.1016\/j.knosys.2026.116187_b16","series-title":"Causalvlr: A toolbox and benchmark for visual-linguistic causal reasoning","author":"Liu","year":"2023"},{"key":"10.1016\/j.knosys.2026.116187_b17","first-page":"2607","article-title":"Promptmrg: Diagnosis-driven prompts for medical report generation","volume":"vol. 38","author":"Jin","year":"2024"},{"key":"10.1016\/j.knosys.2026.116187_b18","doi-asserted-by":"crossref","unstructured":"F. Liu, X. Wu, S. Ge, W. Fan, Y. Zou, Exploring and distilling posterior and prior knowledge for radiology report generation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 13753\u201313762.","DOI":"10.1109\/CVPR46437.2021.01354"},{"key":"10.1016\/j.knosys.2026.116187_b19","series-title":"Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2019: 22nd International Conference, Shenzhen, China, October 13\u201317, 2019, Proceedings, Part VI 22","first-page":"721","article-title":"Automatic radiology report generation based on multi-view image fusion and medical concept enrichment","author":"Yuan","year":"2019"},{"key":"10.1016\/j.knosys.2026.116187_b20","doi-asserted-by":"crossref","unstructured":"B. Jing, Z. Wang, E. Xing, Show, Describe and Conclude: On Exploiting the Structure Information of Chest X-ray Reports, in: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 2019, pp. 6570\u20136580.","DOI":"10.18653\/v1\/P19-1657"},{"key":"10.1016\/j.knosys.2026.116187_b21","doi-asserted-by":"crossref","unstructured":"M. Cornia, M. Stefanini, L. Baraldi, R. Cucchiara, Meshed-memory transformer for image captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 10578\u201310587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"10.1016\/j.knosys.2026.116187_b22","first-page":"13041","article-title":"Unified vision-language pre-training for image captioning and vqa","volume":"vol. 34","author":"Zhou","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b23","doi-asserted-by":"crossref","unstructured":"Z. Shi, H. Liu, X. Zhu, Enhancing descriptive image captioning with natural language inference, in: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), 2021, pp. 269\u2013277.","DOI":"10.18653\/v1\/2021.acl-short.36"},{"key":"10.1016\/j.knosys.2026.116187_b24","doi-asserted-by":"crossref","unstructured":"L. Huang, W. Wang, J. Chen, X.-Y. Wei, Attention on attention for image captioning, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4634\u20134643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"10.1016\/j.knosys.2026.116187_b25","doi-asserted-by":"crossref","unstructured":"K. Yan, L. Ji, H. Luo, M. Zhou, N. Duan, S. Ma, Control image captioning spatially and temporally, in: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 2021, pp. 2014\u20132025.","DOI":"10.18653\/v1\/2021.acl-long.157"},{"key":"10.1016\/j.knosys.2026.116187_b26","doi-asserted-by":"crossref","unstructured":"J. You, C. Hu, H. Kamigaito, H. Takamura, M. Okumura, Abstractive document summarization with word embedding reconstruction, in: Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021), 2021, pp. 1586\u20131596.","DOI":"10.26615\/978-954-452-072-4_178"},{"key":"10.1016\/j.knosys.2026.116187_b27","series-title":"Generating radiology reports via memory-driven transformer","author":"Chen","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b28","doi-asserted-by":"crossref","unstructured":"X. Wang, L. Ma, Y. Fu, X. Xue, Neural symbolic representation learning for image captioning, in: Proceedings of the 2021 International Conference on Multimedia Retrieval, 2021, pp. 312\u2013321.","DOI":"10.1145\/3460426.3463637"},{"key":"10.1016\/j.knosys.2026.116187_b29","series-title":"European Conference on Computer Vision","first-page":"679","article-title":"RadioTransformer: A cascaded global-focal transformer for visual attention\u2013guided disease classification","author":"Bhattacharya","year":"2022"},{"key":"10.1016\/j.knosys.2026.116187_b30","article-title":"Adaptively aligned image captioning via adaptive attention time","volume":"32","author":"Huang","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.116187_b31","doi-asserted-by":"crossref","unstructured":"C.-W. Kuo, Z. Kira, Beyond a pre-trained object detector: Cross-modal textual and visual context for image captioning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 17969\u201317979.","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"10.1016\/j.knosys.2026.116187_b32","doi-asserted-by":"crossref","first-page":"379","DOI":"10.1109\/TMM.2023.3265842","article-title":"Prompt-based learning for unpaired image captioning","volume":"26","author":"Zhu","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.116187_b33","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part VIII 16","first-page":"153","article-title":"Learning visual representations with caption annotations","author":"Sariyildiz","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b34","series-title":"European Conference on Computer Vision","first-page":"752","article-title":"Contrastive learning for weakly supervised phrase grounding","author":"Gupta","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b35","article-title":"Unpaired image-text matching via multimodal aligned conceptual knowledge","author":"Huang","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116187_b36","doi-asserted-by":"crossref","unstructured":"H. Akbari, S. Karaman, S. Bhargava, B. Chen, C. Vondrick, S.-F. Chang, Multi-level multimodal common semantic space for image-phrase grounding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 12476\u201312486.","DOI":"10.1109\/CVPR.2019.01276"},{"key":"10.1016\/j.knosys.2026.116187_b37","article-title":"PFedLAH: Personalized federated learning with lookahead for adaptive cross-modal hashing","author":"Chen","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116187_b38","doi-asserted-by":"crossref","unstructured":"J. Krause, J. Johnson, R. Krishna, L. Fei-Fei, A hierarchical approach for generating descriptive image paragraphs, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 317\u2013325.","DOI":"10.1109\/CVPR.2017.356"},{"key":"10.1016\/j.knosys.2026.116187_b39","doi-asserted-by":"crossref","unstructured":"X. Liang, Z. Hu, H. Zhang, C. Gan, E.P. Xing, Recurrent topic-transition gan for visual paragraph generation, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 3362\u20133371.","DOI":"10.1109\/ICCV.2017.364"},{"key":"10.1016\/j.knosys.2026.116187_b40","doi-asserted-by":"crossref","unstructured":"H. Yu, J. Wang, Z. Huang, Y. Yang, W. Xu, Video paragraph captioning using hierarchical recurrent neural networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 4584\u20134593.","DOI":"10.1109\/CVPR.2016.496"},{"key":"10.1016\/j.knosys.2026.116187_b41","doi-asserted-by":"crossref","unstructured":"R. Sun, Y. Li, T. Zhang, Z. Mao, F. Wu, Y. Zhang, Lesion-aware transformers for diabetic retinopathy grading, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 10938\u201310947.","DOI":"10.1109\/CVPR46437.2021.01079"},{"key":"10.1016\/j.knosys.2026.116187_b42","series-title":"2022 44th Annual International Conference of the IEEE Engineering in Medicine & Biology Society","first-page":"480","article-title":"Vision transformers for classification of breast ultrasound images","author":"Gheflati","year":"2022"},{"issue":"1","key":"10.1016\/j.knosys.2026.116187_b43","doi-asserted-by":"crossref","first-page":"184","DOI":"10.5715\/jnlp.30.184","article-title":"Joint learning-based heterogeneous graph attention network for timeline summarization","volume":"30","author":"You","year":"2023","journal-title":"J. Nat. Lang. Process."},{"key":"10.1016\/j.knosys.2026.116187_b44","doi-asserted-by":"crossref","unstructured":"H.-C. Shin, K. Roberts, L. Lu, D. Demner-Fushman, J. Yao, R.M. Summers, Learning to read chest x-rays: Recurrent neural cascade model for automated image annotation, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 2497\u20132506.","DOI":"10.1109\/CVPR.2016.274"},{"key":"10.1016\/j.knosys.2026.116187_b45","unstructured":"J. You, D. Li, M. Okumura, K. Suzuki, Jpg-jointly learn to align: Automated disease prediction and radiology report generation, in: Proceedings of the 29th International Conference on Computational Linguistics, 2022, pp. 5989\u20136001."},{"key":"10.1016\/j.knosys.2026.116187_b46","first-page":"722","article-title":"Cross modal global local representation learning from radiology reports and x-ray chest images","volume":"vol. 12465","author":"Hadjiyski","year":"2023"},{"key":"10.1016\/j.knosys.2026.116187_b47","doi-asserted-by":"crossref","DOI":"10.1016\/j.compbiomed.2024.108388","article-title":"CECT: Controllable ensemble CNN and transformer for COVID-19 image classification","volume":"173","author":"Liu","year":"2024","journal-title":"Comput. Biol. Med."},{"key":"10.1016\/j.knosys.2026.116187_b48","series-title":"Pseudo-prompt generating in pre-trained vision-language models for multi-label medical image classification","author":"Ye","year":"2024"},{"key":"10.1016\/j.knosys.2026.116187_b49","article-title":"Multi-dimensional semantic self-learning hashing for multimedia retrieval","author":"Chen","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.knosys.2026.116187_b50","series-title":"Handbook of Medical Image Computing and Computer Assisted Intervention","author":"Zhou","year":"2019"},{"key":"10.1016\/j.knosys.2026.116187_b51","first-page":"12910","article-title":"When radiology report generation meets knowledge graph","volume":"vol. 34","author":"Zhang","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b52","doi-asserted-by":"crossref","unstructured":"J. Lovelace, B. Mortazavi, Learning to generate clinically coherent chest X-ray reports, in: Findings of the Association for Computational Linguistics: EMNLP 2020, 2020, pp. 1235\u20131243.","DOI":"10.18653\/v1\/2020.findings-emnlp.110"},{"key":"10.1016\/j.knosys.2026.116187_b53","series-title":"Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021","first-page":"269","article-title":"Contrastive attention for automatic chest X-ray report generation","author":"Liu","year":"2021"},{"key":"10.1016\/j.knosys.2026.116187_b54","series-title":"Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2020: 23rd International Conference, Lima, Peru, October 4\u20138, 2020, Proceedings, Part II 23","first-page":"561","article-title":"Chest x-ray report generation through fine-grained label learning","author":"Syeda-Mahmood","year":"2020"},{"issue":"4","key":"10.1016\/j.knosys.2026.116187_b55","doi-asserted-by":"crossref","first-page":"2152","DOI":"10.1109\/JBHI.2024.3350077","article-title":"TSGET: two-stage global enhanced transformer for automatic radiology report generation","volume":"28","author":"Yi","year":"2024","journal-title":"IEEE J. Biomed. Health Inform."},{"issue":"3","key":"10.1016\/j.knosys.2026.116187_b56","doi-asserted-by":"crossref","first-page":"1494","DOI":"10.1109\/TMI.2024.3507073","article-title":"Lhr-rfl: Linear hybrid-reward-based reinforced focal learning for automatic radiology report generation","volume":"44","author":"Yi","year":"2024","journal-title":"IEEE Trans. Med. Imaging"},{"key":"10.1016\/j.knosys.2026.116187_b57","article-title":"Radiology report generation via visual-semantic ambivalence-aware network and focal self-critical sequence training","author":"Yi","year":"2025","journal-title":"Neural Netw."},{"key":"10.1016\/j.knosys.2026.116187_b58","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2023.105742","article-title":"Unsupervised disease tags for automatic radiology report generation","volume":"89","author":"Yi","year":"2024","journal-title":"Biomed. Signal Process. Control."},{"key":"10.1016\/j.knosys.2026.116187_b59","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"A novel single continuous shot multiple lesions endoscopy report generation","author":"Yuan","year":"2025"},{"key":"10.1016\/j.knosys.2026.116187_b60","doi-asserted-by":"crossref","unstructured":"A. Bhowmik, S. Gumhold, C. Rother, E. Brachmann, Reinforced feature points: Optimizing feature detection and description for a high-level task, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 4948\u20134957.","DOI":"10.1109\/CVPR42600.2020.00500"},{"issue":"11","key":"10.1016\/j.knosys.2026.116187_b61","doi-asserted-by":"crossref","first-page":"2227","DOI":"10.1109\/TPAMI.2014.2321376","article-title":"Scalable nearest neighbor algorithms for high dimensional data","volume":"36","author":"Muja","year":"2014","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.116187_b62","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.knosys.2026.116187_b63","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.116187_b64","series-title":"Hierarchical pre-training for sequence labelling in spoken dialog","author":"Chapuis","year":"2020"},{"key":"10.1016\/j.knosys.2026.116187_b65","article-title":"Improving image-text matching with bidirectional consistency of cross-modal alignment","author":"Li","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.116187_b66","doi-asserted-by":"crossref","DOI":"10.1016\/j.artmed.2023.102633","article-title":"Improving chest X-ray report generation by leveraging warm starting","volume":"144","author":"Nicolson","year":"2023","journal-title":"Artif. Intell. Med."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126009135?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126009135?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T12:21:26Z","timestamp":1780921286000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126009135"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":66,"alternative-id":["S0950705126009135"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116187","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-scene topic-aware for novel single continuous shot multiple scenes endoscopy report generation","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.116187","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"116187"}}