{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:45:32Z","timestamp":1777873532838,"version":"3.51.4"},"reference-count":79,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Expert Systems with Applications"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.eswa.2026.132566","type":"journal-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T06:54:11Z","timestamp":1776927251000},"page":"132566","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["A knowledge-guided multimodal network for video summarization"],"prefix":"10.1016","volume":"324","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9278-1032","authenticated-orcid":false,"given":"Xiaoyan","family":"Tian","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8491-1391","authenticated-orcid":false,"given":"Ye","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9466-016X","authenticated-orcid":false,"given":"Zhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6568-1335","authenticated-orcid":false,"given":"Peng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0184-1421","authenticated-orcid":false,"given":"Fenglei","family":"Ni","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.eswa.2026.132566_bib0001","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.eswa.2026.132566_bib0002","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.127337","article-title":"SUSAN: A deep learning-based architecture for violence detection against women in surveillance videos","volume":"280","author":"Andrade","year":"2025","journal-title":"Expert Systems with Applications"},{"issue":"11","key":"10.1016\/j.eswa.2026.132566_bib0003","doi-asserted-by":"crossref","first-page":"1838","DOI":"10.1109\/JPROC.2021.3117472","article-title":"Video summarization using deep neural networks: A survey","volume":"109","author":"Apostolidis","year":"2021","journal-title":"Proceedings of the IEEE"},{"key":"10.1016\/j.eswa.2026.132566_bib0004","series-title":"Lrec","first-page":"2200","article-title":"SENTIWORDNET 3.0: An enhanced lexical resource for sentiment analysis and opinion mining","volume":"vol. 10","author":"Baccianella","year":"2010"},{"key":"10.1016\/j.eswa.2026.132566_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.124013","article-title":"Diffusion-based normality pre-training for weakly supervised video anomaly detection","volume":"251","author":"Basak","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132566_bib0006","series-title":"International conference on human-computer interaction","first-page":"197","article-title":"SenticNet 8: Fusing emotion AI and commonsense AI for interpretable, trustworthy, and explainable affective computing","author":"Cambria","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0007","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"6299","article-title":"Quo vadis, action recognition? a new model and the kinetics dataset","author":"Carreira","year":"2017"},{"issue":"8","key":"10.1016\/j.eswa.2026.132566_bib0008","doi-asserted-by":"crossref","first-page":"1126","DOI":"10.1038\/s41562-022-01346-2","article-title":"Explicit knowledge of task structure is a primary determinant of human model-based action","volume":"6","author":"Castro-Rodrigues","year":"2022","journal-title":"Nature Human Behaviour"},{"key":"10.1016\/j.eswa.2026.132566_bib0009","series-title":"Proceedings of the 33rd ACM international conference on information and knowledge management","first-page":"4382","article-title":"Personalized video summarization by multimodal video understanding","author":"Chen","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.104064","article-title":"Implicit and explicit commonsense for multi-sentence video captioning","volume":"247","author":"Chou","year":"2024","journal-title":"Computer Vision and Image Understanding"},{"issue":"5","key":"10.1016\/j.eswa.2026.132566_bib0011","doi-asserted-by":"crossref","first-page":"2342","DOI":"10.1109\/TCSVT.2022.3222906","article-title":"AO2-DETR: Arbitrary-oriented object detection transformer","volume":"33","author":"Dai","year":"2022","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"7","key":"10.1016\/j.eswa.2026.132566_bib0012","doi-asserted-by":"crossref","DOI":"10.1145\/3716820","article-title":"Deep learning based image aesthetic quality assessment- a review","volume":"57","author":"Daryanavard Chounchenani","year":"2025","journal-title":"ACM Comput. Surv."},{"issue":"1","key":"10.1016\/j.eswa.2026.132566_bib0013","doi-asserted-by":"crossref","first-page":"56","DOI":"10.1016\/j.patrec.2010.08.004","article-title":"VSUMM: A mechanism designed to produce static video summaries and a novel evaluation method","volume":"32","author":"de Avila","year":"2011","journal-title":"Pattern Recognition Letters"},{"key":"10.1016\/j.eswa.2026.132566_bib0014","series-title":"2009\u202fIEEE Conference on computer vision and pattern recognition","first-page":"248","article-title":"ImageNet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.eswa.2026.132566_bib0015","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.eswa.2026.132566_bib0016","series-title":"WordNet: An electronic lexical database","author":"Fellbaum","year":"1998"},{"key":"10.1016\/j.eswa.2026.132566_bib0017","series-title":"Computer vision\u2013ECCV 2014: 13th european conference, zurich, switzerland, september 6\u201312, 2014, proceedings, part VII 13","first-page":"505","article-title":"Creating summaries from user videos","author":"Gygli","year":"2014"},{"key":"10.1016\/j.eswa.2026.132566_bib0018","doi-asserted-by":"crossref","first-page":"3013","DOI":"10.1109\/TIP.2023.3275069","article-title":"Video summarization with spatiotemporal vision transformer","volume":"32","author":"Hsu","year":"2023","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.eswa.2026.132566_bib0019","series-title":"2017\u202fIEEE Third international conference on multimedia big data (bigMM)","first-page":"117","article-title":"Unsupervised video summaries using multiple features and image quality","author":"Hu","year":"2017"},{"key":"10.1016\/j.eswa.2026.132566_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.127128","article-title":"Multi-temporal granularity concept induction for semantically driven video summarization","volume":"276","author":"Huang","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132566_bib0021","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"2630","article-title":"Causalainer: Causal explainer for automatic video summarization","author":"Huang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132566_bib0022","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.120739","article-title":"Motion context guided edge-preserving network for video salient object detection","volume":"233","author":"Huang","year":"2023","journal-title":"Expert Systems with Applications"},{"issue":"6","key":"10.1016\/j.eswa.2026.132566_bib0023","doi-asserted-by":"crossref","first-page":"1709","DOI":"10.1109\/TCSVT.2019.2904996","article-title":"Video summarization with attention-based encoder-decoder networks","volume":"30","author":"Ji","year":"2019","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132566_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.104962","article-title":"C2F: An effective coarse-to-fine network for video summarization","volume":"144","author":"Jin","year":"2024","journal-title":"Image and Vision Computing"},{"key":"10.1016\/j.eswa.2026.132566_bib0025","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.121288","article-title":"Deep multi-scale pyramidal features network for supervised video summarization","volume":"237","author":"Khan","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132566_bib0026","doi-asserted-by":"crossref","first-page":"2880","DOI":"10.1109\/TASLP.2020.3030497","article-title":"PANNs: Large-scale pretrained audio neural networks for audio pattern recognition","volume":"28","author":"Kong","year":"2020","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"10.1016\/j.eswa.2026.132566_bib0027","series-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision","first-page":"5584","article-title":"Progressive video summarization via multimodal self-Supervised learning","author":"Li","year":"2023"},{"issue":"3","key":"10.1016\/j.eswa.2026.132566_bib0028","first-page":"3904","article-title":"Video joint modelling based on hierarchical transformer for co-summarization","volume":"45","author":"Li","year":"2023","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132566_bib0029","series-title":"International conference on machine learning","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"issue":"12","key":"10.1016\/j.eswa.2026.132566_bib0030","doi-asserted-by":"crossref","first-page":"7413","DOI":"10.1109\/TCSVT.2023.3272984","article-title":"Image aesthetics assessment with attribute-assisted multimodal memory network","volume":"33","author":"Li","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"6","key":"10.1016\/j.eswa.2026.132566_bib0031","doi-asserted-by":"crossref","first-page":"902","DOI":"10.1631\/FITEE.2000429","article-title":"Video summarization with a graph convolutional attention network","volume":"22","author":"Li","year":"2021","journal-title":"Frontiers of Information Technology & Electronic Engineering"},{"key":"10.1016\/j.eswa.2026.132566_bib0032","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107677","article-title":"Exploring global diverse attention via pairwise temporal relation for video summarization","volume":"111","author":"Li","year":"2021","journal-title":"Pattern Recognition"},{"issue":"8","key":"10.1016\/j.eswa.2026.132566_bib0033","doi-asserted-by":"crossref","first-page":"3652","DOI":"10.1109\/TIP.2017.2695887","article-title":"A general framework for edited video and raw video summarization","volume":"26","author":"Li","year":"2017","journal-title":"IEEE Transactions on Image Processing"},{"issue":"8","key":"10.1016\/j.eswa.2026.132566_bib0034","doi-asserted-by":"crossref","first-page":"10555","DOI":"10.1109\/TPAMI.2023.3257546","article-title":"When object detection meets knowledge distillation: a survey","volume":"45","author":"Li","year":"2023","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.eswa.2026.132566_bib0035","doi-asserted-by":"crossref","first-page":"5548","DOI":"10.1109\/TMM.2023.3335875","article-title":"VideoXum: Cross-modal visual and textural summarization of videos","volume":"26","author":"Lin","year":"2024","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.eswa.2026.132566_bib0036","doi-asserted-by":"crossref","first-page":"1573","DOI":"10.1109\/TIP.2022.3143699","article-title":"Video summarization through reinforcement learning with a 3D spatio-temporal U-Net","volume":"31","author":"Liu","year":"2022","journal-title":"IEEE transactions on image processing"},{"key":"10.1016\/j.eswa.2026.132566_bib0037","doi-asserted-by":"crossref","first-page":"197","DOI":"10.1016\/j.neucom.2019.07.108","article-title":"Video summarization via block sparse dictionary selection","volume":"378","author":"Ma","year":"2020","journal-title":"Neurocomputing"},{"key":"10.1016\/j.eswa.2026.132566_bib0038","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"202","article-title":"Unsupervised video summarization with adversarial LSTM networks","author":"Mahasseni","year":"2017"},{"key":"10.1016\/j.eswa.2026.132566_bib0039","doi-asserted-by":"crossref","DOI":"10.3389\/fcomm.2024.1347788","article-title":"Multimodal cohesion and viewers\u2019 comprehension of scene transitions in film: An empirical investigation","volume":"9","author":"Markhabayeva","year":"2024","journal-title":"Frontiers in Communication"},{"key":"10.1016\/j.eswa.2026.132566_bib0040","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2022.105667","article-title":"A review on video summarization techniques","volume":"118","author":"Meena","year":"2023","journal-title":"Engineering Applications of Artificial Intelligence"},{"key":"10.1016\/j.eswa.2026.132566_bib0041","first-page":"13988","article-title":"CLIP-It! Language-Guided video summarization","volume":"34","author":"Narasimhan","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"7","key":"10.1016\/j.eswa.2026.132566_bib0042","doi-asserted-by":"crossref","first-page":"1037","DOI":"10.1109\/TCSVT.2012.2189689","article-title":"What makes a professional video? A computational aesthetics approach","volume":"22","author":"Niu","year":"2012","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132566_bib0043","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR)","article-title":"Rethinking the evaluation of video summaries","author":"Otani","year":"2019"},{"issue":"12","key":"10.1016\/j.eswa.2026.132566_bib0044","doi-asserted-by":"crossref","first-page":"8617","DOI":"10.1007\/s11263-025-02577-2","article-title":"Language-guided recursive spatiotemporal graph modeling for video summarization","volume":"133","author":"Park","year":"2025","journal-title":"International Journal of Computer Vision"},{"key":"10.1016\/j.eswa.2026.132566_bib0045","doi-asserted-by":"crossref","first-page":"98","DOI":"10.1016\/j.inffus.2017.02.003","article-title":"A review of affective computing: From unimodal analysis to multimodal fusion","volume":"37","author":"Poria","year":"2017","journal-title":"Information Fusion"},{"key":"10.1016\/j.eswa.2026.132566_bib0046","series-title":"Computer vision\u2013ECCV 2014: 13th european conference, zurich, switzerland, september 6\u201312, 2014, proceedings, part VI 13","first-page":"540","article-title":"Category-Specific video summarization","author":"Potapov","year":"2014"},{"key":"10.1016\/j.eswa.2026.132566_bib0047","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1016\/j.ins.2021.06.077","article-title":"QHSL: A quantum hue, saturation, and lightness color model","volume":"577","year":"2021","journal-title":"Information Sciences"},{"key":"10.1016\/j.eswa.2026.132566_bib0048","doi-asserted-by":"crossref","first-page":"15","DOI":"10.1007\/978-3-031-56537-3_2","article-title":"Major theories and constructs in media psychology","author":"Rutledge","year":"2024","journal-title":"Handbook of Media Psychology: The Science and The Practice"},{"key":"10.1016\/j.eswa.2026.132566_bib0049","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. arXiv preprint arXiv: 1910.01108."},{"key":"10.1016\/j.eswa.2026.132566_bib0050","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"6776","article-title":"Video summarization using denoising diffusion probabilistic model","volume":"vol. 39","author":"Shang","year":"2025"},{"key":"10.1016\/j.eswa.2026.132566_bib0051","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"5179","article-title":"TVSum: Summarizing web videos using titles","author":"Song","year":"2015"},{"key":"10.1016\/j.eswa.2026.132566_bib0052","series-title":"Icassp 2024-2024 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"2740","article-title":"MTIDNet: A multimodal temporal interest detection network for video summarization","author":"Tian","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0053","series-title":"2024 International conference on advances in data engineering and intelligent computing systems (ADICS)","first-page":"1","article-title":"YOLOv8: A novel object detection algorithm with enhanced performance and robustness","author":"Varghese","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0054","series-title":"Icassp 2024-2024 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"3795","article-title":"Flow dynamics correction for action recognition","author":"Wang","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0055","series-title":"Proceedings of the 31st ACM international conference on multimedia","first-page":"2391","article-title":"TIVA-KG: A multimodal knowledge graph with text, image, video and audio","author":"Wang","year":"2023"},{"key":"10.1016\/j.eswa.2026.132566_bib0056","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Video summarization via semantic attended networks","volume":"vol. 32","author":"Wei","year":"2018"},{"key":"10.1016\/j.eswa.2026.132566_bib0057","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123860","article-title":"Reconstructive network under contrastive graph rewards for video summarization","volume":"250","author":"Wu","year":"2024","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132566_bib0058","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2025.126906","article-title":"EIKA: Explicit & implicit knowledge-augmented network for entity-aware sports video captioning","volume":"274","author":"Xi","year":"2025","journal-title":"Expert Systems with Applications"},{"key":"10.1016\/j.eswa.2026.132566_bib0059","series-title":"2024\u202fIEEE International conference on multimedia and expo (ICME)","first-page":"1","article-title":"An aesthetic-guided multimodal framework for video summarization","author":"Xie","year":"2024"},{"key":"10.1016\/j.eswa.2026.132566_bib0060","series-title":"Proceedings of the 30th ACM international conference on multimedia","first-page":"740","article-title":"A knowledge augmented and multimodal-based framework for video summarization","author":"Xie","year":"2022"},{"key":"10.1016\/j.eswa.2026.132566_bib0061","doi-asserted-by":"crossref","first-page":"4894","DOI":"10.1109\/TMM.2022.3183394","article-title":"Multimodal-based and aesthetic-guided narrative video summarization","volume":"25","author":"Xie","year":"2022","journal-title":"IEEE Transactions on Multimedia"},{"key":"10.1016\/j.eswa.2026.132566_bib0062","series-title":"Icassp 2023-2023 ieee international conference on acoustics, speech and signal processing (icassp)","first-page":"1","article-title":"MHSCNET: A multimodal hierarchical shot-aware convolutional network for video summarization","author":"Xu","year":"2023"},{"issue":"9","key":"10.1016\/j.eswa.2026.132566_bib0063","doi-asserted-by":"crossref","first-page":"9487","DOI":"10.1109\/TCSVT.2025.3557254","article-title":"Hybrid siamese masked autoencoders as unsupervised video summarizer","volume":"35","author":"Xu","year":"2025","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"issue":"10","key":"10.1016\/j.eswa.2026.132566_bib0064","doi-asserted-by":"crossref","first-page":"2711","DOI":"10.1109\/TMM.2019.2959451","article-title":"Unsupervised video summarization with cycle-Consistent adversarial LSTM networks","volume":"22","author":"Yuan","year":"2019","journal-title":"IEEE Transactions on Multimedia"},{"issue":"8","key":"10.1016\/j.eswa.2026.132566_bib0065","doi-asserted-by":"crossref","first-page":"8104","DOI":"10.1109\/TCSVT.2025.3544331","article-title":"Dual graph inference network for weakly supervised semantic segmentation","volume":"35","author":"Zhang","year":"2025","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132566_bib0066","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)","article-title":"Summary transfer: Exemplar-based subset selection for video summarization","author":"Zhang","year":"2016"},{"key":"10.1016\/j.eswa.2026.132566_bib0067","series-title":"Computer vision\u2013ECCV 2016: 14th european conference, amsterdam, the netherlands, october 11\u201314, 2016, proceedings, part VII 14","first-page":"766","article-title":"Video summarization with long short-term memory","author":"Zhang","year":"2016"},{"issue":"4","key":"10.1016\/j.eswa.2026.132566_bib0068","doi-asserted-by":"crossref","first-page":"2775","DOI":"10.1109\/TCSVT.2023.3312325","article-title":"VSS-Net: Visual semantic self-mining network for video summarization","volume":"34","author":"Zhang","year":"2023","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.eswa.2026.132566_bib0069","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.123568","article-title":"Attention-guided multi-granularity fusion model for video summarization","volume":"249","author":"Zhang","year":"2024","journal-title":"Expert Systems with Applications"},{"issue":"2","key":"10.1016\/j.eswa.2026.132566_bib0070","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2659520","article-title":"Aesthetics-guided summarization from multiple user generated videos","volume":"11","author":"Zhang","year":"2015","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM)"},{"issue":"8","key":"10.1016\/j.eswa.2026.132566_bib0071","doi-asserted-by":"crossref","first-page":"5181","DOI":"10.1109\/TNNLS.2021.3119969","article-title":"Audiovisual video summarization","volume":"34","author":"Zhao","year":"2021","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"10.1016\/j.eswa.2026.132566_bib0072","doi-asserted-by":"crossref","first-page":"360","DOI":"10.1016\/j.neucom.2021.10.039","article-title":"Hierarchical multimodal transformer to summarize videos","volume":"468","author":"Zhao","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.eswa.2026.132566_bib0073","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"7405","article-title":"HSA-RNN: Hierarchical structure-adaptive RNN for video summarization","author":"Zhao","year":"2018"},{"key":"10.1016\/j.eswa.2026.132566_bib0074","unstructured":"Zheng, Q., Fan, Y., Huang, L., Zhu, T., Liu, J., Hao, Z., Xing, S., Chen, C.-J., Min, X., Bovik, A. C. et al. (2024). Video quality assessment: A comprehensive survey. arXiv preprint arXiv: 2412.04508."},{"key":"10.1016\/j.eswa.2026.132566_bib0075","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Deep reinforcement learning for unsupervised video summarization with diversity-representativeness reward","volume":"vol. 32","author":"Zhou","year":"2018"},{"key":"10.1016\/j.eswa.2026.132566_bib0076","doi-asserted-by":"crossref","first-page":"3017","DOI":"10.1109\/TIP.2022.3163855","article-title":"Relational reasoning over spatial-temporal graphs for video summarization","volume":"31","author":"Zhu","year":"2022","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.eswa.2026.132566_bib0077","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.108312","article-title":"Learning multiscale hierarchical attention for video summarization","volume":"122","author":"Zhu","year":"2022","journal-title":"Pattern Recognition"},{"key":"10.1016\/j.eswa.2026.132566_bib0078","doi-asserted-by":"crossref","first-page":"948","DOI":"10.1109\/TIP.2020.3039886","article-title":"DSNet: A flexible detect-to-summarize network for video summarization","volume":"30","author":"Zhu","year":"2020","journal-title":"IEEE Transactions on Image Processing"},{"key":"10.1016\/j.eswa.2026.132566_bib0079","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109578","article-title":"Topic-aware video summarization using multimodal transformer","volume":"140","author":"Zhu","year":"2023","journal-title":"Pattern Recognition"}],"container-title":["Expert Systems with Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095741742601479X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095741742601479X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:01:28Z","timestamp":1777572088000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S095741742601479X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":79,"alternative-id":["S095741742601479X"],"URL":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132566","relation":{},"ISSN":["0957-4174"],"issn-type":[{"value":"0957-4174","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A knowledge-guided multimodal network for video summarization","name":"articletitle","label":"Article Title"},{"value":"Expert Systems with Applications","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.eswa.2026.132566","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"132566"}}