{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T08:31:12Z","timestamp":1772094672228,"version":"3.50.1"},"reference-count":57,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62166006"],"award-info":[{"award-number":["62166006"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.knosys.2026.115356","type":"journal-article","created":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T16:15:00Z","timestamp":1769271300000},"page":"115356","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multimodal summarization via coarse-and-fine granularity synergy and region counterfactual reasoning filter"],"prefix":"10.1016","volume":"337","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-4575-2295","authenticated-orcid":false,"given":"Rulong","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1801-9901","authenticated-orcid":false,"given":"Qing","family":"He","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7634-2184","authenticated-orcid":false,"given":"Yuji","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3026-1741","authenticated-orcid":false,"given":"Nisuo","family":"Du","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0796-5538","authenticated-orcid":false,"given":"Zhihao","family":"Yang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115356_bib0001","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2022.118442","article-title":"Deepsumm: exploiting topic models and sequence to sequence networks for extractive text summarization","volume":"211","author":"Joshi","year":"2023","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.115356_bib0002","series-title":"Efficient and interpretable compressive text summarisation with unsupervised dual-agent reinforcement learning","author":"Tang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0003","series-title":"Ijcai","first-page":"4152","article-title":"Multi-modal sentence summarization with modality attention and image filtering","author":"Li","year":"2018"},{"key":"10.1016\/j.knosys.2026.115356_bib0004","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14867","article-title":"Align and attend: multimodal summarization with dual contrastive losses","author":"He","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0005","series-title":"Vmsmo: Learning to generate multimodal summary for video-based news articles","author":"Li","year":"2020"},{"issue":"13s","key":"10.1016\/j.knosys.2026.115356_bib0006","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3584700","article-title":"A survey on multi-modal summarization","volume":"55","author":"Jangra","year":"2023","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.knosys.2026.115356_bib0007","series-title":"Multimodal abstractive summarization for how2 videos","author":"Palaskar","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0008","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"11676","article-title":"Hierarchical cross-modality semantic correlation learning model for multimodal summarization","volume":"36","author":"Zhang","year":"2022"},{"key":"10.1016\/j.knosys.2026.115356_bib0009","series-title":"Vision guided generative pre-trained language models for multimodal abstractive summarization","author":"Yu","year":"2021"},{"key":"10.1016\/j.knosys.2026.115356_bib0010","series-title":"D 2 TV: Dual Knowledge Distillation and Target-oriented Vision Modeling for Many-to-Many Multimodal Summarization","author":"Liang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0011","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"19297","article-title":"Diusum: dynamic image utilization for multimodal summarization","volume":"38","author":"Xiao","year":"2024"},{"key":"10.1016\/j.knosys.2026.115356_bib0012","series-title":"Proceedings of the 28Th International Conference on Computational Linguistics","first-page":"5655","article-title":"Multimodal sentence summarization via multimodal selective encoding","author":"Li","year":"2020"},{"key":"10.1016\/j.knosys.2026.115356_bib0013","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2020.113465","article-title":"A deceptive review detection framework: combination of coarse and fine-grained features","volume":"156","author":"Cao","year":"2020","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.115356_bib0014","series-title":"When Large Vision-Language Model Meets Large Remote Sensing Imagery: Coarse-to-Fine Text-Guided Token Pruning","author":"Luo","year":"2025"},{"key":"10.1016\/j.knosys.2026.115356_bib0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2024.121787","article-title":"Multi-scale interaction network for multimodal entity and relation extraction","volume":"699","author":"Zhang","year":"2025","journal-title":"Inf. Sci."},{"key":"10.1016\/j.knosys.2026.115356_bib0016","doi-asserted-by":"crossref","first-page":"1274","DOI":"10.1109\/TASLP.2023.3345146","article-title":"Enhancing multimodal entity and relation extraction with variational information bottleneck","volume":"32","author":"Cui","year":"2024","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.knosys.2026.115356_bib0017","series-title":"Cfsum: A coarse-to-fine contribution network for multimodal summarization","author":"Xiao","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0018","series-title":"Proceedings of the 1St Workshop on Large Generative Models Meet Multimodal Applications","first-page":"45","article-title":"CGSMP: Controllable generative summarization via multimodal prompt","author":"Yong","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0019","doi-asserted-by":"crossref","first-page":"1553","DOI":"10.1109\/TMM.2013.2267205","article-title":"Multimodal saliency and fusion for movie summarization based on aural, visual, and textual attention","volume":"15","author":"Evangelopoulos","year":"2013","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.knosys.2026.115356_bib0020","series-title":"Keep meeting summaries on topic: abstractive multi-modal meeting summarization","first-page":"2190","author":"Li","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0021","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"8188","article-title":"Aspect-aware multimodal summarization for chinese e-commerce products","volume":"34","author":"Li","year":"2020"},{"key":"10.1016\/j.knosys.2026.115356_bib0022","article-title":"SMSMO: Learning to generate multimodal summary for scientific papers","volume":"310","author":"Zhonga","year":"2025","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115356_bib0023","series-title":"Proceedings of the 46Th International ACM SIGIR Conference on Research and Development in Information Retrieval","first-page":"195","article-title":"Adapting generative pretrained language model for open-domain multimodal sentence summarization","author":"Lin","year":"2023"},{"key":"10.1016\/j.knosys.2026.115356_bib0024","series-title":"Objects as points","author":"Zhou","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0025","series-title":"European Conference on Computer Vision","first-page":"213","article-title":"End-to-end object detection with transformers","author":"Carion","year":"2020"},{"issue":"1","key":"10.1016\/j.knosys.2026.115356_bib0026","article-title":"Hybrid approach for named entity recognition","volume":"118","author":"Bajwa","year":"2015","journal-title":"Int. J. Comput. Appl."},{"key":"10.1016\/j.knosys.2026.115356_bib0027","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","first-page":"4171","article-title":"Bert: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0028","series-title":"Recent advances in named entity recognition: a comprehensive survey and comparative study","author":"Keraghel","year":"2024"},{"key":"10.1016\/j.knosys.2026.115356_bib0029","series-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0030","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110084","article-title":"MPCCT: Multimodal vision-language learning paradigm with context-based compact transformer","volume":"147","author":"Chen","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115356_bib0031","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.110706","article-title":"CLVIN: Complete language-vision interaction network for visual question answering","volume":"275","author":"Chen","year":"2023","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115356_bib0032","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"10.1016\/j.knosys.2026.115356_bib0033","series-title":"Learning Transferable Visual Models From Natural Language Supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.115356_bib0034","series-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0035","series-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"1532","article-title":"Glove: global vectors for word representation","author":"Pennington","year":"2014"},{"key":"10.1016\/j.knosys.2026.115356_bib0036","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6995","article-title":"Multimodal contrastive training for visual representation learning","author":"Yuan","year":"2021"},{"key":"10.1016\/j.knosys.2026.115356_bib0037","first-page":"9564","article-title":"Multimodal contrastive learning with limoe: the language-image mixture of experts","volume":"35","author":"Mustafa","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.115356_bib0038","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"4154","article-title":"MSMO: Multimodal summarization with multimodal output","author":"Zhu","year":"2018"},{"key":"10.1016\/j.knosys.2026.115356_bib0039","series-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"10.1016\/j.knosys.2026.115356_bib0040","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"Rouge: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.knosys.2026.115356_bib0041","series-title":"Proceedings of the 40Th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.knosys.2026.115356_bib0042","series-title":"Bertscore: Evaluating text generation with bert","author":"Zhang","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0043","series-title":"MoverScore: Text generation evaluating with contextualized embeddings and earth mover distance","author":"Zhao","year":"2019"},{"key":"10.1016\/j.knosys.2026.115356_bib0044","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"7514","article-title":"Clipscore: a reference-free evaluation metric for image captioning","author":"Hessel","year":"2021"},{"key":"10.1016\/j.knosys.2026.115356_bib0045","doi-asserted-by":"crossref","first-page":"399","DOI":"10.1613\/jair.2433","article-title":"Global inference for sentence compression: an integer linear programming approach","volume":"31","author":"Clarke","year":"2008","journal-title":"Journal of Artificial Intelligence Research"},{"key":"10.1016\/j.knosys.2026.115356_bib0046","series-title":"A neural attention model for abstractive sentence summarization","author":"Nallapati","year":"2016"},{"key":"10.1016\/j.knosys.2026.115356_bib0047","series-title":"Selective encoding for abstractive sentence summarization","author":"Zhou","year":"2017"},{"key":"10.1016\/j.knosys.2026.115356_bib0048","series-title":"Proceedings of the 55Th Annual Meeting of the Association for Computational Linguistics Association for Computational Linguistics","first-page":"1073-1083","article-title":"Get to the point: summarization with pointer-generator networks","author":"See","year":"2017"},{"key":"10.1016\/j.knosys.2026.115356_bib0049","series-title":"Doubly-attentive decoder for multi-modal neural machine translation","author":"Calixto","year":"2017"},{"key":"10.1016\/j.knosys.2026.115356_bib0050","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"9749","article-title":"Multimodal summarization with guidance of multimodal reference","volume":"34","author":"Zhu","year":"2020"},{"issue":"4","key":"10.1016\/j.knosys.2026.115356_bib0051","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3445794","article-title":"Graph-based multimodal ranking models for multimodal summarization","volume":"20","author":"Zhu","year":"2021","journal-title":"Transactions on Asian and low-resource language information processing"},{"key":"10.1016\/j.knosys.2026.115356_bib0052","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"19440","article-title":"Cross-view topology based consistent and complementary information for deep multi-view clustering","author":"Dong","year":"2023"},{"issue":"5","key":"10.1016\/j.knosys.2026.115356_bib0053","doi-asserted-by":"crossref","first-page":"3755","DOI":"10.1109\/TCSVT.2023.3319330","article-title":"MC-Blur: A comprehensive benchmark for image deblurring","volume":"34","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115356_bib0054","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111628","article-title":"LLDiffusion: Learning degradation representations in diffusion models for low-light image enhancement","volume":"166","author":"Wang","year":"2025","journal-title":"Pattern Recognit."},{"issue":"10","key":"10.1016\/j.knosys.2026.115356_bib0055","doi-asserted-by":"crossref","first-page":"4541","DOI":"10.1007\/s11263-024-02056-0","article-title":"Gridformer: residual dense transformer with grid structure for image restoration in adverse weather conditions","volume":"132","author":"Wang","year":"2024","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.knosys.2026.115356_bib0056","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2071","article-title":"Vision transformers are robust learners","volume":"36","author":"Paul","year":"2022"},{"key":"10.1016\/j.knosys.2026.115356_bib0057","series-title":"European Conference on Computer Vision","first-page":"404","article-title":"Are vision transformers robust to patch perturbations?","author":"Gu","year":"2022"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126000997?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126000997?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T07:42:43Z","timestamp":1772091763000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126000997"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":57,"alternative-id":["S0950705126000997"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115356","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multimodal summarization via coarse-and-fine granularity synergy and region counterfactual reasoning filter","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115356","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115356"}}