{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T17:22:58Z","timestamp":1770830578771,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T00:00:00Z","timestamp":1749168000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T00:00:00Z","timestamp":1749168000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62166042"],"award-info":[{"award-number":["62166042"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]},{"name":"Natural Science Foundation of Xinjiang, China","award":["2021D01C076"],"award-info":[{"award-number":["2021D01C076"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01865-8","type":"journal-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T05:48:15Z","timestamp":1749188895000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Vef-BART: an effective method to mitigate hallucinations through vision enhancement and fusion in BART-based multimodal abstractive summarization"],"prefix":"10.1007","volume":"31","author":[{"given":"Debin","family":"Wang","sequence":"first","affiliation":[]},{"given":"Turdi","family":"Tohti","sequence":"additional","affiliation":[]},{"given":"Dongfang","family":"Han","sequence":"additional","affiliation":[]},{"given":"Zicheng","family":"Zuo","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Yuanyuan","family":"Liao","sequence":"additional","affiliation":[]},{"given":"Qingwen","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,6]]},"reference":[{"issue":"10","key":"1865_CR1","doi-asserted-by":"publisher","first-page":"1290","DOI":"10.3390\/sym11101290","volume":"11","author":"MM Rahman","year":"2019","unstructured":"Rahman, M.M., Siddiqui, F.H.: An optimized abstractive text summarization model using peephole convolutional LSTM. Symmetry 11(10), 1290 (2019)","journal-title":"Symmetry"},{"issue":"2","key":"1865_CR2","doi-asserted-by":"publisher","first-page":"288","DOI":"10.4218\/etrij.2019-0016","volume":"43","author":"MM Rahman","year":"2021","unstructured":"Rahman, M.M., Siddiqui, F.H.: Multi-layered attentional peephole convolutional LSTM for abstractive text summarization. ETRI J. 43(2), 288\u2013298 (2021)","journal-title":"ETRI J."},{"key":"1865_CR3","doi-asserted-by":"crossref","unstructured":"Yu, T., Dai, W., Liu, Z., Fung, P.: Vision guided generative pre-trained language models for multimodal abstractive summarization. In: Moens, M., Huang, X., Specia, L., Yih, S.W. (eds.) Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, EMNLP 2021, Virtual Event, 7\u201311 November, 2021, pp. 3995\u20134007. Association for Computational Linguistics, Punta Cana, Dominican Republic (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.326"},{"key":"1865_CR4","doi-asserted-by":"crossref","unstructured":"He, B., Wang, J., Qiu, J., Bui, T., Shrivastava, A., Wang, Z.: Align and attend: multimodal summarization with dual contrastive losses. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17\u201324, 2023, pp. 14867\u201314878. IEEE, Punta Cana, Dominican Republic (2023)","DOI":"10.1109\/CVPR52729.2023.01428"},{"key":"1865_CR5","doi-asserted-by":"crossref","unstructured":"Matsuura, K., Ashihara, T., Moriya, T., Tanaka, T., Ogawa, A., Delcroix, M., Masumura, R.: Leveraging large text corpora for end-to-end speech summarization. In: IEEE International Conference on Acoustics, Speech and Signal Processing ICASSP 2023, Rhodes Island, Greece, June 4\u201310, 2023, pp. 1\u20135. IEEE, Rhodes Island, Greece (2023)","DOI":"10.1109\/ICASSP49357.2023.10094993"},{"key":"1865_CR6","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Meng, X., Wang, Y., Jiang, X., Liu, Q., Yang, Z.: Unims: A unified framework for multimodal summarization with knowledge distillation. In: Thirty-Sixth AAAI Conference on Artificial Intelligence, pp. 11757\u201311764. AAAI Press, Virtual Event (2022)","DOI":"10.1609\/aaai.v36i10.21431"},{"key":"1865_CR7","doi-asserted-by":"crossref","unstructured":"Xu, Z., Meng, X., Wang, Y., Su, Q., Qiu, Z., Jiang, X., Liu, Q.: Learning summary-worthy visual representation for abstractive summarization in video. In: Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI 2023, 19th\u201325th August 2023, pp. 5242\u20135250. ijcai.org, Macao, SAR, China (2023)","DOI":"10.24963\/ijcai.2023\/582"},{"key":"1865_CR8","first-page":"24516","volume":"35","author":"X Chen","year":"2022","unstructured":"Chen, X., Li, M., Gao, X., Zhang, X.: Towards improving faithfulness in abstractive summarization. Adv. Neural. Inf. Process. Syst. 35, 24516\u201324528 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1865_CR9","first-page":"15807","volume-title":"Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6\u201310, 2023","author":"J Shen","year":"2023","unstructured":"Shen, J., Xuan, J., Liang, C.J.: Mitigating intrinsic named entity-related hallucinations of abstractive text summarization. In: Bouamor, H., Pino, J., Bali, K. (eds.) Findings of the Association for Computational Linguistics: EMNLP 2023, Singapore, December 6\u201310, 2023, pp. 15807\u201315824. Association for Computational Linguistics, Singapore (2023)"},{"key":"1865_CR10","doi-asserted-by":"crossref","unstructured":"Rush, A.M., Chopra, S., Weston, J.: A neural attention model for abstractive sentence summarization. In: M\u00e0rquez, L., Callison-Burch, C., Su, J., Pighin, D., Marton, Y. (eds.) Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, EMNLP 2015, September 17\u201321, 2015, pp. 379\u2013389. The Association for Computational Linguistics, Lisbon, Portugal (2015)","DOI":"10.18653\/v1\/D15-1044"},{"key":"1865_CR11","doi-asserted-by":"crossref","unstructured":"Li, H., Zhu, J., Zhang, J., He, X., Zong, C.: Multimodal sentence summarization via multimodal selective encoding. In: Scott, D., Bel, N., Zong, C. (eds.) Proceedings of the 28th International Conference on Computational Linguistics, COLING 2020, December 8\u201313, 2020, pp. 5655\u20135667. International Committee on Computational Linguistics, Barcelona, Spain (Online) (2020)","DOI":"10.18653\/v1\/2020.coling-main.496"},{"key":"1865_CR12","doi-asserted-by":"crossref","unstructured":"Lewis, M., Liu, Y., Goyal, N., Ghazvininejad, M., Mohamed, A., Levy, O., Stoyanov, V., Zettlemoyer, L.: Bart: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J.R. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020, Online, July 5\u201310, 2020, pp. 7871\u20137880. Association for Computational Linguistics, Online (2020)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"1865_CR13","doi-asserted-by":"publisher","first-page":"3296","DOI":"10.1109\/TMM.2022.3157993","volume":"25","author":"N Liu","year":"2023","unstructured":"Liu, N., Sun, X., Yu, H., Yao, F., Xu, G., Fu, K.: Abstractive summarization for video: a revisit in multistage fusion network with forget gate. IEEE Trans. Multimedia 25, 3296\u20133310 (2023)","journal-title":"IEEE Trans. Multimedia"},{"issue":"2","key":"1865_CR14","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s11633-022-1372-x","volume":"20","author":"L Jing","year":"2023","unstructured":"Jing, L., Li, Y., Xu, J., Yu, Y., Shen, P., Song, X.: Vision enhanced generative pre-trained language model for multimodal sentence summarization. Mach. Intell. Res. 20(2), 289\u2013298 (2023)","journal-title":"Mach. Intell. Res."},{"key":"1865_CR15","doi-asserted-by":"crossref","unstructured":"Li, H., Ke, Q., Gong, M., Drummond, T.: Progressive video summarization via multimodal self-supervised learning. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5584\u20135593. IEEE, Waikoloa, HI, USA (2023)","DOI":"10.1109\/WACV56688.2023.00554"},{"key":"1865_CR16","doi-asserted-by":"crossref","unstructured":"Chen, X., Li, M., Gao, S., Cheng, X., Yang, Q., Zhang, Q., Gao, X., Zhang, X.: A topic-aware summarization framework with different modal side information. In: Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1416\u20131425. ACM, Taipei, Taiwan (2023)","DOI":"10.1145\/3539618.3591630"},{"key":"1865_CR17","doi-asserted-by":"crossref","unstructured":"Liang, Y., Meng, F., Xu, J., Wang, J., Chen, Y., Zhou, J.: Summary-oriented vision modeling for multimodal abstractive summarization. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2023, July 9\u201314, 2023, pp. 2934\u20132951. Association for Computational Linguistics, Toronto, Canada (2023)","DOI":"10.18653\/v1\/2023.acl-long.165"},{"key":"1865_CR18","doi-asserted-by":"crossref","unstructured":"Liu, N., Sun, X., Yu, H., Zhang, W., Xu, G.: Multistage fusion with forget gate for multimodal summarization in open-domain videos. In: Webber, B., Cohn, T., He, Y., Liu, Y. (eds.) Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1834\u20131845. Association for Computational Linguistics, Online (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.144"},{"key":"1865_CR19","doi-asserted-by":"crossref","unstructured":"Wu, S., Dai, D., Qin, Z., Liu, T., Lin, B., Cao, Y., Sui, Z.: Denoising bottleneck with mutual information maximization for video multimodal fusion. In: Rogers, A., Boyd-Graber, J.L., Okazaki, N. (eds.) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2023, July 9\u201314, 2023, pp. 2231\u20132243. Association for Computational Linguistics, Toronto, Canada (2023)","DOI":"10.18653\/v1\/2023.acl-long.124"},{"key":"1865_CR20","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shu, C., Chen, Y., Xiao, J., Zhang, Q., Zheng, L.: ICAF: iterative contrastive alignment framework for multimodal abstractive summarization. In: International Joint Conference on Neural Networks, IJCNN 2022, July 18\u201323, 2022, pp. 1\u20138. IEEE, Padua, Italy (2022)","DOI":"10.1109\/IJCNN55064.2022.9892884"},{"key":"1865_CR21","doi-asserted-by":"crossref","unstructured":"Xiao, M., Zhu, J., Lin, H., Zhou, Y., Zong, C.: CFSUM: a coarse-to-fine contribution network for multimodal summarization. arXiv:2307.02716 (2023)","DOI":"10.18653\/v1\/2023.acl-long.476"},{"key":"1865_CR22","doi-asserted-by":"publisher","first-page":"126692","DOI":"10.1016\/j.eswa.2025.126692","volume":"272","author":"MS Hossain","year":"2025","unstructured":"Hossain, M.S., Aktar, S., Gu, N., Liu, W., Huang, Z.: GEOSCN: a novel multimodal self-attention to integrate geometric information on spatial-channel network for fine-grained image captioning. Expert Syst. Appl. 272, 126692 (2025)","journal-title":"Expert Syst. Appl."},{"issue":"1","key":"1865_CR23","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s00530-024-01608-1","volume":"31","author":"MS Hossain","year":"2025","unstructured":"Hossain, M.S., Aktar, S., Liu, W., Gu, N., Huang, Z.: IGINET: integrating geometric information to enhance inter-modal interaction for fine-grained image captioning. Multimedia Syst. 31(1), 33 (2025)","journal-title":"Multimedia Syst."},{"key":"1865_CR24","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-024-20220-Z","author":"MS Hossain","year":"2025","unstructured":"Hossain, M.S., Aktar, S., Hossen, M.B., Hossain, M.A., Gu, N., Huang, Z.: CSDNET: cross-sketch with dual gated attention for fine-grained image captioning network. Multimedia Tools Appl. (2024). https:\/\/doi.org\/10.1007\/s11042-024-20220-Z","journal-title":"Multimedia Tools Appl."},{"key":"1865_CR25","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2024.102941","volume":"87","author":"MS Hossain","year":"2025","unstructured":"Hossain, M.S., Aktar, S., Hossain, M.A., Gu, N., Huang, Z.: CM-SC: cross-modal spatial-channel attention network for image captioning. Displays 87, 102941 (2025)","journal-title":"Displays"},{"key":"1865_CR26","doi-asserted-by":"crossref","unstructured":"Wu, Y., Zhang, Z., Peng, P., Zhao, Y., Qin, B.: Leveraging multi-modal interactions among the intermediate representations of deep transformers for emotion recognition. In: Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, pp. 101\u2013109. ACM, Lisboa, Portugal (2022)","DOI":"10.1145\/3551876.3554813"},{"key":"1865_CR27","doi-asserted-by":"crossref","unstructured":"Yang, D., Huang, S., Kuang, H., Du, Y., Zhang, L.: Disentangled representation learning for multimodal emotion recognition. CoRR arXiv:2312.13567 (2023)","DOI":"10.1145\/3503161.3547754"},{"key":"1865_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., Sun, W., Zhang, X., Qin, Y.: Improving dimensional emotion recognition via feature-wise fusion. In: Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, pp. 55\u201360. ACM, Lisboa, Portugal (2022)","DOI":"10.1145\/3551876.3554804"},{"key":"1865_CR29","doi-asserted-by":"crossref","unstructured":"Cao, M., Dong, Y., Cheung, J.C.K.: Hallucinated but factual! inspecting the factuality of hallucinations in abstractive summarization. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2022, May 22\u201327, 2022, pp. 3340\u20133354. Association for Computational Linguistics, Dublin, Ireland (2022)","DOI":"10.18653\/v1\/2022.acl-long.236"},{"key":"1865_CR30","doi-asserted-by":"crossref","unstructured":"Zhu, C., Hinthorn, W., Xu, R., Zeng, Q., Zeng, M., Huang, X., Jiang, M.: Enhancing factual consistency of abstractive summarization. In: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2021, June 6\u201311, 2021, pp. 718\u2013733. Association for Computational Linguistics, Online (2021)","DOI":"10.18653\/v1\/2021.naacl-main.58"},{"key":"1865_CR31","doi-asserted-by":"crossref","unstructured":"Song, K., Lebanoff, L., Guo, Q., Qiu, X., Xue, X., Li, C., Yu, D., Liu, F.: Joint parsing and generation for abstractive summarization. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, February 7\u201312, 2020, vol. 34, pp. 8894\u20138901. AAAI Press, New York, NY, USA (2020)","DOI":"10.1609\/aaai.v34i05.6419"},{"key":"1865_CR32","doi-asserted-by":"crossref","unstructured":"Tang, X., Nair, A., Wang, B., Wang, B., Desai, J., Wade, A., Li, H., Celikyilmaz, A., Mehdad, Y., Radev, D.: Confit: Toward faithful dialogue summarization with linguistically-informed contrastive fine-tuning. In: Carpuat, M., Marneffe, M., Ru\u00edz, I.V.M. (eds.) Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL 2022, July 10\u201315, 2022, pp. 5657\u20135668. Association for Computational Linguistics, Seattle, WA, United States (2022)","DOI":"10.18653\/v1\/2022.naacl-main.415"},{"key":"1865_CR33","doi-asserted-by":"crossref","unstructured":"Cao, S., Wang, L.: CLIFF: contrastive learning for improving faithfulness and factuality in abstractive summarization. In: Moens, M., Huang, X., Specia, L., Yih, S.W. (eds.) Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, EMNLP 2021, 7\u201311 November, 2021, pp. 6633\u20136649. Association for Computational Linguistics, Virtual Event\/Punta Cana, Dominican Republic (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.532"},{"key":"1865_CR34","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 5998\u20136008 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1865_CR35","unstructured":"Sanabria, R., Caglayan, O., Palaskar, S., Elliott, D., Barrault, L., Specia, L., Metze, F.: HOW2: a large-scale dataset for multimodal language understanding. CoRR arXiv:1811.00347 (2018)"},{"key":"1865_CR36","doi-asserted-by":"crossref","unstructured":"Li, H., Zhu, J., Liu, T., Zhang, J., Zong, C.: Multi-modal sentence summarization with modality attention and image filtering. In: Lang, J. (ed.) Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, IJCAI 2018, July 13\u201319, 2018, pp. 4152\u20134158. ijcai.org, Stockholm, Sweden (2018)","DOI":"10.24963\/ijcai.2018\/577"},{"key":"1865_CR37","doi-asserted-by":"crossref","unstructured":"Lin, C.-Y., Hovy, E.: Automatic evaluation of summaries using n-gram co-occurrence statistics. In: Hearst, M.A., Ostendorf, M. (eds.) Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics, HLT-NAACL 2003, May 27\u2013June 1, 2003. The Association for Computational Linguistics, Edmonton, Canada (2003)","DOI":"10.3115\/1073445.1073465"},{"key":"1865_CR38","doi-asserted-by":"crossref","unstructured":"Palaskar, S., Libovick\u00fd, J., Gella, S., Metze, F.: Multimodal abstractive summarization for HOW2 videos. In: Korhonen, A., Traum, D.R., M\u00e0rquez, L. (eds.) Proceedings of the 57th Conference of the Association for Computational Linguistics, ACL 2019, July 28\u2013August 2, 2019, Volume 1: Long Papers, pp. 6587\u20136596. Association for Computational Linguistics, Florence, Italy (2019)","DOI":"10.18653\/v1\/P19-1659"},{"key":"1865_CR39","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, July 6\u201312, 2002, Philadelphia, PA, USA, pp. 311\u2013318. ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1865_CR40","unstructured":"Kingma, D.P., Ba, J.: ADAM: a method for stochastic optimization. In: Korhonen, A., Traum, D.R., M\u00e0rquez, L. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, May 7\u20139, 2015, Conference Track Proceedings, pp. 6587\u20136596. Association for Computational Linguistics, San Diego, CA, USA (2019)"},{"key":"1865_CR41","doi-asserted-by":"crossref","unstructured":"Xiao, W., Carenini, G.: Extractive summarization of long documents by combining global and local context. In: Inui, K., Jiang, J., Ng, V., Wan, X. (eds.) Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, November 3\u20137, 2019, pp. 3009\u20133019. Association for Computational Linguistics, Hong Kong, China (2019)","DOI":"10.18653\/v1\/D19-1298"},{"key":"1865_CR42","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1016\/j.neucom.2021.04.072","volume":"456","author":"N Liu","year":"2021","unstructured":"Liu, N., Sun, X., Yu, H., Zhang, W., Xu, G.: D-MMT: a concise decoder-only multi-modal transformer for abstractive summarization in videos. Neurocomputing 456, 179\u2013189 (2021)","journal-title":"Neurocomputing"},{"key":"1865_CR43","doi-asserted-by":"crossref","unstructured":"Atri, Y.K., Goyal, V., Chakraborty, T.: Fusing multimodal signals on hyper-complex space for extreme abstractive text summarization (tl;dr) of scientific contents. In: Singh, A.K., Sun, Y., Akoglu, L., Gunopulos, D., Yan, X., Kumar, R., Ozcan, F., Ye, J. (eds.) Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD 2023, August 6\u201310, 2023, pp. 3724\u20133736. ACM, Long Beach, CA, USA (2023)","DOI":"10.1145\/3580305.3599830"},{"key":"1865_CR44","doi-asserted-by":"crossref","unstructured":"Libovick\u00fd, J., Helcl, J.: Attention strategies for multi-source sequence-to-sequence learning. In: Barzilay, R., Kan, M. (eds.) Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, ACL 2017, July 30\u2013August 4, Volume 2: Short Papers, pp. 379\u2013389. Association for Computational Linguistics, Vancouver, Canada (2017)","DOI":"10.18653\/v1\/P17-2031"},{"key":"1865_CR45","doi-asserted-by":"crossref","unstructured":"Calixto, I., Liu, Q., Campbell, N.: Doubly-attentive decoder for multi-modal neural machine translation. In: Barzilay, R., Kan, M. (eds.) Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, ACL 2017, July 30\u2013August 4, Volume 1: Long Papers, pp. 1913\u20131924. Association for Computational Linguistics, Vancouver, Canada (2017)","DOI":"10.18653\/v1\/P17-1175"},{"key":"1865_CR46","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., Zhu, C.: GPTEVAL: NLG evaluation using GPT-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"1865_CR47","unstructured":"Pu, X., Gao, M., Wan, X.: Is summary useful or not? An extrinsic human evaluation of text summaries on downstream tasks. arXiv preprint arXiv:2305.15044 (2023)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01865-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01865-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01865-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:03:40Z","timestamp":1757927020000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01865-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,6]]},"references-count":47,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1865"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01865-8","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,6]]},"assertion":[{"value":"18 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This article does not contain any studies with human participants or animals performed by the author. We obtain ethical and informed consent from data subjects before collecting, using, or disclosing their personal data.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical and informed consent"}}],"article-number":"282"}}