{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:21:57Z","timestamp":1772119317842,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T00:00:00Z","timestamp":1738627200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T00:00:00Z","timestamp":1738627200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00371-025-03824-w","type":"journal-article","created":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T08:56:58Z","timestamp":1738659418000},"page":"7567-7584","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["SCAP: enhancing image captioning through lightweight feature sifting and hierarchical decoding"],"prefix":"10.1007","volume":"41","author":[{"given":"Yuhao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Jiaqi","family":"Tong","sequence":"additional","affiliation":[]},{"given":"Honglin","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,4]]},"reference":[{"key":"3824_CR1","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Wallach, H., Larochelle, H., Beygelzimer, A., Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 32. Curran Associates, Inc., (2019)"},{"key":"3824_CR2","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C., Chang, K.: Visualbert: A simple and performant baseline for vision and language. CoRR 1908.03557 (2019)"},{"key":"3824_CR3","doi-asserted-by":"crossref","unstructured":"Long, S., Cao, F., Han, S.C., Yang, H.: Vision-and-language pretrained models: a survey (2022)","DOI":"10.24963\/ijcai.2022\/773"},{"key":"3824_CR4","doi-asserted-by":"publisher","first-page":"1183","DOI":"10.1613\/jair.1.11688","volume":"71","author":"A Mogadala","year":"2021","unstructured":"Mogadala, A., Kalimuthu, M., Klakow, D.: Trends in integration of vision and language research: a survey of tasks, datasets, and methods. J. Artific. Intell. Res. 71, 1183\u20131317 (2021)","journal-title":"J. Artific. Intell. Res."},{"key":"3824_CR5","doi-asserted-by":"crossref","unstructured":"Rotstein, N., Bensaid, D., Brody, S., Ganz, R., Kimmel, R.: FuseCap: leveraging large language models to fuse visual data into enriched image captions (2023)","DOI":"10.1109\/WACV57701.2024.00559"},{"key":"3824_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2023.111343","volume":"285","author":"S Mardieva","year":"2024","unstructured":"Mardieva, S., Ahmad, S., Umirzakova, S., Rasool, M.J.A., Whangbo, T.K.: Lightweight image super-resolution for IoT devices using deep residual feature distillation network. Knowl.-Based Syst. 285, 111343 (2024). https:\/\/doi.org\/10.1016\/j.knosys.2023.111343","journal-title":"Knowl.-Based Syst."},{"key":"3824_CR7","doi-asserted-by":"crossref","unstructured":"Ferraro, F., Mostafazadeh, N., Ting-Hao, Huang, Vanderwende, L., Devlin, J., Galley, M., Mitchell, M.: A survey of current datasets for vision and language research (2015)","DOI":"10.18653\/v1\/D15-1021"},{"key":"3824_CR8","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: Generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) Computer Vision - ECCV 2010, pp. 15\u201329. Springer, Berlin (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"3824_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3824_CR10","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"3824_CR11","unstructured":"Gupta, S., Malik, J.: Visual Semantic Role Labeling (2015)"},{"key":"3824_CR12","doi-asserted-by":"crossref","unstructured":"Afraz, A., Yamins, D.L., DiCarlo, J.J.: Neural mechanisms underlying visual object recognition. In: Cold Spring Harbor Symposia on Quantitative Biology, vol. 79, pp. 99\u2013107 (2014). Cold Spring Harbor Laboratory Press","DOI":"10.1101\/sqb.2014.79.024729"},{"key":"3824_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3824_CR14","unstructured":"Zhang, F., Leitner, J., Milford, M., Upcroft, B., Corke, P.: Towards Vision-Based Deep Reinforcement Learning for Robotic Motion Control (2015)"},{"key":"3824_CR15","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Gordon, D., Kolve, E., Fox, D., Fei-Fei, L., Gupta, A., Mottaghi, R., Farhadi, A.: Visual semantic planning using deep successor representations. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.60"},{"key":"3824_CR16","doi-asserted-by":"crossref","unstructured":"Gurari, D., Li, Q., Stangl, A.J., Guo, A., Lin, C., Grauman, K., Luo, J., Bigham, J.P.: Vizwiz grand challenge: Answering visual questions from blind people. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018)","DOI":"10.1109\/CVPR.2018.00380"},{"issue":"1","key":"3824_CR17","doi-asserted-by":"publisher","first-page":"532","DOI":"10.1109\/TNNLS.2022.3175775","volume":"35","author":"A Karambakhsh","year":"2022","unstructured":"Karambakhsh, A., Sheng, B., Li, P., Li, H., Kim, J., Jung, Y.: Sparsevoxnet: 3-d object recognition with sparsely aggregation of 3-d dense blocks. IEEE Trans. Neural Netw. Learn. Syst. 35(1), 532\u2013546 (2022)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3824_CR18","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L.u., Polosukhin, I.: Attention is all you need. In: Guyon, I., Luxburg, U.V., Bengio, S., Wallach, H., Fergus, R., Vishwanathan, S., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol. 30. Curran Associates, Inc., (2017)"},{"key":"3824_CR19","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: Neural image caption generation with visual attention. In: Bach, F., Blei, D. (eds.) Proceedings of the 32nd International Conference on Machine Learning. Proceedings of Machine Learning Research, vol. 37, pp. 2048\u20132057. PMLR, Lille, France (2015). https:\/\/proceedings.mlr.press\/v37\/xuc15.html"},{"key":"3824_CR20","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Wang, M., Liu, D., Hu, Z., Zhang, H.: More grounded image captioning by distilling image-text matching model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00483"},{"key":"3824_CR21","unstructured":"Cortes, C., Lawarence, N., Lee, D., Sugiyama, M., Garnett, R.: Advances in neural information processing systems 28. In: Proceedings of the 29th Annual Conference on Neural Information Processing Systems (2015)"},{"key":"3824_CR22","doi-asserted-by":"crossref","unstructured":"Ma, C.-Y., Kalantidis, Y., AlRegib, G., Vajda, P., Rohrbach, M., Kira, Z.: Learning to generate grounded visual captions without localization supervision. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision - ECCV 2020, pp. 353\u2013370. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-58523-5_21"},{"key":"3824_CR23","doi-asserted-by":"crossref","unstructured":"Chen, N., Pan, X., Chen, R., Yang, L., Lin, Z., Ren, Y., Yuan, H., Guo, X., Huang, F., Wang, W.: Distributed attention for grounded image captioning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 1966\u20131975 (2021)","DOI":"10.1145\/3474085.3475354"},{"key":"3824_CR24","doi-asserted-by":"publisher","first-page":"5400","DOI":"10.1109\/TMM.2022.3192729","volume":"25","author":"L Wang","year":"2023","unstructured":"Wang, L., Li, H., Hu, W., Zhang, X., Qiu, H., Meng, F., Wu, Q.: What happens in crowd scenes: a new dataset about crowd scenes for image captioning. IEEE Trans. Multimed. 25, 5400\u20135412 (2023). https:\/\/doi.org\/10.1109\/TMM.2022.3192729","journal-title":"IEEE Trans. Multimed."},{"key":"3824_CR25","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"3824_CR26","doi-asserted-by":"publisher","unstructured":"Henry\u00a0Senior, S.Y..L.R. Gregory\u00a0Slabaugh: Graph neural networks in vision-language image understanding: a survey. Vis. Comput.(2024) https:\/\/doi.org\/10.1007\/s00371-024-03343-0","DOI":"10.1007\/s00371-024-03343-0"},{"issue":"8","key":"3824_CR27","doi-asserted-by":"publisher","first-page":"4499","DOI":"10.1109\/TNNLS.2021.3116209","volume":"34","author":"Z Xie","year":"2021","unstructured":"Xie, Z., Zhang, W., Sheng, B., Li, P., Chen, C.L.P.: Bagfn: Broad attentive graph fusion network for high-order feature interactions. IEEE Trans. Neural Netw. Learn. Syst. 34(8), 4499\u20134513 (2021)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"3824_CR28","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014). https:\/\/doi.org\/10.1162\/tacl_a_00166","journal-title":"Trans. Assoc. Comput. Linguist."},{"issue":"1","key":"3824_CR29","doi-asserted-by":"publisher","first-page":"24","DOI":"10.1016\/j.vrih.2022.06.001","volume":"5","author":"W Shao","year":"2023","unstructured":"Shao, W., et al.: Covad: Content-oriented video anomaly detection using a self attention-based deep learning model. Virt. Real. Intell. Hardware 5(1), 24\u201341 (2023)","journal-title":"Virt. Real. Intell. Hardware"},{"issue":"1","key":"3824_CR30","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1016\/j.vrih.2022.07.006","volume":"5","author":"M Zhang","year":"2023","unstructured":"Zhang, M., Tian, X.: Transformer architecture based on mutual attention for image-anomaly detection. Virt. Real. Intell. Hardware 5(1), 57\u201367 (2023)","journal-title":"Virt. Real. Intell. Hardware"},{"key":"3824_CR31","unstructured":"Devlin, J., Chang, M.-W., Lee, P., Toutanova, K.: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding (2019)"},{"key":"3824_CR32","doi-asserted-by":"crossref","unstructured":"Nguyen, V.-Q., Suganuma, M., Okatani, T.: GRIT: Faster and Better Image captioning Transformer Using Dual Visual Features (2022)","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"3824_CR33","doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M.: Grounded video description. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00674"},{"key":"3824_CR34","doi-asserted-by":"publisher","unstructured":"Schmid, C., Soatto, S., Tomasi, C.: Conference on Computer Vision and Pattern Recognition, p. 1211. IEEE Computer Society, (2005)https:\/\/doi.org\/10.1109\/CVPR.2005.277","DOI":"10.1109\/CVPR.2005.277"},{"key":"3824_CR35","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Dollar, P., Zitnick, C.L.: Microsoft COCO Captions: Data Collection and Evaluation Server (2015)"},{"key":"3824_CR36","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.131"},{"issue":"1","key":"3824_CR37","first-page":"8909458","volume":"2020","author":"Y Chu","year":"2020","unstructured":"Chu, Y., Yue, X., Yu, L., Sergei, M., Wang, Z.: Automatic image captioning based on resnet50 and lstm with soft attention. Wirel. Commun. Mob. Comput. 2020(1), 8909458 (2020)","journal-title":"Wirel. Commun. Mob. Comput."},{"key":"3824_CR38","doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Wang, G., Chen, T.: Stack-captioning: Coarse-to-fine learning for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"3824_CR39","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"3824_CR40","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Jiang, Y.-G., Liu, W., Zhang, T.: Recurrent fusion network for image captioning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 499\u2013515 (2018)","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"3824_CR41","doi-asserted-by":"crossref","unstructured":"Ke, L., Pei, W., Li, R., Shen, X., Tai, Y.-W.: Reflective decoding network for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8888\u20138897 (2019)","DOI":"10.1109\/ICCV.2019.00898"},{"key":"3824_CR42","doi-asserted-by":"publisher","unstructured":"Sarto, S., Cornia, M., Baraldi, L., Nicolosi, A., Cucchiara, R.: Towards retrieval-augmented architectures for image captioning 20(8) (2024) https:\/\/doi.org\/10.1145\/3663667","DOI":"10.1145\/3663667"},{"key":"3824_CR43","doi-asserted-by":"publisher","unstructured":"Li, Y., Ji, J., Sun, X., Zhou, Y., Luo, Y., Ji, R.: M3ixup: A multi-modal data augmentation approach for image captioning. Pattern Recogn. 158, 110941 (2025) https:\/\/doi.org\/10.1016\/j.patcog.2024.110941","DOI":"10.1016\/j.patcog.2024.110941"},{"key":"3824_CR44","doi-asserted-by":"publisher","unstructured":"Shetty, A., Kale, Y., Patil, Y., Patil, R., Sharma, S.: Optimal transformers based image captioning using beam search. Multimed. Tools Appl. 83 (2024) https:\/\/doi.org\/10.1007\/s11042-023-17359-6","DOI":"10.1007\/s11042-023-17359-6"},{"key":"3824_CR45","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123847","volume":"250","author":"X Yang","year":"2024","unstructured":"Yang, X., Yang, Y., Wu, J., Sun, W., Ma, S., Hou, Z.: Ca-captioner: A novel concentrated attention for image captioning. Exp. Syst. Appl. 250, 123847 (2024). https:\/\/doi.org\/10.1016\/j.eswa.2024.123847","journal-title":"Exp. Syst. Appl."},{"key":"3824_CR46","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-023-00693-9","author":"R Sasibhooshan","year":"2023","unstructured":"Sasibhooshan, R., Kumaraswamy, S., Sasidharan, S.: Image caption generation using visual attention prediction and contextual spatial relation extraction. J. Big Data (2023). https:\/\/doi.org\/10.1186\/s40537-023-00693-9","journal-title":"J. Big Data"},{"key":"3824_CR47","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-022-00571-w","author":"MA Al-Malla","year":"2022","unstructured":"Al-Malla, M.A., Jafar, A., Ghneim, N.: Image captioning model using attention and object features to mimic human image understanding. J. Big Data (2022). https:\/\/doi.org\/10.1186\/s40537-022-00571-w","journal-title":"J. Big Data"},{"key":"3824_CR48","doi-asserted-by":"publisher","DOI":"10.3389\/fnins.2023.1270850","author":"T Bai","year":"2023","unstructured":"Bai, T., Zhou, S., Pang, Y., Luo, J., Wang, H., Du, Y.: An image caption model based on attention mechanism and deep reinforcement learning. Front Neurosci. Front Neurosci. (2023). https:\/\/doi.org\/10.3389\/fnins.2023.1270850","journal-title":"Front Neurosci. Front Neurosci."},{"key":"3824_CR49","unstructured":"Khan, R., Islam, M.S., Kanwal, K., Iqbal, M., Hossain, M.I., Ye, Z.: A Deep Neural Framework for Image Caption Generation Using GRU-Based Attention Mechanism (2022). arXiv:2203.01594"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03824-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-025-03824-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-025-03824-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T04:58:35Z","timestamp":1757134715000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-025-03824-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,4]]},"references-count":49,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["3824"],"URL":"https:\/\/doi.org\/10.1007\/s00371-025-03824-w","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-5169088\/v1","asserted-by":"object"}]},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,4]]},"assertion":[{"value":"20 January 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 February 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"All authors agreed with the content and gave explicit consent to submit.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}