{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T18:14:09Z","timestamp":1770142449938,"version":"3.49.0"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"23","license":[{"start":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T00:00:00Z","timestamp":1726444800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T00:00:00Z","timestamp":1726444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["71991464"],"award-info":[{"award-number":["71991464"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"crossref","award":["2022YFB3303400 and 2021YFF0500900"],"award-info":[{"award-number":["2022YFB3303400 and 2021YFF0500900"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Anhui Provincial Major Science and Technology Project","award":["202203a05020016"],"award-info":[{"award-number":["202203a05020016"]}]},{"name":"Pioneer and Leading Goose R&D Program of Zhejiang","award":["2023C01143"],"award-info":[{"award-number":["2023C01143"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-20220-z","type":"journal-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T05:02:35Z","timestamp":1726462955000},"page":"27149-27176","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["CSDNet: cross-sketch with dual gated attention for fine-grained image captioning network"],"prefix":"10.1007","volume":"84","author":[{"given":"Md. Shamim","family":"Hossain","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shamima","family":"Aktar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Md. Bipul","family":"Hossen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mohammad Alamgir","family":"Hossain","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Naijie","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1475-8894","authenticated-orcid":false,"given":"Zhangjin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,16]]},"reference":[{"key":"20220_CR1","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"20220_CR2","unstructured":"Chen X, Fang H, Lin TY, Vedantam R, Gupta S, Doll\u00e1r P et\u00a0al (2015) Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325"},{"key":"20220_CR3","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp 2556\u20132565","DOI":"10.18653\/v1\/P18-1238"},{"key":"20220_CR4","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J et\u00a0al (2010) Every picture tells a story: Generating sentences from images. In: Computer Vision\u2013ECCV 2010: 11th European Conference on Computer Vision, Heraklion, Crete, Greece, September 5-11, 2010, Proceedings, Part IV 11. Springer, pp 15\u201329","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"20220_CR5","doi-asserted-by":"crossref","unstructured":"Ushiku Y, Yamaguchi M, Mukuta Y, Harada T (2015) Common subspace for model and similarity: Phrase learning for caption generation from images. In: Proceedings of the IEEE international conference on computer vision, pp 2668\u20132676","DOI":"10.1109\/ICCV.2015.306"},{"key":"20220_CR6","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"20220_CR7","doi-asserted-by":"crossref","unstructured":"Fang H, Gupta S, Iandola F, Srivastava RK, Deng L, Doll\u00e1r P et\u00a0al (2015) From captions to visual concepts and back. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1473\u20131482","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"20220_CR8","doi-asserted-by":"crossref","unstructured":"Li Y, Sharma P, Ding N, Goodman S, Soricut R (2019) Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. arXiv preprint arXiv:1811.10665","DOI":"10.18653\/v1\/P18-1238"},{"key":"20220_CR9","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R et\u00a0al (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning. PMLR, pp 2048\u20132057"},{"key":"20220_CR10","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"20220_CR11","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. Adv Neural Inf Process Syst 27"},{"issue":"8","key":"20220_CR12","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"20220_CR13","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H et\u00a0al (2014) Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"issue":"11","key":"20220_CR14","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster M, Paliwal KK (1997) Bidirectional recurrent neural networks. IEEE Trans Signal Process 45(11):2673\u20132681","journal-title":"IEEE Trans Signal Process"},{"key":"20220_CR15","doi-asserted-by":"publisher","first-page":"149","DOI":"10.1016\/j.inffus.2021.07.009","volume":"77","author":"S Uppal","year":"2022","unstructured":"Uppal S, Bhagat S, Hazarika D, Majumder N, Poria S, Zimmermann R et al (2022) Multimodal research in vision and language: A review of current and emerging trends. Information Fusion. 77:149\u2013171","journal-title":"Information Fusion."},{"key":"20220_CR16","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y, Mei T (2020) X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10971\u201310980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"20220_CR17","doi-asserted-by":"publisher","first-page":"76243","DOI":"10.1109\/ACCESS.2020.2986476","volume":"8","author":"Y Zhang","year":"2020","unstructured":"Zhang Y, Yi P, Zhou D, Yang X, Yang D, Zhang Q et al (2020) CSANet: Channel and spatial mixed attention CNN for pedestrian detection. IEEE Access. 8:76243\u201376252","journal-title":"IEEE Access."},{"key":"20220_CR18","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"20220_CR19","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10578\u201310587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"20220_CR20","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4651\u20134659","DOI":"10.1109\/CVPR.2016.503"},{"key":"20220_CR21","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S et\u00a0al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"20220_CR22","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28"},{"key":"20220_CR23","doi-asserted-by":"crossref","unstructured":"Zhou L, Palangi H, Zhang L, Hu H, Corso J, Gao J (2020) Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a034, pp 13041\u201313049","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"20220_CR24","unstructured":"Shao Z, Han J, Marnerides D, Debattista K (2022) Region-object relation-aware dense captioning via transformer. IEEE Trans Neural Netw Learn Syst"},{"key":"20220_CR25","doi-asserted-by":"publisher","first-page":"8753","DOI":"10.1109\/TMM.2023.3241517","volume":"25","author":"Z Shao","year":"2023","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2023) Textual context-aware dense captioning with diverse words. IEEE Trans Multimedia 25:8753\u20138766","journal-title":"IEEE Trans Multimedia"},{"key":"20220_CR26","doi-asserted-by":"crossref","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2024) DCMSTRD: end-to-end dense captioning via multi-scale transformer decoding. IEEE Trans Multimed","DOI":"10.1109\/TMM.2024.3369863"},{"key":"20220_CR27","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei XY (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"20220_CR28","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D et\u00a0al (2014) Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"20220_CR29","doi-asserted-by":"crossref","unstructured":"Gao Y, Beijbom O, Zhang N, Darrell T (2016) Compact bilinear pooling. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 317\u2013326","DOI":"10.1109\/CVPR.2016.41"},{"key":"20220_CR30","unstructured":"Kim JH, On KW, Lim W, Kim J, Ha JW, Zhang BT (2016) Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325"},{"key":"20220_CR31","doi-asserted-by":"crossref","unstructured":"Trottier L, Giguere P, Chaib-Draa B (2017) Parametric exponential linear unit for deep convolutional neural networks. In: 2017 16th IEEE international conference on machine learning and applications (ICMLA). IEEE 207\u2013214","DOI":"10.1109\/ICMLA.2017.00038"},{"key":"20220_CR32","unstructured":"Dauphin YN, Fan A, Auli M, Grangier D (2017) Language modeling with gated convolutional networks. In: International conference on machine learning. PMLR, pp 933\u2013941"},{"key":"20220_CR33","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"20220_CR34","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"20220_CR35","unstructured":"Lin CY (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"20220_CR36","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"20220_CR37","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"20220_CR38","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: IEEE conference on computer vision and pattern recognition. Ieee 2009:248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"20220_CR39","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J et al (2017) Visual genome: Connecting language and vision using crowdsourced dense image annotations. Int J Comput Vision 123:32\u201373","journal-title":"Int J Comput Vision"},{"key":"20220_CR40","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Jiang YG, Liu W, Zhang T (2018) Recurrent fusion network for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 499\u2013515","DOI":"10.1007\/978-3-030-01216-8_31"},{"issue":"1","key":"20220_CR41","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1007\/s00530-023-01249-w","volume":"30","author":"MB Hossen","year":"2024","unstructured":"Hossen MB, Ye Z, Abdussalam A, Hossain MI (2024) GVA: guided visual attention approach for automatic image caption generation. Multimedia Syst 30(1):50","journal-title":"Multimedia Syst"},{"key":"20220_CR42","doi-asserted-by":"crossref","unstructured":"Hossen MB, Ye Z, Abdussalam A, Wahab FE (2024) Attribute guided fusion network for obtaining fine-grained image captions. Multimed Tools Appl 1\u201335","DOI":"10.1007\/s11042-024-19410-6"},{"key":"20220_CR43","doi-asserted-by":"publisher","first-page":"111433","DOI":"10.1016\/j.knosys.2024.111433","volume":"287","author":"C Cai","year":"2024","unstructured":"Cai C, Wang S, Yap KH, Wang Y (2024) Top-down framework for weakly-supervised grounded image captioning. Knowl-Based Syst 287:111433","journal-title":"Knowl-Based Syst"},{"key":"20220_CR44","doi-asserted-by":"crossref","unstructured":"Al-Qatf M, Hawbani A, Wang X, Abdusallam A, Alsamhi S, Alhabib M et\u00a0al (2024) RVAIC: Refined visual attention for improved image captioning. Journal of Intelligent & Fuzzy Systems (Preprint):1\u201313","DOI":"10.3233\/JIFS-233004"},{"key":"20220_CR45","doi-asserted-by":"publisher","first-page":"107732","DOI":"10.1016\/j.engappai.2023.107732","volume":"131","author":"M Al-Qatf","year":"2024","unstructured":"Al-Qatf M, Hawbani A, Wang X, Abdusallam A, Zhao L, Alsamhi SH et al (2024) NPoSC-A3: A novel part of speech clues-aware adaptive attention mechanism for image captioning. Eng Appl Artif Intell 131:107732","journal-title":"Eng Appl Artif Intell"},{"key":"20220_CR46","doi-asserted-by":"publisher","first-page":"102798","DOI":"10.1016\/j.displa.2024.102798","volume":"84","author":"MB Hossen","year":"2024","unstructured":"Hossen MB, Ye Z, Abdussalam A, Hossain MA (2024) ICEAP: An advanced fine-grained image captioning network with enhanced attribute predictor. Displays 84:102798","journal-title":"Displays"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20220-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-20220-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-20220-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T22:19:59Z","timestamp":1757110799000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-20220-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,16]]},"references-count":46,"journal-issue":{"issue":"23","published-online":{"date-parts":[[2025,7]]}},"alternative-id":["20220"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-20220-z","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,16]]},"assertion":[{"value":"10 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 August 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 September 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there are no financial conflicts of interest or personal relationships that could have influenced the outcomes or interpretations presented in this paper. They affirm that the work reported is free from any potential bias stemming from personal or financial considerations.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}