{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T06:54:20Z","timestamp":1780988060708,"version":"3.54.1"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"13","license":[{"start":{"date-parts":[[2024,5,27]],"date-time":"2024-05-27T00:00:00Z","timestamp":1716768000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,5,27]],"date-time":"2024-05-27T00:00:00Z","timestamp":1716768000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-19410-6","type":"journal-article","created":{"date-parts":[[2024,5,27]],"date-time":"2024-05-27T07:01:56Z","timestamp":1716793316000},"page":"11907-11941","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Attribute guided fusion network for obtaining fine-grained image captions"],"prefix":"10.1007","volume":"84","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8546-5426","authenticated-orcid":false,"given":"Md. Bipul","family":"Hossen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhongfu","family":"Ye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amr","family":"Abdussalam","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fazal E","family":"Wahab","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,5,27]]},"reference":[{"key":"19410_CR1","doi-asserted-by":"crossref","unstructured":"Al-Shamayleh AS, Adwan O, Alsharaiah MA, Hussein AH, Kharma QM, Eke CI (2024) A comprehensive literature review on image captioning methods and metrics based on deep learning technique. Multimed Tools Appl 1\u201350","DOI":"10.1007\/s11042-024-18307-8"},{"key":"19410_CR2","doi-asserted-by":"publisher","first-page":"103138","DOI":"10.1016\/j.jvcir.2021.103138","volume":"78","author":"X Zhong","year":"2021","unstructured":"Zhong X, Nie G, Huang W, Liu W, Ma B, Lin CW (2021) Attention-guided image captioning with adaptive global and local feature fusion. J Vis Commun Image Represent 78:103138","journal-title":"J Vis Commun Image Represent"},{"issue":"6","key":"19410_CR3","doi-asserted-by":"publisher","first-page":"3891","DOI":"10.1007\/s00530-023-01166-y","volume":"29","author":"J Chang","year":"2023","unstructured":"Chang J, Zhang L, Shao Z (2023) View-target relation-guided unsupervised 2D image-based 3D model retrieval via transformer. Multimed Syst 29(6):3891\u20133901","journal-title":"Multimed Syst"},{"key":"19410_CR4","doi-asserted-by":"crossref","unstructured":"Chu F, Cao J, Shao Z, Pang Y (2022) Illumination-guided transformer-based network for multispectral pedestrian detection. In: CAAI International conference on artificial intelligence. pp 343\u2013355. Springer","DOI":"10.1007\/978-3-031-20497-5_28"},{"key":"19410_CR5","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TMM.2020.3011317","volume":"23","author":"J Wu","year":"2021","unstructured":"Wu J, Chen T, Wu H, Yang Z, Luo G, Lin L (2021) Fine-grained image captioning with global-local discriminative objective. IEEE Trans Multimed 23:2413\u20132427","journal-title":"IEEE Trans Multimed"},{"issue":"2","key":"19410_CR6","doi-asserted-by":"publisher","first-page":"710","DOI":"10.1109\/TPAMI.2019.2909864","volume":"44","author":"ZJ Zha","year":"2022","unstructured":"Zha ZJ, Liu D, Zhang H, Zhang Y, Wu F (2022) Context-Aware Visual Policy Network for Fine-Grained Image Captioning. IEEE Trans Pattern Anal Mach Intell 44(2):710\u2013722","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"19410_CR7","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, et\u00a0al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF conference on computer vision and pattern recognition. pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"19410_CR8","unstructured":"Agarwal L, Verma B (2024) From methods to datasets: A survey on Image-Caption Generators. Multimed Tools Appl 1\u201347"},{"key":"19410_CR9","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.neucom.2021.10.014","volume":"468","author":"Y Wang","year":"2022","unstructured":"Wang Y, Xu J, Sun Y (2022) A visual persistence model for image captioning. Neurocomputing 468:48\u201359","journal-title":"Neurocomputing"},{"key":"19410_CR10","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, et\u00a0al (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning. pp 2048\u20132057. PMLR"},{"key":"19410_CR11","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"19410_CR12","doi-asserted-by":"publisher","first-page":"104340","DOI":"10.1016\/j.imavis.2021.104340","volume":"117","author":"T Chen","year":"2022","unstructured":"Chen T, Li Z, Wu J, Ma H, Su B (2022) Improving image captioning with Pyramid Attention and SC-GAN. Image Vis Comput 117:104340","journal-title":"Image Vis Comput"},{"key":"19410_CR13","doi-asserted-by":"publisher","first-page":"322","DOI":"10.1016\/j.neucom.2019.06.085","volume":"364","author":"F Xiao","year":"2019","unstructured":"Xiao F, Gong X, Zhang Y, Shen Y, Li J, Gao X (2019) DAA: Dual LSTMs with adaptive attention for image captioning. Neurocomputing 364:322\u2013329","journal-title":"Neurocomputing"},{"key":"19410_CR14","doi-asserted-by":"publisher","first-page":"105194","DOI":"10.1016\/j.engappai.2022.105194","volume":"114","author":"C Wang","year":"2022","unstructured":"Wang C, Gu X (2022) Dynamic-balanced double-attention fusion for image captioning. Eng Appl Artif Intell 114:105194","journal-title":"Eng Appl Artif Intell"},{"issue":"4","key":"19410_CR15","doi-asserted-by":"publisher","first-page":"3157","DOI":"10.1007\/s11063-022-10759-z","volume":"54","author":"F Xiao","year":"2022","unstructured":"Xiao F, Xue W, Shen Y, Gao X (2022) A new attention-based LSTM for image captioning. Neural Process Lett 54(4):3157\u20133171","journal-title":"Neural Process Lett"},{"key":"19410_CR16","doi-asserted-by":"publisher","first-page":"4013","DOI":"10.1109\/TIP.2020.2969330","volume":"29","author":"Y Huang","year":"2020","unstructured":"Huang Y, Chen J, Ouyang W, Wan W, Xue Y (2020) Image captioning with end-to-end attribute detection and subsequent attributes prediction. IEEE Trans Image Process 29:4013\u20134026","journal-title":"IEEE Trans Image Process"},{"key":"19410_CR17","doi-asserted-by":"crossref","unstructured":"Al-Qatf M, Wang X, Hawbani A, Abdusallam A, Alsamhi SH (2022) Image captioning with novel topics guidance and retrieval-based topics re-weighting. IEEE Trans Multimed","DOI":"10.1109\/TMM.2022.3202690"},{"key":"19410_CR18","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"issue":"2","key":"19410_CR19","doi-asserted-by":"publisher","first-page":"913","DOI":"10.1109\/TCYB.2019.2914351","volume":"51","author":"X Li","year":"2021","unstructured":"Li X, Yuan A, Lu X (2021) Vision-to-language tasks based on attributes and attention mechanism. IEEE Trans Cybern 51(2):913\u2013926","journal-title":"IEEE Trans Cybern"},{"key":"19410_CR20","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. In: Proceedings of the IEEE international conference on computer vision. pp 4894\u20134902","DOI":"10.1109\/ICCV.2017.524"},{"issue":"6","key":"19410_CR21","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2018","unstructured":"Wu Q, Shen C, Wang P, Dick A, Van Den Hengel A (2018) Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach Intell 40(6):1367\u20131381","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"19410_CR22","doi-asserted-by":"publisher","first-page":"34","DOI":"10.1016\/j.patrec.2021.08.021","volume":"152","author":"Y Zhou","year":"2021","unstructured":"Zhou Y, Long J, Xu S, Shang L (2021) Attribute-driven image captioning via soft-switch pointer. Pattern Recognit Lett 152:34\u201341","journal-title":"Pattern Recognit Lett"},{"key":"19410_CR23","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Han J (2018) Show, Observe and Tell: Attribute-driven Attention Model for Image Captioning. In: Proceedings of the twenty-seventh international joint conference on artificial intelligence, IJCAI-18. International Joint Conferences on Artificial Intelligence Organization. pp 606\u2013612","DOI":"10.24963\/ijcai.2018\/84"},{"issue":"1","key":"19410_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3292058","volume":"15","author":"C He","year":"2019","unstructured":"He C, Hu H (2019) Image captioning with visual-semantic double attention. ACM Trans Multimed Comput Commun Appl (TOMM) 15(1):1\u201316","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"issue":"1","key":"19410_CR25","doi-asserted-by":"publisher","first-page":"1223","DOI":"10.1007\/s11042-022-13279-z","volume":"82","author":"D Zhao","year":"2023","unstructured":"Zhao D, Yang R, Wang Z, Qi Z (2023) A cooperative approach based on self-attention with interactive attribute for image caption. Multimed Tools Appl 82(1):1223\u20131236","journal-title":"Multimed Tools Appl"},{"key":"19410_CR26","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"issue":"1","key":"19410_CR27","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1109\/TCSVT.2021.3067449","volume":"32","author":"C Yan","year":"2022","unstructured":"Yan C, Hao Y, Li L, Yin J, Liu A, Mao Z et al (2022) Task-adaptive attention for image captioning. IEEE Trans Circ Syst Vid Technol 32(1):43\u201351","journal-title":"IEEE Trans Circ Syst Vid Technol"},{"key":"19410_CR28","doi-asserted-by":"publisher","first-page":"57943","DOI":"10.1109\/ACCESS.2020.2981513","volume":"8","author":"C Wu","year":"2020","unstructured":"Wu C, Yuan S, Cao H, Wei Y, Wang L (2020) Hierarchical attention-based fusion for image caption with multi-grained rewards. IEEE Access. 8:57943\u201357951","journal-title":"IEEE Access."},{"key":"19410_CR29","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1016\/j.neucom.2020.06.112","volume":"413","author":"W Cai","year":"2020","unstructured":"Cai W, Liu Q (2020) Image captioning with semantic-enhanced features and extremely hard negative examples. Neurocomputing 413:31\u201340","journal-title":"Neurocomputing"},{"issue":"12","key":"19410_CR30","doi-asserted-by":"publisher","first-page":"18413","DOI":"10.1007\/s11042-021-10578-9","volume":"80","author":"C Sur","year":"2021","unstructured":"Sur C (2021) MRRC: multiple role representation crossover interpretation for image captioning with R-CNN feature distribution composition (FDC). Multimed Tools Appl 80(12):18413\u201318443","journal-title":"Multimed Tools Appl"},{"issue":"8","key":"19410_CR31","doi-asserted-by":"publisher","first-page":"2117","DOI":"10.1109\/TMM.2019.2896516","volume":"21","author":"X Li","year":"2019","unstructured":"Li X, Jiang S (2019) Know more say less: Image captioning based on scene graphs. IEEE Trans Multimed 21(8):2117\u20132130","journal-title":"IEEE Trans Multimed"},{"key":"19410_CR32","doi-asserted-by":"publisher","first-page":"107075","DOI":"10.1016\/j.patcog.2019.107075","volume":"98","author":"J Wang","year":"2020","unstructured":"Wang J, Wang W, Wang L, Wang Z, Feng DD, Tan T (2020) Learning visual relationship and context-aware attention for image captioning. Pattern Recognit 98:107075","journal-title":"Pattern Recognit"},{"key":"19410_CR33","doi-asserted-by":"crossref","unstructured":"Li Y, Pan Y, Yao T, Mei T (2022) Comprehending and ordering semantics for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp 17990\u201317999","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"19410_CR34","unstructured":"Shao Z, Han J, Marnerides D, Debattista K (2022) Region-object relation-aware dense captioning via transformer. IEEE Trans Neural Netw Learn Syst"},{"key":"19410_CR35","doi-asserted-by":"crossref","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2023) Textual context-aware dense captioning with diverse words. IEEE Trans Multimed","DOI":"10.1109\/TMM.2023.3241517"},{"key":"19410_CR36","doi-asserted-by":"crossref","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2024) DCMSTRD: End-to-end Dense Captioning via Multi-Scale Transformer Decoding. IEEE Trans Multimed","DOI":"10.1109\/TMM.2024.3369863"},{"key":"19410_CR37","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, Pu Y, Tran K, Gao J, et\u00a0al (2017) Semantic compositional networks for visual captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 5630\u20135639","DOI":"10.1109\/CVPR.2017.127"},{"issue":"6","key":"19410_CR38","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149. https:\/\/doi.org\/10.1109\/TPAMI.2016.2577031","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"1","key":"19410_CR39","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1007\/s00530-023-01249-w","volume":"30","author":"MB Hossen","year":"2024","unstructured":"Hossen MB, Ye Z, Abdussalam A, Hossain MI (2024) GVA: guided visual attention approach for automatic image caption generation. Multimed Syst 30(1):50","journal-title":"Multimed Syst"},{"key":"19410_CR40","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, et\u00a0al (2014) Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer. pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"19410_CR41","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM, Caicedo JC, Hockenmaier J, Lazebnik S (2015) Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision. pp 2641\u20132649","DOI":"10.1109\/ICCV.2015.303"},{"key":"19410_CR42","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: 2015 IEEE Conference on computer vision and pattern recognition (CVPR). pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"19410_CR43","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics. pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"19410_CR44","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. pp 65\u201372"},{"key":"19410_CR45","unstructured":"Lin CY (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out. pp 74\u201381"},{"key":"19410_CR46","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"19410_CR47","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer. pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"19410_CR48","doi-asserted-by":"publisher","first-page":"103068","DOI":"10.1016\/j.cviu.2020.103068","volume":"201","author":"H Wei","year":"2020","unstructured":"Wei H, Li Z, Zhang C, Ma H (2020) The synergy of double attention: Combine sentence-level and word-level attention for image captioning. Comput Vis Image Understand 201:103068","journal-title":"Comput Vis Image Understand"},{"key":"19410_CR49","doi-asserted-by":"publisher","first-page":"154953","DOI":"10.1109\/ACCESS.2020.3018752","volume":"8","author":"L Cheng","year":"2020","unstructured":"Cheng L, Wei W, Mao X, Liu Y, Miao C (2020) Stack-VS: Stacked visual-semantic attention for image caption generation. IEEE Access 8:154953\u2013154965","journal-title":"IEEE Access"},{"issue":"4","key":"19410_CR50","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3576927","volume":"19","author":"A Abdussalam","year":"2023","unstructured":"Abdussalam A, Ye Z, Hawbani A, Al-Qatf M, Khan R (2023) NumCap: A Number-controlled Multi-caption Image Captioning Network. ACM Trans Multimed Comput Commun Appl 19(4):1\u201324","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"19410_CR51","doi-asserted-by":"crossref","unstructured":"Qian K, Pan Y, Xu H, Tian L (2023) Transformer model incorporating local graph semantic attention for image caption. Vis Comput 1\u201312","DOI":"10.1007\/s00371-023-03180-7"},{"issue":"2","key":"19410_CR52","doi-asserted-by":"publisher","first-page":"890","DOI":"10.1109\/TCYB.2022.3156367","volume":"54","author":"Y Yang","year":"2024","unstructured":"Yang Y, Wei H, Zhu H, Yu D, Xiong H, Yang J (2024) Exploiting Cross-Modal Prediction and Relation Consistency for Semisupervised Image Captioning. IEEE Trans Cybern 54(2):890\u2013902. https:\/\/doi.org\/10.1109\/TCYB.2022.3156367","journal-title":"IEEE Trans Cybern"},{"issue":"2","key":"19410_CR53","doi-asserted-by":"publisher","first-page":"3447","DOI":"10.3233\/JIFS-233004","volume":"46","author":"M Al-Qatf","year":"2024","unstructured":"Al-Qatf M, Hawbani A, Wang X, Abdusallam A, Alsamhi S, Alhabib M et al (2024) RVAIC: Refined visual attention for improved image captioning. J Intell Fuzzy Syst 46(2):3447\u20133459","journal-title":"J Intell Fuzzy Syst"},{"key":"19410_CR54","doi-asserted-by":"crossref","unstructured":"Wang C, Gu X (2022) Image captioning with adaptive incremental global context attention. Appl Intell pp 1\u201323","DOI":"10.1007\/s10489-021-02734-3"},{"key":"19410_CR55","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1016\/j.neucom.2018.12.026","volume":"333","author":"YH Tan","year":"2019","unstructured":"Tan YH, Chan CS (2019) Phrase-based image caption generator with hierarchical LSTM network. Neurocomputing 333:86\u2013100","journal-title":"Neurocomputing"},{"key":"19410_CR56","doi-asserted-by":"publisher","first-page":"587","DOI":"10.1007\/978-3-030-20876-9_37","volume-title":"Computer Vision - ACCV 2018","author":"W Wang","year":"2019","unstructured":"Wang W, Chen Z, Hu H (2019) Multivariate Attention Network for Image Captioning. In: Jawahar CV, Li H, Mori G, Schindler K (eds) Computer Vision - ACCV 2018. Springer International Publishing, Cham, pp 587\u2013602"},{"key":"19410_CR57","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","volume":"143","author":"Y Zhang","year":"2021","unstructured":"Zhang Y, Shi X, Mi S, Yang X (2021) Image captioning with transformer and knowledge graph. Pattern Recognit Lett 143:43\u201349","journal-title":"Pattern Recognit Lett"},{"key":"19410_CR58","doi-asserted-by":"publisher","first-page":"111433","DOI":"10.1016\/j.knosys.2024.111433","volume":"287","author":"C Cai","year":"2024","unstructured":"Cai C, Wang S, Yap KH, Wang Y (2024) Top-down framework for weakly-supervised grounded image captioning. Knowl-Based Syst 287:111433","journal-title":"Knowl-Based Syst"},{"issue":"4","key":"19410_CR59","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3460474","volume":"17","author":"W Jiang","year":"2021","unstructured":"Jiang W, Wang W, Hu H (2021) Bi-directional co-attention network for image captioning. ACM Trans Multimed Comput Commun Appl (TOMM) 17(4):1\u201320","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"issue":"7","key":"19410_CR60","doi-asserted-by":"publisher","first-page":"4417","DOI":"10.1109\/TCSVT.2021.3121062","volume":"32","author":"Y Wang","year":"2022","unstructured":"Wang Y, Xu N, Liu AA, Li W, Zhang Y (2022) High-Order Interaction Learning for Image Captioning. IEEE Trans Circ Syst Vid Technol 32(7):4417\u20134430. https:\/\/doi.org\/10.1109\/TCSVT.2021.3121062","journal-title":"IEEE Trans Circ Syst Vid Technol"},{"key":"19410_CR61","doi-asserted-by":"publisher","first-page":"102238","DOI":"10.1016\/j.displa.2022.102238","volume":"73","author":"W Jiang","year":"2022","unstructured":"Jiang W, Li Q, Zhan K, Fang Y, Shen F (2022) Hybrid attention network for image captioning. Displays. 73:102238","journal-title":"Displays."},{"issue":"3","key":"19410_CR62","doi-asserted-by":"publisher","first-page":"103288","DOI":"10.1016\/j.ipm.2023.103288","volume":"60","author":"C Wang","year":"2023","unstructured":"Wang C, Gu X (2023) Learning Double-Level Relationship Networks for image captioning. Inf Process Manag 60(3):103288","journal-title":"Inf Process Manag"},{"issue":"1","key":"19410_CR63","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1186\/s40537-023-00693-9","volume":"10","author":"R Sasibhooshan","year":"2023","unstructured":"Sasibhooshan R, Kumaraswamy S, Sasidharan S (2023) Image caption generation using visual attention prediction and contextual spatial relation extraction. J Big Data 10(1):18","journal-title":"J Big Data"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19410-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-19410-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19410-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,2]],"date-time":"2025-05-02T03:27:52Z","timestamp":1746156472000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-19410-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,27]]},"references-count":63,"journal-issue":{"issue":"13","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["19410"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-19410-6","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,5,27]]},"assertion":[{"value":"6 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 April 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 May 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We declare that we have no known conflicts of interest associated with this publication.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}}]}}