{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:15:54Z","timestamp":1771956954289,"version":"3.50.1"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","award":["22ZR1418400"],"award-info":[{"award-number":["22ZR1418400"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s10489-022-04202-y","type":"journal-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T12:26:59Z","timestamp":1665404819000},"page":"13398-13414","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Multi-feature fusion enhanced transformer with multi-layer fused decoding for image captioning"],"prefix":"10.1007","volume":"53","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6270-7771","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongjun","family":"Fang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhe","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"4202_CR1","doi-asserted-by":"crossref","unstructured":"Khan MA, Muhammad K, Sharif M, Akram T, Kadry S (2021) Intelligent fusion-assisted skin lesion localization and classification for smart healthcare. Neural Computing and Applications, 1\u201316","DOI":"10.1007\/s00521-021-06490-w"},{"key":"4202_CR2","doi-asserted-by":"publisher","first-page":"7941","DOI":"10.3390\/s21237941","volume":"21","author":"S Khan","year":"2021","unstructured":"Khan S, Khan MA, Alhaisoni M, Tariq U, Yong H-S, Armghan A, Alenezi F (2021) Human action recognition: a paradigm of best deep learning features selection and serial based extended fusion. Sensors 21:7941","journal-title":"Sensors"},{"key":"4202_CR3","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D, Parikh D (2018) Neural baby talk. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7219\u20137228","DOI":"10.1109\/CVPR.2018.00754"},{"key":"4202_CR4","unstructured":"Mitchell M, Dodge J, Goyal A, Yamaguchi K, Stratos K, Han X, Mensch A, Berg A, Berg T, Daum\u00e9 III H (2012) Midge: generating image descriptions from computer vision detections. In: Proceedings of the 13th conference of the european chapter of the association for computational linguistics, pp 747\u2013756"},{"key":"4202_CR5","doi-asserted-by":"crossref","unstructured":"Devlin J, Cheng H, Fang H, Gupta S, Deng L, He X, Zweig G, Mitchell M (2015) Language models for image captioning: the quirks and what works. arXiv:1505.01809","DOI":"10.3115\/v1\/P15-2017"},{"key":"4202_CR6","doi-asserted-by":"crossref","unstructured":"Wang C, Yang H, Bartz C, Meinel C (2016) Image captioning with deep bidirectional LSTMs. In: Proceedings of the 24th ACM international conference on multimedia, pp 988\u2013997","DOI":"10.1145\/2964284.2964299"},{"key":"4202_CR7","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"4202_CR8","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"4202_CR9","doi-asserted-by":"crossref","unstructured":"Li G, Zhu L, Liu P, Yang Y (2019) Entangled transformer for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 8928\u20138937","DOI":"10.1109\/ICCV.2019.00902"},{"key":"4202_CR10","unstructured":"Ashraf AH, Imran M, Qahtani AM, Alsufyani A, Almutiry O, Mahmood A, Habib M (2021) Weapons Detection for Security and Video Surveillance Using CNN and YOLO-v5s"},{"key":"4202_CR11","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. arXiv:1706.03762"},{"key":"4202_CR12","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2020) An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv:2010.11929 [cs]"},{"key":"4202_CR13","doi-asserted-by":"publisher","first-page":"7286","DOI":"10.3390\/s21217286","volume":"21","author":"MA Khan","year":"2021","unstructured":"Khan MA, Alhaisoni M, Tariq U, Hussain N, Majid A, Dama\u0161evi\u010dius R, Maskeli\u016bnas R (2021) COVID-19 case recognition from chest CT images by deep learning, Entropy-controlled Firefly Optimization, and Parallel Feature Fusion. Sensors 21:7286","journal-title":"Sensors"},{"key":"4202_CR14","doi-asserted-by":"crossref","unstructured":"Saeed F, Khan MA, Sharif M, Mittal M, Goyal LM, Roy S (2021) Deep neural network features fusion and selection based on PLS regression with an application for crops diseases classification, vol 103","DOI":"10.1016\/j.asoc.2021.107164"},{"key":"4202_CR15","doi-asserted-by":"publisher","first-page":"434","DOI":"10.3390\/s22020434","volume":"22","author":"M Nawaz","year":"2022","unstructured":"Nawaz M, Nazir T, Javed A, Tariq U, Yong H-S, Khan MA, Cha J (2022) An efficient deep learning approach to automatic glaucoma detection using optic disc and optic cup localization. Sensors 22:434","journal-title":"Sensors"},{"key":"4202_CR16","doi-asserted-by":"publisher","unstructured":"Cho K, van Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN Encoder-Decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1724\u20131734. https:\/\/doi.org\/10.3115\/v1\/D14-1179","DOI":"10.3115\/v1\/D14-1179"},{"key":"4202_CR17","unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. arXiv:1409.0473"},{"key":"4202_CR18","doi-asserted-by":"crossref","unstructured":"Johnson J, Karpathy A, Fei-Fei L (2016) Densecap: fully convolutional localization networks for dense captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4565\u20134574","DOI":"10.1109\/CVPR.2016.494"},{"key":"4202_CR19","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"4202_CR20","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2016) Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE Trans Pattern Anal Mach Intell 39:652\u2013663","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4202_CR21","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning, pp 2048\u20132057"},{"key":"4202_CR22","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"4202_CR23","doi-asserted-by":"publisher","first-page":"104146","DOI":"10.1016\/j.imavis.2021.104146","volume":"109","author":"Z Zhang","year":"2021","unstructured":"Zhang Z, Wu Q, Wang Y, Chen F (2021) Exploring region relationships implicitly: image captioning with visual relationship attention. Image and Vision Computing 109:104146","journal-title":"Image and Vision Computing"},{"key":"4202_CR24","doi-asserted-by":"crossref","unstructured":"Guo Y, Liu Y, De Boer MH, Liu L, Lew MS (2018) A dual prediction network for image captioning. In: 2018 IEEE international conference on multimedia and expo (ICME), pp 1\u20136","DOI":"10.1109\/ICME.2018.8486491"},{"key":"4202_CR25","doi-asserted-by":"crossref","unstructured":"Zhong X, Nie G, Huang W, Liu W, Ma B, Lin C-W (2021) Attention-guided image captioning with adaptive global and local feature fusion, vol 78","DOI":"10.1016\/j.jvcir.2021.103138"},{"key":"4202_CR26","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"4202_CR27","doi-asserted-by":"crossref","unstructured":"Gu J, Cai J, Wang G, Chen T (2018) Stack-captioning: coarse-to-fine learning for image captioning. In: Proceedings of the AAAI conference on artificial intelligence 32","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"4202_CR28","unstructured":"Xu N, Zhang H, Liu A. -A., Nie W, Su Y, Nie J, Zhang Y (2019) Multi-level policy and reward-based deep reinforcement learning framework for image captioning. IEEE Transactions on Multimedia, 1\u20131."},{"key":"4202_CR29","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1016\/j.neucom.2019.12.073","volume":"387","author":"Y Wei","year":"2020","unstructured":"Wei Y, Wang L, Cao H, Shao M, Wu C (2020) Multi-attention generative adversarial network for image captioning. Neurocomputing 387:91\u201399","journal-title":"Neurocomputing"},{"key":"4202_CR30","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4651\u20134659","DOI":"10.1109\/CVPR.2016.503"},{"key":"4202_CR31","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, Pu Y, Tran K, Gao J, Carin L, Deng L (2017) Semantic compositional networks for visual captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 5630\u20135639","DOI":"10.1109\/CVPR.2017.127"},{"key":"4202_CR32","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. In: Proceedings of the IEEE international conference on computer vision, pp 4894\u20134902","DOI":"10.1109\/ICCV.2017.524"},{"key":"4202_CR33","doi-asserted-by":"crossref","unstructured":"Li N, Chen Z (2018) Image Cationing with Visual-Semantic LSTM. In: IJCAI, pp 793\u2013799","DOI":"10.24963\/ijcai.2018\/110"},{"key":"4202_CR34","doi-asserted-by":"crossref","unstructured":"Wu Q, Shen C, Liu L, Dick A, Van Den Hengel A (2016) What value do explicit high level concepts have in vision to language problems?. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 203\u2013212","DOI":"10.1109\/CVPR.2016.29"},{"key":"4202_CR35","doi-asserted-by":"crossref","unstructured":"Fang H, Gupta S, Iandola F, Srivastava RK, Deng L, Doll\u00e1r P, Gao J, He X, Mitchell M, Platt JC (2015) From captions to visual concepts and back. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1473\u20131482","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"4202_CR36","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"4202_CR37","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren S, He K, Girshick R, Sun J, Faster R -CNN (2016) Towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39:1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"4202_CR38","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10685\u201310694","DOI":"10.1109\/CVPR.2019.01094"},{"key":"4202_CR39","doi-asserted-by":"crossref","unstructured":"Guo L, Liu J, Tang J, Li J, Luo W, Lu H (2019) Aligning linguistic words and visual semantic units for image captioning. In: Proceedings of the 27th ACM international conference on multimedia, pp 765\u2013773","DOI":"10.1145\/3343031.3350943"},{"key":"4202_CR40","doi-asserted-by":"publisher","first-page":"103044","DOI":"10.1016\/j.jvcir.2021.103044","volume":"75","author":"J Zhang","year":"2021","unstructured":"Zhang J, Li K, Wang Z (2021) Parallel-fusion LSTM with synchronous semantic and visual information for image captioning. Journal of Visual Communication and Image Representation 75:103044","journal-title":"Journal of Visual Communication and Image Representation"},{"key":"4202_CR41","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp 2556\u20132565","DOI":"10.18653\/v1\/P18-1238"},{"key":"4202_CR42","unstructured":"Herdade S, Kappeler A, Boakye K, Soares J (2019) Image captioning: transforming objects into words. Advances in Neural Information Processing Systems 32"},{"key":"4202_CR43","doi-asserted-by":"crossref","unstructured":"He S, Liao W, Tavakoli HR, Yang M, Rosenhahn B, Pugeault N (2020) Image captioning through image transformer. In: Proceedings of the Asian conference on computer vision","DOI":"10.1007\/978-3-030-69538-5_10"},{"key":"4202_CR44","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei X-Y (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"4202_CR45","doi-asserted-by":"publisher","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. In: CVPR, pp 10575\u201310584. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01059","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"4202_CR46","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv:1607.06450"},{"key":"4202_CR47","doi-asserted-by":"crossref","unstructured":"Zhou B, Khosla A, Lapedriza A, Oliva A, Torralba A (2016) Learning deep features for discriminative localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2921\u20132929","DOI":"10.1109\/CVPR.2016.319"},{"key":"4202_CR48","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: common objects in context. In: European conference on computer vision, pp 740\u2013 755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4202_CR49","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"4202_CR50","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA, Bernstein MS, Fei-Fei L (2017) Visual genome: Connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis 123:32\u201373","journal-title":"Int J Comput Vis"},{"key":"4202_CR51","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: 2016 IEEE conference on computer vision and pattern recognition (CVPR), pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"4202_CR52","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the association for computational linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"4202_CR53","unstructured":"Banerjee S, Lavie A (2005) METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the Acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"4202_CR54","unstructured":"Lin C-Y (2004) Rouge: a package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"4202_CR55","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"4202_CR56","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: semantic propositional image caption evaluation. In: European conference on computer vision, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"4202_CR57","unstructured":"Kingma DP, Ba J (2014) Adam: a method for stochastic optimization. arXiv:1412.6980"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04202-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-022-04202-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-022-04202-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,5]],"date-time":"2024-10-05T13:41:14Z","timestamp":1728135674000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-022-04202-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":57,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["4202"],"URL":"https:\/\/doi.org\/10.1007\/s10489-022-04202-y","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"22 September 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 October 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}