{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:35:18Z","timestamp":1772120118879,"version":"3.50.1"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T00:00:00Z","timestamp":1677110400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T00:00:00Z","timestamp":1677110400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172118"],"award-info":[{"award-number":["62172118"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61876049"],"award-info":[{"award-number":["61876049"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61902086"],"award-info":[{"award-number":["61902086"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Nature Science key Foundation of Guangxi","award":["2021GXNSFDA196002"],"award-info":[{"award-number":["2021GXNSFDA196002"]}]},{"DOI":"10.13039\/501100020739","name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing","doi-asserted-by":"publisher","award":["GIIP2006"],"award-info":[{"award-number":["GIIP2006"]}],"id":[{"id":"10.13039\/501100020739","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020739","name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing","doi-asserted-by":"publisher","award":["GIIP2007"],"award-info":[{"award-number":["GIIP2007"]}],"id":[{"id":"10.13039\/501100020739","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020739","name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing","doi-asserted-by":"publisher","award":["GIIP2008"],"award-info":[{"award-number":["GIIP2008"]}],"id":[{"id":"10.13039\/501100020739","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Innovation Project of Guangxi Graduate Education","award":["YCB2021070"],"award-info":[{"award-number":["YCB2021070"]}]},{"name":"Innovation Project of Guangxi Graduate Education","award":["YCBZ2018052"],"award-info":[{"award-number":["YCBZ2018052"]}]},{"name":"Innovation Project of Guangxi Graduate Education","award":["YCSW2022269"],"award-info":[{"award-number":["YCSW2022269"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s13735-023-00266-9","type":"journal-article","created":{"date-parts":[[2023,2,23]],"date-time":"2023-02-23T11:03:05Z","timestamp":1677150185000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["LG-MLFormer: local and global MLP for image captioning"],"prefix":"10.1007","volume":"12","author":[{"given":"Zetao","family":"Jiang","sequence":"first","affiliation":[]},{"given":"Xiuxian","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zhongyi","family":"Zhai","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,2,23]]},"reference":[{"issue":"4","key":"266_CR1","first-page":"652","volume":"39","author":"V Oriol","year":"2016","unstructured":"Oriol V, Alexander T, Samy B, Dumitru E (2016) Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE Trans Pattern Anal Mach Intell 39(4):652\u2013663","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"266_CR2","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 375-383","DOI":"10.1109\/CVPR.2017.345"},{"key":"266_CR3","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and vqa. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077-6086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"266_CR4","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"266_CR5","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X et al. (2017) Semantic compositional networks for visual captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.127"},{"key":"266_CR6","doi-asserted-by":"crossref","unstructured":"Chen L, Zhang H, Xiao J, Nie L, Shao J, Liu W, Chua T-S (2017) Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.667"},{"key":"266_CR7","unstructured":"Herdade S, Kappeler A, Boakye K, Soares J (2019) Image captioning: Transforming objects into words. In: Advances in neural information processing systems. pp 11137\u201311147"},{"key":"266_CR8","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 10578-10587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"266_CR9","doi-asserted-by":"crossref","unstructured":"Ji J, Luo Y, Sun X, Chen F, Luo G, Wu Y, Gao Y, Ji R (2021) Improving image captioning by leveraging intra- and inter-layer global representation in transformer network. In: Proceedings of the AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"266_CR10","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y, Mei T (2020) X-linear attention networks for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 10971-10980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"266_CR11","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. In: Advances in neural information processing systems, pp 5998-6008"},{"key":"266_CR12","doi-asserted-by":"crossref","unstructured":"Zhang X, Sun X, Luo Y, Ji J, Zhou Y, Wu Y, Huang F, Ji R (2021) RSTNet: Captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 15465-15474","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"266_CR13","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian T, Li Z, Zhang C, Ma H (2022) ADual Global Enhanced Transformer for image captioning. Neural Netw 148:129\u2013141","journal-title":"Neural Netw"},{"key":"266_CR14","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-020-3523-6","author":"J Wang","year":"2022","unstructured":"Wang J, Li Y, Pan Y et al (2022) Contextual and selective attention networks for image captioning. Sci China Inf Sci. https:\/\/doi.org\/10.1007\/s11432-020-3523-6","journal-title":"Sci China Inf Sci"},{"key":"266_CR15","doi-asserted-by":"crossref","unstructured":"Luo Y, Ji J, Sun X, Cao L, Wu Y, Huang F, Lin C-W, Ji R (2021) Dual-Level Collaborative Transformer for Image Captioning. arXiv preprint arXiv:2101.06462","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"266_CR16","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u2019ar P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: European conference on computer vision, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"266_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"266_CR18","unstructured":"Tolstikhin I, Houlsby N, Kolesnikov A et al. (2021) MLP-Mixer: an all-MLP Architecture for Vision. arXiv preprint arXiv:2105.01601"},{"key":"266_CR19","doi-asserted-by":"crossref","unstructured":"Kolesnikov A, Beyer L, Zhai X et al. (2019) Big transfer (BiT): general visual representation learning. arXiv preprint arXiv:1912.11370","DOI":"10.1007\/978-3-030-58558-7_29"},{"key":"266_CR20","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A et al. (2021) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"266_CR21","unstructured":"Brock A, De S, Smith SL, Simonyan K (2021) High-Performance large-scale image recognition without normalization. arXiv preprint arXiv:2102.06171"},{"key":"266_CR22","doi-asserted-by":"crossref","unstructured":"Vaswani A, Ramachandran P, Srinivas A, Parmar N, Hechtman B, Shlens J (2021) Scaling local self-attention for parameter efficient visual backbones. arXiv preprint arXiv:2103.12731","DOI":"10.1109\/CVPR46437.2021.01270"},{"key":"266_CR23","doi-asserted-by":"crossref","unstructured":"Guo M-H, Liu Z-N, Mu T-J, Hu S-M (2021) Beyond self-attention: external attention using two linear layers for visual tasks. arXiv preprint arXiv:2105.02358","DOI":"10.1109\/TPAMI.2022.3211006"},{"key":"266_CR24","unstructured":"Liu H, Dai Z, So DR, Len QV (2021) Pay Attention to MLPs. arXiv preprint arXiv:2105.08050"},{"key":"266_CR25","unstructured":"Ding X, Xia C, Zhang X, Chu X, Han J, Ding G (2021) RepMLP: re-parameterizing convolutions into fully-connected layers for image recognition. arXiv preprint arXiv:2105.01883"},{"key":"266_CR26","doi-asserted-by":"crossref","unstructured":"Chen X, Ma L, Jiang W, Yao J, Liu W (2018) Regularizing RNNs for caption generation by reconstructing the past with the present. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2018.00834"},{"key":"266_CR27","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3156-3164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"266_CR28","unstructured":"Mao J, Xu W, Yang Y, Wang J, Yuille AL (2014) Explain images with multimodal recurrent neural networks. arXiv preprint arXiv:1410.1090"},{"key":"266_CR29","doi-asserted-by":"crossref","unstructured":"Cornia M, Baraldi L, Cucchiara R (2019) Show, control and tell: a framework for generating controllable and grounded captions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8307\u20138316","DOI":"10.1109\/CVPR.2019.00850"},{"issue":"12","key":"266_CR30","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni G et al (2013) Babytalk: Understanding and generating simple image descriptions. IEEE Trans Pattern Anal Mach Intell 35(12):2891\u20132903","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"266_CR31","doi-asserted-by":"crossref","unstructured":"Ushiku Y, Yamaguchi M, Mukuta Y et al. (2015) Common subspace for model and similarity: phrase learning for caption generation from images. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2668\u20132676","DOI":"10.1109\/ICCV.2015.306"},{"key":"266_CR32","unstructured":"Mitchell M et al. (2012) Midge: Generating image descriptions from computer vision detections. In: Proceedings of the 13th conference of the European chapter of the association for computational linguistics, pp 747\u2013756"},{"key":"266_CR33","doi-asserted-by":"crossref","unstructured":"Jiang H et al. (2020) In defense of grid features for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 10267\u201310276","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"266_CR34","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.131"},{"key":"266_CR35","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics","DOI":"10.3115\/1073083.1073135"},{"key":"266_CR36","unstructured":"Banerjee S, Lavie A (2005) Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization"},{"key":"266_CR37","unstructured":"Lin C-Y (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out: proceedings of the ACL workshop, vol 8"},{"key":"266_CR38","doi-asserted-by":"crossref","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"266_CR39","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) SPICE: semantic propositional image caption evaluation. In: Proceedings of the European conference on computer vision","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"266_CR40","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Jiang Y-G, Liu W, Zhang T (2018) Recurrent fusion network for image captioning. In: Proceedings of the European conference on computer vision, pp 499\u2013515","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"266_CR41","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision, pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"266_CR42","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 10685\u201310694","DOI":"10.1109\/CVPR.2019.01094"},{"key":"266_CR43","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei X-Y (2019) Attention on Attention for Image Captioning. In: Proceedings of the international conference on computer vision","DOI":"10.1109\/ICCV.2019.00473"},{"key":"266_CR44","doi-asserted-by":"crossref","unstructured":"Li G et al. (2019) Entangled transformer for image captioning. In: Proceedings of the international conference on computer vision, pp 8928\u20138937","DOI":"10.1109\/ICCV.2019.00902"},{"key":"266_CR45","doi-asserted-by":"crossref","unstructured":"Barraco M, Stefanini M, Cornia M, Cascianelli S, Baraldi L, Cucchiara R (2022) CaMEL: mean teacher learning for image captioning. In: International conference on pattern recognition","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"266_CR46","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. arXiv preprint arXiv:2201.12086"},{"key":"266_CR47","doi-asserted-by":"crossref","unstructured":"Hu X, Gan Z, Wang J, Yang Z, Liu Z, Lu Y, Wang L (2022) Scaling up vision-language pre-training for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 17980\u201317989","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"266_CR48","doi-asserted-by":"crossref","unstructured":"Kuo Chia-Wen (2022) Beyond a pre-trained object detector: cross-modal textual and visual context for image captioning. In: CVPR","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"266_CR49","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning transferable visual models from natural language supervision. In: ICML"},{"issue":"8","key":"266_CR50","first-page":"3118","volume":"31","author":"W Lingxiang","year":"2020","unstructured":"Lingxiang W, Min X, Lei S, Ting Y, Mei T (2020) Noise augmented double-stream graph convolutional networks for image captioning. IEEE Trans Circuits Syst Video Technol 31(8):3118\u20133127","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"issue":"11","key":"266_CR51","doi-asserted-by":"publisher","first-page":"7706","DOI":"10.1109\/TCSVT.2022.3181490","volume":"32","author":"W Jiang","year":"2022","unstructured":"Jiang W, Zhou W, Hu H (2022) Double-stream position learning transformer network for image captioning. IEEE Trans Circuits Syst Video Technol 32(11):7706\u20137718. https:\/\/doi.org\/10.1109\/TCSVT.2022.3181490","journal-title":"IEEE Trans Circuits Syst Video Technol"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00266-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-023-00266-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00266-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,6,14]],"date-time":"2023-06-14T11:24:24Z","timestamp":1686741864000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-023-00266-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,2,23]]},"references-count":51,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["266"],"URL":"https:\/\/doi.org\/10.1007\/s13735-023-00266-9","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-2148019\/v1","asserted-by":"object"}]},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,2,23]]},"assertion":[{"value":"9 October 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 January 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 January 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 February 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors disclosed no relevant relationships.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}],"article-number":"4"}}