{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T21:30:47Z","timestamp":1774128647710,"version":"3.50.1"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2024,5,17]],"date-time":"2024-05-17T00:00:00Z","timestamp":1715904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,5,17]],"date-time":"2024-05-17T00:00:00Z","timestamp":1715904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Science Foundation of China","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100018617","name":"Liaoning Revitalization Talents Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100018617","id-type":"DOI","asserted-by":"publisher"}]},{"name":"The Scientific Research Project of Liaoning Province\uff08"},{"name":"Key R&D projects of Liaoning Provincial Department of Science and Technology"},{"name":"Liaoning Provincial Key Laboratory Special Fund"},{"DOI":"10.13039\/100019455","name":"\u0110\u1ea1i h\u1ecdc Kinh t\u1ebf Th\u00e0nh ph\u1ed1 H\u1ed3 Ch\u00ed Minh","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019455","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-19315-4","type":"journal-article","created":{"date-parts":[[2024,5,17]],"date-time":"2024-05-17T04:01:50Z","timestamp":1715918510000},"page":"10645-10664","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Dual visual align-cross attention-based image captioning transformer"],"prefix":"10.1007","volume":"84","author":[{"given":"Yonggong","family":"Ren","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinghan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenqiang","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuzhu","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Fu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2025-8319","authenticated-orcid":false,"given":"Dang N. H.","family":"Thanh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,5,17]]},"reference":[{"key":"19315_CR1","doi-asserted-by":"crossref","unstructured":"Zhou L, Palangi H, Zhang L, Corso J, Gao J (2020) Unified vision-language pre-training for image captioning and vqa. Proc AAAI Confer Artif Intell 34(07):13041\u201313049","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"19315_CR2","doi-asserted-by":"crossref","unstructured":"Hu X, Gan Z, Wang J, Yang Z, Liu Z, Lu Y, Wang L (2022) Scaling up vision-language pre-training for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. IEEE, pp 17980\u201317989","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"19315_CR3","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I\u00a0(2017) Attention is all you need. In: Advances in neural information processing systems, p 30"},{"key":"19315_CR4","unstructured":"Devlin J, Chang MW, Lee K, Toutanova K (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:181004805"},{"key":"19315_CR5","doi-asserted-by":"crossref","unstructured":"Luo Y, Ji J, Sun X, Cao L, Wu Y, Huang F,\u00a0Lin C-W,\u00a0Ji R (2021) Dual-level collaborative transformer for image captioning. Proc AAAI Confer Artif Intell 35(3):2286\u20132293","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"19315_CR6","first-page":"15465","volume":"2021","author":"X Zhang","year":"2021","unstructured":"Zhang X, Sun X, Luo Y, Ji J, Zhou Y, Wu Y, Huang F, Ji R (2021) Rstnet: Captioning with adaptive attention on visual and non-visual words. Proc IEEE Conf Comput Vis Pattern Recognit 2021:15465\u201315474","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"19315_CR7","first-page":"10578","volume":"2020","author":"M Cornia","year":"2020","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. Proc IEEE Conf Comput Vis Pattern Recognit 2020:10578\u201310587","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"19315_CR8","first-page":"6077","volume":"2018","author":"P Anderson","year":"2018","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. Proc IEEE Int Conf Comput Vis Pattern Recog 2018:6077\u20136086","journal-title":"Proc IEEE Int Conf Comput Vis Pattern Recog"},{"key":"19315_CR9","first-page":"375","volume":"2017","author":"J Lu","year":"2017","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. Proc IEEE Int Conf Comput Vis Pattern Recog 2017:375\u2013383","journal-title":"Proc IEEE Int Conf Comput Vis Pattern Recog"},{"key":"19315_CR10","first-page":"7008","volume":"2017","author":"SJ Rennie","year":"2017","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. Proc IEEE Int Conf Comput Vis Pattern Recog 2017:7008\u20137024","journal-title":"Proc IEEE Int Conf Comput Vis Pattern Recog"},{"key":"19315_CR11","first-page":"2048","volume":"2015","author":"K Xu","year":"2015","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: Neural image caption generation with visual attention. Int Conf Mach Learning, PMLR 2015:2048\u20132057","journal-title":"Int Conf Mach Learning, PMLR"},{"key":"19315_CR12","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159"},{"key":"19315_CR13","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian T, Li Z, Zhang C, Ma H (2022) Dual Global Enhanced Transformer for image captioning. Neural Netw 148:129\u2013141. https:\/\/doi.org\/10.1016\/j.neunet.2022.01.011","journal-title":"Neural Netw"},{"key":"19315_CR14","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnic CL (2014) Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13. Springer International Publishing, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"19315_CR15","first-page":"3128","volume":"2015","author":"A Karpathy","year":"2015","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. Proc IEEE Conf Comput Vis Pattern Recognit 2015:3128\u20133137","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"19315_CR16","first-page":"3156","volume":"2015","author":"O Vinyals","year":"2015","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: A neural image caption generator. Proc IEEE Conf Comput Vis Pattern Recognit 2015:3156\u20133164","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"19315_CR17","doi-asserted-by":"crossref","unstructured":"Ji J, Ma Y, Sun X, Zhou Y, Wu Y, Ji R (2022) Knowing what to learn: a metric-oriented focal mechanism for image captioning. IEEE Trans Image Process 31:4321\u20134335","DOI":"10.1109\/TIP.2022.3183434"},{"key":"19315_CR18","doi-asserted-by":"crossref","unstructured":"Ma Y, Ji J, Sun X, Zhou Y, Wu Y, Huang F, Ji R (2022) Knowing what it is: semantic-enhanced dual attention transformer. IEEE Trans Multimed 3723\u20133736","DOI":"10.1109\/TMM.2022.3164787"},{"key":"19315_CR19","first-page":"4894","volume":"2017","author":"T Yao","year":"2017","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. Proc IEEE Int Conf Comput Vis 2017:4894\u20134902","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"19315_CR20","doi-asserted-by":"crossref","unstructured":"Zohourianshahzadi Z, Kalita JK (2022) Neural attention for image captioning: review of outstanding methods. Artif Intell Rev 55(5):3833\u20133862","DOI":"10.1007\/s10462-021-10092-2"},{"key":"19315_CR21","unstructured":"Herdade S, Kappeler A, Boakye K, Soares J (2019) Image captioning: transforming objects into words. Adv Neural Inf Proces Syst 32"},{"issue":"6","key":"19315_CR22","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren S, He K, Girshick R, Sun J (2016) Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"8","key":"19315_CR23","doi-asserted-by":"publisher","first-page":"3118","DOI":"10.1109\/TCSVT.2020.3036860","volume":"31","author":"L Wu","year":"2021","unstructured":"Wu L, Xu M, Sang L, Yao T, Mei T (2021) Noise augmented double-stream graph convolutional networks for image captioning. IEEE Trans Circuits Syst Video Technol 31(8):3118\u20133127","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"19315_CR24","first-page":"10971","volume":"2020","author":"Y Pan","year":"2020","unstructured":"Pan Y, Yao T, Li Y, Mei T (2020) X-linear attention networks for image captioning. Proc IEEE Conf Comput Vis Pattern Recognit 2020:10971\u201310980","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"19315_CR25","doi-asserted-by":"publisher","first-page":"117174","DOI":"10.1016\/j.eswa.2022.117174","volume":"201","author":"C Wang","year":"2022","unstructured":"Wang C, Shen Y, Ji L (2022) Geometry attention transformer with position-aware LSTMs for image captioning. Expert Syst Appl 201:117174. https:\/\/doi.org\/10.1016\/j.eswa.2022.117174","journal-title":"Expert Syst Appl"},{"key":"19315_CR26","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.neucom.2021.10.014","volume":"468","author":"Y Wang","year":"2022","unstructured":"Wang Y, Xu J, Sun Y (2022) A visual persistence model for image captioning. Neurocomputing 468:48\u201359. https:\/\/doi.org\/10.1016\/j.neucom.2021.10.014","journal-title":"Neurocomputing"},{"key":"19315_CR27","doi-asserted-by":"crossref","unstructured":"Zhang Z, Wu Q, Wang Y, Chen F (2021) Exploring pairwise relationships adaptively from linguistic context in image captioning. IEEE Trans Multimed 24:3101\u20133113","DOI":"10.1109\/TMM.2021.3093725"},{"key":"19315_CR28","doi-asserted-by":"crossref","unstructured":"Yang X, Liu Y, Wang X (2022) Reformer: the relational transformer for image captioning. In: Proceedings of the 30th ACM International Conference on Multimedia. ACM, pp 5398\u20135406","DOI":"10.1145\/3503161.3548409"},{"key":"19315_CR29","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei XY (2019) Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision. IEEE, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"19315_CR30","doi-asserted-by":"publisher","unstructured":"Shao Z, Han J, Marnerides D, Debattista K (2022) Region-object relation-aware dense captioning via transformer. IEEE\u00a0Trans Neural Netw Learn Syst. https:\/\/doi.org\/10.1109\/TNNLS.2022.3152990","DOI":"10.1109\/TNNLS.2022.3152990"},{"key":"19315_CR31","doi-asserted-by":"publisher","first-page":"8753","DOI":"10.1109\/TMM.2023.3241517","volume":"25","author":"Z Shao","year":"2023","unstructured":"Shao Z, Han J, Debattista K, Pang Y (2023) Textual Context-Aware Dense Captioning With Diverse Words. IEEE Trans Multimedia 25:8753\u20138766.","journal-title":"IEEE Trans Multimedia"},{"key":"19315_CR32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3367416","author":"C Chen","year":"2024","unstructured":"Chen C, Han J, Debattista K (2024) Virtual Category Learning: A Semi-Supervised Learning Method for Dense Prediction with Extremely Limited Labels. IEEE Trans Pattern Anal Mach Intell. https:\/\/doi.org\/10.1109\/TPAMI.2024.3367416","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"19315_CR33","doi-asserted-by":"crossref","unstructured":"Li Y, Pan Y, Yao T, Mei T (2022) Comprehending and ordering semantics for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. IEEE, pp 17990\u201317999","DOI":"10.1109\/CVPR52688.2022.01746"},{"key":"19315_CR34","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. IEEE, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"1","key":"19315_CR35","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA et al (2017) Visual genome: Connecting language and vision using crowdsourced dense image annotations. IJCV 123(1):32\u201373","journal-title":"IJCV"},{"key":"19315_CR36","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"19315_CR37","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"19315_CR38","unstructured":"Lin CY (2004) Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381"},{"key":"19315_CR39","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. IEEE, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"19315_CR40","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer International Publishing, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"19315_CR41","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10685\u201310694","DOI":"10.1109\/CVPR.2019.01094"},{"key":"19315_CR42","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"19315_CR43","doi-asserted-by":"crossref","unstructured":"Ma Y, Ji J, Sun X, Zhou Y, Ji R (2023) Towards local visual modeling for image captioning. Pattern Recogn 138:109420","DOI":"10.1016\/j.patcog.2023.109420"},{"key":"19315_CR44","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Jiang YG, Liu W, Zhang T (2018) Recurrent fusion network for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 499\u2013515","DOI":"10.1007\/978-3-030-01216-8_31"},{"issue":"2","key":"19315_CR45","doi-asserted-by":"publisher","first-page":"710","DOI":"10.1109\/TPAMI.2019.2909864","volume":"44","author":"Z-J Zha","year":"2022","unstructured":"Zha Z-J, Liu D, Zhang H, Zhang Y, Wu F (2022) Context-aware visual policy network for fine-grained image captioning. IEEE Trans Pattern Anal Mach Intell 44(2):710\u2013722","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"19315_CR46","doi-asserted-by":"crossref","unstructured":"Ji J, Luo Y, Sun X, Chen F, Luo G, Wu Y, Gao Y, Ji R (2021) Improving image captioning by leveraging intra-and inter-layer global representation in transformer network. Proc AAAI Confer Artif Intell 35(2):1655\u20131663","DOI":"10.1609\/aaai.v35i2.16258"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19315-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-19315-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19315-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,1]],"date-time":"2025-05-01T05:02:49Z","timestamp":1746075769000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-19315-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,17]]},"references-count":46,"journal-issue":{"issue":"12","published-online":{"date-parts":[[2025,4]]}},"alternative-id":["19315"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-19315-4","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,5,17]]},"assertion":[{"value":"4 February 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 April 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no conflicts of interest. The research does not involve human and animal participants.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethics statement"}}]}}