{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T17:10:45Z","timestamp":1772644245547,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"9","license":[{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T00:00:00Z","timestamp":1701907200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Natural Science Foundation Youth Fund of Jiangsu Province of China","award":["No. BK20210931"],"award-info":[{"award-number":["No. BK20210931"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61902179"],"award-info":[{"award-number":["No. 61902179"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s00371-023-03180-7","type":"journal-article","created":{"date-parts":[[2023,12,7]],"date-time":"2023-12-07T13:02:08Z","timestamp":1701954128000},"page":"6533-6544","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Transformer model incorporating local graph semantic attention for image caption"],"prefix":"10.1007","volume":"40","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6329-3803","authenticated-orcid":false,"given":"Kui","family":"Qian","sequence":"first","affiliation":[]},{"given":"Yuchen","family":"Pan","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Tian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,12,7]]},"reference":[{"key":"3180_CR1","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai, S., An, S.: A survey on automatic image caption generation. Neurocomputing 311, 291\u2013304 (2018)","journal-title":"Neurocomputing"},{"issue":"3","key":"3180_CR2","doi-asserted-by":"publisher","first-page":"445","DOI":"10.1007\/s00371-018-1566-y","volume":"35","author":"X Liu","year":"2019","unstructured":"Liu, X., Xu, Q., Wang, N.: A survey on deep neural network-based image captioning. Vis. Comput. 35(3), 445\u2013470 (2019)","journal-title":"Vis. Comput."},{"key":"3180_CR3","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1016\/j.patrec.2019.03.021","volume":"123","author":"S Ding","year":"2019","unstructured":"Ding, S., Qu, S., Xi, Y., et al.: Image caption generation with high-level image features. Pattern Recogn. Lett. 123, 89\u201395 (2019)","journal-title":"Pattern Recogn. Lett."},{"key":"3180_CR4","doi-asserted-by":"crossref","unstructured":"Amritkar, C., Jabade, V.: Image caption generation using deep learning technique. In: 2018 4th International Conference on Computing Communication Control and Automation (ICCUBEA), pp. 1\u20134. IEEE (2018)","DOI":"10.1109\/ICCUBEA.2018.8697360"},{"key":"3180_CR5","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., et al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"issue":"6","key":"3180_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MDZ Hossain","year":"2019","unstructured":"Hossain, M.D.Z., Sohel, F., Shiratuddin, M.F., et al.: A comprehensive survey of deep learning for image captioning. ACM Comput. Surv. 51(6), 1\u201336 (2019)","journal-title":"ACM Comput. Surv."},{"key":"3180_CR7","doi-asserted-by":"crossref","unstructured":"Luo, R.C., Hsu, Y.T., Wen, Y.C., et al.: Visual image caption generation for service robotics and industrial applications. In: 2019 IEEE International Conference on Industrial Cyber Physical Systems (ICPS). IEEE, pp 827\u2013832 (2019)","DOI":"10.1109\/ICPHYS.2019.8780171"},{"issue":"3","key":"3180_CR8","doi-asserted-by":"publisher","first-page":"478","DOI":"10.1109\/JSTSP.2020.2987728","volume":"14","author":"C Zhang","year":"2020","unstructured":"Zhang, C., Yang, Z., He, X., et al.: Multimodal intelligence: representation learning, information fusion, and applications. IEEE J. Sel. Top. Signal Process. 14(3), 478\u2013493 (2020)","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"3180_CR9","doi-asserted-by":"publisher","first-page":"68","DOI":"10.1016\/j.patrec.2020.12.001","volume":"141","author":"X Li","year":"2021","unstructured":"Li, X., Ye, Z., Zhang, Z., et al.: Clothes image caption generation with attribute detection and visual attention model. Pattern Recogn. Lett. 141, 68\u201374 (2021)","journal-title":"Pattern Recogn. Lett."},{"key":"3180_CR10","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., et al.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"3180_CR11","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.: Multimodal neural language models. In: International Conference on Machine Learning, pp. 595\u2013603 (2014)"},{"key":"3180_CR12","doi-asserted-by":"crossref","unstructured":"Parikh, H., Sawant, H., Parmar, B., et al.: Encoder\u2013decoder architecture for image caption generation. In: 2020 3rd International Conference on Communication System, Computing and IT Applications (CSCITA), pp. 174\u2013179. IEEE (2020)","DOI":"10.1109\/CSCITA47329.2020.9137802"},{"key":"3180_CR13","unstructured":"Xu, K., Ba, J., Kiros, R., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning. In: PMLR, pp. 2048\u20132057 (2015)"},{"issue":"2","key":"3180_CR14","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.102178","volume":"57","author":"M Liu","year":"2020","unstructured":"Liu, M., Li, L., Hu, H., et al.: Image caption generation with dual attention mechanism. Inf. Process. Manag. 57(2), 102178 (2020)","journal-title":"Inf. Process. Manag."},{"issue":"3","key":"3180_CR15","doi-asserted-by":"publisher","first-page":"2207","DOI":"10.1007\/s00521-021-06557-8","volume":"34","author":"K Qian","year":"2022","unstructured":"Qian, K., Tian, L.: A topic-based multi-channel attention model under hybrid mode for image caption. Neural Comput. Appl. 34(3), 2207\u20132216 (2022)","journal-title":"Neural Comput. Appl."},{"key":"3180_CR16","doi-asserted-by":"crossref","unstructured":"Zhong, J., Cao, Y., Zhu, Y., et al.: Multi-channel weighted fusion for image captioning. Vis. Comput. 1\u201318 (2022)","DOI":"10.1007\/s00371-022-02716-7"},{"key":"3180_CR17","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"issue":"12","key":"3180_CR18","doi-asserted-by":"publisher","first-page":"4467","DOI":"10.1109\/TCSVT.2019.2947482","volume":"30","author":"J Yu","year":"2019","unstructured":"Yu, J., Li, J., Yu, Z., et al.: Multimodal transformer with multi-view visual representation for image captioning. IEEE Trans. Circuits Syst. Video Technol. 30(12), 4467\u20134480 (2019)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3180_CR19","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Zhang, Y., Hu, Z., et al.: Semi-autoregressive transformer for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 3139\u20133143 (2021)","DOI":"10.1109\/ICCVW54120.2021.00350"},{"key":"3180_CR20","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M,. Baraldi, L., et al.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578-10587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"3180_CR21","doi-asserted-by":"publisher","first-page":"43","DOI":"10.1016\/j.patrec.2020.12.020","volume":"143","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Shi, X., Mi, S., et al.: Image captioning with transformer and knowledge graph. Pattern Recogn. Lett. 143, 43\u201349 (2021)","journal-title":"Pattern Recogn. Lett."},{"issue":"3","key":"3180_CR22","doi-asserted-by":"publisher","first-page":"1371","DOI":"10.1109\/TPAMI.2020.3025814","volume":"44","author":"T Chen","year":"2020","unstructured":"Chen, T., Lin, L., Chen, R., et al.: Knowledge-guided multi-label few-shot learning for general image recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(3), 1371\u20131384 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3180_CR23","doi-asserted-by":"crossref","unstructured":"Goel, A., Fernando, B., Nguyen, T.S., et al.: Injecting prior knowledge into image caption generation. In: European Conference on Computer Vision, pp. 369\u2013385. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-66096-3_26"},{"issue":"2","key":"3180_CR24","doi-asserted-by":"publisher","first-page":"111","DOI":"10.1007\/s13735-022-00228-7","volume":"11","author":"J Yan","year":"2022","unstructured":"Yan, J., Xie, Y., Luan, X., et al.: Caption TLSTMs: combining transformer with LSTMs for image captioning. Int. J. Multimedia Inf. Retrieval 11(2), 111\u2013121 (2022)","journal-title":"Int. J. Multimedia Inf. Retrieval"},{"key":"3180_CR25","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian, T., Li, Z., Zhang, C., et al.: Dual global enhanced transformer for image captioning. Neural Netw. 148, 129\u2013141 (2022)","journal-title":"Neural Netw."},{"key":"3180_CR26","doi-asserted-by":"crossref","unstructured":"Ji, J., Luo, Y., Sun, X., et al.: Improving image captioning by leveraging intra-and inter-layer global representation in transformer network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35(2), pp. 1655\u20131663 (2021)","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"3180_CR27","unstructured":"Shao, Z., Han, J., Marnerides, D., et al.: Region-object relation-aware dense captioning via transformer. IEEE Trans. Neural Netw. Learn. Syst. (2022)"},{"key":"3180_CR28","doi-asserted-by":"crossref","unstructured":"Valentini-Botinhao, C., Wang, X., Takaki, S., et al.: Investigating RNN-based speech enhancement methods for noise-robust text-to-speech. In: Proceedings of the 9th ISCA Speech Synthesis Workshop, pp. 146\u2013152 (2016)","DOI":"10.21437\/SSW.2016-24"},{"key":"3180_CR29","doi-asserted-by":"crossref","unstructured":"Deselaers, T., Hasan, S., Bender, O., et al.: A deep learning approach to machine transliteration. In: Proceedings of the 4th Workshop on Statistical Machine Translation, pp 233\u2013241 (2009)","DOI":"10.3115\/1626431.1626476"},{"key":"3180_CR30","doi-asserted-by":"publisher","first-page":"343","DOI":"10.1613\/jair.1.12007","volume":"69","author":"F Stahlberg","year":"2020","unstructured":"Stahlberg, F.: Neural machine translation: a review. J. Artif. Intell. Res. 69, 343\u2013418 (2020)","journal-title":"J. Artif. Intell. Res."},{"key":"3180_CR31","doi-asserted-by":"crossref","unstructured":"Chen, X., Lawrence, Zitnick, C.: Mind\u2019s eye: a recurrent visual representation for image caption generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2422\u20132431 (2015)","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"3180_CR32","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1016\/j.neucom.2018.12.026","volume":"333","author":"YH Tan","year":"2019","unstructured":"Tan, Y.H., Chan, C.S.: Phrase-based image caption generator with hierarchical LSTM network. Neurocomputing 333, 86\u2013100 (2019)","journal-title":"Neurocomputing"},{"key":"3180_CR33","doi-asserted-by":"crossref","unstructured":"Khademi, M., Schulte, O.: Image caption generation with hierarchical contextual visual spatial attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 1943\u20131951 (2018)","DOI":"10.1109\/CVPRW.2018.00260"},{"key":"3180_CR34","doi-asserted-by":"crossref","unstructured":"Youm, Q., Jin, H., Wang, Z., et al.: Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"3180_CR35","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., et al.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"3180_CR36","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"issue":"10","key":"3180_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3505244","volume":"54","author":"S Khan","year":"2022","unstructured":"Khan, S., Naseer, M., Hayat, M., et al.: Transformers in vision: a survey. ACM Comput. Surv. (CSUR) 54(10), 1\u201341 (2022)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"3180_CR38","doi-asserted-by":"crossref","unstructured":"Li, G., Zhu, L., Liu, P., et al.: Entangled transformer for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8928\u20138937 (2019)","DOI":"10.1109\/ICCV.2019.00902"},{"key":"3180_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/LGRS.2022.3198234","volume":"19","author":"H Kandala","year":"2022","unstructured":"Kandala, H., Saha, S., Banerjee, B., et al.: Exploring transformer and multilabel classification for remote sensing image captioning. IEEE Geosci. Remote Sens. Lett. 19, 1\u20135 (2022)","journal-title":"IEEE Geosci. Remote Sens. Lett."},{"key":"3180_CR40","doi-asserted-by":"crossref","unstructured":"Wang, J., Chen, Z., Ma, A., et al.: Capformer: pure transformer for remote sensing image caption. In: IGARSS 2022\u20132022 IEEE International Geoscience and Remote Sensing Symposium. IEEE, pp. 7996\u20137999 (2022)","DOI":"10.1109\/IGARSS46834.2022.9883199"},{"key":"3180_CR41","doi-asserted-by":"crossref","unstructured":"Nguyen, V. Q., Suganuma, M., Okatani, T.: Grit: Faster and better image captioning transformer using dual visual features. In: European Conference on Computer Vision, pp. 167\u2013184. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"3180_CR42","unstructured":"Dehmamy, N., Barab\u00e1si, A.L., Yu, R.: Understanding the representation power of graph neural networks in learning graph topology. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"3180_CR43","unstructured":"Franceschi, L., Niepert, M., Pontil, M., et al.: Learning discrete structures for graph neural networks. International Conference on Machine Learning. PMLR, pp. 1972\u20131982 (2019)"},{"key":"3180_CR44","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2023.3336053","volume":"61","author":"G Zhou","year":"2023","unstructured":"Zhou, G., Xu, J., Chen, W., et al.: Deep feature enhancement method for land cover with irregular and sparse spatial distribution features: a case study on open-pit mining[J]. IEEE Trans. Geosci. Remote Sens. 61, 1\u201320 (2023)","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"3180_CR45","doi-asserted-by":"crossref","unstructured":"Chen, Z.M., Wei, X.S., Wang, P., et al.: Multi-label image recognition with graph convolutional networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5177\u20135186 (2019)","DOI":"10.1109\/CVPR.2019.00532"},{"key":"3180_CR46","doi-asserted-by":"crossref","unstructured":"Ma, Q., Yuan, C., Zhou, W., et al.: Label-specific dual graph neural network for multi-label text classification. In: Proceedings of the 11th International Joint Conference on Natural Language Processing, pp. 3855\u20133864 (2021)","DOI":"10.18653\/v1\/2021.acl-long.298"},{"issue":"4","key":"3180_CR47","first-page":"691","volume":"22","author":"CL Chowdhary","year":"2019","unstructured":"Chowdhary, C.L., Goyal, A., Vasnani, B.K.: Experimental assessment of beam search algorithm for improvement in image caption generation. J. Appl. Sci. Eng. 22(4), 691\u2013698 (2019)","journal-title":"J. Appl. Sci. Eng."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03180-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-023-03180-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03180-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,13]],"date-time":"2024-08-13T15:17:06Z","timestamp":1723562226000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-023-03180-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,7]]},"references-count":47,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["3180"],"URL":"https:\/\/doi.org\/10.1007\/s00371-023-03180-7","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,7]]},"assertion":[{"value":"10 November 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 December 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}