{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T04:17:02Z","timestamp":1778559422029,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3]]},"DOI":"10.1145\/3672919.3672964","type":"proceedings-article","created":{"date-parts":[[2024,7,24]],"date-time":"2024-07-24T12:39:43Z","timestamp":1721824783000},"page":"237-245","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["MGTANet: Multi-Scale Guided Token Attention Network for Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2418-4647","authenticated-orcid":false,"given":"Wenhao","family":"Jia","sequence":"first","affiliation":[{"name":"School of Computer and Information, Hefei University of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7547-5659","authenticated-orcid":false,"given":"Ronggui","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer and Information, Hefei University of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9004-3244","authenticated-orcid":false,"given":"Juan","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Computer and Information, Hefei University of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4841-5318","authenticated-orcid":false,"given":"Lixia","family":"Xua","sequence":"additional","affiliation":[{"name":"School of Computer and Information, Hefei University of Technology, China"}]}],"member":"320","published-online":{"date-parts":[[2024,7,24]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998\u20136008.[7]."},{"key":"e_1_3_2_1_2_1","volume-title":"Faster r-cnn: towards real-time object detection with region proposal networks","author":"Ren K.","year":"2016","unstructured":"S. Ren, K. He, R. Girshick, J. Sun, Faster r-cnn: towards real-time object detection with region proposal networks, IEEE transactions on pattern analysis and machine intelligence 39 (6), 2016, 1137\u20131149.[8]."},{"key":"e_1_3_2_1_3_1","first-page":"15474.11","volume-title":"in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Zhang X.","year":"2021","unstructured":"X. Zhang, X. Sun, Y. Luo, J. Ji, Y. Zhou, Y. Wu, F. Huang, R. Ji, Rstnet: Captioning with adaptive attention on visual and non-visual words,in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 15465\u201315474.11."},{"key":"e_1_3_2_1_4_1","first-page":"2293.12","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence, no. 3","author":"Luo J.","year":"2021","unstructured":"Y. Luo, J. Ji, X. Sun, L. Cao, Y. Wu, F. Huang, C.-W. Lin, R. Ji, Dual-level collaborative transformer for image captioning, in: Proceedings of the AAAI Conference on Artificial Intelligence, no. 3, 2021, pp. 2286\u20132293.12."},{"key":"e_1_3_2_1_5_1","volume-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10575\u201310584","author":"Cornia M.","year":"2019","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 10575\u201310584, 2019, 22."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"e_1_3_2_1_8_1","volume-title":"Annual Meeting of the Association for Computational Linguistics","author":"Papineni K.","year":"2002","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: a method for automatic evaluation of machine translation. In: Annual Meeting of the Association for Computational Linguistics, 2002, 25."},{"key":"e_1_3_2_1_9_1","volume-title":"Lavie","author":"Banerjee S.","year":"2005","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: IEEvaluation@ACL, 2005, 26."},{"key":"e_1_3_2_1_10_1","volume-title":"Annual Meeting of the Association for Computational Linguistics","author":"Lin C.-Y.","year":"2004","unstructured":"Lin, C.-Y.: Rouge: A package for automatic evaluation of summaries. In: Annual Meeting of the Association for Computational Linguistics, 2004, 27."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_2_1_12_1","volume-title":"Soares","author":"Herdade S.","year":"1906","unstructured":"Herdade, S., Kappeler, A., Boakye, K., Soares, J.: Image captioning: Transforming objects into words. ArXiv abs\/1906.05963, 2019, 31."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/91"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/224"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2022.117174"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3093725"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-023-00307-3"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Young P Lai A Hodosh M Hockenmaier J 2014 From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans Assoc Comput Linguist 2:67\u20137839.","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080671"},{"key":"e_1_3_2_1_21_1","first-page":"2057","volume-title":"International conference on machine learning, PMLR","author":"Li K.","year":"2015","unstructured":"Li K. Xu, J. Ba, R. Kiros, K. Cho, A. Courville, R. Salakhudinov, R. Zemel, Y. Bengio, Show, attend and tell: Neural image caption generation with visual attention, in: International conference on machine learning, PMLR, 2015, pp. 2048\u20132057."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Ding S Qu S Xi Y Wan S 2020 Stimulus-driven and concept driven analysis for image caption generation. Neurocomputing 398:520\u201353043.","DOI":"10.1016\/j.neucom.2019.04.095"},{"key":"e_1_3_2_1_24_1","first-page":"26254","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Donahue L.","year":"2015","unstructured":"J. Donahue, L. Anne Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, K. Saenko, T. Darrell, Long-term recurrent convolutional networks for visual recognition and description, in: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015, pp. 26254."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Ma Y Ji J Sun X Zhou Y Ji R 2023 Towards local visual modeling for image captioning. Pattern Recogn. 138:10942045.","DOI":"10.1016\/j.patcog.2023.109420"},{"key":"e_1_3_2_1_26_1","first-page":"4575","volume-title":"2015 IEEE conference on computer vision and pattern recognition (CVPR)","author":"Zitnick CL","year":"2014","unstructured":"V edantam R, Zitnick CL, Parikh D, 2014, Cider: Consensus-based image description evaluation. In: 2015 IEEE conference on computer vision and pattern recognition (CVPR), pp 4566\u20134575."},{"key":"e_1_3_2_1_27_1","volume-title":"European Conference on Computer Vision.","author":"Pan Y","year":"2018","unstructured":"Y ao T, Pan Y, Li Y, Mei T, 2018, Exploring visual relationship for image captioning. In: European Conference on Computer Vision."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"}],"event":{"name":"CSAIDE 2024: 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy","location":"Nanjing China","acronym":"CSAIDE 2024"},"container-title":["Proceedings of the 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672919.3672964","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3672919.3672964","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T16:35:22Z","timestamp":1755880522000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672919.3672964"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3]]},"references-count":28,"alternative-id":["10.1145\/3672919.3672964","10.1145\/3672919"],"URL":"https:\/\/doi.org\/10.1145\/3672919.3672964","relation":{},"subject":[],"published":{"date-parts":[[2024,3]]},"assertion":[{"value":"2024-07-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}