{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T20:14:53Z","timestamp":1779308093343,"version":"3.51.4"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Vietnam National University Ho Chi Minh City","award":["C2023-26-11"],"award-info":[{"award-number":["C2023-26-11"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1007\/s00138-024-01599-z","type":"journal-article","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T06:59:25Z","timestamp":1724309965000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Transformer with multi-level grid features and depth pooling for image captioning"],"prefix":"10.1007","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1310-5808","authenticated-orcid":false,"given":"Doanh C.","family":"Bui","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0236-7992","authenticated-orcid":false,"given":"Tam V.","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6571-7075","authenticated-orcid":false,"given":"Khang","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,20]]},"reference":[{"issue":"5","key":"1599_CR1","doi-asserted-by":"publisher","first-page":"77","DOI":"10.1007\/s00138-022-01329-3","volume":"33","author":"M Zhong","year":"2022","unstructured":"Zhong, M., Zhang, H., Wang, Y., Xiong, H.: Bitransformer: augmenting semantic context in video captioning via bidirectional decoder. Mach. Vis. Appl. 33(5), 77 (2022)","journal-title":"Mach. Vis. Appl."},{"key":"1599_CR2","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"1599_CR3","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Jiang, Y.-G., Liu, W., Zhang, T.: Recurrent fusion network for image captioning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 499\u2013515 (2018)","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"1599_CR4","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1599_CR5","unstructured":"Xu, K., Ba, J., Kiros, R., et al.: Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning, PMLR, pp. 2048\u20132057 (2015)"},{"key":"1599_CR6","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1599_CR7","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.-Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"1599_CR8","unstructured":"Herdade, S., Kappeler, A., Boakye, K., Soares, J.: Image captioning: transforming objects into words. In: Wallach, H., Larochelle, H., Beygelzimer, A., d\u2019Alch\u00e9-Buc, F., Fox, E., Garnett, R. (eds.), Advances in Neural Information Processing Systems, vol. 32, Curran Associates, Inc. (2019). [Online]. Available: https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/680390c55bbd9ce416d1d69a9ab4760d-Paper.pdf"},{"key":"1599_CR9","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10575\u201310584 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"1599_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, X., Sun, X., Luo, Y., et al.: Rstnet: captioning with adaptive attention on visual and non-visual words. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15460\u201315469 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"1599_CR11","doi-asserted-by":"crossref","unstructured":"Luo, Y., Ji, J., Sun, X., et al.: Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 2286\u20132293 (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"1599_CR12","doi-asserted-by":"crossref","unstructured":"Wu, M., Zhang, X., Sun, X., et al.: Difnet: boosting visual information flow for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18020\u201318029 (2022)","DOI":"10.1109\/CVPR52688.2022.01749"},{"key":"1599_CR13","doi-asserted-by":"publisher","first-page":"32443","DOI":"10.1109\/ACCESS.2022.3158763","volume":"10","author":"K Nguyen","year":"2022","unstructured":"Nguyen, K., Bui, D.C., Trinh, T., Vo, N.D.: EAEs: effective augmented embedding spaces for text-based image captioning. IEEE Access 10, 32443\u201332452 (2022)","journal-title":"IEEE Access"},{"key":"1599_CR14","doi-asserted-by":"crossref","unstructured":"Zhang, W., Shi, H., Guo, J., et al.: Magic: multimodal relational graph adversarial inference for diverse and unpaired text-based image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. 3335\u20133343 (2022)","DOI":"10.1609\/aaai.v36i3.20243"},{"key":"1599_CR15","doi-asserted-by":"crossref","unstructured":"Hosseinzadeh, M., Wang, Y.: Image change captioning by learning from an auxiliary task. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2725\u20132734 (2021)","DOI":"10.1109\/CVPR46437.2021.00275"},{"key":"1599_CR16","doi-asserted-by":"crossref","unstructured":"Yue, S., Tu, Y., Li, L., Yang, Y., Gao, S., Yu, Z.: I3n: Intra-and inter-representation interaction network for change captioning. IEEE Trans. Multimedia (2023)","DOI":"10.1109\/TMM.2023.3242142"},{"key":"1599_CR17","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10968\u201310977 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"1599_CR18","unstructured":"Zhang, J., Fang, Z., Sun, H., Wang, Z.: Adaptive semantic-enhanced transformer for image captioning. IEEE Trans. Neural Netw. Learn. Syst. 1\u201312 (2022)"},{"key":"1599_CR19","doi-asserted-by":"publisher","first-page":"129","DOI":"10.1016\/j.neunet.2022.01.011","volume":"148","author":"T Xian","year":"2022","unstructured":"Xian, T., Li, Z., Zhang, C., Ma, H.: Dual global enhanced transformer for image captioning. Neural Netw. 148, 129\u2013141 (2022)","journal-title":"Neural Netw."},{"key":"1599_CR20","doi-asserted-by":"crossref","unstructured":"Wei, J., Li, Z., Zhu, J., Ma, H.: Enhance understanding and reasoning ability for image captioning. Appl. Intell. 1\u201317 (2022)","DOI":"10.1007\/s10489-022-03624-y"},{"key":"1599_CR21","doi-asserted-by":"crossref","unstructured":"Gao, Y., Wang, N., Suo, W., Sun, M., Wang, P.: Improving image captioning via enhancing dual-side context awareness. In: Proceedings of the 2022 International Conference on Multimedia Retrieval, pp. 389\u2013397 (2022)","DOI":"10.1145\/3512527.3531379"},{"key":"1599_CR22","doi-asserted-by":"crossref","unstructured":"Geng, M., Zhao, Q.: Improve image captioning by modeling dynamic scene graph extension. In: Proceedings of the 2022 International Conference on Multimedia Retrieval, pp. 398\u2013406 (2022)","DOI":"10.1145\/3512527.3531401"},{"key":"1599_CR23","doi-asserted-by":"crossref","unstructured":"Barraco, M., Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: With a little help from your own past: prototypical memory networks for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3021\u20133031 (2023)","DOI":"10.1109\/ICCV51070.2023.00282"},{"key":"1599_CR24","doi-asserted-by":"crossref","unstructured":"Jiang, H., Misra, I., Rohrbach, M., Learned-Miller, E., Chen, X.: In defense of grid features for visual question answering. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10264\u201310273 (2020)","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"1599_CR25","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1179\u20131195 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"1599_CR26","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"issue":"4","key":"1599_CR27","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1007\/s00138-023-01402-5","volume":"34","author":"Y Ma","year":"2023","unstructured":"Ma, Y., Wang, Y.: Feature refinement with multi-level context for object detection. Mach. Vis. Appl. 34(4), 49 (2023)","journal-title":"Mach. Vis. Appl."},{"issue":"4","key":"1599_CR28","doi-asserted-by":"publisher","first-page":"66","DOI":"10.1007\/s00138-023-01425-y","volume":"34","author":"S Sheng","year":"2023","unstructured":"Sheng, S., Jing, J., Jiao, X., Wang, Y., Dong, Z.: M\u00c6idm: multi-scale anomaly embedding inpainting and discrimination for surface anomaly detection. Mach. Vis. Appl. 34(4), 66 (2023)","journal-title":"Mach. Vis. Appl."},{"issue":"6","key":"1599_CR29","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/s00138-023-01460-9","volume":"34","author":"J Zhang","year":"2023","unstructured":"Zhang, J., Liu, M., Wang, X.: Global attention guided multi-scale network for face image super-resolution. Mach. Vis. Appl. 34(6), 106 (2023)","journal-title":"Mach. Vis. Appl."},{"key":"1599_CR30","doi-asserted-by":"crossref","unstructured":"Farooq Bhat, S., Alhashim, I., Wonka, P.: Adabins: Depth estimation using adaptive bins. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4008\u20134017 (2021)","DOI":"10.1109\/CVPR46437.2021.00400"},{"key":"1599_CR31","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"1599_CR32","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding (2018). arXiv preprint arXiv:1810.04805"},{"key":"1599_CR33","doi-asserted-by":"publisher","first-page":"778","DOI":"10.1016\/j.neucom.2022.06.062","volume":"501","author":"J Ji","year":"2022","unstructured":"Ji, J., Wang, M., Zhang, X., Lei, M., Qu, L.: Relation constraint self-attention for image captioning. Neurocomputing 501, 778\u2013789 (2022)","journal-title":"Neurocomputing"},{"key":"1599_CR34","doi-asserted-by":"crossref","unstructured":"Dong, L., Xu, S., Xu, B.: Speech-transformer: a no-recurrence sequence-to-sequence model for speech recognition. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), IEEE, pp. 5884\u20135888 (2018)","DOI":"10.1109\/ICASSP.2018.8462506"},{"key":"1599_CR35","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, vol. 1, p. 2 (2019)"},{"key":"1599_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, P., Li, X., Hu, X., et al.: Vinvl: Revisiting visual representations in vision-language models. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5575\u20135584 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"issue":"8","key":"1599_CR37","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"1599_CR38","first-page":"14200","volume":"34","author":"A Nagrani","year":"2021","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. Adv. Neural Inf. Process. Syst. 34, 14200\u201314213 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1599_CR39","unstructured":"Liu, F., Liu, Y., Ren, X., He, X., Sun, X.: Aligning visual regions and textual concepts for semantic-grounded image representations. In: NeurIPS, pp. 6847\u20136857 (2019)"},{"key":"1599_CR40","unstructured":"Chen, X., Fang, H., Lin, T.-Y., et al.: Microsoft coco captions: data collection and evaluation server (2015). arXiv preprint arXiv:1504.00325"},{"key":"1599_CR41","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.-J.: Bleu: A method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"1599_CR42","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"1599_CR43","unstructured":"Rouge, L.C.: A package for automatic evaluation of summaries. In: Proceedings of Workshop on Text Summarization of ACL, Spain (2004)"},{"key":"1599_CR44","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: Spice: Semantic propositional image caption evaluation. In: European Conference on Computer Vision, pp. 382\u2013398. Springer (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1599_CR45","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"1599_CR46","doi-asserted-by":"crossref","unstructured":"Robertson, S.: Understanding inverse document frequency: on theoretical arguments for IDF. J. Document. (2004)","DOI":"10.1108\/00220410410560582"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01599-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-024-01599-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01599-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T04:07:04Z","timestamp":1726027624000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-024-01599-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,20]]},"references-count":46,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,9]]}},"alternative-id":["1599"],"URL":"https:\/\/doi.org\/10.1007\/s00138-024-01599-z","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,20]]},"assertion":[{"value":"31 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 April 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 August 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 August 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"118"}}