{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T21:30:46Z","timestamp":1774128646219,"version":"3.50.1"},"reference-count":29,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2022,9,21]],"date-time":"2022-09-21T00:00:00Z","timestamp":1663718400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,9,21]],"date-time":"2022-09-21T00:00:00Z","timestamp":1663718400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006240"],"award-info":[{"award-number":["62006240"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s11760-022-02350-9","type":"journal-article","created":{"date-parts":[[2022,9,21]],"date-time":"2022-09-21T05:02:30Z","timestamp":1663736550000},"page":"1419-1427","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Input enhanced asymmetric transformer for image captioning"],"prefix":"10.1007","volume":"17","author":[{"given":"Chenhao","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Xia","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Qiduo","family":"Lu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,9,21]]},"reference":[{"key":"2350_CR1","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"2350_CR2","doi-asserted-by":"crossref","unstructured":"Cornia, M., et al.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"2350_CR3","doi-asserted-by":"crossref","unstructured":"Luo, Y., et al.: Dual-level collaborative transformer for image captioning. arXiv preprint arXiv:2101.06462 (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"2350_CR4","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: RSTNet: Captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"2350_CR5","unstructured":"Lu, X., Zhao, T., Lee, K.: Visualsparta: sparse transformer fragment-level matching for large-scale text-to-image search. arXiv e-prints: arXiv-2101 (2021)"},{"key":"2350_CR6","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. PMLR (2021)"},{"issue":"12","key":"2350_CR7","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., et al.: Babytalk: understanding and generating simple image descriptions. IEEE Trans. Pattern Anal. Mach. Intell. 35(12), 2891\u20132903 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2350_CR8","unstructured":"Mitchell, M., et al.: Midge: generating image descriptions from computer vision detections. In: Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics (2012)"},{"key":"2350_CR9","doi-asserted-by":"crossref","unstructured":"Ushiku, Y., et al.: Common subspace for model and similarity: phrase learning for caption generation from images. In: Proceedings of the IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.306"},{"key":"2350_CR10","unstructured":"Mao, J., et al.: Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014)"},{"key":"2350_CR11","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning. PMLR (2015)"},{"key":"2350_CR12","doi-asserted-by":"crossref","unstructured":"Ma, S., Han, Y.: Describing images by feeding LSTM with structural words. In: 2016 IEEE International Conference on Multimedia and Expo (ICME). IEEE (2016)","DOI":"10.1109\/ICME.2016.7552883"},{"key":"2350_CR13","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)"},{"key":"2350_CR14","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"2350_CR15","unstructured":"Lebret, R., Pinheiro, P.O., Collobert, R.: Simple image description generator via a linear phrase-based approach. arXiv preprint arXiv:1412.8419 (2014)"},{"key":"2350_CR16","doi-asserted-by":"crossref","unstructured":"Tran, K., et al.: Rich image captioning in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops (2016)","DOI":"10.1109\/CVPRW.2016.61"},{"key":"2350_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2350_CR18","doi-asserted-by":"crossref","unstructured":"Papineni, K., et al.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"2350_CR19","doi-asserted-by":"crossref","unstructured":"Farhadi, A., et al.: Every picture tells a story: generating sentences from images. In: European Conference on Computer Vision. Springer, Berlin, Heidelberg (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"2350_CR20","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"2350_CR21","doi-asserted-by":"crossref","unstructured":"Vinyals, O., et al.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"2350_CR22","doi-asserted-by":"crossref","unstructured":"Huang, L., et al.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"2350_CR23","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Proceedings of the European Conference on Computer Vision (2014).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2350_CR24","unstructured":"Banerjee, S., Lavie, A.: METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the acl Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization (2005)"},{"key":"2350_CR25","unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. Text summarization branches out (2004)"},{"key":"2350_CR26","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2350_CR27","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Spice: semantic propositional image caption evaluation. In: European Conference on Computer Vision. Springer, Cham (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"2350_CR28","unstructured":"Ren, Shaoqing, et al. \"Faster r-cnn: Towards real-time object detection with region proposal networks.\" Advances in neural information processing systems 28 (2015)."},{"key":"2350_CR29","doi-asserted-by":"crossref","unstructured":"He, K., et al.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.90"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-022-02350-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-022-02350-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-022-02350-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,24]],"date-time":"2023-04-24T16:22:06Z","timestamp":1682353326000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-022-02350-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,21]]},"references-count":29,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["2350"],"URL":"https:\/\/doi.org\/10.1007\/s11760-022-02350-9","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,9,21]]},"assertion":[{"value":"2 April 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 June 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 August 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 September 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"I declare that the authors have no competing interests as defined by Springer, or other interests that might be perceived to influence the results and\/or discussion reported in this paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Title of manuscript: Input Enhanced Asymmetric Transformer for Image Captioning. Journal: Signal, Image and Video Processing. I give my consent for the publication of identifiable details, which can include photograph(s) and details within the text (\u201cMaterial\u201d) to be published in the above Journal and Article.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Consent for publication"}}]}}