{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T15:38:15Z","timestamp":1777995495891,"version":"3.51.4"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2022,7,28]],"date-time":"2022-07-28T00:00:00Z","timestamp":1658966400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,7,28]],"date-time":"2022-07-28T00:00:00Z","timestamp":1658966400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076262"],"award-info":[{"award-number":["62076262"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s11063-022-10980-w","type":"journal-article","created":{"date-parts":[[2022,7,29]],"date-time":"2022-07-29T13:51:10Z","timestamp":1659102670000},"page":"2707-2724","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Hadamard Product Perceptron Attention for Image Captioning"],"prefix":"10.1007","volume":"55","author":[{"given":"Weitao","family":"Jiang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4884-323X","authenticated-orcid":false,"given":"Haifeng","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,7,28]]},"reference":[{"key":"10980_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, Gould S (2016) Spice: Semantic propositional image caption evaluation. In: Proceedings of the European Conference on Computer Vision, Springer, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"10980_CR2","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10980_CR3","unstructured":"Banerjee S, Lavie A (2005) Meteor: An automatic metric for mt evaluation with improved correalation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp 65\u201372"},{"key":"10980_CR4","doi-asserted-by":"crossref","unstructured":"Chen L, Zhang H, Xiao J, Nie L, Shao J, Liu W, Chua T-S (2017) Sca-cnn: spatial and channel-wise attention in convolutional networks for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 5659\u20135667","DOI":"10.1109\/CVPR.2017.667"},{"key":"10980_CR5","doi-asserted-by":"crossref","unstructured":"Clark K, Khandelwal U, Levy O, Manning CD (2019) What does bert look at? an analysis of bert\u2019s attention. arXiv preprint arXiv:1906.04341","DOI":"10.18653\/v1\/W19-4828"},{"key":"10980_CR6","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, Cucchiara R (2020) Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp10578\u201310587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"10980_CR7","doi-asserted-by":"crossref","unstructured":"Dai Z, Yang Z, Yang Y, Carbonell J, Le QV, Salakhutdinov R (2019) Transformer-xl: Attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860","DOI":"10.18653\/v1\/P19-1285"},{"key":"10980_CR8","unstructured":"Dehghani M, Gouws S, Vinyals O, Uszkoreit J, Kaiser \u0141 (2018) Universal transformers. arXiv preprint arXiv:1807.03819"},{"key":"10980_CR9","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10980_CR10","unstructured":"Friedman N, Russell S (1997) Image segmentation in video sequences: A probabilistic approach. In: Proceedings of the Thirteenth Conference on Uncertainty in Artificial Intelligence, pp 175\u2013181"},{"key":"10980_CR11","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, Pu Y, Tran K, Gao J, Carin L, Deng L (2017) Semantic compositional networks for visual captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 5630\u20135639","DOI":"10.1109\/CVPR.2017.127"},{"key":"10980_CR12","unstructured":"Gupta A, Verma Y, Jawahar CV (2012) Choosing linguistics over vision to describe images. In: Twenty-Sixth AAAI Conference on Artificial Intelligence"},{"key":"10980_CR13","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"10980_CR14","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"10980_CR15","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, Wei X-Y (2019) Attention on attention for image captioning. In: Proceedings of the IEEE International Conference on Computer Vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"issue":"4","key":"10980_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3460474","volume":"17","author":"W Jiang","year":"2021","unstructured":"Jiang W, Wang W, Haifeng H (2021) Bi-directional co-attention network for image captioning. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) 17(4):1\u201320","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM)"},{"key":"10980_CR17","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Jiang Y-G, Liu W, Zhang T (2018) Recurrent fusion network for image captioning. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 499\u2013515","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"10980_CR18","doi-asserted-by":"crossref","unstructured":"Kalimuthu M, Mogadala A, Mosbach M, Klakow D (2021) Fusion models for improved image captioning. In International Conference on Pattern Recognition, Springer, pp 381\u2013395","DOI":"10.1007\/978-3-030-68780-9_32"},{"key":"10980_CR19","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"10980_CR20","first-page":"361","volume":"29","author":"J-H Kim","year":"2016","unstructured":"Kim J-H, Lee S-W, Kwak D, Heo M-O, Kim J, Ha J-W, Zhang B-T (2016) Multimodal residual learning for visual qa. In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp 29:361\u2013369","journal-title":"In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp"},{"key":"10980_CR21","unstructured":"Kim J-H, On K-W, Lim W, Kim J, Ha J-W, Zhang B-T (2016) Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325"},{"issue":"1","key":"10980_CR22","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA et al (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vision 123(1):32\u201373","journal-title":"Int J Comput Vision"},{"issue":"12","key":"10980_CR23","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni G, Premraj V, Ordonez V, Dhar S, Li S, Choi Y, Berg AC, Berg TL (2013) Babytalk: Understanding and generating simple image descriptions. IEEE Trans Pattern Anal Mach Intell 35(12):2891\u20132903","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10980_CR24","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova P, Ordonez V, Berg TL, Choi Y (2014) Treetalk: Composition and compression of trees for image descriptions. Transactions of the Association for Computational Linguistics 2:351\u2013362","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"10980_CR25","doi-asserted-by":"crossref","unstructured":"Li G, Zhu L, Liu P, Yang Y (2019) Entangled transformer for image captioning. In: Proceedings of the IEEE International Conference on Computer Vision, pp 8928\u20138937","DOI":"10.1109\/ICCV.2019.00902"},{"key":"10980_CR26","unstructured":"Lin C-Y (2004) Rouge: A package for automatic evaluation of summaries. In the Workshop on Text Summarization Branches Out, pp 74\u201381"},{"key":"10980_CR27","doi-asserted-by":"crossref","unstructured":"Lin T-Y, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Lawrence ZC (2014) Microsoft coco: common objects in context. In Proceedings of the European Conference on Computer Vision, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10980_CR28","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 375\u2013383","DOI":"10.1109\/CVPR.2017.345"},{"key":"10980_CR29","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D, Parikh D (2018) Neural baby talk. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 7219\u20137228","DOI":"10.1109\/CVPR.2018.00754"},{"key":"10980_CR30","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1109\/TIP.2019.2926774","volume":"29","author":"M Meng","year":"2019","unstructured":"Meng M, Lan M, Jun Yu, Jigang W, Tao D (2019) Constrained discriminative projection learning for image classification. IEEE Trans Image Process 29:186\u2013198","journal-title":"IEEE Trans Image Process"},{"key":"10980_CR31","doi-asserted-by":"publisher","first-page":"986","DOI":"10.1109\/TIP.2020.3038365","volume":"30","author":"M Meng","year":"2020","unstructured":"Meng M, Wang H, Jun Yu, Chen H, Jigang W (2020) Asymmetric supervised consistent and specific hashing for cross-modal retrieval. IEEE Trans Image Process 30:986\u20131000","journal-title":"IEEE Trans Image Process"},{"key":"10980_CR32","unstructured":"Mitchell M, Dodge J, Goyal A, Yamaguchi K, Stratos K, Han X, Mensch A, Berg A, Berg T, Daum\u00e9\u00a0III H (2012) Midge: Generating image descriptions from computer vision detections. In: Proceedings of the 13th Conference of the European Chapter of the Association for Computational Linguistics, pp 747\u2013756"},{"key":"10980_CR33","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the Annual Meeting on Association for Computational Linguistics, Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"10980_CR34","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM, Caicedo JC, Hockenmaier J, Lazebnik S (2015) Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, pp 2641\u20132649","DOI":"10.1109\/ICCV.2015.303"},{"key":"10980_CR35","first-page":"91","volume":"28","author":"R Shaoqing","year":"2015","unstructured":"Shaoqing R, Kaiming H, Ross G, Jian S (2015) Faster r-cnn: towards real-time object detection with region proposal networks. In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp 28:91\u201399","journal-title":"In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp"},{"key":"10980_CR36","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"10980_CR37","doi-asserted-by":"crossref","unstructured":"Sammani F, Melas-Kyriazi L (2020) Show, edit and tell: A framework for editing image captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4808\u20134816","DOI":"10.1109\/CVPR42600.2020.00486"},{"key":"10980_CR38","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp 2556\u20132565","DOI":"10.18653\/v1\/P18-1238"},{"key":"10980_CR39","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"key":"10980_CR40","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp 30:5998\u20136008","journal-title":"In: Proceedings of the Conference on Advances in Neural Information Processing Systems, pp"},{"key":"10980_CR41","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10980_CR42","doi-asserted-by":"crossref","unstructured":"Vig J (2019) A multiscale visualization of attention in the transformer model. arXiv preprint arXiv:1906.05714","DOI":"10.18653\/v1\/P19-3007"},{"key":"10980_CR43","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"10980_CR44","doi-asserted-by":"crossref","unstructured":"Wang J, Tang J, Luo J (2020) Multimodal attention with image text spatial relationship for ocr-based image captioning. In: Proceedings of the 28th ACM International Conference on Multimedia, pp 4337\u20134345","DOI":"10.1145\/3394171.3413753"},{"key":"10980_CR45","doi-asserted-by":"crossref","unstructured":"Wang X, Ma L, Fu Y, Xue X (2021) Neural symbolic representation learning for image captioning. In: Proceedings of the 2021 International Conference on Multimedia Retrieval, pp 312\u2013321","DOI":"10.1145\/3460426.3463637"},{"key":"10980_CR46","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning, pp 2048\u20132057"},{"key":"10980_CR47","doi-asserted-by":"publisher","first-page":"835","DOI":"10.1109\/TMM.2020.2990074","volume":"23","author":"L Yang","year":"2021","unstructured":"Yang L, Wang H, Tang P, Li Q (2021) Captionnet: A tailor-made recurrent neural network for generating image descriptions. IEEE Trans Multimedia 23:835\u2013845","journal-title":"IEEE Trans Multimedia"},{"key":"10980_CR48","doi-asserted-by":"crossref","unstructured":"Yang X, Tang K, Zhang H, Cai J (2019) Auto-encoding scene graphs for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 10685\u201310694","DOI":"10.1109\/CVPR.2019.01094"},{"key":"10980_CR49","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2018) Exploring visual relationship for image captioning. In: Proceedings of the European conference on computer vision (ECCV), pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"10980_CR50","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. In: Proceedings of the IEEE International Conference on Computer Vision, pp 4894\u20134902","DOI":"10.1109\/ICCV.2017.524"},{"key":"10980_CR51","doi-asserted-by":"crossref","unstructured":"Zhong Y, Wang L, Chen J, Yu D, Li Y (2020) Comprehensive image captioning via scene graph decomposition. In: European Conference on Computer Vision, Springer, pp 211\u2013229","DOI":"10.1007\/978-3-030-58568-6_13"},{"key":"10980_CR52","doi-asserted-by":"crossref","unstructured":"Zhou L, Kalantidis Y, Chen X, Corso JJ, Rohrbach M (2019) Grounded video description. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6578\u20136587","DOI":"10.1109\/CVPR.2019.00674"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-022-10980-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-022-10980-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-022-10980-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,8]],"date-time":"2023-07-08T12:08:28Z","timestamp":1688818108000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-022-10980-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,28]]},"references-count":52,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["10980"],"URL":"https:\/\/doi.org\/10.1007\/s11063-022-10980-w","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,7,28]]},"assertion":[{"value":"20 July 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}