{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T05:45:18Z","timestamp":1748497518903},"reference-count":62,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2023,6,10]],"date-time":"2023-06-10T00:00:00Z","timestamp":1686355200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,6,10]],"date-time":"2023-06-10T00:00:00Z","timestamp":1686355200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s11063-023-11314-0","type":"journal-article","created":{"date-parts":[[2023,6,10]],"date-time":"2023-06-10T11:02:18Z","timestamp":1686394938000},"page":"8339-8363","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Complementary Shifted Transformer for Image Captioning"],"prefix":"10.1007","volume":"55","author":[{"given":"Yanbo","family":"Liu","sequence":"first","affiliation":[]},{"given":"You","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Ruoyu","family":"Xiang","sequence":"additional","affiliation":[]},{"given":"Jixin","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,6,10]]},"reference":[{"key":"11314_CR1","first-page":"3104","volume":"27","author":"I Sutskever","year":"2014","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. Adv Neural Inform Process Syst 27:3104\u20133112","journal-title":"Adv Neural Inform Process Syst"},{"issue":"4","key":"11314_CR2","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2016","unstructured":"Vinyals O, Toshev A, Bengio S et al (2016) Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE Trans Pattern Anal Mach Intell 39(4):652\u2013663","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11314_CR3","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, et\u00a0al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: proceedings of the IEEE conference on computer vision and pattern recognition pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"11314_CR4","unstructured":"Xu K, Ba J, Kiros R, et\u00a0al (2015) Show, attend and tell: Neural image caption generation with visual attention. In: international conference on machine learning, PMLR, pp 2048\u20132057"},{"key":"11314_CR5","unstructured":"Vaswani A, Shazeer N, Parmar N, et\u00a0al (2017) Attention is all you need. In: proceedings of the 31st international conference on neural information processing systems. Curran Associates Inc., Red Hook, USA, pp 6000\u20146010"},{"key":"11314_CR6","unstructured":"Wang B, Zhao D, Lioma C, et\u00a0al (2020) Encoding word order in complex embeddings. In: international conference on learning representations"},{"key":"11314_CR7","unstructured":"Herdade S, Kappeler A, Boakye K, Soares J (2019) Image captioning: transforming objects into words. In: proceedings of the 33rd international conference on neural information processing systems, pp. 11137\u201311147"},{"key":"11314_CR8","doi-asserted-by":"crossref","unstructured":"Guo L, Liu J, Zhu X, et\u00a0al (2020) Normalized and geometry-aware self-attention network for image captioning. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,327\u201310,336","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"11314_CR9","doi-asserted-by":"crossref","unstructured":"Zhang X, Sun X, Luo Y, et\u00a0al (2021) Rstnet: Captioning with adaptive attention on visual and non-visual words. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 15,465\u201315,474","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"11314_CR10","doi-asserted-by":"crossref","unstructured":"Shaw P, Uszkoreit J, Vaswani A (2018) Self-attention with relative position representations. In: proceedings of NAACL-HLT, pp 464\u2013468","DOI":"10.18653\/v1\/N18-2074"},{"key":"11314_CR11","doi-asserted-by":"crossref","unstructured":"Hu H, Gu J, Zhang Z, et\u00a0al (2018) Relation networks for object detection. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 3588\u20133597","DOI":"10.1109\/CVPR.2018.00378"},{"key":"11314_CR12","unstructured":"Bhojanapalli S, Yun C, Rawat AS, et\u00a0al (2020) Low-rank bottleneck in multi-head attention models. In: international conference on machine learning, PMLR, pp 864\u2013873"},{"key":"11314_CR13","unstructured":"Shazeer N, Lan Z, Cheng Y, et\u00a0al (2020) Talking-heads attention. arXiv preprint arXiv:2003.02436"},{"key":"11314_CR14","doi-asserted-by":"crossref","unstructured":"Cornia M, Stefanini M, Baraldi L, et\u00a0al (2020) Meshed-memory transformer for image captioning. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,578\u201310,587","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"11314_CR15","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, et\u00a0al (2014) Microsoft coco: Common objects in context. In: computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"11314_CR16","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"11314_CR17","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, et\u00a0al (2017) Semantic compositional networks for visual captioning. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 5630\u20135639","DOI":"10.1109\/CVPR.2017.127"},{"key":"11314_CR18","doi-asserted-by":"crossref","unstructured":"Wang Y, Lin Z, Shen X, et\u00a0al (2017) Skeleton key: Image captioning by skeleton-attribute decomposition. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 7272\u20137281","DOI":"10.1109\/CVPR.2017.780"},{"key":"11314_CR19","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O et al (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vision 123:32\u201373","journal-title":"Int J Comput Vision"},{"issue":"2","key":"11314_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3177745","volume":"14","author":"M Cornia","year":"2018","unstructured":"Cornia M, Baraldi L, Serra G et al (2018) Paying more attention to saliency: image captioning with saliency and context attention. ACM Trans Multimed Comput Commun Appl (TOMM) 14(2):1\u201321","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"11314_CR21","doi-asserted-by":"crossref","unstructured":"Pan Y, Yao T, Li Y, et\u00a0al (2020) X-linear attention networks for image captioning. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,971\u201310,980","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"11314_CR22","doi-asserted-by":"crossref","unstructured":"Fei Z (2022) Attention-aligned transformer for image captioning. In: proceedings of the AAAI Conference on Artificial Intelligence, pp 607\u2013615","DOI":"10.1609\/aaai.v36i1.19940"},{"issue":"4","key":"11314_CR23","doi-asserted-by":"publisher","first-page":"705","DOI":"10.1007\/s13735-022-00260-7","volume":"11","author":"Y Yang","year":"2022","unstructured":"Yang Y, An Y, Hu J et al (2022) Tri-rat: optimizing the attention scores for image captioning. Int J Multimed Inform Retr 11(4):705\u2013715","journal-title":"Int J Multimed Inform Retr"},{"key":"11314_CR24","doi-asserted-by":"crossref","unstructured":"Zeng P, Zhang H, Song J, et\u00a0al (2022) S2 transformer for image captioning. In: Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, IJCAI-22. international joint conferences on artificial intelligence organization, pp 1608\u20131614","DOI":"10.24963\/ijcai.2022\/224"},{"key":"11314_CR25","doi-asserted-by":"crossref","unstructured":"Ma Y, Ji J, Sun X, et\u00a0al (2022) Knowing what it is: semantic-enhanced dual attention transformer. IEEE Trans Multimed","DOI":"10.1109\/TMM.2022.3164787"},{"key":"11314_CR26","doi-asserted-by":"crossref","unstructured":"Wu M, Zhang X, Sun X, et\u00a0al (2022) Difnet: Boosting visual information flow for image captioning. In: proceedings of the IEEE\/cvf conference on computer vision and pattern recognition pp 18,020\u201318,029","DOI":"10.1109\/CVPR52688.2022.01749"},{"key":"11314_CR27","doi-asserted-by":"crossref","unstructured":"Luo Y, Ji J, Sun X, et\u00a0al (2021) Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI conference on artificial intelligence, pp 2286\u20132293","DOI":"10.1609\/aaai.v35i3.16328"},{"issue":"104","key":"11314_CR28","first-page":"575","volume":"128","author":"J Hu","year":"2022","unstructured":"Hu J, Yang Y, Yao L et al (2022) Position-guided transformer for image captioning. Image Vis Comput 128(104):575","journal-title":"Image Vis Comput"},{"issue":"5","key":"11314_CR29","doi-asserted-by":"publisher","first-page":"2019","DOI":"10.1109\/TIP.2014.2311377","volume":"23","author":"J Yu","year":"2014","unstructured":"Yu J, Rui Y, Tao D (2014) Click prediction for web image reranking using multimodal sparse coding. IEEE Trans Image Process 23(5):2019\u20132032","journal-title":"IEEE Trans Image Process"},{"issue":"5","key":"11314_CR30","doi-asserted-by":"publisher","first-page":"3117","DOI":"10.1002\/int.22814","volume":"37","author":"J Zhang","year":"2022","unstructured":"Zhang J, Yang J, Yu J et al (2022) Semisupervised image classification by mutual learning of multiple self-supervised models. Int J Intell Syst 37(5):3117\u20133141","journal-title":"Int J Intell Syst"},{"key":"11314_CR31","doi-asserted-by":"crossref","unstructured":"Wang X, Ma L, Fu Y, et\u00a0al (2021) Neural symbolic representation learning for image captioning. In: proceedings of the 2021 international conference on multimedia retrieval, pp 312\u2013321","DOI":"10.1145\/3460426.3463637"},{"key":"11314_CR32","doi-asserted-by":"crossref","unstructured":"Jiang W, Hu H (2022) Hadamard product perceptron attention for image captioning. Neural Process Lett 1\u201318","DOI":"10.1007\/s11063-022-10980-w"},{"issue":"2","key":"11314_CR33","doi-asserted-by":"publisher","first-page":"563","DOI":"10.1109\/TPAMI.2019.2932058","volume":"44","author":"J Yu","year":"2019","unstructured":"Yu J, Tan M, Zhang H et al (2019) Hierarchical deep click feature prediction for fine-grained image recognition. IEEE Trans Pattern Anal Mach Intell 44(2):563\u2013578","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11314_CR34","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, et\u00a0al (2019) Hierarchy parsing for image captioning. In: proceedings of the IEEE\/CVF international conference on computer vision, pp 2621\u20132629","DOI":"10.1109\/ICCV.2019.00271"},{"issue":"107","key":"11314_CR35","first-page":"952","volume":"116","author":"J Zhang","year":"2021","unstructured":"Zhang J, Cao Y, Wu Q (2021) Vector of locally and adaptively aggregated descriptors for image feature representation. Pattern Recogn 116(107):952","journal-title":"Pattern Recogn"},{"key":"11314_CR36","doi-asserted-by":"crossref","unstructured":"Jiang H, Misra I, Rohrbach M, et\u00a0al (2020) In defense of grid features for visual question answering. In: proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10,267\u201310,276","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"11314_CR37","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, et\u00a0al (2020) End-to-end object detection with transformers. In: Computer Vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, Aug 23\u201328, 2020, Proceedings, Part I 16, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"11314_CR38","doi-asserted-by":"crossref","unstructured":"Wu Y, He K (2018) Group normalization. In: computer vision\u2013ECCV 2018: 15th European conference, Munich, Germany, September 8-14, 2018, Proceedings, Part XIII 15, Springer, pp 3\u201319","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"11314_CR39","unstructured":"Fan Y, Xie S, Xia Y, et\u00a0al (2020) Multi-branch attentive transformer. arXiv preprint arXiv:2006.10270"},{"key":"11314_CR40","doi-asserted-by":"crossref","unstructured":"Ji J, Huang X, Sun X, et\u00a0al (2022) Multi-branch distance-sensitive self-attention network for image captioning. IEEE Transactions on Multimedia","DOI":"10.1109\/TMM.2022.3169061"},{"key":"11314_CR41","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et\u00a0al (2016) Deep residual learning for image recognition. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11314_CR42","unstructured":"Ba JL, Kiros JR, Hinton GE (2016) Layer normalization. arXiv preprint arXiv:1607.06450"},{"key":"11314_CR43","doi-asserted-by":"crossref","unstructured":"Yu T, Li X, Cai Y, et\u00a0al (2021) S2-mlpv2: Improved spatial-shift mlp architecture for vision. arXiv preprint arXiv:2108.01072","DOI":"10.1109\/WACV51458.2022.00367"},{"issue":"109","key":"11314_CR44","first-page":"420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma Y, Ji J, Sun X et al (2023) Towards local visual modeling for image captioning. Pattern Recogn 138(109):420","journal-title":"Pattern Recogn"},{"key":"11314_CR45","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, et\u00a0al (2017) Self-critical sequence training for image captioning. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 7008\u20137024","DOI":"10.1109\/CVPR.2017.131"},{"key":"11314_CR46","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, et\u00a0al (2018) Exploring visual relationship for image captioning. In: proceedings of the European conference on computer vision (ECCV), pp 684\u2013699","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"11314_CR47","doi-asserted-by":"crossref","unstructured":"Huang L, Wang W, Chen J, et\u00a0al (2019) Attention on attention for image captioning. In: proceedings of the IEEE\/CVF international conference on computer vision, pp 4634\u20134643","DOI":"10.1109\/ICCV.2019.00473"},{"key":"11314_CR48","doi-asserted-by":"crossref","unstructured":"Fan Z, Wei Z, Wang S, et\u00a0al (2021) Tcic: Theme concepts learning cross language and vision for image captioning. In: proceedings of the thirtieth international joint conference on artificial intelligence, IJCAI-21 pp 657\u2013663","DOI":"10.24963\/ijcai.2021\/91"},{"issue":"117","key":"11314_CR49","first-page":"174","volume":"201","author":"C Wang","year":"2022","unstructured":"Wang C, Shen Y, Ji L (2022) Geometry attention transformer with position-aware Lstms for image captioning. Expert Syst Appl 201(117):174","journal-title":"Expert Syst Appl"},{"key":"11314_CR50","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1016\/j.neucom.2021.10.014","volume":"468","author":"Y Wang","year":"2022","unstructured":"Wang Y, Xu J, Sun Y (2022) A visual persistence model for image captioning. Neurocomputing 468:48\u201359","journal-title":"Neurocomputing"},{"key":"11314_CR51","doi-asserted-by":"crossref","unstructured":"Agrawal H, Desai K, Wang Y, et\u00a0al (2019) Nocaps: Novel object captioning at scale. In: proceedings of the IEEE\/CVF international conference on computer vision, pp 8948\u20138957","DOI":"10.1109\/ICCV.2019.00904"},{"key":"11314_CR52","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, et\u00a0al (2002) Bleu: a method for automatic evaluation of machine translation. In: proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"11314_CR53","unstructured":"Banerjee S, Lavie A (2005) Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, pp 65\u201372"},{"key":"11314_CR54","unstructured":"Lin CY (2004) Rouge: A package for automatic evaluation of summaries. In: text summarization branches out, pp 74\u201381"},{"key":"11314_CR55","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"11314_CR56","doi-asserted-by":"crossref","unstructured":"Anderson P, Fernando B, Johnson M, et\u00a0al (2016) Spice: Semantic propositional image caption evaluation. In: computer vision\u2013ECCV 2016: 14th European conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14, Springer, pp 382\u2013398","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"11314_CR57","unstructured":"Ren S, He K, Girshick R, et\u00a0al (2015) Faster r-cnn: Towards real-time object detection with region proposal networks. In: advances in neural information processing systems, vol\u00a028. Curran Associates, Inc"},{"key":"11314_CR58","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P, et\u00a0al (2017) Aggregated residual transformations for deep neural networks. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 1492\u20131500","DOI":"10.1109\/CVPR.2017.634"},{"key":"11314_CR59","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, et\u00a0al (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, Ieee, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"11314_CR60","unstructured":"Kingma DP, Ba J (2014) Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980"},{"key":"11314_CR61","doi-asserted-by":"crossref","unstructured":"Li G, Zhu L, Liu P, et\u00a0al (2019) Entangled transformer for image captioning. In: proceedings of the IEEE\/CVF international conference on computer vision, pp 8928\u20138937","DOI":"10.1109\/ICCV.2019.00902"},{"key":"11314_CR62","doi-asserted-by":"crossref","unstructured":"Ji J, Luo Y, Sun X, et\u00a0al (2021) Improving image captioning by leveraging intra-and inter-layer global representation in transformer network. In: proceedings of the AAAI conference on artificial intelligence, pp 1655\u20131663","DOI":"10.1609\/aaai.v35i2.16258"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11314-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11063-023-11314-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-023-11314-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,28]],"date-time":"2023-10-28T19:19:36Z","timestamp":1698520776000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11063-023-11314-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,10]]},"references-count":62,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["11314"],"URL":"https:\/\/doi.org\/10.1007\/s11063-023-11314-0","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"value":"1370-4621","type":"print"},{"value":"1573-773X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,6,10]]},"assertion":[{"value":"24 May 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 June 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}